From 6afe1a1fe8ff83f6ac2726b04665e76ba7b14f3e Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Thu, 13 Mar 2008 23:52:49 +0100
Subject: PM: Remove legacy PM

AFAICT pm_send_all is a nop when noone uses pm_register...

Hmm.. can we just force CONFIG_PM_LEGACY=n, and see what happens?

Or maybe this is better idea? It may break build somewhere, but it
should be easy to fix... (it builds here, i386 and x86-64).

Signed-off-by: Pavel Machek <pavel@suse.cz>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 kernel/power/Kconfig  |  10 ---
 kernel/power/Makefile |   1 -
 kernel/power/pm.c     | 205 --------------------------------------------------
 3 files changed, 216 deletions(-)
 delete mode 100644 kernel/power/pm.c

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 6233f3b4ae6..b45da40e8d2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -19,16 +19,6 @@ config PM
 	  will issue the hlt instruction if nothing is to be done, thereby
 	  sending the processor to sleep and saving power.
 
-config PM_LEGACY
-	bool "Legacy Power Management API (DEPRECATED)"
-	depends on PM
-	default n
-	---help---
-	   Support for pm_register() and friends.  This old API is obsoleted
-	   by the driver model.
-
-	   If unsure, say N.
-
 config PM_DEBUG
 	bool "Power Management Debug Support"
 	depends on PM
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f7dfff28ecd..597823b5b70 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -4,7 +4,6 @@ EXTRA_CFLAGS	+=	-DDEBUG
 endif
 
 obj-y				:= main.o
-obj-$(CONFIG_PM_LEGACY)		+= pm.o
 obj-$(CONFIG_PM_SLEEP)		+= process.o console.o
 obj-$(CONFIG_HIBERNATION)	+= swsusp.o disk.o snapshot.o swap.o user.o
 
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
deleted file mode 100644
index 60c73fa670d..00000000000
--- a/kernel/power/pm.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- *  pm.c - Power management interface
- *
- *  Copyright (C) 2000 Andrew Henroid
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/pm.h>
-#include <linux/pm_legacy.h>
-#include <linux/interrupt.h>
-#include <linux/mutex.h>
-
-/*
- *	Locking notes:
- *		pm_devs_lock can be a semaphore providing pm ops are not called
- *	from an interrupt handler (already a bad idea so no change here). Each
- *	change must be protected so that an unlink of an entry doesn't clash
- *	with a pm send - which is permitted to sleep in the current architecture
- *
- *	Module unloads clashing with pm events now work out safely, the module 
- *	unload path will block until the event has been sent. It may well block
- *	until a resume but that will be fine.
- */
- 
-static DEFINE_MUTEX(pm_devs_lock);
-static LIST_HEAD(pm_devs);
-
-/**
- *	pm_register - register a device with power management
- *	@type: device type 
- *	@id: device ID
- *	@callback: callback function
- *
- *	Add a device to the list of devices that wish to be notified about
- *	power management events. A &pm_dev structure is returned on success,
- *	on failure the return is %NULL.
- *
- *      The callback function will be called in process context and
- *      it may sleep.
- */
- 
-struct pm_dev *pm_register(pm_dev_t type,
-			   unsigned long id,
-			   pm_callback callback)
-{
-	struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL);
-	if (dev) {
-		dev->type = type;
-		dev->id = id;
-		dev->callback = callback;
-
-		mutex_lock(&pm_devs_lock);
-		list_add(&dev->entry, &pm_devs);
-		mutex_unlock(&pm_devs_lock);
-	}
-	return dev;
-}
-
-/**
- *	pm_send - send request to a single device
- *	@dev: device to send to
- *	@rqst: power management request
- *	@data: data for the callback
- *
- *	Issue a power management request to a given device. The 
- *	%PM_SUSPEND and %PM_RESUME events are handled specially. The
- *	data field must hold the intended next state. No call is made
- *	if the state matches.
- *
- *	BUGS: what stops two power management requests occurring in parallel
- *	and conflicting.
- *
- *	WARNING: Calling pm_send directly is not generally recommended, in
- *	particular there is no locking against the pm_dev going away. The
- *	caller must maintain all needed locking or have 'inside knowledge'
- *	on the safety. Also remember that this function is not locked against
- *	pm_unregister. This means that you must handle SMP races on callback
- *	execution and unload yourself.
- */
- 
-static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data)
-{
-	int status = 0;
-	unsigned long prev_state, next_state;
-
-	if (in_interrupt())
-		BUG();
-
-	switch (rqst) {
-	case PM_SUSPEND:
-	case PM_RESUME:
-		prev_state = dev->state;
-		next_state = (unsigned long) data;
-		if (prev_state != next_state) {
-			if (dev->callback)
-				status = (*dev->callback)(dev, rqst, data);
-			if (!status) {
-				dev->state = next_state;
-				dev->prev_state = prev_state;
-			}
-		}
-		else {
-			dev->prev_state = prev_state;
-		}
-		break;
-	default:
-		if (dev->callback)
-			status = (*dev->callback)(dev, rqst, data);
-		break;
-	}
-	return status;
-}
-
-/*
- * Undo incomplete request
- */
-static void pm_undo_all(struct pm_dev *last)
-{
-	struct list_head *entry = last->entry.prev;
-	while (entry != &pm_devs) {
-		struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
-		if (dev->state != dev->prev_state) {
-			/* previous state was zero (running) resume or
-			 * previous state was non-zero (suspended) suspend
-			 */
-			pm_request_t undo = (dev->prev_state
-					     ? PM_SUSPEND:PM_RESUME);
-			pm_send(dev, undo, (void*) dev->prev_state);
-		}
-		entry = entry->prev;
-	}
-}
-
-/**
- *	pm_send_all - send request to all managed devices
- *	@rqst: power management request
- *	@data: data for the callback
- *
- *	Issue a power management request to a all devices. The 
- *	%PM_SUSPEND events are handled specially. Any device is 
- *	permitted to fail a suspend by returning a non zero (error)
- *	value from its callback function. If any device vetoes a 
- *	suspend request then all other devices that have suspended 
- *	during the processing of this request are restored to their
- *	previous state.
- *
- *	WARNING:  This function takes the pm_devs_lock. The lock is not dropped until
- *	the callbacks have completed. This prevents races against pm locking
- *	functions, races against module unload pm_unregister code. It does
- *	mean however that you must not issue pm_ functions within the callback
- *	or you will deadlock and users will hate you.
- *
- *	Zero is returned on success. If a suspend fails then the status
- *	from the device that vetoes the suspend is returned.
- *
- *	BUGS: what stops two power management requests occurring in parallel
- *	and conflicting.
- */
- 
-int pm_send_all(pm_request_t rqst, void *data)
-{
-	struct list_head *entry;
-	
-	mutex_lock(&pm_devs_lock);
-	entry = pm_devs.next;
-	while (entry != &pm_devs) {
-		struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
-		if (dev->callback) {
-			int status = pm_send(dev, rqst, data);
-			if (status) {
-				/* return devices to previous state on
-				 * failed suspend request
-				 */
-				if (rqst == PM_SUSPEND)
-					pm_undo_all(dev);
-				mutex_unlock(&pm_devs_lock);
-				return status;
-			}
-		}
-		entry = entry->next;
-	}
-	mutex_unlock(&pm_devs_lock);
-	return 0;
-}
-
-EXPORT_SYMBOL(pm_register);
-EXPORT_SYMBOL(pm_send_all);
-
-- 
cgit v1.2.3-70-g09d2


From d7b906897e9caae452947e33674df0a2d6f7e10f Mon Sep 17 00:00:00 2001
From: Russell King <rmk+lkml@arm.linux.org.uk>
Date: Thu, 17 Apr 2008 07:46:24 +0200
Subject: [S390] genirq/clockevents: move irq affinity prototypes/inlines to
 interrupt.h

> Generic code is not supposed to include irq.h. Replace this include
> by linux/hardirq.h instead and add/replace an include of linux/irq.h
> in asm header files where necessary.
> This change should only matter for architectures that make use of
> GENERIC_CLOCKEVENTS.
> Architectures in question are mips, x86, arm, sh, powerpc, uml and sparc64.
>
> I did some cross compile tests for mips, x86_64, arm, powerpc and sparc64.
> This patch fixes also build breakages caused by the include replacement in
> tick-common.h.

I generally dislike adding optional linux/* includes in asm/* includes -
I'm nervous about this causing include loops.

However, there's a separate point to be discussed here.

That is, what interfaces are expected of every architecture in the kernel.
If generic code wants to be able to set the affinity of interrupts, then
that needs to become part of the interfaces listed in linux/interrupt.h
rather than linux/irq.h.

So what I suggest is this approach instead (against Linus' tree of a
couple of days ago) - we move irq_set_affinity() and irq_can_set_affinity()
to linux/interrupt.h, change the linux/irq.h includes to linux/interrupt.h
and include asm/irq_regs.h where needed (asm/irq_regs.h is supposed to be
rarely used include since not much touches the stacked parent context
registers.)

Build tested on ARM PXA family kernels and ARM's Realview platform
kernels which both use genirq.

[ tglx@linutronix.de: add GENERIC_HARDIRQ dependencies ]

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 include/linux/interrupt.h    | 19 +++++++++++++++++++
 include/linux/irq.h          | 10 ----------
 kernel/time/tick-broadcast.c |  2 +-
 kernel/time/tick-common.c    |  4 +++-
 kernel/time/tick-oneshot.c   |  2 +-
 5 files changed, 24 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index f8ab4ce7056..b5fef13148b 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -102,6 +102,25 @@ extern void disable_irq_nosync(unsigned int irq);
 extern void disable_irq(unsigned int irq);
 extern void enable_irq(unsigned int irq);
 
+#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
+
+extern int irq_set_affinity(unsigned int irq, cpumask_t cpumask);
+extern int irq_can_set_affinity(unsigned int irq);
+
+#else /* CONFIG_SMP */
+
+static inline int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
+{
+	return -EINVAL;
+}
+
+static inline int irq_can_set_affinity(unsigned int irq)
+{
+	return 0;
+}
+
+#endif /* CONFIG_SMP && CONFIG_GENERIC_HARDIRQS */
+
 #ifdef CONFIG_GENERIC_HARDIRQS
 /*
  * Special lockdep variants of irq disabling/enabling.
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 176e5e790a4..1883a85625d 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -228,21 +228,11 @@ static inline void set_pending_irq(unsigned int irq, cpumask_t mask)
 
 #endif /* CONFIG_GENERIC_PENDING_IRQ */
 
-extern int irq_set_affinity(unsigned int irq, cpumask_t cpumask);
-extern int irq_can_set_affinity(unsigned int irq);
-
 #else /* CONFIG_SMP */
 
 #define move_native_irq(x)
 #define move_masked_irq(x)
 
-static inline int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
-{
-	return -EINVAL;
-}
-
-static inline int irq_can_set_affinity(unsigned int irq) { return 0; }
-
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_IRQBALANCE
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index e1bd50cbbf5..fdfa0c745bb 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -14,7 +14,7 @@
 #include <linux/cpu.h>
 #include <linux/err.h>
 #include <linux/hrtimer.h>
-#include <linux/irq.h>
+#include <linux/interrupt.h>
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 1bea399a9ef..4f3886562b8 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -14,12 +14,14 @@
 #include <linux/cpu.h>
 #include <linux/err.h>
 #include <linux/hrtimer.h>
-#include <linux/irq.h>
+#include <linux/interrupt.h>
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
 #include <linux/tick.h>
 
+#include <asm/irq_regs.h>
+
 #include "tick-internal.h"
 
 /*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 0258d3115d5..450c04935b6 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -14,7 +14,7 @@
 #include <linux/cpu.h>
 #include <linux/err.h>
 #include <linux/hrtimer.h>
-#include <linux/irq.h>
+#include <linux/interrupt.h>
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
-- 
cgit v1.2.3-70-g09d2


From 029a07e0311c7fef968d44b50beca53969cee40b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 10 Feb 2008 09:17:43 +0100
Subject: hrtimer: use nanosleep specific restart_block fields

Convert all the nanosleep related users of restart_block to the
new nanosleep specific restart_block fields.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/compat.c  | 15 +++++++--------
 kernel/hrtimer.c | 13 ++++++-------
 2 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index 5f0e201bcfd..9c48abfcd4a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -47,15 +47,14 @@ static long compat_nanosleep_restart(struct restart_block *restart)
 	mm_segment_t oldfs;
 	long ret;
 
-	rmtp = (struct compat_timespec __user *)(restart->arg1);
-	restart->arg1 = (unsigned long)&rmt;
+	restart->nanosleep.rmtp = (struct timespec __user *) &rmt;
 	oldfs = get_fs();
 	set_fs(KERNEL_DS);
 	ret = hrtimer_nanosleep_restart(restart);
 	set_fs(oldfs);
 
 	if (ret) {
-		restart->arg1 = (unsigned long)rmtp;
+		rmtp = restart->nanosleep.compat_rmtp;
 
 		if (rmtp && put_compat_timespec(&rmt, rmtp))
 			return -EFAULT;
@@ -89,7 +88,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
 			= &current_thread_info()->restart_block;
 
 		restart->fn = compat_nanosleep_restart;
-		restart->arg1 = (unsigned long)rmtp;
+		restart->nanosleep.compat_rmtp = rmtp;
 
 		if (rmtp && put_compat_timespec(&rmt, rmtp))
 			return -EFAULT;
@@ -607,9 +606,9 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
 	long err;
 	mm_segment_t oldfs;
 	struct timespec tu;
-	struct compat_timespec *rmtp = (struct compat_timespec *)(restart->arg1);
+	struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp;
 
-	restart->arg1 = (unsigned long) &tu;
+	restart->nanosleep.rmtp = (struct timespec __user *) &tu;
 	oldfs = get_fs();
 	set_fs(KERNEL_DS);
 	err = clock_nanosleep_restart(restart);
@@ -621,7 +620,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
 
 	if (err == -ERESTART_RESTARTBLOCK) {
 		restart->fn = compat_clock_nanosleep_restart;
-		restart->arg1 = (unsigned long) rmtp;
+		restart->nanosleep.compat_rmtp = rmtp;
 	}
 	return err;
 }
@@ -652,7 +651,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
 	if (err == -ERESTART_RESTARTBLOCK) {
 		restart = &current_thread_info()->restart_block;
 		restart->fn = compat_clock_nanosleep_restart;
-		restart->arg1 = (unsigned long) rmtp;
+		restart->nanosleep.compat_rmtp = rmtp;
 	}
 	return err;
 }
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 98bee013f71..911e87d0440 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1354,13 +1354,13 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
 	struct hrtimer_sleeper t;
 	struct timespec __user  *rmtp;
 
-	hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS);
-	t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2;
+	hrtimer_init(&t.timer, restart->nanosleep.index, HRTIMER_MODE_ABS);
+	t.timer.expires.tv64 = restart->nanosleep.expires;
 
 	if (do_nanosleep(&t, HRTIMER_MODE_ABS))
 		return 0;
 
-	rmtp = (struct timespec __user *)restart->arg1;
+	rmtp = restart->nanosleep.rmtp;
 	if (rmtp) {
 		int ret = update_rmtp(&t.timer, rmtp);
 		if (ret <= 0)
@@ -1394,10 +1394,9 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 
 	restart = &current_thread_info()->restart_block;
 	restart->fn = hrtimer_nanosleep_restart;
-	restart->arg0 = (unsigned long) t.timer.base->index;
-	restart->arg1 = (unsigned long) rmtp;
-	restart->arg2 = t.timer.expires.tv64 & 0xFFFFFFFF;
-	restart->arg3 = t.timer.expires.tv64 >> 32;
+	restart->nanosleep.index = t.timer.base->index;
+	restart->nanosleep.rmtp = rmtp;
+	restart->nanosleep.expires = t.timer.expires.tv64;
 
 	return -ERESTART_RESTARTBLOCK;
 }
-- 
cgit v1.2.3-70-g09d2


From d59b949f771eb3cbe50865c72e13e2a0a8d4d781 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Tue, 5 Feb 2008 00:48:13 +0100
Subject: timer_list: add annotations to workqueue.c

Add timer list annotations to workqueue.c so we can see the call site
in the timer stats.

Signed-off-by: Pavel Machek <Pavel@suse.cz>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/workqueue.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ff06611655a..00ff4d08e37 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -219,6 +219,7 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	struct timer_list *timer = &dwork->timer;
 	struct work_struct *work = &dwork->work;
 
+	timer_stats_timer_set_start_info(&dwork->timer);
 	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
 		BUG_ON(timer_pending(timer));
 		BUG_ON(!list_empty(&work->entry));
@@ -580,6 +581,7 @@ EXPORT_SYMBOL(schedule_delayed_work);
 int schedule_delayed_work_on(int cpu,
 			struct delayed_work *dwork, unsigned long delay)
 {
+	timer_stats_timer_set_start_info(&dwork->timer);
 	return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work_on);
-- 
cgit v1.2.3-70-g09d2


From ee7dd205b5cdbc3231d48e38641efd05f572c52a Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Fri, 4 Apr 2008 20:54:10 +0200
Subject: posix-timers: fix shadowed variables

Fix sparse warnings like this:
kernel/posix-cpu-timers.c:1090:25: warning: symbol 't' shadows an earlier one
kernel/posix-cpu-timers.c:1058:21: originally declared here

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/posix-cpu-timers.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 2eae91f954c..ae5c6c147c4 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1087,45 +1087,45 @@ static void check_process_timers(struct task_struct *tsk,
 	maxfire = 20;
 	prof_expires = cputime_zero;
 	while (!list_empty(timers)) {
-		struct cpu_timer_list *t = list_first_entry(timers,
+		struct cpu_timer_list *tl = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
-		if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) {
-			prof_expires = t->expires.cpu;
+		if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) {
+			prof_expires = tl->expires.cpu;
 			break;
 		}
-		t->firing = 1;
-		list_move_tail(&t->entry, firing);
+		tl->firing = 1;
+		list_move_tail(&tl->entry, firing);
 	}
 
 	++timers;
 	maxfire = 20;
 	virt_expires = cputime_zero;
 	while (!list_empty(timers)) {
-		struct cpu_timer_list *t = list_first_entry(timers,
+		struct cpu_timer_list *tl = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
-		if (!--maxfire || cputime_lt(utime, t->expires.cpu)) {
-			virt_expires = t->expires.cpu;
+		if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) {
+			virt_expires = tl->expires.cpu;
 			break;
 		}
-		t->firing = 1;
-		list_move_tail(&t->entry, firing);
+		tl->firing = 1;
+		list_move_tail(&tl->entry, firing);
 	}
 
 	++timers;
 	maxfire = 20;
 	sched_expires = 0;
 	while (!list_empty(timers)) {
-		struct cpu_timer_list *t = list_first_entry(timers,
+		struct cpu_timer_list *tl = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
-		if (!--maxfire || sum_sched_runtime < t->expires.sched) {
-			sched_expires = t->expires.sched;
+		if (!--maxfire || sum_sched_runtime < tl->expires.sched) {
+			sched_expires = tl->expires.sched;
 			break;
 		}
-		t->firing = 1;
-		list_move_tail(&t->entry, firing);
+		tl->firing = 1;
+		list_move_tail(&tl->entry, firing);
 	}
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From 0d180406f2914aea3a78ddb880e2fe9ac78a9372 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 4 Apr 2008 20:54:10 +0200
Subject: timers: simplify lockdep handling

In order to avoid the false positive from lockdep, each per-cpu base->lock has
the separate lock class and migrate_timers() uses double_spin_lock().

This all is overcomplicated: except for migrate_timers() we never take 2 locks
at once, and migrate_timers() can use spin_lock_nested().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/timer.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index b024106daa7..f3d35d4ea42 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1228,13 +1228,6 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
 	return 0;
 }
 
-/*
- * lockdep: we want to track each per-CPU base as a separate lock-class,
- * but timer-bases are kmalloc()-ed, so we need to attach separate
- * keys to them:
- */
-static struct lock_class_key base_lock_keys[NR_CPUS];
-
 static int __cpuinit init_timers_cpu(int cpu)
 {
 	int j;
@@ -1277,7 +1270,6 @@ static int __cpuinit init_timers_cpu(int cpu)
 	}
 
 	spin_lock_init(&base->lock);
-	lockdep_set_class(&base->lock, base_lock_keys + cpu);
 
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
@@ -1316,8 +1308,8 @@ static void __cpuinit migrate_timers(int cpu)
 	new_base = get_cpu_var(tvec_bases);
 
 	local_irq_disable();
-	double_spin_lock(&new_base->lock, &old_base->lock,
-			 smp_processor_id() < cpu);
+	spin_lock(&new_base->lock);
+	spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
 
 	BUG_ON(old_base->running_timer);
 
@@ -1330,8 +1322,8 @@ static void __cpuinit migrate_timers(int cpu)
 		migrate_timer_list(new_base, old_base->tv5.vec + i);
 	}
 
-	double_spin_unlock(&new_base->lock, &old_base->lock,
-			   smp_processor_id() < cpu);
+	spin_unlock(&old_base->lock);
+	spin_unlock(&new_base->lock);
 	local_irq_enable();
 	put_cpu_var(tvec_bases);
 }
-- 
cgit v1.2.3-70-g09d2


From 8e60e05fdc7344415fa69a3883b11f65db967b47 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 4 Apr 2008 20:54:10 +0200
Subject: hrtimers: simplify lockdep handling

In order to avoid the false positive from lockdep, each per-cpu base->lock has
the separate lock class and migrate_hrtimers() uses double_spin_lock().

This is overcomplicated: except for migrate_hrtimers() we never take 2 locks
at once, and migrate_hrtimers() can use spin_lock_nested().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 2 --
 kernel/hrtimer.c        | 9 ++++-----
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 1ad56a7b2f7..56f3236da82 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -173,7 +173,6 @@ struct hrtimer_clock_base {
  * struct hrtimer_cpu_base - the per cpu clock bases
  * @lock:		lock protecting the base and associated clock bases
  *			and timers
- * @lock_key:		the lock_class_key for use with lockdep
  * @clock_base:		array of clock bases for this cpu
  * @curr_timer:		the timer which is executing a callback right now
  * @expires_next:	absolute time of the next event which was scheduled
@@ -189,7 +188,6 @@ struct hrtimer_clock_base {
  */
 struct hrtimer_cpu_base {
 	spinlock_t			lock;
-	struct lock_class_key		lock_key;
 	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
 	struct list_head		cb_pending;
 #ifdef CONFIG_HIGH_RES_TIMERS
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 911e87d0440..c642ef75069 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1424,7 +1424,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 	int i;
 
 	spin_lock_init(&cpu_base->lock);
-	lockdep_set_class(&cpu_base->lock, &cpu_base->lock_key);
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
 		cpu_base->clock_base[i].cpu_base = cpu_base;
@@ -1465,16 +1464,16 @@ static void migrate_hrtimers(int cpu)
 	tick_cancel_sched_timer(cpu);
 
 	local_irq_disable();
-	double_spin_lock(&new_base->lock, &old_base->lock,
-			 smp_processor_id() < cpu);
+	spin_lock(&new_base->lock);
+	spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		migrate_hrtimer_list(&old_base->clock_base[i],
 				     &new_base->clock_base[i]);
 	}
 
-	double_spin_unlock(&new_base->lock, &old_base->lock,
-			   smp_processor_id() < cpu);
+	spin_unlock(&old_base->lock);
+	spin_unlock(&new_base->lock);
 	local_irq_enable();
 	put_cpu_var(hrtimer_bases);
 }
-- 
cgit v1.2.3-70-g09d2


From 903b8a8d4835a796f582033802c83283886f4a3d Mon Sep 17 00:00:00 2001
From: Karsten Wiese <fzu@wemgehoertderstaat.de>
Date: Thu, 28 Feb 2008 15:10:50 +0100
Subject: clockevents: optimise tick_nohz_stop_sched_tick() a bit

Call
	ts = &per_cpu(tick_cpu_sched, cpu);
and
	cpu = smp_processor_id();
once instead of twice.

No functional change done, as changed code runs with local irq off.
Reduces source lines and text size (20bytes on x86_64).

[ akpm@linux-foundation.org: Build fix ]

Signed-off-by: Karsten Wiese <fzu@wemgehoertderstaat.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 686da821d37..69dba0c7172 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -158,9 +158,8 @@ void tick_nohz_stop_idle(int cpu)
 	}
 }
 
-static ktime_t tick_nohz_start_idle(int cpu)
+static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 {
-	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 	ktime_t now, delta;
 
 	now = ktime_get();
@@ -201,8 +200,8 @@ void tick_nohz_stop_sched_tick(void)
 	local_irq_save(flags);
 
 	cpu = smp_processor_id();
-	now = tick_nohz_start_idle(cpu);
 	ts = &per_cpu(tick_cpu_sched, cpu);
+	now = tick_nohz_start_idle(ts);
 
 	/*
 	 * If this cpu is offline and it is the one which updates
@@ -222,7 +221,6 @@ void tick_nohz_stop_sched_tick(void)
 	if (need_resched())
 		goto end;
 
-	cpu = smp_processor_id();
 	if (unlikely(local_softirq_pending())) {
 		static int ratelimit;
 
-- 
cgit v1.2.3-70-g09d2


From 6993fc5bbc5d63ccd55985b39c34417e430e75e9 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 30 Jan 2008 13:30:02 +0100
Subject: clocksource: make clocksource watchdog cycle through online CPUs

This way it checks if the clocks are synchronized between CPUs too.
This might be able to detect slowly drifting TSCs which only
go wrong over longer time.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clocksource.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 7f60097d443..912156dd600 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -141,8 +141,16 @@ static void clocksource_watchdog(unsigned long data)
 	}
 
 	if (!list_empty(&watchdog_list)) {
-		__mod_timer(&watchdog_timer,
-			    watchdog_timer.expires + WATCHDOG_INTERVAL);
+		/*
+		 * Cycle through CPUs to check if the CPUs stay
+		 * synchronized to each other.
+		 */
+		int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map);
+
+		if (next_cpu >= NR_CPUS)
+			next_cpu = first_cpu(cpu_online_map);
+		watchdog_timer.expires += WATCHDOG_INTERVAL;
+		add_timer_on(&watchdog_timer, next_cpu);
 	}
 	spin_unlock(&watchdog_lock);
 }
@@ -164,7 +172,8 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 		if (!started && watchdog) {
 			watchdog_last = watchdog->read();
 			watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
-			add_timer(&watchdog_timer);
+			add_timer_on(&watchdog_timer,
+				     first_cpu(cpu_online_map));
 		}
 	} else {
 		if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -185,7 +194,8 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 				watchdog_last = watchdog->read();
 				watchdog_timer.expires =
 					jiffies + WATCHDOG_INTERVAL;
-				add_timer(&watchdog_timer);
+				add_timer_on(&watchdog_timer,
+					     first_cpu(cpu_online_map));
 			}
 		}
 	}
-- 
cgit v1.2.3-70-g09d2


From 64ac24e738823161693bf791f87adc802cf529ff Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Fri, 7 Mar 2008 21:55:58 -0500
Subject: Generic semaphore implementation

Semaphores are no longer performance-critical, so a generic C
implementation is better for maintainability, debuggability and
extensibility.  Thanks to Peter Zijlstra for fixing the lockdep
warning.  Thanks to Harvey Harrison for pointing out that the
unlikely() was unnecessary.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 arch/alpha/kernel/Makefile               |   2 +-
 arch/alpha/kernel/alpha_ksyms.c          |   9 --
 arch/alpha/kernel/semaphore.c            | 224 ---------------------------
 arch/arm/kernel/Makefile                 |   2 +-
 arch/arm/kernel/semaphore.c              | 221 ---------------------------
 arch/avr32/kernel/Makefile               |   2 +-
 arch/avr32/kernel/semaphore.c            | 148 ------------------
 arch/blackfin/Kconfig                    |   4 -
 arch/blackfin/kernel/bfin_ksyms.c        |   5 -
 arch/cris/kernel/Makefile                |   3 +-
 arch/cris/kernel/crisksyms.c             |   7 -
 arch/cris/kernel/semaphore.c             | 129 ----------------
 arch/frv/kernel/Makefile                 |   2 +-
 arch/frv/kernel/frv_ksyms.c              |   1 -
 arch/frv/kernel/semaphore.c              | 155 -------------------
 arch/h8300/kernel/Makefile               |   2 +-
 arch/h8300/kernel/h8300_ksyms.c          |   1 -
 arch/h8300/kernel/semaphore.c            | 132 ----------------
 arch/ia64/kernel/Makefile                |   2 +-
 arch/ia64/kernel/ia64_ksyms.c            |   6 -
 arch/ia64/kernel/semaphore.c             | 165 --------------------
 arch/m32r/kernel/Makefile                |   2 +-
 arch/m32r/kernel/m32r_ksyms.c            |   5 -
 arch/m32r/kernel/semaphore.c             | 185 ----------------------
 arch/m68k/kernel/Makefile                |   2 +-
 arch/m68k/kernel/m68k_ksyms.c            |   6 -
 arch/m68k/kernel/semaphore.c             | 132 ----------------
 arch/m68k/lib/Makefile                   |   2 +-
 arch/m68k/lib/semaphore.S                |  53 -------
 arch/m68knommu/kernel/Makefile           |   2 +-
 arch/m68knommu/kernel/m68k_ksyms.c       |   6 -
 arch/m68knommu/kernel/semaphore.c        | 133 ----------------
 arch/m68knommu/lib/Makefile              |   2 +-
 arch/m68knommu/lib/semaphore.S           |  66 --------
 arch/mips/kernel/Makefile                |   2 +-
 arch/mips/kernel/semaphore.c             | 168 --------------------
 arch/mn10300/kernel/Makefile             |   2 +-
 arch/mn10300/kernel/semaphore.c          | 149 ------------------
 arch/parisc/kernel/Makefile              |   2 +-
 arch/parisc/kernel/parisc_ksyms.c        |   5 -
 arch/parisc/kernel/semaphore.c           | 102 -------------
 arch/powerpc/kernel/Makefile             |   2 +-
 arch/powerpc/kernel/ppc_ksyms.c          |   1 -
 arch/powerpc/kernel/semaphore.c          | 135 ----------------
 arch/ppc/kernel/semaphore.c              | 131 ----------------
 arch/s390/kernel/Makefile                |   2 +-
 arch/s390/kernel/s390_ksyms.c            |   7 -
 arch/s390/kernel/semaphore.c             | 108 -------------
 arch/sh/kernel/Makefile_32               |   2 +-
 arch/sh/kernel/Makefile_64               |   2 +-
 arch/sh/kernel/semaphore.c               | 139 -----------------
 arch/sh/kernel/sh_ksyms_32.c             |   7 -
 arch/sh/kernel/sh_ksyms_64.c             |   4 -
 arch/sparc/kernel/Makefile               |   2 +-
 arch/sparc/kernel/semaphore.c            | 155 -------------------
 arch/sparc/kernel/sparc_ksyms.c          |   5 -
 arch/sparc64/kernel/Makefile             |   2 +-
 arch/sparc64/kernel/semaphore.c          | 254 -------------------------------
 arch/sparc64/kernel/sparc64_ksyms.c      |   6 -
 arch/um/Kconfig.i386                     |   4 -
 arch/um/Kconfig.x86_64                   |   4 -
 arch/um/sys-i386/ksyms.c                 |  12 --
 arch/um/sys-ppc/Makefile                 |   8 +-
 arch/um/sys-x86_64/ksyms.c               |  13 +-
 arch/v850/kernel/Makefile                |   2 +-
 arch/v850/kernel/semaphore.c             | 166 --------------------
 arch/v850/kernel/v850_ksyms.c            |   7 -
 arch/x86/Kconfig                         |   3 -
 arch/x86/kernel/i386_ksyms_32.c          |   5 -
 arch/x86/kernel/x8664_ksyms_64.c         |   6 -
 arch/x86/lib/semaphore_32.S              |  83 ----------
 arch/x86/lib/thunk_64.S                  |   5 -
 arch/xtensa/kernel/Makefile              |   2 +-
 arch/xtensa/kernel/semaphore.c           | 226 ---------------------------
 arch/xtensa/kernel/xtensa_ksyms.c        |   9 --
 include/asm-alpha/semaphore.h            | 150 +-----------------
 include/asm-arm/semaphore-helper.h       |  84 ----------
 include/asm-arm/semaphore.h              |  99 +-----------
 include/asm-avr32/semaphore.h            | 109 +------------
 include/asm-blackfin/semaphore-helper.h  |  82 ----------
 include/asm-blackfin/semaphore.h         | 106 +------------
 include/asm-cris/semaphore-helper.h      |  78 ----------
 include/asm-cris/semaphore.h             | 134 +---------------
 include/asm-frv/semaphore.h              | 156 +------------------
 include/asm-h8300/semaphore-helper.h     |  85 -----------
 include/asm-h8300/semaphore.h            | 191 +----------------------
 include/asm-ia64/semaphore.h             | 100 +-----------
 include/asm-m32r/semaphore.h             | 145 +-----------------
 include/asm-m68k/semaphore-helper.h      | 142 -----------------
 include/asm-m68k/semaphore.h             | 164 +-------------------
 include/asm-m68knommu/semaphore-helper.h |  82 ----------
 include/asm-m68knommu/semaphore.h        | 154 +------------------
 include/asm-mips/semaphore.h             | 109 +------------
 include/asm-mn10300/semaphore.h          | 170 +--------------------
 include/asm-parisc/semaphore-helper.h    |  89 -----------
 include/asm-parisc/semaphore.h           | 146 +-----------------
 include/asm-powerpc/semaphore.h          |  95 +-----------
 include/asm-s390/semaphore.h             | 108 +------------
 include/asm-sh/semaphore-helper.h        |  89 -----------
 include/asm-sh/semaphore.h               | 116 +-------------
 include/asm-sparc/semaphore.h            | 193 +----------------------
 include/asm-sparc64/semaphore.h          |  54 +------
 include/asm-um/semaphore.h               |   7 +-
 include/asm-v850/semaphore.h             |  85 +----------
 include/asm-x86/semaphore.h              |   6 +-
 include/asm-x86/semaphore_32.h           | 175 ---------------------
 include/asm-x86/semaphore_64.h           | 180 ----------------------
 include/asm-xtensa/semaphore.h           | 100 +-----------
 include/linux/semaphore.h                |  77 ++++++++++
 kernel/Makefile                          |   2 +-
 kernel/semaphore.c                       | 187 +++++++++++++++++++++++
 lib/Makefile                             |   1 -
 lib/semaphore-sleepers.c                 | 176 ---------------------
 113 files changed, 314 insertions(+), 7679 deletions(-)
 delete mode 100644 arch/alpha/kernel/semaphore.c
 delete mode 100644 arch/arm/kernel/semaphore.c
 delete mode 100644 arch/avr32/kernel/semaphore.c
 delete mode 100644 arch/cris/kernel/semaphore.c
 delete mode 100644 arch/frv/kernel/semaphore.c
 delete mode 100644 arch/h8300/kernel/semaphore.c
 delete mode 100644 arch/ia64/kernel/semaphore.c
 delete mode 100644 arch/m32r/kernel/semaphore.c
 delete mode 100644 arch/m68k/kernel/semaphore.c
 delete mode 100644 arch/m68k/lib/semaphore.S
 delete mode 100644 arch/m68knommu/kernel/semaphore.c
 delete mode 100644 arch/m68knommu/lib/semaphore.S
 delete mode 100644 arch/mips/kernel/semaphore.c
 delete mode 100644 arch/mn10300/kernel/semaphore.c
 delete mode 100644 arch/parisc/kernel/semaphore.c
 delete mode 100644 arch/powerpc/kernel/semaphore.c
 delete mode 100644 arch/ppc/kernel/semaphore.c
 delete mode 100644 arch/s390/kernel/semaphore.c
 delete mode 100644 arch/sh/kernel/semaphore.c
 delete mode 100644 arch/sparc/kernel/semaphore.c
 delete mode 100644 arch/sparc64/kernel/semaphore.c
 delete mode 100644 arch/v850/kernel/semaphore.c
 delete mode 100644 arch/xtensa/kernel/semaphore.c
 delete mode 100644 include/asm-arm/semaphore-helper.h
 delete mode 100644 include/asm-blackfin/semaphore-helper.h
 delete mode 100644 include/asm-cris/semaphore-helper.h
 delete mode 100644 include/asm-h8300/semaphore-helper.h
 delete mode 100644 include/asm-m68k/semaphore-helper.h
 delete mode 100644 include/asm-m68knommu/semaphore-helper.h
 delete mode 100644 include/asm-parisc/semaphore-helper.h
 delete mode 100644 include/asm-sh/semaphore-helper.h
 delete mode 100644 include/asm-x86/semaphore_32.h
 delete mode 100644 include/asm-x86/semaphore_64.h
 create mode 100644 include/linux/semaphore.h
 create mode 100644 kernel/semaphore.c
 delete mode 100644 lib/semaphore-sleepers.c

(limited to 'kernel')

diff --git a/arch/alpha/kernel/Makefile b/arch/alpha/kernel/Makefile
index dccf05245d4..ac706c1d7ad 100644
--- a/arch/alpha/kernel/Makefile
+++ b/arch/alpha/kernel/Makefile
@@ -7,7 +7,7 @@ EXTRA_AFLAGS	:= $(KBUILD_CFLAGS)
 EXTRA_CFLAGS	:= -Werror -Wno-sign-compare
 
 obj-y    := entry.o traps.o process.o init_task.o osf_sys.o irq.o \
-	    irq_alpha.o signal.o setup.o ptrace.o time.o semaphore.o \
+	    irq_alpha.o signal.o setup.o ptrace.o time.o \
 	    alpha_ksyms.o systbls.o err_common.o io.o
 
 obj-$(CONFIG_VGA_HOSE)	+= console.o
diff --git a/arch/alpha/kernel/alpha_ksyms.c b/arch/alpha/kernel/alpha_ksyms.c
index e9762a33b04..d96e742d4dc 100644
--- a/arch/alpha/kernel/alpha_ksyms.c
+++ b/arch/alpha/kernel/alpha_ksyms.c
@@ -77,15 +77,6 @@ EXPORT_SYMBOL(__do_clear_user);
 EXPORT_SYMBOL(__strncpy_from_user);
 EXPORT_SYMBOL(__strnlen_user);
 
-/* Semaphore helper functions.  */
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__up_wakeup);
-EXPORT_SYMBOL(down);
-EXPORT_SYMBOL(down_interruptible);
-EXPORT_SYMBOL(down_trylock);
-EXPORT_SYMBOL(up);
-
 /* 
  * SMP-specific symbols.
  */
diff --git a/arch/alpha/kernel/semaphore.c b/arch/alpha/kernel/semaphore.c
deleted file mode 100644
index 8d2982aa1b8..00000000000
--- a/arch/alpha/kernel/semaphore.c
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Alpha semaphore implementation.
- *
- * (C) Copyright 1996 Linus Torvalds
- * (C) Copyright 1999, 2000 Richard Henderson
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-
-/*
- * This is basically the PPC semaphore scheme ported to use
- * the Alpha ll/sc sequences, so see the PPC code for
- * credits.
- */
-
-/*
- * Atomically update sem->count.
- * This does the equivalent of the following:
- *
- *	old_count = sem->count;
- *	tmp = MAX(old_count, 0) + incr;
- *	sem->count = tmp;
- *	return old_count;
- */
-static inline int __sem_update_count(struct semaphore *sem, int incr)
-{
-	long old_count, tmp = 0;
-
-	__asm__ __volatile__(
-	"1:	ldl_l	%0,%2\n"
-	"	cmovgt	%0,%0,%1\n"
-	"	addl	%1,%3,%1\n"
-	"	stl_c	%1,%2\n"
-	"	beq	%1,2f\n"
-	"	mb\n"
-	".subsection 2\n"
-	"2:	br	1b\n"
-	".previous"
-	: "=&r" (old_count), "=&r" (tmp), "=m" (sem->count)
-	: "Ir" (incr), "1" (tmp), "m" (sem->count));
-
-	return old_count;
-}
-
-/*
- * Perform the "down" function.  Return zero for semaphore acquired,
- * return negative for signalled out of the function.
- *
- * If called from down, the return is ignored and the wait loop is
- * not interruptible.  This means that a task waiting on a semaphore
- * using "down()" cannot be killed until someone does an "up()" on
- * the semaphore.
- *
- * If called from down_interruptible, the return value gets checked
- * upon return.  If the return value is negative then the task continues
- * with the negative value in the return register (it can be tested by
- * the caller).
- *
- * Either form may be used in conjunction with "up()".
- */
-
-void __sched
-__down_failed(struct semaphore *sem)
-{
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-
-#ifdef CONFIG_DEBUG_SEMAPHORE
-	printk("%s(%d): down failed(%p)\n",
-	       tsk->comm, task_pid_nr(tsk), sem);
-#endif
-
-	tsk->state = TASK_UNINTERRUPTIBLE;
-	wmb();
-	add_wait_queue_exclusive(&sem->wait, &wait);
-
-	/*
-	 * Try to get the semaphore.  If the count is > 0, then we've
-	 * got the semaphore; we decrement count and exit the loop.
-	 * If the count is 0 or negative, we set it to -1, indicating
-	 * that we are asleep, and then sleep.
-	 */
-	while (__sem_update_count(sem, -1) <= 0) {
-		schedule();
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-	}
-	remove_wait_queue(&sem->wait, &wait);
-	tsk->state = TASK_RUNNING;
-
-	/*
-	 * If there are any more sleepers, wake one of them up so
-	 * that it can either get the semaphore, or set count to -1
-	 * indicating that there are still processes sleeping.
-	 */
-	wake_up(&sem->wait);
-
-#ifdef CONFIG_DEBUG_SEMAPHORE
-	printk("%s(%d): down acquired(%p)\n",
-	       tsk->comm, task_pid_nr(tsk), sem);
-#endif
-}
-
-int __sched
-__down_failed_interruptible(struct semaphore *sem)
-{
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	long ret = 0;
-
-#ifdef CONFIG_DEBUG_SEMAPHORE
-	printk("%s(%d): down failed(%p)\n",
-	       tsk->comm, task_pid_nr(tsk), sem);
-#endif
-
-	tsk->state = TASK_INTERRUPTIBLE;
-	wmb();
-	add_wait_queue_exclusive(&sem->wait, &wait);
-
-	while (__sem_update_count(sem, -1) <= 0) {
-		if (signal_pending(current)) {
-			/*
-			 * A signal is pending - give up trying.
-			 * Set sem->count to 0 if it is negative,
-			 * since we are no longer sleeping.
-			 */
-			__sem_update_count(sem, 0);
-			ret = -EINTR;
-			break;
-		}
-		schedule();
-		set_task_state(tsk, TASK_INTERRUPTIBLE);
-	}
-
-	remove_wait_queue(&sem->wait, &wait);
-	tsk->state = TASK_RUNNING;
-	wake_up(&sem->wait);
-
-#ifdef CONFIG_DEBUG_SEMAPHORE
-	printk("%s(%d): down %s(%p)\n",
-	       current->comm, task_pid_nr(current),
-	       (ret < 0 ? "interrupted" : "acquired"), sem);
-#endif
-	return ret;
-}
-
-void
-__up_wakeup(struct semaphore *sem)
-{
-	/*
-	 * Note that we incremented count in up() before we came here,
-	 * but that was ineffective since the result was <= 0, and
-	 * any negative value of count is equivalent to 0.
-	 * This ends up setting count to 1, unless count is now > 0
-	 * (i.e. because some other cpu has called up() in the meantime),
-	 * in which case we just increment count.
-	 */
-	__sem_update_count(sem, 1);
-	wake_up(&sem->wait);
-}
-
-void __sched
-down(struct semaphore *sem)
-{
-#ifdef WAITQUEUE_DEBUG
-	CHECK_MAGIC(sem->__magic);
-#endif
-#ifdef CONFIG_DEBUG_SEMAPHORE
-	printk("%s(%d): down(%p) <count=%d> from %p\n",
-	       current->comm, task_pid_nr(current), sem,
-	       atomic_read(&sem->count), __builtin_return_address(0));
-#endif
-	__down(sem);
-}
-
-int __sched
-down_interruptible(struct semaphore *sem)
-{
-#ifdef WAITQUEUE_DEBUG
-	CHECK_MAGIC(sem->__magic);
-#endif
-#ifdef CONFIG_DEBUG_SEMAPHORE
-	printk("%s(%d): down(%p) <count=%d> from %p\n",
-	       current->comm, task_pid_nr(current), sem,
-	       atomic_read(&sem->count), __builtin_return_address(0));
-#endif
-	return __down_interruptible(sem);
-}
-
-int
-down_trylock(struct semaphore *sem)
-{
-	int ret;
-
-#ifdef WAITQUEUE_DEBUG
-	CHECK_MAGIC(sem->__magic);
-#endif
-
-	ret = __down_trylock(sem);
-
-#ifdef CONFIG_DEBUG_SEMAPHORE
-	printk("%s(%d): down_trylock %s from %p\n",
-	       current->comm, task_pid_nr(current),
-	       ret ? "failed" : "acquired",
-	       __builtin_return_address(0));
-#endif
-
-	return ret;
-}
-
-void
-up(struct semaphore *sem)
-{
-#ifdef WAITQUEUE_DEBUG
-	CHECK_MAGIC(sem->__magic);
-#endif
-#ifdef CONFIG_DEBUG_SEMAPHORE
-	printk("%s(%d): up(%p) <count=%d> from %p\n",
-	       current->comm, task_pid_nr(current), sem,
-	       atomic_read(&sem->count), __builtin_return_address(0));
-#endif
-	__up(sem);
-}
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 00d44c6fbfe..6235f72a14f 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -7,7 +7,7 @@ AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET)
 # Object file lists.
 
 obj-y		:= compat.o entry-armv.o entry-common.o irq.o \
-		   process.o ptrace.o semaphore.o setup.o signal.o \
+		   process.o ptrace.o setup.o signal.o \
 		   sys_arm.o stacktrace.o time.o traps.o
 
 obj-$(CONFIG_ISA_DMA_API)	+= dma.o
diff --git a/arch/arm/kernel/semaphore.c b/arch/arm/kernel/semaphore.c
deleted file mode 100644
index 981fe5c6ccb..00000000000
--- a/arch/arm/kernel/semaphore.c
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- *  ARM semaphore implementation, taken from
- *
- *  i386 semaphore implementation.
- *
- *  (C) Copyright 1999 Linus Torvalds
- *
- *  Modified for ARM by Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-
-#include <asm/semaphore.h>
-
-/*
- * Semaphores are implemented using a two-way counter:
- * The "count" variable is decremented for each process
- * that tries to acquire the semaphore, while the "sleeping"
- * variable is a count of such acquires.
- *
- * Notably, the inline "up()" and "down()" functions can
- * efficiently test if they need to do any extra work (up
- * needs to do something only if count was negative before
- * the increment operation.
- *
- * "sleeping" and the contention routine ordering is
- * protected by the semaphore spinlock.
- *
- * Note that these functions are only called when there is
- * contention on the lock, and as such all this is the
- * "non-critical" part of the whole semaphore business. The
- * critical part is the inline stuff in <asm/semaphore.h>
- * where we want to avoid any extra jumps and calls.
- */
-
-/*
- * Logic:
- *  - only on a boundary condition do we need to care. When we go
- *    from a negative count to a non-negative, we wake people up.
- *  - when we go from a non-negative count to a negative do we
- *    (a) synchronize with the "sleeper" count and (b) make sure
- *    that we're on the wakeup list before we synchronize so that
- *    we cannot lose wakeup events.
- */
-
-void __up(struct semaphore *sem)
-{
-	wake_up(&sem->wait);
-}
-
-static DEFINE_SPINLOCK(semaphore_lock);
-
-void __sched __down(struct semaphore * sem)
-{
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	tsk->state = TASK_UNINTERRUPTIBLE;
-	add_wait_queue_exclusive(&sem->wait, &wait);
-
-	spin_lock_irq(&semaphore_lock);
-	sem->sleepers++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock.
-		 */
-		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irq(&semaphore_lock);
-
-		schedule();
-		tsk->state = TASK_UNINTERRUPTIBLE;
-		spin_lock_irq(&semaphore_lock);
-	}
-	spin_unlock_irq(&semaphore_lock);
-	remove_wait_queue(&sem->wait, &wait);
-	tsk->state = TASK_RUNNING;
-	wake_up(&sem->wait);
-}
-
-int __sched __down_interruptible(struct semaphore * sem)
-{
-	int retval = 0;
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	tsk->state = TASK_INTERRUPTIBLE;
-	add_wait_queue_exclusive(&sem->wait, &wait);
-
-	spin_lock_irq(&semaphore_lock);
-	sem->sleepers ++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * With signals pending, this turns into
-		 * the trylock failure case - we won't be
-		 * sleeping, and we* can't get the lock as
-		 * it has contention. Just correct the count
-		 * and exit.
-		 */
-		if (signal_pending(current)) {
-			retval = -EINTR;
-			sem->sleepers = 0;
-			atomic_add(sleepers, &sem->count);
-			break;
-		}
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock. The
-		 * "-1" is because we're still hoping to get
-		 * the lock.
-		 */
-		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irq(&semaphore_lock);
-
-		schedule();
-		tsk->state = TASK_INTERRUPTIBLE;
-		spin_lock_irq(&semaphore_lock);
-	}
-	spin_unlock_irq(&semaphore_lock);
-	tsk->state = TASK_RUNNING;
-	remove_wait_queue(&sem->wait, &wait);
-	wake_up(&sem->wait);
-	return retval;
-}
-
-/*
- * Trylock failed - make sure we correct for
- * having decremented the count.
- *
- * We could have done the trylock with a
- * single "cmpxchg" without failure cases,
- * but then it wouldn't work on a 386.
- */
-int __down_trylock(struct semaphore * sem)
-{
-	int sleepers;
-	unsigned long flags;
-
-	spin_lock_irqsave(&semaphore_lock, flags);
-	sleepers = sem->sleepers + 1;
-	sem->sleepers = 0;
-
-	/*
-	 * Add "everybody else" and us into it. They aren't
-	 * playing, because we own the spinlock.
-	 */
-	if (!atomic_add_negative(sleepers, &sem->count))
-		wake_up(&sem->wait);
-
-	spin_unlock_irqrestore(&semaphore_lock, flags);
-	return 1;
-}
-
-/*
- * The semaphore operations have a special calling sequence that
- * allow us to do a simpler in-line version of them. These routines
- * need to convert that sequence back into the C sequence when
- * there is contention on the semaphore.
- *
- * ip contains the semaphore pointer on entry. Save the C-clobbered
- * registers (r0 to r3 and lr), but not ip, as we use it as a return
- * value in some cases..
- * To remain AAPCS compliant (64-bit stack align) we save r4 as well.
- */
-asm("	.section .sched.text,\"ax\",%progbits	\n\
-	.align	5				\n\
-	.globl	__down_failed			\n\
-__down_failed:					\n\
-	stmfd	sp!, {r0 - r4, lr}		\n\
-	mov	r0, ip				\n\
-	bl	__down				\n\
-	ldmfd	sp!, {r0 - r4, pc}		\n\
-						\n\
-	.align	5				\n\
-	.globl	__down_interruptible_failed	\n\
-__down_interruptible_failed:			\n\
-	stmfd	sp!, {r0 - r4, lr}		\n\
-	mov	r0, ip				\n\
-	bl	__down_interruptible		\n\
-	mov	ip, r0				\n\
-	ldmfd	sp!, {r0 - r4, pc}		\n\
-						\n\
-	.align	5				\n\
-	.globl	__down_trylock_failed		\n\
-__down_trylock_failed:				\n\
-	stmfd	sp!, {r0 - r4, lr}		\n\
-	mov	r0, ip				\n\
-	bl	__down_trylock			\n\
-	mov	ip, r0				\n\
-	ldmfd	sp!, {r0 - r4, pc}		\n\
-						\n\
-	.align	5				\n\
-	.globl	__up_wakeup			\n\
-__up_wakeup:					\n\
-	stmfd	sp!, {r0 - r4, lr}		\n\
-	mov	r0, ip				\n\
-	bl	__up				\n\
-	ldmfd	sp!, {r0 - r4, pc}		\n\
-	");
-
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_interruptible_failed);
-EXPORT_SYMBOL(__down_trylock_failed);
-EXPORT_SYMBOL(__up_wakeup);
diff --git a/arch/avr32/kernel/Makefile b/arch/avr32/kernel/Makefile
index e4b6d122b03..18229d0d186 100644
--- a/arch/avr32/kernel/Makefile
+++ b/arch/avr32/kernel/Makefile
@@ -6,7 +6,7 @@ extra-y				:= head.o vmlinux.lds
 
 obj-$(CONFIG_SUBARCH_AVR32B)	+= entry-avr32b.o
 obj-y				+= syscall_table.o syscall-stubs.o irq.o
-obj-y				+= setup.o traps.o semaphore.o ocd.o ptrace.o
+obj-y				+= setup.o traps.o ocd.o ptrace.o
 obj-y				+= signal.o sys_avr32.o process.o time.o
 obj-y				+= init_task.o switch_to.o cpu.o
 obj-$(CONFIG_MODULES)		+= module.o avr32_ksyms.o
diff --git a/arch/avr32/kernel/semaphore.c b/arch/avr32/kernel/semaphore.c
deleted file mode 100644
index 1e2705a0501..00000000000
--- a/arch/avr32/kernel/semaphore.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * AVR32 sempahore implementation.
- *
- * Copyright (C) 2004-2006 Atmel Corporation
- *
- * Based on linux/arch/i386/kernel/semaphore.c
- *  Copyright (C) 1999 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/sched.h>
-#include <linux/errno.h>
-#include <linux/module.h>
-
-#include <asm/semaphore.h>
-#include <asm/atomic.h>
-
-/*
- * Semaphores are implemented using a two-way counter:
- * The "count" variable is decremented for each process
- * that tries to acquire the semaphore, while the "sleeping"
- * variable is a count of such acquires.
- *
- * Notably, the inline "up()" and "down()" functions can
- * efficiently test if they need to do any extra work (up
- * needs to do something only if count was negative before
- * the increment operation.
- *
- * "sleeping" and the contention routine ordering is protected
- * by the spinlock in the semaphore's waitqueue head.
- *
- * Note that these functions are only called when there is
- * contention on the lock, and as such all this is the
- * "non-critical" part of the whole semaphore business. The
- * critical part is the inline stuff in <asm/semaphore.h>
- * where we want to avoid any extra jumps and calls.
- */
-
-/*
- * Logic:
- *  - only on a boundary condition do we need to care. When we go
- *    from a negative count to a non-negative, we wake people up.
- *  - when we go from a non-negative count to a negative do we
- *    (a) synchronize with the "sleeper" count and (b) make sure
- *    that we're on the wakeup list before we synchronize so that
- *    we cannot lose wakeup events.
- */
-
-void __up(struct semaphore *sem)
-{
-	wake_up(&sem->wait);
-}
-EXPORT_SYMBOL(__up);
-
-void __sched __down(struct semaphore *sem)
-{
-	struct task_struct *tsk = current;
-        DECLARE_WAITQUEUE(wait, tsk);
-        unsigned long flags;
-
-        tsk->state = TASK_UNINTERRUPTIBLE;
-        spin_lock_irqsave(&sem->wait.lock, flags);
-        add_wait_queue_exclusive_locked(&sem->wait, &wait);
-
-        sem->sleepers++;
-        for (;;) {
-                int sleepers = sem->sleepers;
-
-                /*
-                 * Add "everybody else" into it. They aren't
-                 * playing, because we own the spinlock in
-                 * the wait_queue_head.
-                 */
-                if (atomic_add_return(sleepers - 1, &sem->count) >= 0) {
-                        sem->sleepers = 0;
-                        break;
-                }
-                sem->sleepers = 1;      /* us - see -1 above */
-                spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-                schedule();
-
-                spin_lock_irqsave(&sem->wait.lock, flags);
-                tsk->state = TASK_UNINTERRUPTIBLE;
-        }
-        remove_wait_queue_locked(&sem->wait, &wait);
-        wake_up_locked(&sem->wait);
-        spin_unlock_irqrestore(&sem->wait.lock, flags);
-        tsk->state = TASK_RUNNING;
-}
-EXPORT_SYMBOL(__down);
-
-int __sched __down_interruptible(struct semaphore *sem)
-{
-	int retval = 0;
-	struct task_struct *tsk = current;
-        DECLARE_WAITQUEUE(wait, tsk);
-        unsigned long flags;
-
-        tsk->state = TASK_INTERRUPTIBLE;
-        spin_lock_irqsave(&sem->wait.lock, flags);
-        add_wait_queue_exclusive_locked(&sem->wait, &wait);
-
-        sem->sleepers++;
-        for (;;) {
-                int sleepers = sem->sleepers;
-
-		/*
-		 * With signals pending, this turns into the trylock
-		 * failure case - we won't be sleeping, and we can't
-		 * get the lock as it has contention. Just correct the
-		 * count and exit.
-		 */
-		if (signal_pending(current)) {
-			retval = -EINTR;
-			sem->sleepers = 0;
-			atomic_add(sleepers, &sem->count);
-			break;
-		}
-
-                /*
-                 * Add "everybody else" into it. They aren't
-                 * playing, because we own the spinlock in
-                 * the wait_queue_head.
-                 */
-                if (atomic_add_return(sleepers - 1, &sem->count) >= 0) {
-                        sem->sleepers = 0;
-                        break;
-                }
-                sem->sleepers = 1;      /* us - see -1 above */
-                spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-                schedule();
-
-                spin_lock_irqsave(&sem->wait.lock, flags);
-                tsk->state = TASK_INTERRUPTIBLE;
-        }
-        remove_wait_queue_locked(&sem->wait, &wait);
-        wake_up_locked(&sem->wait);
-        spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-        tsk->state = TASK_RUNNING;
-	return retval;
-}
-EXPORT_SYMBOL(__down_interruptible);
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index 589c6aca480..2dd1f300a5c 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -31,10 +31,6 @@ config ZONE_DMA
 	bool
 	default y
 
-config SEMAPHORE_SLEEPERS
-	bool
-	default y
-
 config GENERIC_FIND_NEXT_BIT
 	bool
 	default y
diff --git a/arch/blackfin/kernel/bfin_ksyms.c b/arch/blackfin/kernel/bfin_ksyms.c
index 0bfbb269e35..053edff6c0d 100644
--- a/arch/blackfin/kernel/bfin_ksyms.c
+++ b/arch/blackfin/kernel/bfin_ksyms.c
@@ -42,11 +42,6 @@ EXPORT_SYMBOL(ip_fast_csum);
 
 EXPORT_SYMBOL(kernel_thread);
 
-EXPORT_SYMBOL(__up);
-EXPORT_SYMBOL(__down);
-EXPORT_SYMBOL(__down_trylock);
-EXPORT_SYMBOL(__down_interruptible);
-
 EXPORT_SYMBOL(is_in_rom);
 EXPORT_SYMBOL(bfin_return_from_exception);
 
diff --git a/arch/cris/kernel/Makefile b/arch/cris/kernel/Makefile
index c8e8ea57098..ee7bcd4d20b 100644
--- a/arch/cris/kernel/Makefile
+++ b/arch/cris/kernel/Makefile
@@ -5,8 +5,7 @@
 
 extra-y	:= vmlinux.lds
 
-obj-y   := process.o traps.o irq.o ptrace.o setup.o \
-	   time.o sys_cris.o semaphore.o
+obj-y   := process.o traps.o irq.o ptrace.o setup.o time.o sys_cris.o
 
 obj-$(CONFIG_MODULES)    += crisksyms.o
 obj-$(CONFIG_MODULES)	 += module.o
diff --git a/arch/cris/kernel/crisksyms.c b/arch/cris/kernel/crisksyms.c
index 62f0e752915..7ac000f6a88 100644
--- a/arch/cris/kernel/crisksyms.c
+++ b/arch/cris/kernel/crisksyms.c
@@ -9,7 +9,6 @@
 #include <linux/string.h>
 #include <linux/tty.h>
 
-#include <asm/semaphore.h>
 #include <asm/processor.h>
 #include <asm/uaccess.h>
 #include <asm/checksum.h>
@@ -49,12 +48,6 @@ EXPORT_SYMBOL(__negdi2);
 EXPORT_SYMBOL(__ioremap);
 EXPORT_SYMBOL(iounmap);
 
-/* Semaphore functions */
-EXPORT_SYMBOL(__up);
-EXPORT_SYMBOL(__down);
-EXPORT_SYMBOL(__down_interruptible);
-EXPORT_SYMBOL(__down_trylock);
-
 /* Userspace access functions */
 EXPORT_SYMBOL(__copy_user_zeroing);
 EXPORT_SYMBOL(__copy_user);
diff --git a/arch/cris/kernel/semaphore.c b/arch/cris/kernel/semaphore.c
deleted file mode 100644
index f137a439041..00000000000
--- a/arch/cris/kernel/semaphore.c
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- *  Generic semaphore code. Buyer beware. Do your own
- * specific changes in <asm/semaphore-helper.h>
- */
-
-#include <linux/sched.h>
-#include <asm/semaphore-helper.h>
-
-/*
- * Semaphores are implemented using a two-way counter:
- * The "count" variable is decremented for each process
- * that tries to sleep, while the "waking" variable is
- * incremented when the "up()" code goes to wake up waiting
- * processes.
- *
- * Notably, the inline "up()" and "down()" functions can
- * efficiently test if they need to do any extra work (up
- * needs to do something only if count was negative before
- * the increment operation.
- *
- * waking_non_zero() (from asm/semaphore.h) must execute
- * atomically.
- *
- * When __up() is called, the count was negative before
- * incrementing it, and we need to wake up somebody.
- *
- * This routine adds one to the count of processes that need to
- * wake up and exit.  ALL waiting processes actually wake up but
- * only the one that gets to the "waking" field first will gate
- * through and acquire the semaphore.  The others will go back
- * to sleep.
- *
- * Note that these functions are only called when there is
- * contention on the lock, and as such all this is the
- * "non-critical" part of the whole semaphore business. The
- * critical part is the inline stuff in <asm/semaphore.h>
- * where we want to avoid any extra jumps and calls.
- */
-void __up(struct semaphore *sem)
-{
-	wake_one_more(sem);
-	wake_up(&sem->wait);
-}
-
-/*
- * Perform the "down" function.  Return zero for semaphore acquired,
- * return negative for signalled out of the function.
- *
- * If called from __down, the return is ignored and the wait loop is
- * not interruptible.  This means that a task waiting on a semaphore
- * using "down()" cannot be killed until someone does an "up()" on
- * the semaphore.
- *
- * If called from __down_interruptible, the return value gets checked
- * upon return.  If the return value is negative then the task continues
- * with the negative value in the return register (it can be tested by
- * the caller).
- *
- * Either form may be used in conjunction with "up()".
- *
- */
-
-#define DOWN_VAR				\
-	struct task_struct *tsk = current;	\
-	wait_queue_t wait;			\
-	init_waitqueue_entry(&wait, tsk);
-
-#define DOWN_HEAD(task_state)						\
-									\
-									\
-	tsk->state = (task_state);					\
-	add_wait_queue(&sem->wait, &wait);				\
-									\
-	/*								\
-	 * Ok, we're set up.  sem->count is known to be less than zero	\
-	 * so we must wait.						\
-	 *								\
-	 * We can let go the lock for purposes of waiting.		\
-	 * We re-acquire it after awaking so as to protect		\
-	 * all semaphore operations.					\
-	 *								\
-	 * If "up()" is called before we call waking_non_zero() then	\
-	 * we will catch it right away.  If it is called later then	\
-	 * we will have to go through a wakeup cycle to catch it.	\
-	 *								\
-	 * Multiple waiters contend for the semaphore lock to see	\
-	 * who gets to gate through and who has to wait some more.	\
-	 */								\
-	for (;;) {
-
-#define DOWN_TAIL(task_state)			\
-		tsk->state = (task_state);	\
-	}					\
-	tsk->state = TASK_RUNNING;		\
-	remove_wait_queue(&sem->wait, &wait);
-
-void __sched __down(struct semaphore * sem)
-{
-	DOWN_VAR
-	DOWN_HEAD(TASK_UNINTERRUPTIBLE)
-	if (waking_non_zero(sem))
-		break;
-	schedule();
-	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
-}
-
-int __sched __down_interruptible(struct semaphore * sem)
-{
-	int ret = 0;
-	DOWN_VAR
-	DOWN_HEAD(TASK_INTERRUPTIBLE)
-
-	ret = waking_non_zero_interruptible(sem, tsk);
-	if (ret)
-	{
-		if (ret == 1)
-			/* ret != 0 only if we get interrupted -arca */
-			ret = 0;
-		break;
-	}
-	schedule();
-	DOWN_TAIL(TASK_INTERRUPTIBLE)
-	return ret;
-}
-
-int __down_trylock(struct semaphore * sem)
-{
-	return waking_non_zero_trylock(sem);
-}
diff --git a/arch/frv/kernel/Makefile b/arch/frv/kernel/Makefile
index e8f73ed28b5..c36f70b6699 100644
--- a/arch/frv/kernel/Makefile
+++ b/arch/frv/kernel/Makefile
@@ -9,7 +9,7 @@ extra-y:= head.o init_task.o vmlinux.lds
 
 obj-y := $(heads-y) entry.o entry-table.o break.o switch_to.o kernel_thread.o \
 	 kernel_execve.o process.o traps.o ptrace.o signal.o dma.o \
-	 sys_frv.o time.o semaphore.o setup.o frv_ksyms.o \
+	 sys_frv.o time.o setup.o frv_ksyms.o \
 	 debug-stub.o irq.o sleep.o uaccess.o
 
 obj-$(CONFIG_GDBSTUB)		+= gdb-stub.o gdb-io.o
diff --git a/arch/frv/kernel/frv_ksyms.c b/arch/frv/kernel/frv_ksyms.c
index f772704b3d2..0316b3c50ef 100644
--- a/arch/frv/kernel/frv_ksyms.c
+++ b/arch/frv/kernel/frv_ksyms.c
@@ -12,7 +12,6 @@
 #include <asm/pgalloc.h>
 #include <asm/irq.h>
 #include <asm/io.h>
-#include <asm/semaphore.h>
 #include <asm/checksum.h>
 #include <asm/hardirq.h>
 #include <asm/cacheflush.h>
diff --git a/arch/frv/kernel/semaphore.c b/arch/frv/kernel/semaphore.c
deleted file mode 100644
index 7ee3a147b47..00000000000
--- a/arch/frv/kernel/semaphore.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/* semaphore.c: FR-V semaphores
- *
- * Copyright (C) 2003 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- * - Derived from lib/rwsem-spinlock.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/sched.h>
-#include <linux/module.h>
-#include <asm/semaphore.h>
-
-struct sem_waiter {
-	struct list_head	list;
-	struct task_struct	*task;
-};
-
-#ifdef CONFIG_DEBUG_SEMAPHORE
-void semtrace(struct semaphore *sem, const char *str)
-{
-	if (sem->debug)
-		printk("[%d] %s({%d,%d})\n",
-		       current->pid,
-		       str,
-		       sem->counter,
-		       list_empty(&sem->wait_list) ? 0 : 1);
-}
-#else
-#define semtrace(SEM,STR) do { } while(0)
-#endif
-
-/*
- * wait for a token to be granted from a semaphore
- * - entered with lock held and interrupts disabled
- */
-void __down(struct semaphore *sem, unsigned long flags)
-{
-	struct task_struct *tsk = current;
-	struct sem_waiter waiter;
-
-	semtrace(sem, "Entering __down");
-
-	/* set up my own style of waitqueue */
-	waiter.task = tsk;
-	get_task_struct(tsk);
-
-	list_add_tail(&waiter.list, &sem->wait_list);
-
-	/* we don't need to touch the semaphore struct anymore */
-	spin_unlock_irqrestore(&sem->wait_lock, flags);
-
-	/* wait to be given the semaphore */
-	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-
-	for (;;) {
-		if (list_empty(&waiter.list))
-			break;
-		schedule();
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-	}
-
-	tsk->state = TASK_RUNNING;
-	semtrace(sem, "Leaving __down");
-}
-
-EXPORT_SYMBOL(__down);
-
-/*
- * interruptibly wait for a token to be granted from a semaphore
- * - entered with lock held and interrupts disabled
- */
-int __down_interruptible(struct semaphore *sem, unsigned long flags)
-{
-	struct task_struct *tsk = current;
-	struct sem_waiter waiter;
-	int ret;
-
-	semtrace(sem,"Entering __down_interruptible");
-
-	/* set up my own style of waitqueue */
-	waiter.task = tsk;
-	get_task_struct(tsk);
-
-	list_add_tail(&waiter.list, &sem->wait_list);
-
-	/* we don't need to touch the semaphore struct anymore */
-	set_task_state(tsk, TASK_INTERRUPTIBLE);
-
-	spin_unlock_irqrestore(&sem->wait_lock, flags);
-
-	/* wait to be given the semaphore */
-	ret = 0;
-	for (;;) {
-		if (list_empty(&waiter.list))
-			break;
-		if (unlikely(signal_pending(current)))
-			goto interrupted;
-		schedule();
-		set_task_state(tsk, TASK_INTERRUPTIBLE);
-	}
-
- out:
-	tsk->state = TASK_RUNNING;
-	semtrace(sem, "Leaving __down_interruptible");
-	return ret;
-
- interrupted:
-	spin_lock_irqsave(&sem->wait_lock, flags);
-
-	if (!list_empty(&waiter.list)) {
-		list_del(&waiter.list);
-		ret = -EINTR;
-	}
-
-	spin_unlock_irqrestore(&sem->wait_lock, flags);
-	if (ret == -EINTR)
-		put_task_struct(current);
-	goto out;
-}
-
-EXPORT_SYMBOL(__down_interruptible);
-
-/*
- * release a single token back to a semaphore
- * - entered with lock held and interrupts disabled
- */
-void __up(struct semaphore *sem)
-{
-	struct task_struct *tsk;
-	struct sem_waiter *waiter;
-
-	semtrace(sem,"Entering __up");
-
-	/* grant the token to the process at the front of the queue */
-	waiter = list_entry(sem->wait_list.next, struct sem_waiter, list);
-
-	/* We must be careful not to touch 'waiter' after we set ->task = NULL.
-	 * It is allocated on the waiter's stack and may become invalid at
-	 * any time after that point (due to a wakeup from another source).
-	 */
-	list_del_init(&waiter->list);
-	tsk = waiter->task;
-	mb();
-	waiter->task = NULL;
-	wake_up_process(tsk);
-	put_task_struct(tsk);
-
-	semtrace(sem,"Leaving __up");
-}
-
-EXPORT_SYMBOL(__up);
diff --git a/arch/h8300/kernel/Makefile b/arch/h8300/kernel/Makefile
index 874f6aefee6..6c248c3c5c3 100644
--- a/arch/h8300/kernel/Makefile
+++ b/arch/h8300/kernel/Makefile
@@ -5,7 +5,7 @@
 extra-y := vmlinux.lds
 
 obj-y := process.o traps.o ptrace.o irq.o \
-	 sys_h8300.o time.o semaphore.o signal.o \
+	 sys_h8300.o time.o signal.o \
          setup.o gpio.o init_task.o syscalls.o \
 	 entry.o
 
diff --git a/arch/h8300/kernel/h8300_ksyms.c b/arch/h8300/kernel/h8300_ksyms.c
index d1b15267ac8..6866bd9c7fb 100644
--- a/arch/h8300/kernel/h8300_ksyms.c
+++ b/arch/h8300/kernel/h8300_ksyms.c
@@ -12,7 +12,6 @@
 #include <asm/pgalloc.h>
 #include <asm/irq.h>
 #include <asm/io.h>
-#include <asm/semaphore.h>
 #include <asm/checksum.h>
 #include <asm/current.h>
 #include <asm/gpio.h>
diff --git a/arch/h8300/kernel/semaphore.c b/arch/h8300/kernel/semaphore.c
deleted file mode 100644
index d12cbbfe6eb..00000000000
--- a/arch/h8300/kernel/semaphore.c
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- *  Generic semaphore code. Buyer beware. Do your own
- * specific changes in <asm/semaphore-helper.h>
- */
-
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <asm/semaphore-helper.h>
-
-#ifndef CONFIG_RMW_INSNS
-spinlock_t semaphore_wake_lock;
-#endif
-
-/*
- * Semaphores are implemented using a two-way counter:
- * The "count" variable is decremented for each process
- * that tries to sleep, while the "waking" variable is
- * incremented when the "up()" code goes to wake up waiting
- * processes.
- *
- * Notably, the inline "up()" and "down()" functions can
- * efficiently test if they need to do any extra work (up
- * needs to do something only if count was negative before
- * the increment operation.
- *
- * waking_non_zero() (from asm/semaphore.h) must execute
- * atomically.
- *
- * When __up() is called, the count was negative before
- * incrementing it, and we need to wake up somebody.
- *
- * This routine adds one to the count of processes that need to
- * wake up and exit.  ALL waiting processes actually wake up but
- * only the one that gets to the "waking" field first will gate
- * through and acquire the semaphore.  The others will go back
- * to sleep.
- *
- * Note that these functions are only called when there is
- * contention on the lock, and as such all this is the
- * "non-critical" part of the whole semaphore business. The
- * critical part is the inline stuff in <asm/semaphore.h>
- * where we want to avoid any extra jumps and calls.
- */
-void __up(struct semaphore *sem)
-{
-	wake_one_more(sem);
-	wake_up(&sem->wait);
-}
-
-/*
- * Perform the "down" function.  Return zero for semaphore acquired,
- * return negative for signalled out of the function.
- *
- * If called from __down, the return is ignored and the wait loop is
- * not interruptible.  This means that a task waiting on a semaphore
- * using "down()" cannot be killed until someone does an "up()" on
- * the semaphore.
- *
- * If called from __down_interruptible, the return value gets checked
- * upon return.  If the return value is negative then the task continues
- * with the negative value in the return register (it can be tested by
- * the caller).
- *
- * Either form may be used in conjunction with "up()".
- *
- */
-
-
-#define DOWN_HEAD(task_state)						\
-									\
-									\
-	current->state = (task_state);					\
-	add_wait_queue(&sem->wait, &wait);				\
-									\
-	/*								\
-	 * Ok, we're set up.  sem->count is known to be less than zero	\
-	 * so we must wait.						\
-	 *								\
-	 * We can let go the lock for purposes of waiting.		\
-	 * We re-acquire it after awaking so as to protect		\
-	 * all semaphore operations.					\
-	 *								\
-	 * If "up()" is called before we call waking_non_zero() then	\
-	 * we will catch it right away.  If it is called later then	\
-	 * we will have to go through a wakeup cycle to catch it.	\
-	 *								\
-	 * Multiple waiters contend for the semaphore lock to see	\
-	 * who gets to gate through and who has to wait some more.	\
-	 */								\
-	for (;;) {
-
-#define DOWN_TAIL(task_state)			\
-		current->state = (task_state);	\
-	}					\
-	current->state = TASK_RUNNING;		\
-	remove_wait_queue(&sem->wait, &wait);
-
-void __sched __down(struct semaphore * sem)
-{
-	DECLARE_WAITQUEUE(wait, current);
-
-	DOWN_HEAD(TASK_UNINTERRUPTIBLE)
-	if (waking_non_zero(sem))
-		break;
-	schedule();
-	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
-}
-
-int __sched __down_interruptible(struct semaphore * sem)
-{
-	DECLARE_WAITQUEUE(wait, current);
-	int ret = 0;
-
-	DOWN_HEAD(TASK_INTERRUPTIBLE)
-
-	ret = waking_non_zero_interruptible(sem, current);
-	if (ret)
-	{
-		if (ret == 1)
-			/* ret != 0 only if we get interrupted -arca */
-			ret = 0;
-		break;
-	}
-	schedule();
-	DOWN_TAIL(TASK_INTERRUPTIBLE)
-	return ret;
-}
-
-int __down_trylock(struct semaphore * sem)
-{
-	return waking_non_zero_trylock(sem);
-}
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index 33e5a598672..13fd10e8699 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -6,7 +6,7 @@ extra-y	:= head.o init_task.o vmlinux.lds
 
 obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o	\
 	 irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o		\
-	 salinfo.o semaphore.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \
+	 salinfo.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \
 	 unwind.o mca.o mca_asm.o topology.o
 
 obj-$(CONFIG_IA64_BRL_EMU)	+= brl_emu.o
diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c
index 8e7193d5552..6da1f20d737 100644
--- a/arch/ia64/kernel/ia64_ksyms.c
+++ b/arch/ia64/kernel/ia64_ksyms.c
@@ -19,12 +19,6 @@ EXPORT_SYMBOL_GPL(empty_zero_page);
 EXPORT_SYMBOL(ip_fast_csum);		/* hand-coded assembly */
 EXPORT_SYMBOL(csum_ipv6_magic);
 
-#include <asm/semaphore.h>
-EXPORT_SYMBOL(__down);
-EXPORT_SYMBOL(__down_interruptible);
-EXPORT_SYMBOL(__down_trylock);
-EXPORT_SYMBOL(__up);
-
 #include <asm/page.h>
 EXPORT_SYMBOL(clear_page);
 
diff --git a/arch/ia64/kernel/semaphore.c b/arch/ia64/kernel/semaphore.c
deleted file mode 100644
index 2724ef3fbae..00000000000
--- a/arch/ia64/kernel/semaphore.c
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * IA-64 semaphore implementation (derived from x86 version).
- *
- * Copyright (C) 1999-2000, 2002 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-/*
- * Semaphores are implemented using a two-way counter: The "count"
- * variable is decremented for each process that tries to acquire the
- * semaphore, while the "sleepers" variable is a count of such
- * acquires.
- *
- * Notably, the inline "up()" and "down()" functions can efficiently
- * test if they need to do any extra work (up needs to do something
- * only if count was negative before the increment operation.
- *
- * "sleeping" and the contention routine ordering is protected
- * by the spinlock in the semaphore's waitqueue head.
- *
- * Note that these functions are only called when there is contention
- * on the lock, and as such all this is the "non-critical" part of the
- * whole semaphore business. The critical part is the inline stuff in
- * <asm/semaphore.h> where we want to avoid any extra jumps and calls.
- */
-#include <linux/sched.h>
-#include <linux/init.h>
-
-#include <asm/errno.h>
-#include <asm/semaphore.h>
-
-/*
- * Logic:
- *  - Only on a boundary condition do we need to care. When we go
- *    from a negative count to a non-negative, we wake people up.
- *  - When we go from a non-negative count to a negative do we
- *    (a) synchronize with the "sleepers" count and (b) make sure
- *    that we're on the wakeup list before we synchronize so that
- *    we cannot lose wakeup events.
- */
-
-void
-__up (struct semaphore *sem)
-{
-	wake_up(&sem->wait);
-}
-
-void __sched __down (struct semaphore *sem)
-{
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	unsigned long flags;
-
-	tsk->state = TASK_UNINTERRUPTIBLE;
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	add_wait_queue_exclusive_locked(&sem->wait, &wait);
-
-	sem->sleepers++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock in
-		 * the wait_queue_head.
-		 */
-		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-		schedule();
-
-		spin_lock_irqsave(&sem->wait.lock, flags);
-		tsk->state = TASK_UNINTERRUPTIBLE;
-	}
-	remove_wait_queue_locked(&sem->wait, &wait);
-	wake_up_locked(&sem->wait);
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-	tsk->state = TASK_RUNNING;
-}
-
-int __sched __down_interruptible (struct semaphore * sem)
-{
-	int retval = 0;
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	unsigned long flags;
-
-	tsk->state = TASK_INTERRUPTIBLE;
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	add_wait_queue_exclusive_locked(&sem->wait, &wait);
-
-	sem->sleepers ++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * With signals pending, this turns into
-		 * the trylock failure case - we won't be
-		 * sleeping, and we* can't get the lock as
-		 * it has contention. Just correct the count
-		 * and exit.
-		 */
-		if (signal_pending(current)) {
-			retval = -EINTR;
-			sem->sleepers = 0;
-			atomic_add(sleepers, &sem->count);
-			break;
-		}
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock in
-		 * wait_queue_head. The "-1" is because we're
-		 * still hoping to get the semaphore.
-		 */
-		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-		schedule();
-
-		spin_lock_irqsave(&sem->wait.lock, flags);
-		tsk->state = TASK_INTERRUPTIBLE;
-	}
-	remove_wait_queue_locked(&sem->wait, &wait);
-	wake_up_locked(&sem->wait);
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-	tsk->state = TASK_RUNNING;
-	return retval;
-}
-
-/*
- * Trylock failed - make sure we correct for having decremented the
- * count.
- */
-int
-__down_trylock (struct semaphore *sem)
-{
-	unsigned long flags;
-	int sleepers;
-
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	sleepers = sem->sleepers + 1;
-	sem->sleepers = 0;
-
-	/*
-	 * Add "everybody else" and us into it. They aren't
-	 * playing, because we own the spinlock in the
-	 * wait_queue_head.
-	 */
-	if (!atomic_add_negative(sleepers, &sem->count)) {
-		wake_up_locked(&sem->wait);
-	}
-
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-	return 1;
-}
diff --git a/arch/m32r/kernel/Makefile b/arch/m32r/kernel/Makefile
index e97e26e87c9..09200d4886e 100644
--- a/arch/m32r/kernel/Makefile
+++ b/arch/m32r/kernel/Makefile
@@ -5,7 +5,7 @@
 extra-y	:= head.o init_task.o vmlinux.lds
 
 obj-y	:= process.o entry.o traps.o align.o irq.o setup.o time.o \
-	m32r_ksyms.o sys_m32r.o semaphore.o signal.o ptrace.o
+	m32r_ksyms.o sys_m32r.o signal.o ptrace.o
 
 obj-$(CONFIG_SMP)		+= smp.o smpboot.o
 obj-$(CONFIG_MODULES)		+= module.o
diff --git a/arch/m32r/kernel/m32r_ksyms.c b/arch/m32r/kernel/m32r_ksyms.c
index 41a4c95e06d..e6709fe950b 100644
--- a/arch/m32r/kernel/m32r_ksyms.c
+++ b/arch/m32r/kernel/m32r_ksyms.c
@@ -7,7 +7,6 @@
 #include <linux/interrupt.h>
 #include <linux/string.h>
 
-#include <asm/semaphore.h>
 #include <asm/processor.h>
 #include <asm/uaccess.h>
 #include <asm/checksum.h>
@@ -22,10 +21,6 @@ EXPORT_SYMBOL(dump_fpu);
 EXPORT_SYMBOL(__ioremap);
 EXPORT_SYMBOL(iounmap);
 EXPORT_SYMBOL(kernel_thread);
-EXPORT_SYMBOL(__down);
-EXPORT_SYMBOL(__down_interruptible);
-EXPORT_SYMBOL(__up);
-EXPORT_SYMBOL(__down_trylock);
 
 /* Networking helper routines. */
 /* Delay loops */
diff --git a/arch/m32r/kernel/semaphore.c b/arch/m32r/kernel/semaphore.c
deleted file mode 100644
index 940c2d37cfd..00000000000
--- a/arch/m32r/kernel/semaphore.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- *  linux/arch/m32r/semaphore.c
- *    orig : i386 2.6.4
- *
- *  M32R semaphore implementation.
- *
- *	Copyright (c) 2002 - 2004 Hitoshi Yamamoto
- */
-
-/*
- * i386 semaphore implementation.
- *
- * (C) Copyright 1999 Linus Torvalds
- *
- * Portions Copyright 1999 Red Hat, Inc.
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
- */
-#include <linux/sched.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <asm/semaphore.h>
-
-/*
- * Semaphores are implemented using a two-way counter:
- * The "count" variable is decremented for each process
- * that tries to acquire the semaphore, while the "sleeping"
- * variable is a count of such acquires.
- *
- * Notably, the inline "up()" and "down()" functions can
- * efficiently test if they need to do any extra work (up
- * needs to do something only if count was negative before
- * the increment operation.
- *
- * "sleeping" and the contention routine ordering is protected
- * by the spinlock in the semaphore's waitqueue head.
- *
- * Note that these functions are only called when there is
- * contention on the lock, and as such all this is the
- * "non-critical" part of the whole semaphore business. The
- * critical part is the inline stuff in <asm/semaphore.h>
- * where we want to avoid any extra jumps and calls.
- */
-
-/*
- * Logic:
- *  - only on a boundary condition do we need to care. When we go
- *    from a negative count to a non-negative, we wake people up.
- *  - when we go from a non-negative count to a negative do we
- *    (a) synchronize with the "sleeper" count and (b) make sure
- *    that we're on the wakeup list before we synchronize so that
- *    we cannot lose wakeup events.
- */
-
-asmlinkage void __up(struct semaphore *sem)
-{
-	wake_up(&sem->wait);
-}
-
-asmlinkage void __sched __down(struct semaphore * sem)
-{
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	unsigned long flags;
-
-	tsk->state = TASK_UNINTERRUPTIBLE;
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	add_wait_queue_exclusive_locked(&sem->wait, &wait);
-
-	sem->sleepers++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock in
-		 * the wait_queue_head.
-		 */
-		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-		schedule();
-
-		spin_lock_irqsave(&sem->wait.lock, flags);
-		tsk->state = TASK_UNINTERRUPTIBLE;
-	}
-	remove_wait_queue_locked(&sem->wait, &wait);
-	wake_up_locked(&sem->wait);
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-	tsk->state = TASK_RUNNING;
-}
-
-asmlinkage int __sched __down_interruptible(struct semaphore * sem)
-{
-	int retval = 0;
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	unsigned long flags;
-
-	tsk->state = TASK_INTERRUPTIBLE;
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	add_wait_queue_exclusive_locked(&sem->wait, &wait);
-
-	sem->sleepers++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * With signals pending, this turns into
-		 * the trylock failure case - we won't be
-		 * sleeping, and we* can't get the lock as
-		 * it has contention. Just correct the count
-		 * and exit.
-		 */
-		if (signal_pending(current)) {
-			retval = -EINTR;
-			sem->sleepers = 0;
-			atomic_add(sleepers, &sem->count);
-			break;
-		}
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock in
-		 * wait_queue_head. The "-1" is because we're
-		 * still hoping to get the semaphore.
-		 */
-		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-		schedule();
-
-		spin_lock_irqsave(&sem->wait.lock, flags);
-		tsk->state = TASK_INTERRUPTIBLE;
-	}
-	remove_wait_queue_locked(&sem->wait, &wait);
-	wake_up_locked(&sem->wait);
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-	tsk->state = TASK_RUNNING;
-	return retval;
-}
-
-/*
- * Trylock failed - make sure we correct for
- * having decremented the count.
- *
- * We could have done the trylock with a
- * single "cmpxchg" without failure cases,
- * but then it wouldn't work on a 386.
- */
-asmlinkage int __down_trylock(struct semaphore * sem)
-{
-	int sleepers;
-	unsigned long flags;
-
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	sleepers = sem->sleepers + 1;
-	sem->sleepers = 0;
-
-	/*
-	 * Add "everybody else" and us into it. They aren't
-	 * playing, because we own the spinlock in the
-	 * wait_queue_head.
-	 */
-	if (!atomic_add_negative(sleepers, &sem->count)) {
-		wake_up_locked(&sem->wait);
-	}
-
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-	return 1;
-}
diff --git a/arch/m68k/kernel/Makefile b/arch/m68k/kernel/Makefile
index a806208c7fb..7a62a718143 100644
--- a/arch/m68k/kernel/Makefile
+++ b/arch/m68k/kernel/Makefile
@@ -10,7 +10,7 @@ endif
 extra-y	+= vmlinux.lds
 
 obj-y	:= entry.o process.o traps.o ints.o signal.o ptrace.o module.o \
-	   sys_m68k.o time.o semaphore.o setup.o m68k_ksyms.o devres.o
+	   sys_m68k.o time.o setup.o m68k_ksyms.o devres.o
 
 devres-y = ../../../kernel/irq/devres.o
 
diff --git a/arch/m68k/kernel/m68k_ksyms.c b/arch/m68k/kernel/m68k_ksyms.c
index 6fc69c74fe2..d900e77e536 100644
--- a/arch/m68k/kernel/m68k_ksyms.c
+++ b/arch/m68k/kernel/m68k_ksyms.c
@@ -1,5 +1,4 @@
 #include <linux/module.h>
-#include <asm/semaphore.h>
 
 asmlinkage long long __ashldi3 (long long, int);
 asmlinkage long long __ashrdi3 (long long, int);
@@ -15,8 +14,3 @@ EXPORT_SYMBOL(__ashrdi3);
 EXPORT_SYMBOL(__lshrdi3);
 EXPORT_SYMBOL(__muldi3);
 
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__down_failed_trylock);
-EXPORT_SYMBOL(__up_wakeup);
-
diff --git a/arch/m68k/kernel/semaphore.c b/arch/m68k/kernel/semaphore.c
deleted file mode 100644
index d12cbbfe6eb..00000000000
--- a/arch/m68k/kernel/semaphore.c
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- *  Generic semaphore code. Buyer beware. Do your own
- * specific changes in <asm/semaphore-helper.h>
- */
-
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <asm/semaphore-helper.h>
-
-#ifndef CONFIG_RMW_INSNS
-spinlock_t semaphore_wake_lock;
-#endif
-
-/*
- * Semaphores are implemented using a two-way counter:
- * The "count" variable is decremented for each process
- * that tries to sleep, while the "waking" variable is
- * incremented when the "up()" code goes to wake up waiting
- * processes.
- *
- * Notably, the inline "up()" and "down()" functions can
- * efficiently test if they need to do any extra work (up
- * needs to do something only if count was negative before
- * the increment operation.
- *
- * waking_non_zero() (from asm/semaphore.h) must execute
- * atomically.
- *
- * When __up() is called, the count was negative before
- * incrementing it, and we need to wake up somebody.
- *
- * This routine adds one to the count of processes that need to
- * wake up and exit.  ALL waiting processes actually wake up but
- * only the one that gets to the "waking" field first will gate
- * through and acquire the semaphore.  The others will go back
- * to sleep.
- *
- * Note that these functions are only called when there is
- * contention on the lock, and as such all this is the
- * "non-critical" part of the whole semaphore business. The
- * critical part is the inline stuff in <asm/semaphore.h>
- * where we want to avoid any extra jumps and calls.
- */
-void __up(struct semaphore *sem)
-{
-	wake_one_more(sem);
-	wake_up(&sem->wait);
-}
-
-/*
- * Perform the "down" function.  Return zero for semaphore acquired,
- * return negative for signalled out of the function.
- *
- * If called from __down, the return is ignored and the wait loop is
- * not interruptible.  This means that a task waiting on a semaphore
- * using "down()" cannot be killed until someone does an "up()" on
- * the semaphore.
- *
- * If called from __down_interruptible, the return value gets checked
- * upon return.  If the return value is negative then the task continues
- * with the negative value in the return register (it can be tested by
- * the caller).
- *
- * Either form may be used in conjunction with "up()".
- *
- */
-
-
-#define DOWN_HEAD(task_state)						\
-									\
-									\
-	current->state = (task_state);					\
-	add_wait_queue(&sem->wait, &wait);				\
-									\
-	/*								\
-	 * Ok, we're set up.  sem->count is known to be less than zero	\
-	 * so we must wait.						\
-	 *								\
-	 * We can let go the lock for purposes of waiting.		\
-	 * We re-acquire it after awaking so as to protect		\
-	 * all semaphore operations.					\
-	 *								\
-	 * If "up()" is called before we call waking_non_zero() then	\
-	 * we will catch it right away.  If it is called later then	\
-	 * we will have to go through a wakeup cycle to catch it.	\
-	 *								\
-	 * Multiple waiters contend for the semaphore lock to see	\
-	 * who gets to gate through and who has to wait some more.	\
-	 */								\
-	for (;;) {
-
-#define DOWN_TAIL(task_state)			\
-		current->state = (task_state);	\
-	}					\
-	current->state = TASK_RUNNING;		\
-	remove_wait_queue(&sem->wait, &wait);
-
-void __sched __down(struct semaphore * sem)
-{
-	DECLARE_WAITQUEUE(wait, current);
-
-	DOWN_HEAD(TASK_UNINTERRUPTIBLE)
-	if (waking_non_zero(sem))
-		break;
-	schedule();
-	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
-}
-
-int __sched __down_interruptible(struct semaphore * sem)
-{
-	DECLARE_WAITQUEUE(wait, current);
-	int ret = 0;
-
-	DOWN_HEAD(TASK_INTERRUPTIBLE)
-
-	ret = waking_non_zero_interruptible(sem, current);
-	if (ret)
-	{
-		if (ret == 1)
-			/* ret != 0 only if we get interrupted -arca */
-			ret = 0;
-		break;
-	}
-	schedule();
-	DOWN_TAIL(TASK_INTERRUPTIBLE)
-	return ret;
-}
-
-int __down_trylock(struct semaphore * sem)
-{
-	return waking_non_zero_trylock(sem);
-}
diff --git a/arch/m68k/lib/Makefile b/arch/m68k/lib/Makefile
index 6bbf19f9600..a18af095cd7 100644
--- a/arch/m68k/lib/Makefile
+++ b/arch/m68k/lib/Makefile
@@ -5,4 +5,4 @@
 EXTRA_AFLAGS := -traditional
 
 lib-y	:= ashldi3.o ashrdi3.o lshrdi3.o muldi3.o \
-	   checksum.o string.o semaphore.o uaccess.o
+	   checksum.o string.o uaccess.o
diff --git a/arch/m68k/lib/semaphore.S b/arch/m68k/lib/semaphore.S
deleted file mode 100644
index 0215624c160..00000000000
--- a/arch/m68k/lib/semaphore.S
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- *  linux/arch/m68k/lib/semaphore.S
- *
- *  Copyright (C) 1996  Linus Torvalds
- *
- *  m68k version by Andreas Schwab
- */
-
-#include <linux/linkage.h>
-#include <asm/semaphore.h>
-
-/*
- * The semaphore operations have a special calling sequence that
- * allow us to do a simpler in-line version of them. These routines
- * need to convert that sequence back into the C sequence when
- * there is contention on the semaphore.
- */
-ENTRY(__down_failed)
-	moveml %a0/%d0/%d1,-(%sp)
-	movel %a1,-(%sp)
-	jbsr __down
-	movel (%sp)+,%a1
-	moveml (%sp)+,%a0/%d0/%d1
-	rts
-
-ENTRY(__down_failed_interruptible)
-	movel %a0,-(%sp)
-	movel %d1,-(%sp)
-	movel %a1,-(%sp)
-	jbsr __down_interruptible
-	movel (%sp)+,%a1
-	movel (%sp)+,%d1
-	movel (%sp)+,%a0
-	rts
-
-ENTRY(__down_failed_trylock)
-	movel %a0,-(%sp)
-	movel %d1,-(%sp)
-	movel %a1,-(%sp)
-	jbsr __down_trylock
-	movel (%sp)+,%a1
-	movel (%sp)+,%d1
-	movel (%sp)+,%a0
-	rts
-
-ENTRY(__up_wakeup)
-	moveml %a0/%d0/%d1,-(%sp)
-	movel %a1,-(%sp)
-	jbsr __up
-	movel (%sp)+,%a1
-	moveml (%sp)+,%a0/%d0/%d1
-	rts
-
diff --git a/arch/m68knommu/kernel/Makefile b/arch/m68knommu/kernel/Makefile
index 1524b39ad63..f0eab3dedb5 100644
--- a/arch/m68knommu/kernel/Makefile
+++ b/arch/m68knommu/kernel/Makefile
@@ -5,7 +5,7 @@
 extra-y := vmlinux.lds
 
 obj-y += dma.o entry.o init_task.o irq.o m68k_ksyms.o process.o ptrace.o \
-	 semaphore.o setup.o signal.o syscalltable.o sys_m68k.o time.o traps.o
+	 setup.o signal.o syscalltable.o sys_m68k.o time.o traps.o
 
 obj-$(CONFIG_MODULES)	+= module.o
 obj-$(CONFIG_COMEMPCI)	+= comempci.o
diff --git a/arch/m68knommu/kernel/m68k_ksyms.c b/arch/m68knommu/kernel/m68k_ksyms.c
index 53fad149028..39fe0a7aec3 100644
--- a/arch/m68knommu/kernel/m68k_ksyms.c
+++ b/arch/m68knommu/kernel/m68k_ksyms.c
@@ -13,7 +13,6 @@
 #include <asm/pgalloc.h>
 #include <asm/irq.h>
 #include <asm/io.h>
-#include <asm/semaphore.h>
 #include <asm/checksum.h>
 #include <asm/current.h>
 
@@ -39,11 +38,6 @@ EXPORT_SYMBOL(csum_partial_copy_nocheck);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memset);
 
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__down_failed_trylock);
-EXPORT_SYMBOL(__up_wakeup);
-
 /*
  * libgcc functions - functions that are used internally by the
  * compiler...  (prototypes are not correct though, but that
diff --git a/arch/m68knommu/kernel/semaphore.c b/arch/m68knommu/kernel/semaphore.c
deleted file mode 100644
index bce2bc7d87c..00000000000
--- a/arch/m68knommu/kernel/semaphore.c
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- *  Generic semaphore code. Buyer beware. Do your own
- * specific changes in <asm/semaphore-helper.h>
- */
-
-#include <linux/sched.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <asm/semaphore-helper.h>
-
-#ifndef CONFIG_RMW_INSNS
-spinlock_t semaphore_wake_lock;
-#endif
-
-/*
- * Semaphores are implemented using a two-way counter:
- * The "count" variable is decremented for each process
- * that tries to sleep, while the "waking" variable is
- * incremented when the "up()" code goes to wake up waiting
- * processes.
- *
- * Notably, the inline "up()" and "down()" functions can
- * efficiently test if they need to do any extra work (up
- * needs to do something only if count was negative before
- * the increment operation.
- *
- * waking_non_zero() (from asm/semaphore.h) must execute
- * atomically.
- *
- * When __up() is called, the count was negative before
- * incrementing it, and we need to wake up somebody.
- *
- * This routine adds one to the count of processes that need to
- * wake up and exit.  ALL waiting processes actually wake up but
- * only the one that gets to the "waking" field first will gate
- * through and acquire the semaphore.  The others will go back
- * to sleep.
- *
- * Note that these functions are only called when there is
- * contention on the lock, and as such all this is the
- * "non-critical" part of the whole semaphore business. The
- * critical part is the inline stuff in <asm/semaphore.h>
- * where we want to avoid any extra jumps and calls.
- */
-void __up(struct semaphore *sem)
-{
-	wake_one_more(sem);
-	wake_up(&sem->wait);
-}
-
-/*
- * Perform the "down" function.  Return zero for semaphore acquired,
- * return negative for signalled out of the function.
- *
- * If called from __down, the return is ignored and the wait loop is
- * not interruptible.  This means that a task waiting on a semaphore
- * using "down()" cannot be killed until someone does an "up()" on
- * the semaphore.
- *
- * If called from __down_interruptible, the return value gets checked
- * upon return.  If the return value is negative then the task continues
- * with the negative value in the return register (it can be tested by
- * the caller).
- *
- * Either form may be used in conjunction with "up()".
- *
- */
-
-
-#define DOWN_HEAD(task_state)						\
-									\
-									\
-	current->state = (task_state);					\
-	add_wait_queue(&sem->wait, &wait);				\
-									\
-	/*								\
-	 * Ok, we're set up.  sem->count is known to be less than zero	\
-	 * so we must wait.						\
-	 *								\
-	 * We can let go the lock for purposes of waiting.		\
-	 * We re-acquire it after awaking so as to protect		\
-	 * all semaphore operations.					\
-	 *								\
-	 * If "up()" is called before we call waking_non_zero() then	\
-	 * we will catch it right away.  If it is called later then	\
-	 * we will have to go through a wakeup cycle to catch it.	\
-	 *								\
-	 * Multiple waiters contend for the semaphore lock to see	\
-	 * who gets to gate through and who has to wait some more.	\
-	 */								\
-	for (;;) {
-
-#define DOWN_TAIL(task_state)			\
-		current->state = (task_state);	\
-	}					\
-	current->state = TASK_RUNNING;		\
-	remove_wait_queue(&sem->wait, &wait);
-
-void __sched __down(struct semaphore * sem)
-{
-	DECLARE_WAITQUEUE(wait, current);
-
-	DOWN_HEAD(TASK_UNINTERRUPTIBLE)
-	if (waking_non_zero(sem))
-		break;
-	schedule();
-	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
-}
-
-int __sched __down_interruptible(struct semaphore * sem)
-{
-	DECLARE_WAITQUEUE(wait, current);
-	int ret = 0;
-
-	DOWN_HEAD(TASK_INTERRUPTIBLE)
-
-	ret = waking_non_zero_interruptible(sem, current);
-	if (ret)
-	{
-		if (ret == 1)
-			/* ret != 0 only if we get interrupted -arca */
-			ret = 0;
-		break;
-	}
-	schedule();
-	DOWN_TAIL(TASK_INTERRUPTIBLE)
-	return ret;
-}
-
-int __down_trylock(struct semaphore * sem)
-{
-	return waking_non_zero_trylock(sem);
-}
diff --git a/arch/m68knommu/lib/Makefile b/arch/m68knommu/lib/Makefile
index e051a791398..d94d709665a 100644
--- a/arch/m68knommu/lib/Makefile
+++ b/arch/m68knommu/lib/Makefile
@@ -4,4 +4,4 @@
 
 lib-y	:= ashldi3.o ashrdi3.o lshrdi3.o \
 	   muldi3.o mulsi3.o divsi3.o udivsi3.o modsi3.o umodsi3.o \
-	   checksum.o semaphore.o memcpy.o memset.o delay.o
+	   checksum.o memcpy.o memset.o delay.o
diff --git a/arch/m68knommu/lib/semaphore.S b/arch/m68knommu/lib/semaphore.S
deleted file mode 100644
index 87c74603437..00000000000
--- a/arch/m68knommu/lib/semaphore.S
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- *  linux/arch/m68k/lib/semaphore.S
- *
- *  Copyright (C) 1996  Linus Torvalds
- *
- *  m68k version by Andreas Schwab
- *
- *  MAR/1999 -- modified to support ColdFire (gerg@snapgear.com)
- */
-
-#include <linux/linkage.h>
-#include <asm/semaphore.h>
-
-/*
- * "down_failed" is called with the eventual return address
- * in %a0, and the address of the semaphore in %a1. We need
- * to increment the number of waiters on the semaphore,
- * call "__down()", and then eventually return to try again.
- */
-ENTRY(__down_failed)
-#ifdef CONFIG_COLDFIRE
-	subl #12,%sp
-	moveml %a0/%d0/%d1,(%sp)
-#else
-	moveml %a0/%d0/%d1,-(%sp)
-#endif
-	movel %a1,-(%sp)
-	jbsr __down
-	movel (%sp)+,%a1
-	movel (%sp)+,%d0
-	movel (%sp)+,%d1
-	rts
-
-ENTRY(__down_failed_interruptible)
-	movel %a0,-(%sp)
-	movel %d1,-(%sp)
-	movel %a1,-(%sp)
-	jbsr __down_interruptible
-	movel (%sp)+,%a1
-	movel (%sp)+,%d1
-	rts
-
-ENTRY(__up_wakeup)
-#ifdef CONFIG_COLDFIRE
-	subl #12,%sp
-	moveml %a0/%d0/%d1,(%sp)
-#else
-	moveml %a0/%d0/%d1,-(%sp)
-#endif
-	movel %a1,-(%sp)
-	jbsr __up
-	movel (%sp)+,%a1
-	movel (%sp)+,%d0
-	movel (%sp)+,%d1
-	rts
-
-ENTRY(__down_failed_trylock)
-	movel %a0,-(%sp)
-	movel %d1,-(%sp)
-	movel %a1,-(%sp)
-	jbsr __down_trylock
-	movel (%sp)+,%a1
-	movel (%sp)+,%d1
-	movel (%sp)+,%a0
-	rts
-
diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile
index 9e78e1a4ca1..6fcdb6fda2e 100644
--- a/arch/mips/kernel/Makefile
+++ b/arch/mips/kernel/Makefile
@@ -5,7 +5,7 @@
 extra-y		:= head.o init_task.o vmlinux.lds
 
 obj-y		+= cpu-probe.o branch.o entry.o genex.o irq.o process.o \
-		   ptrace.o reset.o semaphore.o setup.o signal.o syscall.o \
+		   ptrace.o reset.o setup.o signal.o syscall.o \
 		   time.o topology.o traps.o unaligned.o
 
 obj-$(CONFIG_CEVT_BCM1480)	+= cevt-bcm1480.o
diff --git a/arch/mips/kernel/semaphore.c b/arch/mips/kernel/semaphore.c
deleted file mode 100644
index 1265358cdca..00000000000
--- a/arch/mips/kernel/semaphore.c
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * MIPS-specific semaphore code.
- *
- * Copyright (C) 1999 Cort Dougan <cort@cs.nmt.edu>
- * Copyright (C) 2004 Ralf Baechle <ralf@linux-mips.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * April 2001 - Reworked by Paul Mackerras <paulus@samba.org>
- * to eliminate the SMP races in the old version between the updates
- * of `count' and `waking'.  Now we use negative `count' values to
- * indicate that some process(es) are waiting for the semaphore.
- */
-
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <asm/atomic.h>
-#include <asm/cpu-features.h>
-#include <asm/errno.h>
-#include <asm/semaphore.h>
-#include <asm/war.h>
-/*
- * Atomically update sem->count.
- * This does the equivalent of the following:
- *
- *	old_count = sem->count;
- *	tmp = MAX(old_count, 0) + incr;
- *	sem->count = tmp;
- *	return old_count;
- *
- * On machines without lld/scd we need a spinlock to make the manipulation of
- * sem->count and sem->waking atomic.  Scalability isn't an issue because
- * this lock is used on UP only so it's just an empty variable.
- */
-static inline int __sem_update_count(struct semaphore *sem, int incr)
-{
-	int old_count, tmp;
-
-	if (cpu_has_llsc && R10000_LLSC_WAR) {
-		__asm__ __volatile__(
-		"	.set	mips3					\n"
-		"1:	ll	%0, %2		# __sem_update_count	\n"
-		"	sra	%1, %0, 31				\n"
-		"	not	%1					\n"
-		"	and	%1, %0, %1				\n"
-		"	addu	%1, %1, %3				\n"
-		"	sc	%1, %2					\n"
-		"	beqzl	%1, 1b					\n"
-		"	.set	mips0					\n"
-		: "=&r" (old_count), "=&r" (tmp), "=m" (sem->count)
-		: "r" (incr), "m" (sem->count));
-	} else if (cpu_has_llsc) {
-		__asm__ __volatile__(
-		"	.set	mips3					\n"
-		"1:	ll	%0, %2		# __sem_update_count	\n"
-		"	sra	%1, %0, 31				\n"
-		"	not	%1					\n"
-		"	and	%1, %0, %1				\n"
-		"	addu	%1, %1, %3				\n"
-		"	sc	%1, %2					\n"
-		"	beqz	%1, 1b					\n"
-		"	.set	mips0					\n"
-		: "=&r" (old_count), "=&r" (tmp), "=m" (sem->count)
-		: "r" (incr), "m" (sem->count));
-	} else {
-		static DEFINE_SPINLOCK(semaphore_lock);
-		unsigned long flags;
-
-		spin_lock_irqsave(&semaphore_lock, flags);
-		old_count = atomic_read(&sem->count);
-		tmp = max_t(int, old_count, 0) + incr;
-		atomic_set(&sem->count, tmp);
-		spin_unlock_irqrestore(&semaphore_lock, flags);
-	}
-
-	return old_count;
-}
-
-void __up(struct semaphore *sem)
-{
-	/*
-	 * Note that we incremented count in up() before we came here,
-	 * but that was ineffective since the result was <= 0, and
-	 * any negative value of count is equivalent to 0.
-	 * This ends up setting count to 1, unless count is now > 0
-	 * (i.e. because some other cpu has called up() in the meantime),
-	 * in which case we just increment count.
-	 */
-	__sem_update_count(sem, 1);
-	wake_up(&sem->wait);
-}
-
-EXPORT_SYMBOL(__up);
-
-/*
- * Note that when we come in to __down or __down_interruptible,
- * we have already decremented count, but that decrement was
- * ineffective since the result was < 0, and any negative value
- * of count is equivalent to 0.
- * Thus it is only when we decrement count from some value > 0
- * that we have actually got the semaphore.
- */
-void __sched __down(struct semaphore *sem)
-{
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-
-	__set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-	add_wait_queue_exclusive(&sem->wait, &wait);
-
-	/*
-	 * Try to get the semaphore.  If the count is > 0, then we've
-	 * got the semaphore; we decrement count and exit the loop.
-	 * If the count is 0 or negative, we set it to -1, indicating
-	 * that we are asleep, and then sleep.
-	 */
-	while (__sem_update_count(sem, -1) <= 0) {
-		schedule();
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-	}
-	remove_wait_queue(&sem->wait, &wait);
-	__set_task_state(tsk, TASK_RUNNING);
-
-	/*
-	 * If there are any more sleepers, wake one of them up so
-	 * that it can either get the semaphore, or set count to -1
-	 * indicating that there are still processes sleeping.
-	 */
-	wake_up(&sem->wait);
-}
-
-EXPORT_SYMBOL(__down);
-
-int __sched __down_interruptible(struct semaphore * sem)
-{
-	int retval = 0;
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-
-	__set_task_state(tsk, TASK_INTERRUPTIBLE);
-	add_wait_queue_exclusive(&sem->wait, &wait);
-
-	while (__sem_update_count(sem, -1) <= 0) {
-		if (signal_pending(current)) {
-			/*
-			 * A signal is pending - give up trying.
-			 * Set sem->count to 0 if it is negative,
-			 * since we are no longer sleeping.
-			 */
-			__sem_update_count(sem, 0);
-			retval = -EINTR;
-			break;
-		}
-		schedule();
-		set_task_state(tsk, TASK_INTERRUPTIBLE);
-	}
-	remove_wait_queue(&sem->wait, &wait);
-	__set_task_state(tsk, TASK_RUNNING);
-
-	wake_up(&sem->wait);
-	return retval;
-}
-
-EXPORT_SYMBOL(__down_interruptible);
diff --git a/arch/mn10300/kernel/Makefile b/arch/mn10300/kernel/Makefile
index ef07c956170..23f2ab67574 100644
--- a/arch/mn10300/kernel/Makefile
+++ b/arch/mn10300/kernel/Makefile
@@ -3,7 +3,7 @@
 #
 extra-y := head.o init_task.o vmlinux.lds
 
-obj-y   := process.o semaphore.o signal.o entry.o fpu.o traps.o irq.o \
+obj-y   := process.o signal.o entry.o fpu.o traps.o irq.o \
 	   ptrace.o setup.o time.o sys_mn10300.o io.o kthread.o \
 	   switch_to.o mn10300_ksyms.o kernel_execve.o
 
diff --git a/arch/mn10300/kernel/semaphore.c b/arch/mn10300/kernel/semaphore.c
deleted file mode 100644
index 9153c4039fd..00000000000
--- a/arch/mn10300/kernel/semaphore.c
+++ /dev/null
@@ -1,149 +0,0 @@
-/* MN10300 Semaphore implementation
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-#include <linux/sched.h>
-#include <linux/module.h>
-#include <asm/semaphore.h>
-
-struct sem_waiter {
-	struct list_head	list;
-	struct task_struct	*task;
-};
-
-#if SEMAPHORE_DEBUG
-void semtrace(struct semaphore *sem, const char *str)
-{
-	if (sem->debug)
-		printk(KERN_DEBUG "[%d] %s({%d,%d})\n",
-		       current->pid,
-		       str,
-		       atomic_read(&sem->count),
-		       list_empty(&sem->wait_list) ? 0 : 1);
-}
-#else
-#define semtrace(SEM, STR) do { } while (0)
-#endif
-
-/*
- * wait for a token to be granted from a semaphore
- * - entered with lock held and interrupts disabled
- */
-void __down(struct semaphore *sem, unsigned long flags)
-{
-	struct task_struct *tsk = current;
-	struct sem_waiter waiter;
-
-	semtrace(sem, "Entering __down");
-
-	/* set up my own style of waitqueue */
-	waiter.task = tsk;
-	get_task_struct(tsk);
-
-	list_add_tail(&waiter.list, &sem->wait_list);
-
-	/* we don't need to touch the semaphore struct anymore */
-	spin_unlock_irqrestore(&sem->wait_lock, flags);
-
-	/* wait to be given the semaphore */
-	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-
-	for (;;) {
-		if (!waiter.task)
-			break;
-		schedule();
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-	}
-
-	tsk->state = TASK_RUNNING;
-	semtrace(sem, "Leaving __down");
-}
-EXPORT_SYMBOL(__down);
-
-/*
- * interruptibly wait for a token to be granted from a semaphore
- * - entered with lock held and interrupts disabled
- */
-int __down_interruptible(struct semaphore *sem, unsigned long flags)
-{
-	struct task_struct *tsk = current;
-	struct sem_waiter waiter;
-	int ret;
-
-	semtrace(sem, "Entering __down_interruptible");
-
-	/* set up my own style of waitqueue */
-	waiter.task = tsk;
-	get_task_struct(tsk);
-
-	list_add_tail(&waiter.list, &sem->wait_list);
-
-	/* we don't need to touch the semaphore struct anymore */
-	set_task_state(tsk, TASK_INTERRUPTIBLE);
-
-	spin_unlock_irqrestore(&sem->wait_lock, flags);
-
-	/* wait to be given the semaphore */
-	ret = 0;
-	for (;;) {
-		if (!waiter.task)
-			break;
-		if (unlikely(signal_pending(current)))
-			goto interrupted;
-		schedule();
-		set_task_state(tsk, TASK_INTERRUPTIBLE);
-	}
-
- out:
-	tsk->state = TASK_RUNNING;
-	semtrace(sem, "Leaving __down_interruptible");
-	return ret;
-
- interrupted:
-	spin_lock_irqsave(&sem->wait_lock, flags);
-	list_del(&waiter.list);
-	spin_unlock_irqrestore(&sem->wait_lock, flags);
-
-	ret = 0;
-	if (!waiter.task) {
-		put_task_struct(current);
-		ret = -EINTR;
-	}
-	goto out;
-}
-EXPORT_SYMBOL(__down_interruptible);
-
-/*
- * release a single token back to a semaphore
- * - entered with lock held and interrupts disabled
- */
-void __up(struct semaphore *sem)
-{
-	struct task_struct *tsk;
-	struct sem_waiter *waiter;
-
-	semtrace(sem, "Entering __up");
-
-	/* grant the token to the process at the front of the queue */
-	waiter = list_entry(sem->wait_list.next, struct sem_waiter, list);
-
-	/* We must be careful not to touch 'waiter' after we set ->task = NULL.
-	 * It is an allocated on the waiter's stack and may become invalid at
-	 * any time after that point (due to a wakeup from another source).
-	 */
-	list_del_init(&waiter->list);
-	tsk = waiter->task;
-	smp_mb();
-	waiter->task = NULL;
-	wake_up_process(tsk);
-	put_task_struct(tsk);
-
-	semtrace(sem, "Leaving __up");
-}
-EXPORT_SYMBOL(__up);
diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile
index 27827bc3717..1f6585a56f9 100644
--- a/arch/parisc/kernel/Makefile
+++ b/arch/parisc/kernel/Makefile
@@ -9,7 +9,7 @@ AFLAGS_pacache.o := -traditional
 
 obj-y	     	:= cache.o pacache.o setup.o traps.o time.o irq.o \
 		   pa7300lc.o syscall.o entry.o sys_parisc.o firmware.o \
-		   ptrace.o hardware.o inventory.o drivers.o semaphore.o \
+		   ptrace.o hardware.o inventory.o drivers.o \
 		   signal.o hpmc.o real2.o parisc_ksyms.o unaligned.o \
 		   process.o processor.o pdc_cons.o pdc_chassis.o unwind.o \
 		   topology.o
diff --git a/arch/parisc/kernel/parisc_ksyms.c b/arch/parisc/kernel/parisc_ksyms.c
index 7aca704e96f..5b7fc4aa044 100644
--- a/arch/parisc/kernel/parisc_ksyms.c
+++ b/arch/parisc/kernel/parisc_ksyms.c
@@ -69,11 +69,6 @@ EXPORT_SYMBOL(memcpy_toio);
 EXPORT_SYMBOL(memcpy_fromio);
 EXPORT_SYMBOL(memset_io);
 
-#include <asm/semaphore.h>
-EXPORT_SYMBOL(__up);
-EXPORT_SYMBOL(__down_interruptible);
-EXPORT_SYMBOL(__down);
-
 extern void $$divI(void);
 extern void $$divU(void);
 extern void $$remI(void);
diff --git a/arch/parisc/kernel/semaphore.c b/arch/parisc/kernel/semaphore.c
deleted file mode 100644
index ee806bcc372..00000000000
--- a/arch/parisc/kernel/semaphore.c
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Semaphore implementation Copyright (c) 2001 Matthew Wilcox, Hewlett-Packard
- */
-
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-
-/*
- * Semaphores are complex as we wish to avoid using two variables.
- * `count' has multiple roles, depending on its value.  If it is positive
- * or zero, there are no waiters.  The functions here will never be
- * called; see <asm/semaphore.h>
- *
- * When count is -1 it indicates there is at least one task waiting
- * for the semaphore.
- *
- * When count is less than that, there are '- count - 1' wakeups
- * pending.  ie if it has value -3, there are 2 wakeups pending.
- *
- * Note that these functions are only called when there is contention
- * on the lock, and as such all this is the "non-critical" part of the
- * whole semaphore business. The critical part is the inline stuff in
- * <asm/semaphore.h> where we want to avoid any extra jumps and calls.
- */
-void __up(struct semaphore *sem)
-{
-	sem->count--;
-	wake_up(&sem->wait);
-}
-
-#define wakers(count) (-1 - count)
-
-#define DOWN_HEAD							\
-	int ret = 0;							\
-	DECLARE_WAITQUEUE(wait, current);				\
-									\
-	/* Note that someone is waiting */				\
-	if (sem->count == 0)						\
-		sem->count = -1;					\
-									\
-	/* protected by the sentry still -- use unlocked version */	\
-	wait.flags = WQ_FLAG_EXCLUSIVE;					\
-	__add_wait_queue_tail(&sem->wait, &wait);			\
- lost_race:								\
-	spin_unlock_irq(&sem->sentry);					\
-
-#define DOWN_TAIL							\
-	spin_lock_irq(&sem->sentry);					\
-	if (wakers(sem->count) == 0 && ret == 0)			\
-		goto lost_race;	/* Someone stole our wakeup */		\
-	__remove_wait_queue(&sem->wait, &wait);				\
-	current->state = TASK_RUNNING;					\
-	if (!waitqueue_active(&sem->wait) && (sem->count < 0))		\
-		sem->count = wakers(sem->count);
-
-#define UPDATE_COUNT							\
-	sem->count += (sem->count < 0) ? 1 : - 1;
-	
-
-void __sched __down(struct semaphore * sem)
-{
-	DOWN_HEAD
-
-	for(;;) {
-		set_task_state(current, TASK_UNINTERRUPTIBLE);
-		/* we can _read_ this without the sentry */
-		if (sem->count != -1)
-			break;
- 		schedule();
- 	}
-
-	DOWN_TAIL
-	UPDATE_COUNT
-}
-
-int __sched __down_interruptible(struct semaphore * sem)
-{
-	DOWN_HEAD
-
-	for(;;) {
-		set_task_state(current, TASK_INTERRUPTIBLE);
-		/* we can _read_ this without the sentry */
-		if (sem->count != -1)
-			break;
-
-		if (signal_pending(current)) {
-			ret = -EINTR;
-			break;
-		}
-		schedule();
-	}
-
-	DOWN_TAIL
-
-	if (!ret) {
-		UPDATE_COUNT
-	}
-
-	return ret;
-}
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index c1baf9d5903..b9dbfff9afe 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -12,7 +12,7 @@ CFLAGS_prom_init.o      += -fPIC
 CFLAGS_btext.o		+= -fPIC
 endif
 
-obj-y				:= semaphore.o cputable.o ptrace.o syscalls.o \
+obj-y				:= cputable.o ptrace.o syscalls.o \
 				   irq.o align.o signal_32.o pmc.o vdso.o \
 				   init_task.o process.o systbl.o idle.o \
 				   signal.o
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 9c98424277a..65d14e6ddc3 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -15,7 +15,6 @@
 #include <linux/bitops.h>
 
 #include <asm/page.h>
-#include <asm/semaphore.h>
 #include <asm/processor.h>
 #include <asm/cacheflush.h>
 #include <asm/uaccess.h>
diff --git a/arch/powerpc/kernel/semaphore.c b/arch/powerpc/kernel/semaphore.c
deleted file mode 100644
index 2f8c3c95139..00000000000
--- a/arch/powerpc/kernel/semaphore.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * PowerPC-specific semaphore code.
- *
- * Copyright (C) 1999 Cort Dougan <cort@cs.nmt.edu>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * April 2001 - Reworked by Paul Mackerras <paulus@samba.org>
- * to eliminate the SMP races in the old version between the updates
- * of `count' and `waking'.  Now we use negative `count' values to
- * indicate that some process(es) are waiting for the semaphore.
- */
-
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/module.h>
-
-#include <asm/atomic.h>
-#include <asm/semaphore.h>
-#include <asm/errno.h>
-
-/*
- * Atomically update sem->count.
- * This does the equivalent of the following:
- *
- *	old_count = sem->count;
- *	tmp = MAX(old_count, 0) + incr;
- *	sem->count = tmp;
- *	return old_count;
- */
-static inline int __sem_update_count(struct semaphore *sem, int incr)
-{
-	int old_count, tmp;
-
-	__asm__ __volatile__("\n"
-"1:	lwarx	%0,0,%3\n"
-"	srawi	%1,%0,31\n"
-"	andc	%1,%0,%1\n"
-"	add	%1,%1,%4\n"
-	PPC405_ERR77(0,%3)
-"	stwcx.	%1,0,%3\n"
-"	bne	1b"
-	: "=&r" (old_count), "=&r" (tmp), "=m" (sem->count)
-	: "r" (&sem->count), "r" (incr), "m" (sem->count)
-	: "cc");
-
-	return old_count;
-}
-
-void __up(struct semaphore *sem)
-{
-	/*
-	 * Note that we incremented count in up() before we came here,
-	 * but that was ineffective since the result was <= 0, and
-	 * any negative value of count is equivalent to 0.
-	 * This ends up setting count to 1, unless count is now > 0
-	 * (i.e. because some other cpu has called up() in the meantime),
-	 * in which case we just increment count.
-	 */
-	__sem_update_count(sem, 1);
-	wake_up(&sem->wait);
-}
-EXPORT_SYMBOL(__up);
-
-/*
- * Note that when we come in to __down or __down_interruptible,
- * we have already decremented count, but that decrement was
- * ineffective since the result was < 0, and any negative value
- * of count is equivalent to 0.
- * Thus it is only when we decrement count from some value > 0
- * that we have actually got the semaphore.
- */
-void __sched __down(struct semaphore *sem)
-{
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-
-	__set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-	add_wait_queue_exclusive(&sem->wait, &wait);
-
-	/*
-	 * Try to get the semaphore.  If the count is > 0, then we've
-	 * got the semaphore; we decrement count and exit the loop.
-	 * If the count is 0 or negative, we set it to -1, indicating
-	 * that we are asleep, and then sleep.
-	 */
-	while (__sem_update_count(sem, -1) <= 0) {
-		schedule();
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-	}
-	remove_wait_queue(&sem->wait, &wait);
-	__set_task_state(tsk, TASK_RUNNING);
-
-	/*
-	 * If there are any more sleepers, wake one of them up so
-	 * that it can either get the semaphore, or set count to -1
-	 * indicating that there are still processes sleeping.
-	 */
-	wake_up(&sem->wait);
-}
-EXPORT_SYMBOL(__down);
-
-int __sched __down_interruptible(struct semaphore * sem)
-{
-	int retval = 0;
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-
-	__set_task_state(tsk, TASK_INTERRUPTIBLE);
-	add_wait_queue_exclusive(&sem->wait, &wait);
-
-	while (__sem_update_count(sem, -1) <= 0) {
-		if (signal_pending(current)) {
-			/*
-			 * A signal is pending - give up trying.
-			 * Set sem->count to 0 if it is negative,
-			 * since we are no longer sleeping.
-			 */
-			__sem_update_count(sem, 0);
-			retval = -EINTR;
-			break;
-		}
-		schedule();
-		set_task_state(tsk, TASK_INTERRUPTIBLE);
-	}
-	remove_wait_queue(&sem->wait, &wait);
-	__set_task_state(tsk, TASK_RUNNING);
-
-	wake_up(&sem->wait);
-	return retval;
-}
-EXPORT_SYMBOL(__down_interruptible);
diff --git a/arch/ppc/kernel/semaphore.c b/arch/ppc/kernel/semaphore.c
deleted file mode 100644
index 2fe429b27c1..00000000000
--- a/arch/ppc/kernel/semaphore.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * PowerPC-specific semaphore code.
- *
- * Copyright (C) 1999 Cort Dougan <cort@cs.nmt.edu>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * April 2001 - Reworked by Paul Mackerras <paulus@samba.org>
- * to eliminate the SMP races in the old version between the updates
- * of `count' and `waking'.  Now we use negative `count' values to
- * indicate that some process(es) are waiting for the semaphore.
- */
-
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <asm/atomic.h>
-#include <asm/semaphore.h>
-#include <asm/errno.h>
-
-/*
- * Atomically update sem->count.
- * This does the equivalent of the following:
- *
- *	old_count = sem->count;
- *	tmp = MAX(old_count, 0) + incr;
- *	sem->count = tmp;
- *	return old_count;
- */
-static inline int __sem_update_count(struct semaphore *sem, int incr)
-{
-	int old_count, tmp;
-
-	__asm__ __volatile__("\n"
-"1:	lwarx	%0,0,%3\n"
-"	srawi	%1,%0,31\n"
-"	andc	%1,%0,%1\n"
-"	add	%1,%1,%4\n"
-	PPC405_ERR77(0,%3)
-"	stwcx.	%1,0,%3\n"
-"	bne	1b"
-	: "=&r" (old_count), "=&r" (tmp), "=m" (sem->count)
-	: "r" (&sem->count), "r" (incr), "m" (sem->count)
-	: "cc");
-
-	return old_count;
-}
-
-void __up(struct semaphore *sem)
-{
-	/*
-	 * Note that we incremented count in up() before we came here,
-	 * but that was ineffective since the result was <= 0, and
-	 * any negative value of count is equivalent to 0.
-	 * This ends up setting count to 1, unless count is now > 0
-	 * (i.e. because some other cpu has called up() in the meantime),
-	 * in which case we just increment count.
-	 */
-	__sem_update_count(sem, 1);
-	wake_up(&sem->wait);
-}
-
-/*
- * Note that when we come in to __down or __down_interruptible,
- * we have already decremented count, but that decrement was
- * ineffective since the result was < 0, and any negative value
- * of count is equivalent to 0.
- * Thus it is only when we decrement count from some value > 0
- * that we have actually got the semaphore.
- */
-void __sched __down(struct semaphore *sem)
-{
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-
-	tsk->state = TASK_UNINTERRUPTIBLE;
-	add_wait_queue_exclusive(&sem->wait, &wait);
-	smp_wmb();
-
-	/*
-	 * Try to get the semaphore.  If the count is > 0, then we've
-	 * got the semaphore; we decrement count and exit the loop.
-	 * If the count is 0 or negative, we set it to -1, indicating
-	 * that we are asleep, and then sleep.
-	 */
-	while (__sem_update_count(sem, -1) <= 0) {
-		schedule();
-		tsk->state = TASK_UNINTERRUPTIBLE;
-	}
-	remove_wait_queue(&sem->wait, &wait);
-	tsk->state = TASK_RUNNING;
-
-	/*
-	 * If there are any more sleepers, wake one of them up so
-	 * that it can either get the semaphore, or set count to -1
-	 * indicating that there are still processes sleeping.
-	 */
-	wake_up(&sem->wait);
-}
-
-int __sched __down_interruptible(struct semaphore * sem)
-{
-	int retval = 0;
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-
-	tsk->state = TASK_INTERRUPTIBLE;
-	add_wait_queue_exclusive(&sem->wait, &wait);
-	smp_wmb();
-
-	while (__sem_update_count(sem, -1) <= 0) {
-		if (signal_pending(current)) {
-			/*
-			 * A signal is pending - give up trying.
-			 * Set sem->count to 0 if it is negative,
-			 * since we are no longer sleeping.
-			 */
-			__sem_update_count(sem, 0);
-			retval = -EINTR;
-			break;
-		}
-		schedule();
-		tsk->state = TASK_INTERRUPTIBLE;
-	}
-	tsk->state = TASK_RUNNING;
-	remove_wait_queue(&sem->wait, &wait);
-	wake_up(&sem->wait);
-	return retval;
-}
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 4d3e38392cb..ce144b67f06 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -11,7 +11,7 @@ CFLAGS_smp.o	:= -Wno-nonnull
 
 obj-y	:=  bitmap.o traps.o time.o process.o base.o early.o \
             setup.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o \
-	    semaphore.o s390_ext.o debug.o irq.o ipl.o dis.o diag.o
+	    s390_ext.o debug.o irq.o ipl.o dis.o diag.o
 
 obj-y	+= $(if $(CONFIG_64BIT),entry64.o,entry.o)
 obj-y	+= $(if $(CONFIG_64BIT),reipl64.o,reipl.o)
diff --git a/arch/s390/kernel/s390_ksyms.c b/arch/s390/kernel/s390_ksyms.c
index 7234c737f82..48238a114ce 100644
--- a/arch/s390/kernel/s390_ksyms.c
+++ b/arch/s390/kernel/s390_ksyms.c
@@ -26,13 +26,6 @@ EXPORT_SYMBOL(_ni_bitmap);
 EXPORT_SYMBOL(_zb_findmap);
 EXPORT_SYMBOL(_sb_findmap);
 
-/*
- * semaphore ops
- */
-EXPORT_SYMBOL(__up);
-EXPORT_SYMBOL(__down);
-EXPORT_SYMBOL(__down_interruptible);
-
 /*
  * binfmt_elf loader 
  */
diff --git a/arch/s390/kernel/semaphore.c b/arch/s390/kernel/semaphore.c
deleted file mode 100644
index 191303f6c1d..00000000000
--- a/arch/s390/kernel/semaphore.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- *  linux/arch/s390/kernel/semaphore.c
- *
- *  S390 version
- *    Copyright (C) 1998-2000 IBM Corporation
- *    Author(s): Martin Schwidefsky
- *
- *  Derived from "linux/arch/i386/kernel/semaphore.c
- *    Copyright (C) 1999, Linus Torvalds
- *
- */
-#include <linux/sched.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-
-#include <asm/semaphore.h>
-
-/*
- * Atomically update sem->count. Equivalent to:
- *   old_val = sem->count.counter;
- *   new_val = ((old_val >= 0) ? old_val : 0) + incr;
- *   sem->count.counter = new_val;
- *   return old_val;
- */
-static inline int __sem_update_count(struct semaphore *sem, int incr)
-{
-	int old_val, new_val;
-
-	asm volatile(
-		"	l	%0,0(%3)\n"
-		"0:	ltr	%1,%0\n"
-		"	jhe	1f\n"
-		"	lhi	%1,0\n"
-		"1:	ar	%1,%4\n"
-		"	cs	%0,%1,0(%3)\n"
-		"	jl	0b\n"
-		: "=&d" (old_val), "=&d" (new_val), "=m" (sem->count)
-		: "a" (&sem->count), "d" (incr), "m" (sem->count)
-		: "cc");
-	return old_val;
-}
-
-/*
- * The inline function up() incremented count but the result
- * was <= 0. This indicates that some process is waiting on
- * the semaphore. The semaphore is free and we'll wake the
- * first sleeping process, so we set count to 1 unless some
- * other cpu has called up in the meantime in which case
- * we just increment count by 1.
- */
-void __up(struct semaphore *sem)
-{
-	__sem_update_count(sem, 1);
-	wake_up(&sem->wait);
-}
-
-/*
- * The inline function down() decremented count and the result
- * was < 0. The wait loop will atomically test and update the
- * semaphore counter following the rules:
- *   count > 0: decrement count, wake up queue and exit.
- *   count <= 0: set count to -1, go to sleep.
- */
-void __sched __down(struct semaphore * sem)
-{
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-
-	__set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-	add_wait_queue_exclusive(&sem->wait, &wait);
-	while (__sem_update_count(sem, -1) <= 0) {
-		schedule();
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-	}
-	remove_wait_queue(&sem->wait, &wait);
-	__set_task_state(tsk, TASK_RUNNING);
-	wake_up(&sem->wait);
-}
-
-/*
- * Same as __down() with an additional test for signals.
- * If a signal is pending the count is updated as follows:
- *   count > 0: wake up queue and exit.
- *   count <= 0: set count to 0, wake up queue and exit.
- */
-int __sched __down_interruptible(struct semaphore * sem)
-{
-	int retval = 0;
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-
-	__set_task_state(tsk, TASK_INTERRUPTIBLE);
-	add_wait_queue_exclusive(&sem->wait, &wait);
-	while (__sem_update_count(sem, -1) <= 0) {
-		if (signal_pending(current)) {
-			__sem_update_count(sem, 0);
-			retval = -EINTR;
-			break;
-		}
-		schedule();
-		set_task_state(tsk, TASK_INTERRUPTIBLE);
-	}
-	remove_wait_queue(&sem->wait, &wait);
-	__set_task_state(tsk, TASK_RUNNING);
-	wake_up(&sem->wait);
-	return retval;
-}
-
diff --git a/arch/sh/kernel/Makefile_32 b/arch/sh/kernel/Makefile_32
index 62bf373266f..4bbdce36b92 100644
--- a/arch/sh/kernel/Makefile_32
+++ b/arch/sh/kernel/Makefile_32
@@ -5,7 +5,7 @@
 extra-y	:= head_32.o init_task.o vmlinux.lds
 
 obj-y	:= debugtraps.o io.o io_generic.o irq.o machvec.o process_32.o \
-	   ptrace_32.o semaphore.o setup.o signal_32.o sys_sh.o sys_sh32.o \
+	   ptrace_32.o setup.o signal_32.o sys_sh.o sys_sh32.o \
 	   syscalls_32.o time_32.o topology.o traps.o traps_32.o
 
 obj-y				+= cpu/ timers/
diff --git a/arch/sh/kernel/Makefile_64 b/arch/sh/kernel/Makefile_64
index e01283d49cb..6edf53b93d9 100644
--- a/arch/sh/kernel/Makefile_64
+++ b/arch/sh/kernel/Makefile_64
@@ -1,7 +1,7 @@
 extra-y	:= head_64.o init_task.o vmlinux.lds
 
 obj-y	:= debugtraps.o io.o io_generic.o irq.o machvec.o process_64.o \
-	   ptrace_64.o semaphore.o setup.o signal_64.o sys_sh.o sys_sh64.o \
+	   ptrace_64.o setup.o signal_64.o sys_sh.o sys_sh64.o \
 	   syscalls_64.o time_64.o topology.o traps.o traps_64.o
 
 obj-y				+= cpu/ timers/
diff --git a/arch/sh/kernel/semaphore.c b/arch/sh/kernel/semaphore.c
deleted file mode 100644
index 184119eeae5..00000000000
--- a/arch/sh/kernel/semaphore.c
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Just taken from alpha implementation.
- * This can't work well, perhaps.
- */
-/*
- *  Generic semaphore code. Buyer beware. Do your own
- * specific changes in <asm/semaphore-helper.h>
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/wait.h>
-#include <linux/init.h>
-#include <asm/semaphore.h>
-#include <asm/semaphore-helper.h>
-
-DEFINE_SPINLOCK(semaphore_wake_lock);
-
-/*
- * Semaphores are implemented using a two-way counter:
- * The "count" variable is decremented for each process
- * that tries to sleep, while the "waking" variable is
- * incremented when the "up()" code goes to wake up waiting
- * processes.
- *
- * Notably, the inline "up()" and "down()" functions can
- * efficiently test if they need to do any extra work (up
- * needs to do something only if count was negative before
- * the increment operation.
- *
- * waking_non_zero() (from asm/semaphore.h) must execute
- * atomically.
- *
- * When __up() is called, the count was negative before
- * incrementing it, and we need to wake up somebody.
- *
- * This routine adds one to the count of processes that need to
- * wake up and exit.  ALL waiting processes actually wake up but
- * only the one that gets to the "waking" field first will gate
- * through and acquire the semaphore.  The others will go back
- * to sleep.
- *
- * Note that these functions are only called when there is
- * contention on the lock, and as such all this is the
- * "non-critical" part of the whole semaphore business. The
- * critical part is the inline stuff in <asm/semaphore.h>
- * where we want to avoid any extra jumps and calls.
- */
-void __up(struct semaphore *sem)
-{
-	wake_one_more(sem);
-	wake_up(&sem->wait);
-}
-
-/*
- * Perform the "down" function.  Return zero for semaphore acquired,
- * return negative for signalled out of the function.
- *
- * If called from __down, the return is ignored and the wait loop is
- * not interruptible.  This means that a task waiting on a semaphore
- * using "down()" cannot be killed until someone does an "up()" on
- * the semaphore.
- *
- * If called from __down_interruptible, the return value gets checked
- * upon return.  If the return value is negative then the task continues
- * with the negative value in the return register (it can be tested by
- * the caller).
- *
- * Either form may be used in conjunction with "up()".
- *
- */
-
-#define DOWN_VAR				\
-	struct task_struct *tsk = current;	\
-	wait_queue_t wait;			\
-	init_waitqueue_entry(&wait, tsk);
-
-#define DOWN_HEAD(task_state)						\
-									\
-									\
-	tsk->state = (task_state);					\
-	add_wait_queue(&sem->wait, &wait);				\
-									\
-	/*								\
-	 * Ok, we're set up.  sem->count is known to be less than zero	\
-	 * so we must wait.						\
-	 *								\
-	 * We can let go the lock for purposes of waiting.		\
-	 * We re-acquire it after awaking so as to protect		\
-	 * all semaphore operations.					\
-	 *								\
-	 * If "up()" is called before we call waking_non_zero() then	\
-	 * we will catch it right away.  If it is called later then	\
-	 * we will have to go through a wakeup cycle to catch it.	\
-	 *								\
-	 * Multiple waiters contend for the semaphore lock to see	\
-	 * who gets to gate through and who has to wait some more.	\
-	 */								\
-	for (;;) {
-
-#define DOWN_TAIL(task_state)			\
-		tsk->state = (task_state);	\
-	}					\
-	tsk->state = TASK_RUNNING;		\
-	remove_wait_queue(&sem->wait, &wait);
-
-void __sched __down(struct semaphore * sem)
-{
-	DOWN_VAR
-	DOWN_HEAD(TASK_UNINTERRUPTIBLE)
-	if (waking_non_zero(sem))
-		break;
-	schedule();
-	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
-}
-
-int __sched __down_interruptible(struct semaphore * sem)
-{
-	int ret = 0;
-	DOWN_VAR
-	DOWN_HEAD(TASK_INTERRUPTIBLE)
-
-	ret = waking_non_zero_interruptible(sem, tsk);
-	if (ret)
-	{
-		if (ret == 1)
-			/* ret != 0 only if we get interrupted -arca */
-			ret = 0;
-		break;
-	}
-	schedule();
-	DOWN_TAIL(TASK_INTERRUPTIBLE)
-	return ret;
-}
-
-int __down_trylock(struct semaphore * sem)
-{
-	return waking_non_zero_trylock(sem);
-}
diff --git a/arch/sh/kernel/sh_ksyms_32.c b/arch/sh/kernel/sh_ksyms_32.c
index 45bb333fd9e..6d405462cee 100644
--- a/arch/sh/kernel/sh_ksyms_32.c
+++ b/arch/sh/kernel/sh_ksyms_32.c
@@ -9,7 +9,6 @@
 #include <linux/pci.h>
 #include <linux/irq.h>
 #include <asm/sections.h>
-#include <asm/semaphore.h>
 #include <asm/processor.h>
 #include <asm/uaccess.h>
 #include <asm/checksum.h>
@@ -48,12 +47,6 @@ EXPORT_SYMBOL(__copy_user);
 EXPORT_SYMBOL(get_vm_area);
 #endif
 
-/* semaphore exports */
-EXPORT_SYMBOL(__up);
-EXPORT_SYMBOL(__down);
-EXPORT_SYMBOL(__down_interruptible);
-EXPORT_SYMBOL(__down_trylock);
-
 EXPORT_SYMBOL(__udelay);
 EXPORT_SYMBOL(__ndelay);
 EXPORT_SYMBOL(__const_udelay);
diff --git a/arch/sh/kernel/sh_ksyms_64.c b/arch/sh/kernel/sh_ksyms_64.c
index b6410ce4bd1..a310c9707f0 100644
--- a/arch/sh/kernel/sh_ksyms_64.c
+++ b/arch/sh/kernel/sh_ksyms_64.c
@@ -16,7 +16,6 @@
 #include <linux/in6.h>
 #include <linux/interrupt.h>
 #include <linux/screen_info.h>
-#include <asm/semaphore.h>
 #include <asm/processor.h>
 #include <asm/uaccess.h>
 #include <asm/checksum.h>
@@ -37,9 +36,6 @@ EXPORT_SYMBOL(csum_partial_copy_nocheck);
 EXPORT_SYMBOL(screen_info);
 #endif
 
-EXPORT_SYMBOL(__down);
-EXPORT_SYMBOL(__down_trylock);
-EXPORT_SYMBOL(__up);
 EXPORT_SYMBOL(__put_user_asm_l);
 EXPORT_SYMBOL(__get_user_asm_l);
 EXPORT_SYMBOL(copy_page);
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index bf1b15d3f6f..2712bb166f6 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -12,7 +12,7 @@ obj-y    := entry.o wof.o wuf.o etrap.o rtrap.o traps.o $(IRQ_OBJS) \
 	    sys_sparc.o sunos_asm.o systbls.o \
 	    time.o windows.o cpu.o devices.o sclow.o \
 	    tadpole.o tick14.o ptrace.o sys_solaris.o \
-	    unaligned.o una_asm.o muldiv.o semaphore.o \
+	    unaligned.o una_asm.o muldiv.o \
 	    prom.o of_device.o devres.o
 
 devres-y = ../../../kernel/irq/devres.o
diff --git a/arch/sparc/kernel/semaphore.c b/arch/sparc/kernel/semaphore.c
deleted file mode 100644
index 0c37c1a7cd7..00000000000
--- a/arch/sparc/kernel/semaphore.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/* $Id: semaphore.c,v 1.7 2001/04/18 21:06:05 davem Exp $ */
-
-/* sparc32 semaphore implementation, based on i386 version */
-
-#include <linux/sched.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-
-#include <asm/semaphore.h>
-
-/*
- * Semaphores are implemented using a two-way counter:
- * The "count" variable is decremented for each process
- * that tries to acquire the semaphore, while the "sleeping"
- * variable is a count of such acquires.
- *
- * Notably, the inline "up()" and "down()" functions can
- * efficiently test if they need to do any extra work (up
- * needs to do something only if count was negative before
- * the increment operation.
- *
- * "sleeping" and the contention routine ordering is
- * protected by the semaphore spinlock.
- *
- * Note that these functions are only called when there is
- * contention on the lock, and as such all this is the
- * "non-critical" part of the whole semaphore business. The
- * critical part is the inline stuff in <asm/semaphore.h>
- * where we want to avoid any extra jumps and calls.
- */
-
-/*
- * Logic:
- *  - only on a boundary condition do we need to care. When we go
- *    from a negative count to a non-negative, we wake people up.
- *  - when we go from a non-negative count to a negative do we
- *    (a) synchronize with the "sleeper" count and (b) make sure
- *    that we're on the wakeup list before we synchronize so that
- *    we cannot lose wakeup events.
- */
-
-void __up(struct semaphore *sem)
-{
-	wake_up(&sem->wait);
-}
-
-static DEFINE_SPINLOCK(semaphore_lock);
-
-void __sched __down(struct semaphore * sem)
-{
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	tsk->state = TASK_UNINTERRUPTIBLE;
-	add_wait_queue_exclusive(&sem->wait, &wait);
-
-	spin_lock_irq(&semaphore_lock);
-	sem->sleepers++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock.
-		 */
-		if (!atomic24_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irq(&semaphore_lock);
-
-		schedule();
-		tsk->state = TASK_UNINTERRUPTIBLE;
-		spin_lock_irq(&semaphore_lock);
-	}
-	spin_unlock_irq(&semaphore_lock);
-	remove_wait_queue(&sem->wait, &wait);
-	tsk->state = TASK_RUNNING;
-	wake_up(&sem->wait);
-}
-
-int __sched __down_interruptible(struct semaphore * sem)
-{
-	int retval = 0;
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	tsk->state = TASK_INTERRUPTIBLE;
-	add_wait_queue_exclusive(&sem->wait, &wait);
-
-	spin_lock_irq(&semaphore_lock);
-	sem->sleepers ++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * With signals pending, this turns into
-		 * the trylock failure case - we won't be
-		 * sleeping, and we* can't get the lock as
-		 * it has contention. Just correct the count
-		 * and exit.
-		 */
-		if (signal_pending(current)) {
-			retval = -EINTR;
-			sem->sleepers = 0;
-			atomic24_add(sleepers, &sem->count);
-			break;
-		}
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock. The
-		 * "-1" is because we're still hoping to get
-		 * the lock.
-		 */
-		if (!atomic24_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irq(&semaphore_lock);
-
-		schedule();
-		tsk->state = TASK_INTERRUPTIBLE;
-		spin_lock_irq(&semaphore_lock);
-	}
-	spin_unlock_irq(&semaphore_lock);
-	tsk->state = TASK_RUNNING;
-	remove_wait_queue(&sem->wait, &wait);
-	wake_up(&sem->wait);
-	return retval;
-}
-
-/*
- * Trylock failed - make sure we correct for
- * having decremented the count.
- */
-int __down_trylock(struct semaphore * sem)
-{
-	int sleepers;
-	unsigned long flags;
-
-	spin_lock_irqsave(&semaphore_lock, flags);
-	sleepers = sem->sleepers + 1;
-	sem->sleepers = 0;
-
-	/*
-	 * Add "everybody else" and us into it. They aren't
-	 * playing, because we own the spinlock.
-	 */
-	if (!atomic24_add_negative(sleepers, &sem->count))
-		wake_up(&sem->wait);
-
-	spin_unlock_irqrestore(&semaphore_lock, flags);
-	return 1;
-}
diff --git a/arch/sparc/kernel/sparc_ksyms.c b/arch/sparc/kernel/sparc_ksyms.c
index c1025e55165..97b1de0e909 100644
--- a/arch/sparc/kernel/sparc_ksyms.c
+++ b/arch/sparc/kernel/sparc_ksyms.c
@@ -107,11 +107,6 @@ EXPORT_SYMBOL(___rw_read_try);
 EXPORT_SYMBOL(___rw_read_exit);
 EXPORT_SYMBOL(___rw_write_enter);
 #endif
-/* semaphores */
-EXPORT_SYMBOL(__up);
-EXPORT_SYMBOL(__down);
-EXPORT_SYMBOL(__down_trylock);
-EXPORT_SYMBOL(__down_interruptible);
 
 EXPORT_SYMBOL(sparc_valid_addr_bitmap);
 EXPORT_SYMBOL(phys_base);
diff --git a/arch/sparc64/kernel/Makefile b/arch/sparc64/kernel/Makefile
index 1bf5b187de4..459462e80a1 100644
--- a/arch/sparc64/kernel/Makefile
+++ b/arch/sparc64/kernel/Makefile
@@ -10,7 +10,7 @@ extra-y		:= head.o init_task.o vmlinux.lds
 obj-y		:= process.o setup.o cpu.o idprom.o \
 		   traps.o auxio.o una_asm.o sysfs.o iommu.o \
 		   irq.o ptrace.o time.o sys_sparc.o signal.o \
-		   unaligned.o central.o pci.o starfire.o semaphore.o \
+		   unaligned.o central.o pci.o starfire.o \
 		   power.o sbus.o sparc64_ksyms.o chmc.o \
 		   visemul.o prom.o of_device.o hvapi.o sstate.o mdesc.o
 
diff --git a/arch/sparc64/kernel/semaphore.c b/arch/sparc64/kernel/semaphore.c
deleted file mode 100644
index 9974a689955..00000000000
--- a/arch/sparc64/kernel/semaphore.c
+++ /dev/null
@@ -1,254 +0,0 @@
-/* semaphore.c: Sparc64 semaphore implementation.
- *
- * This is basically the PPC semaphore scheme ported to use
- * the sparc64 atomic instructions, so see the PPC code for
- * credits.
- */
-
-#include <linux/sched.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-
-/*
- * Atomically update sem->count.
- * This does the equivalent of the following:
- *
- *	old_count = sem->count;
- *	tmp = MAX(old_count, 0) + incr;
- *	sem->count = tmp;
- *	return old_count;
- */
-static inline int __sem_update_count(struct semaphore *sem, int incr)
-{
-	int old_count, tmp;
-
-	__asm__ __volatile__("\n"
-"	! __sem_update_count old_count(%0) tmp(%1) incr(%4) &sem->count(%3)\n"
-"1:	ldsw	[%3], %0\n"
-"	mov	%0, %1\n"
-"	cmp	%0, 0\n"
-"	movl	%%icc, 0, %1\n"
-"	add	%1, %4, %1\n"
-"	cas	[%3], %0, %1\n"
-"	cmp	%0, %1\n"
-"	membar	#StoreLoad | #StoreStore\n"
-"	bne,pn	%%icc, 1b\n"
-"	 nop\n"
-	: "=&r" (old_count), "=&r" (tmp), "=m" (sem->count)
-	: "r" (&sem->count), "r" (incr), "m" (sem->count)
-	: "cc");
-
-	return old_count;
-}
-
-static void __up(struct semaphore *sem)
-{
-	__sem_update_count(sem, 1);
-	wake_up(&sem->wait);
-}
-
-void up(struct semaphore *sem)
-{
-	/* This atomically does:
-	 * 	old_val = sem->count;
-	 *	new_val = sem->count + 1;
-	 *	sem->count = new_val;
-	 *	if (old_val < 0)
-	 *		__up(sem);
-	 *
-	 * The (old_val < 0) test is equivalent to
-	 * the more straightforward (new_val <= 0),
-	 * but it is easier to test the former because
-	 * of how the CAS instruction works.
-	 */
-
-	__asm__ __volatile__("\n"
-"	! up sem(%0)\n"
-"	membar	#StoreLoad | #LoadLoad\n"
-"1:	lduw	[%0], %%g1\n"
-"	add	%%g1, 1, %%g7\n"
-"	cas	[%0], %%g1, %%g7\n"
-"	cmp	%%g1, %%g7\n"
-"	bne,pn	%%icc, 1b\n"
-"	 addcc	%%g7, 1, %%g0\n"
-"	membar	#StoreLoad | #StoreStore\n"
-"	ble,pn	%%icc, 3f\n"
-"	 nop\n"
-"2:\n"
-"	.subsection 2\n"
-"3:	mov	%0, %%g1\n"
-"	save	%%sp, -160, %%sp\n"
-"	call	%1\n"
-"	 mov	%%g1, %%o0\n"
-"	ba,pt	%%xcc, 2b\n"
-"	 restore\n"
-"	.previous\n"
-	: : "r" (sem), "i" (__up)
-	: "g1", "g2", "g3", "g7", "memory", "cc");
-}
-
-static void __sched __down(struct semaphore * sem)
-{
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-
-	tsk->state = TASK_UNINTERRUPTIBLE;
-	add_wait_queue_exclusive(&sem->wait, &wait);
-
-	while (__sem_update_count(sem, -1) <= 0) {
-		schedule();
-		tsk->state = TASK_UNINTERRUPTIBLE;
-	}
-	remove_wait_queue(&sem->wait, &wait);
-	tsk->state = TASK_RUNNING;
-
-	wake_up(&sem->wait);
-}
-
-void __sched down(struct semaphore *sem)
-{
-	might_sleep();
-	/* This atomically does:
-	 * 	old_val = sem->count;
-	 *	new_val = sem->count - 1;
-	 *	sem->count = new_val;
-	 *	if (old_val < 1)
-	 *		__down(sem);
-	 *
-	 * The (old_val < 1) test is equivalent to
-	 * the more straightforward (new_val < 0),
-	 * but it is easier to test the former because
-	 * of how the CAS instruction works.
-	 */
-
-	__asm__ __volatile__("\n"
-"	! down sem(%0)\n"
-"1:	lduw	[%0], %%g1\n"
-"	sub	%%g1, 1, %%g7\n"
-"	cas	[%0], %%g1, %%g7\n"
-"	cmp	%%g1, %%g7\n"
-"	bne,pn	%%icc, 1b\n"
-"	 cmp	%%g7, 1\n"
-"	membar	#StoreLoad | #StoreStore\n"
-"	bl,pn	%%icc, 3f\n"
-"	 nop\n"
-"2:\n"
-"	.subsection 2\n"
-"3:	mov	%0, %%g1\n"
-"	save	%%sp, -160, %%sp\n"
-"	call	%1\n"
-"	 mov	%%g1, %%o0\n"
-"	ba,pt	%%xcc, 2b\n"
-"	 restore\n"
-"	.previous\n"
-	: : "r" (sem), "i" (__down)
-	: "g1", "g2", "g3", "g7", "memory", "cc");
-}
-
-int down_trylock(struct semaphore *sem)
-{
-	int ret;
-
-	/* This atomically does:
-	 * 	old_val = sem->count;
-	 *	new_val = sem->count - 1;
-	 *	if (old_val < 1) {
-	 *		ret = 1;
-	 *	} else {
-	 *		sem->count = new_val;
-	 *		ret = 0;
-	 *	}
-	 *
-	 * The (old_val < 1) test is equivalent to
-	 * the more straightforward (new_val < 0),
-	 * but it is easier to test the former because
-	 * of how the CAS instruction works.
-	 */
-
-	__asm__ __volatile__("\n"
-"	! down_trylock sem(%1) ret(%0)\n"
-"1:	lduw	[%1], %%g1\n"
-"	sub	%%g1, 1, %%g7\n"
-"	cmp	%%g1, 1\n"
-"	bl,pn	%%icc, 2f\n"
-"	 mov	1, %0\n"
-"	cas	[%1], %%g1, %%g7\n"
-"	cmp	%%g1, %%g7\n"
-"	bne,pn	%%icc, 1b\n"
-"	 mov	0, %0\n"
-"	membar	#StoreLoad | #StoreStore\n"
-"2:\n"
-	: "=&r" (ret)
-	: "r" (sem)
-	: "g1", "g7", "memory", "cc");
-
-	return ret;
-}
-
-static int __sched __down_interruptible(struct semaphore * sem)
-{
-	int retval = 0;
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-
-	tsk->state = TASK_INTERRUPTIBLE;
-	add_wait_queue_exclusive(&sem->wait, &wait);
-
-	while (__sem_update_count(sem, -1) <= 0) {
-		if (signal_pending(current)) {
-			__sem_update_count(sem, 0);
-			retval = -EINTR;
-			break;
-		}
-		schedule();
-		tsk->state = TASK_INTERRUPTIBLE;
-	}
-	tsk->state = TASK_RUNNING;
-	remove_wait_queue(&sem->wait, &wait);
-	wake_up(&sem->wait);
-	return retval;
-}
-
-int __sched down_interruptible(struct semaphore *sem)
-{
-	int ret = 0;
-	
-	might_sleep();
-	/* This atomically does:
-	 * 	old_val = sem->count;
-	 *	new_val = sem->count - 1;
-	 *	sem->count = new_val;
-	 *	if (old_val < 1)
-	 *		ret = __down_interruptible(sem);
-	 *
-	 * The (old_val < 1) test is equivalent to
-	 * the more straightforward (new_val < 0),
-	 * but it is easier to test the former because
-	 * of how the CAS instruction works.
-	 */
-
-	__asm__ __volatile__("\n"
-"	! down_interruptible sem(%2) ret(%0)\n"
-"1:	lduw	[%2], %%g1\n"
-"	sub	%%g1, 1, %%g7\n"
-"	cas	[%2], %%g1, %%g7\n"
-"	cmp	%%g1, %%g7\n"
-"	bne,pn	%%icc, 1b\n"
-"	 cmp	%%g7, 1\n"
-"	membar	#StoreLoad | #StoreStore\n"
-"	bl,pn	%%icc, 3f\n"
-"	 nop\n"
-"2:\n"
-"	.subsection 2\n"
-"3:	mov	%2, %%g1\n"
-"	save	%%sp, -160, %%sp\n"
-"	call	%3\n"
-"	 mov	%%g1, %%o0\n"
-"	ba,pt	%%xcc, 2b\n"
-"	 restore\n"
-"	.previous\n"
-	: "=r" (ret)
-	: "0" (ret), "r" (sem), "i" (__down_interruptible)
-	: "g1", "g2", "g3", "g7", "memory", "cc");
-	return ret;
-}
diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c
index 51fa773f38c..051b8d9cb98 100644
--- a/arch/sparc64/kernel/sparc64_ksyms.c
+++ b/arch/sparc64/kernel/sparc64_ksyms.c
@@ -130,12 +130,6 @@ EXPORT_SYMBOL(_mcount);
 
 EXPORT_SYMBOL(sparc64_get_clock_tick);
 
-/* semaphores */
-EXPORT_SYMBOL(down);
-EXPORT_SYMBOL(down_trylock);
-EXPORT_SYMBOL(down_interruptible);
-EXPORT_SYMBOL(up);
-
 /* RW semaphores */
 EXPORT_SYMBOL(__down_read);
 EXPORT_SYMBOL(__down_read_trylock);
diff --git a/arch/um/Kconfig.i386 b/arch/um/Kconfig.i386
index 3cd8a04d66d..e09edfa560d 100644
--- a/arch/um/Kconfig.i386
+++ b/arch/um/Kconfig.i386
@@ -19,10 +19,6 @@ config 64BIT
 	bool
 	default n
 
-config SEMAPHORE_SLEEPERS
-	bool
-	default y
-
 config 3_LEVEL_PGTABLES
 	bool "Three-level pagetables (EXPERIMENTAL)"
 	default n
diff --git a/arch/um/Kconfig.x86_64 b/arch/um/Kconfig.x86_64
index 6533b349f06..3fbe69e359e 100644
--- a/arch/um/Kconfig.x86_64
+++ b/arch/um/Kconfig.x86_64
@@ -11,10 +11,6 @@ config RWSEM_GENERIC_SPINLOCK
 	bool
 	default y
 
-config SEMAPHORE_SLEEPERS
-	bool
-	default y
-
 config 3_LEVEL_PGTABLES
        bool
        default y
diff --git a/arch/um/sys-i386/ksyms.c b/arch/um/sys-i386/ksyms.c
index 2a1eac1859c..bfbefd30db8 100644
--- a/arch/um/sys-i386/ksyms.c
+++ b/arch/um/sys-i386/ksyms.c
@@ -1,17 +1,5 @@
 #include "linux/module.h"
-#include "linux/in6.h"
-#include "linux/rwsem.h"
-#include "asm/byteorder.h"
-#include "asm/delay.h"
-#include "asm/semaphore.h"
-#include "asm/uaccess.h"
 #include "asm/checksum.h"
-#include "asm/errno.h"
-
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__down_failed_trylock);
-EXPORT_SYMBOL(__up_wakeup);
 
 /* Networking helper routines. */
 EXPORT_SYMBOL(csum_partial);
diff --git a/arch/um/sys-ppc/Makefile b/arch/um/sys-ppc/Makefile
index 08901526e89..b8bc844fd2c 100644
--- a/arch/um/sys-ppc/Makefile
+++ b/arch/um/sys-ppc/Makefile
@@ -3,7 +3,7 @@ OBJ = built-in.o
 .S.o:
 	$(CC) $(KBUILD_AFLAGS) -D__ASSEMBLY__ -D__UM_PPC__ -c $< -o $*.o
 
-OBJS = ptrace.o sigcontext.o semaphore.o checksum.o miscthings.o misc.o \
+OBJS = ptrace.o sigcontext.o checksum.o miscthings.o misc.o \
 	ptrace_user.o sysrq.o
 
 EXTRA_AFLAGS := -DCONFIG_PPC32 -I. -I$(srctree)/arch/ppc/kernel
@@ -20,10 +20,6 @@ ptrace_user.o: ptrace_user.c
 sigcontext.o: sigcontext.c
 	$(CC) $(USER_CFLAGS) $(EXTRA_CFLAGS) -c -o $@ $<
 
-semaphore.c:
-	rm -f $@
-	ln -s $(srctree)/arch/ppc/kernel/$@ $@
-
 checksum.S:
 	rm -f $@
 	ln -s $(srctree)/arch/ppc/lib/$@ $@
@@ -66,4 +62,4 @@ misc.o: misc.S ppc_defs.h
 	$(CC) $(EXTRA_AFLAGS) $(KBUILD_AFLAGS) -D__ASSEMBLY__ -D__UM_PPC__ -c $< -o $*.o
 	rm -f asm
 
-clean-files := $(OBJS) ppc_defs.h checksum.S semaphore.c mk_defs.c
+clean-files := $(OBJS) ppc_defs.h checksum.S mk_defs.c
diff --git a/arch/um/sys-x86_64/ksyms.c b/arch/um/sys-x86_64/ksyms.c
index 12c593607c5..4d7d1a812d8 100644
--- a/arch/um/sys-x86_64/ksyms.c
+++ b/arch/um/sys-x86_64/ksyms.c
@@ -1,16 +1,5 @@
 #include "linux/module.h"
-#include "linux/in6.h"
-#include "linux/rwsem.h"
-#include "asm/byteorder.h"
-#include "asm/semaphore.h"
-#include "asm/uaccess.h"
-#include "asm/checksum.h"
-#include "asm/errno.h"
-
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__down_failed_trylock);
-EXPORT_SYMBOL(__up_wakeup);
+#include "asm/string.h"
 
 /*XXX: we need them because they would be exported by x86_64 */
 EXPORT_SYMBOL(__memcpy);
diff --git a/arch/v850/kernel/Makefile b/arch/v850/kernel/Makefile
index 3930482bddc..da5889c5357 100644
--- a/arch/v850/kernel/Makefile
+++ b/arch/v850/kernel/Makefile
@@ -11,7 +11,7 @@
 
 extra-y := head.o init_task.o vmlinux.lds
 
-obj-y += intv.o entry.o process.o syscalls.o time.o semaphore.o setup.o \
+obj-y += intv.o entry.o process.o syscalls.o time.o setup.o \
 	 signal.o irq.o mach.o ptrace.o bug.o
 obj-$(CONFIG_MODULES)		+= module.o v850_ksyms.o
 # chip-specific code
diff --git a/arch/v850/kernel/semaphore.c b/arch/v850/kernel/semaphore.c
deleted file mode 100644
index fc89fd661c9..00000000000
--- a/arch/v850/kernel/semaphore.c
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * arch/v850/kernel/semaphore.c -- Semaphore support
- *
- *  Copyright (C) 1998-2000  IBM Corporation
- *  Copyright (C) 1999  Linus Torvalds
- *
- * This file is subject to the terms and conditions of the GNU General
- * Public License.  See the file COPYING in the main directory of this
- * archive for more details.
- *
- * This file is a copy of the s390 version, arch/s390/kernel/semaphore.c
- *    Author(s): Martin Schwidefsky
- * which was derived from the i386 version, linux/arch/i386/kernel/semaphore.c
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-
-#include <asm/semaphore.h>
-
-/*
- * Semaphores are implemented using a two-way counter:
- * The "count" variable is decremented for each process
- * that tries to acquire the semaphore, while the "sleeping"
- * variable is a count of such acquires.
- *
- * Notably, the inline "up()" and "down()" functions can
- * efficiently test if they need to do any extra work (up
- * needs to do something only if count was negative before
- * the increment operation.
- *
- * "sleeping" and the contention routine ordering is
- * protected by the semaphore spinlock.
- *
- * Note that these functions are only called when there is
- * contention on the lock, and as such all this is the
- * "non-critical" part of the whole semaphore business. The
- * critical part is the inline stuff in <asm/semaphore.h>
- * where we want to avoid any extra jumps and calls.
- */
-
-/*
- * Logic:
- *  - only on a boundary condition do we need to care. When we go
- *    from a negative count to a non-negative, we wake people up.
- *  - when we go from a non-negative count to a negative do we
- *    (a) synchronize with the "sleeper" count and (b) make sure
- *    that we're on the wakeup list before we synchronize so that
- *    we cannot lose wakeup events.
- */
-
-void __up(struct semaphore *sem)
-{
-	wake_up(&sem->wait);
-}
-
-static DEFINE_SPINLOCK(semaphore_lock);
-
-void __sched __down(struct semaphore * sem)
-{
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	tsk->state = TASK_UNINTERRUPTIBLE;
-	add_wait_queue_exclusive(&sem->wait, &wait);
-
-	spin_lock_irq(&semaphore_lock);
-	sem->sleepers++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock.
-		 */
-		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irq(&semaphore_lock);
-
-		schedule();
-		tsk->state = TASK_UNINTERRUPTIBLE;
-		spin_lock_irq(&semaphore_lock);
-	}
-	spin_unlock_irq(&semaphore_lock);
-	remove_wait_queue(&sem->wait, &wait);
-	tsk->state = TASK_RUNNING;
-	wake_up(&sem->wait);
-}
-
-int __sched __down_interruptible(struct semaphore * sem)
-{
-	int retval = 0;
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	tsk->state = TASK_INTERRUPTIBLE;
-	add_wait_queue_exclusive(&sem->wait, &wait);
-
-	spin_lock_irq(&semaphore_lock);
-	sem->sleepers ++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * With signals pending, this turns into
-		 * the trylock failure case - we won't be
-		 * sleeping, and we* can't get the lock as
-		 * it has contention. Just correct the count
-		 * and exit.
-		 */
-		if (signal_pending(current)) {
-			retval = -EINTR;
-			sem->sleepers = 0;
-			atomic_add(sleepers, &sem->count);
-			break;
-		}
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock. The
-		 * "-1" is because we're still hoping to get
-		 * the lock.
-		 */
-		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irq(&semaphore_lock);
-
-		schedule();
-		tsk->state = TASK_INTERRUPTIBLE;
-		spin_lock_irq(&semaphore_lock);
-	}
-	spin_unlock_irq(&semaphore_lock);
-	tsk->state = TASK_RUNNING;
-	remove_wait_queue(&sem->wait, &wait);
-	wake_up(&sem->wait);
-	return retval;
-}
-
-/*
- * Trylock failed - make sure we correct for
- * having decremented the count.
- */
-int __down_trylock(struct semaphore * sem)
-{
-        unsigned long flags;
-	int sleepers;
-
-	spin_lock_irqsave(&semaphore_lock, flags);
-	sleepers = sem->sleepers + 1;
-	sem->sleepers = 0;
-
-	/*
-	 * Add "everybody else" and us into it. They aren't
-	 * playing, because we own the spinlock.
-	 */
-	if (!atomic_add_negative(sleepers, &sem->count))
-		wake_up(&sem->wait);
-
-	spin_unlock_irqrestore(&semaphore_lock, flags);
-	return 1;
-}
diff --git a/arch/v850/kernel/v850_ksyms.c b/arch/v850/kernel/v850_ksyms.c
index 93575fdc874..8d386a5dbc4 100644
--- a/arch/v850/kernel/v850_ksyms.c
+++ b/arch/v850/kernel/v850_ksyms.c
@@ -11,7 +11,6 @@
 #include <asm/pgalloc.h>
 #include <asm/irq.h>
 #include <asm/io.h>
-#include <asm/semaphore.h>
 #include <asm/checksum.h>
 #include <asm/current.h>
 
@@ -34,12 +33,6 @@ EXPORT_SYMBOL (memset);
 EXPORT_SYMBOL (memcpy);
 EXPORT_SYMBOL (memmove);
 
-/* semaphores */
-EXPORT_SYMBOL (__down);
-EXPORT_SYMBOL (__down_interruptible);
-EXPORT_SYMBOL (__down_trylock);
-EXPORT_SYMBOL (__up);
-
 /*
  * libgcc functions - functions that are used internally by the
  * compiler...  (prototypes are not correct though, but that
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6c70fed0f9a..e4b38861ea5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -53,9 +53,6 @@ config STACKTRACE_SUPPORT
 config HAVE_LATENCYTOP_SUPPORT
 	def_bool y
 
-config SEMAPHORE_SLEEPERS
-	def_bool y
-
 config FAST_CMPXCHG_LOCAL
 	bool
 	default y
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 061627806a2..deb43785e92 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -1,13 +1,8 @@
 #include <linux/module.h>
-#include <asm/semaphore.h>
 #include <asm/checksum.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
 
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__down_failed_trylock);
-EXPORT_SYMBOL(__up_wakeup);
 /* Networking helper routines. */
 EXPORT_SYMBOL(csum_partial_copy_generic);
 
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index a66e9c1a053..95a993e1816 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -4,7 +4,6 @@
 #include <linux/module.h>
 #include <linux/smp.h>
 
-#include <asm/semaphore.h>
 #include <asm/processor.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -12,11 +11,6 @@
 
 EXPORT_SYMBOL(kernel_thread);
 
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__down_failed_trylock);
-EXPORT_SYMBOL(__up_wakeup);
-
 EXPORT_SYMBOL(__get_user_1);
 EXPORT_SYMBOL(__get_user_2);
 EXPORT_SYMBOL(__get_user_4);
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
index 3899bd37fdf..648fe474178 100644
--- a/arch/x86/lib/semaphore_32.S
+++ b/arch/x86/lib/semaphore_32.S
@@ -30,89 +30,6 @@
  * value or just clobbered..
  */
 	.section .sched.text, "ax"
-ENTRY(__down_failed)
-	CFI_STARTPROC
-	FRAME
-	pushl %edx
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET edx,0
-	pushl %ecx
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET ecx,0
-	call __down
-	popl %ecx
-	CFI_ADJUST_CFA_OFFSET -4
-	CFI_RESTORE ecx
-	popl %edx
-	CFI_ADJUST_CFA_OFFSET -4
-	CFI_RESTORE edx
-	ENDFRAME
-	ret
-	CFI_ENDPROC
-	ENDPROC(__down_failed)
-
-ENTRY(__down_failed_interruptible)
-	CFI_STARTPROC
-	FRAME
-	pushl %edx
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET edx,0
-	pushl %ecx
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET ecx,0
-	call __down_interruptible
-	popl %ecx
-	CFI_ADJUST_CFA_OFFSET -4
-	CFI_RESTORE ecx
-	popl %edx
-	CFI_ADJUST_CFA_OFFSET -4
-	CFI_RESTORE edx
-	ENDFRAME
-	ret
-	CFI_ENDPROC
-	ENDPROC(__down_failed_interruptible)
-
-ENTRY(__down_failed_trylock)
-	CFI_STARTPROC
-	FRAME
-	pushl %edx
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET edx,0
-	pushl %ecx
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET ecx,0
-	call __down_trylock
-	popl %ecx
-	CFI_ADJUST_CFA_OFFSET -4
-	CFI_RESTORE ecx
-	popl %edx
-	CFI_ADJUST_CFA_OFFSET -4
-	CFI_RESTORE edx
-	ENDFRAME
-	ret
-	CFI_ENDPROC
-	ENDPROC(__down_failed_trylock)
-
-ENTRY(__up_wakeup)
-	CFI_STARTPROC
-	FRAME
-	pushl %edx
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET edx,0
-	pushl %ecx
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET ecx,0
-	call __up
-	popl %ecx
-	CFI_ADJUST_CFA_OFFSET -4
-	CFI_RESTORE ecx
-	popl %edx
-	CFI_ADJUST_CFA_OFFSET -4
-	CFI_RESTORE edx
-	ENDFRAME
-	ret
-	CFI_ENDPROC
-	ENDPROC(__up_wakeup)
 
 /*
  * rw spinlock fallbacks
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index 8b92d428ab0..e009251d4e9 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -41,11 +41,6 @@
 	thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
 #endif	
 	
-	thunk __down_failed,__down
-	thunk_retrax __down_failed_interruptible,__down_interruptible
-	thunk_retrax __down_failed_trylock,__down_trylock
-	thunk __up_wakeup,__up
-
 #ifdef CONFIG_TRACE_IRQFLAGS
 	thunk trace_hardirqs_on_thunk,trace_hardirqs_on
 	thunk trace_hardirqs_off_thunk,trace_hardirqs_off
diff --git a/arch/xtensa/kernel/Makefile b/arch/xtensa/kernel/Makefile
index f582d6a24ec..7419dbccf02 100644
--- a/arch/xtensa/kernel/Makefile
+++ b/arch/xtensa/kernel/Makefile
@@ -5,7 +5,7 @@
 extra-y := head.o vmlinux.lds
 
 
-obj-y := align.o entry.o irq.o coprocessor.o process.o ptrace.o semaphore.o  \
+obj-y := align.o entry.o irq.o coprocessor.o process.o ptrace.o \
 	 setup.o signal.o syscall.o time.o traps.o vectors.o platform.o  \
 	 pci-dma.o init_task.o io.o
 
diff --git a/arch/xtensa/kernel/semaphore.c b/arch/xtensa/kernel/semaphore.c
deleted file mode 100644
index 995c6410ae1..00000000000
--- a/arch/xtensa/kernel/semaphore.c
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * arch/xtensa/kernel/semaphore.c
- *
- * Generic semaphore code. Buyer beware. Do your own specific changes
- * in <asm/semaphore-helper.h>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 2001 - 2005 Tensilica Inc.
- *
- * Joe Taylor	<joe@tensilica.com, joetylr@yahoo.com>
- * Chris Zankel	<chris@zankel.net>
- * Marc Gauthier<marc@tensilica.com, marc@alumni.uwaterloo.ca>
- * Kevin Chea
- */
-
-#include <linux/sched.h>
-#include <linux/wait.h>
-#include <linux/init.h>
-#include <asm/semaphore.h>
-#include <asm/errno.h>
-
-/*
- * These two _must_ execute atomically wrt each other.
- */
-
-static __inline__ void wake_one_more(struct semaphore * sem)
-{
-	atomic_inc((atomic_t *)&sem->sleepers);
-}
-
-static __inline__ int waking_non_zero(struct semaphore *sem)
-{
-	unsigned long flags;
-	int ret = 0;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	if (sem->sleepers > 0) {
-		sem->sleepers--;
-		ret = 1;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;
-}
-
-/*
- * waking_non_zero_interruptible:
- *	1	got the lock
- *	0	go to sleep
- *	-EINTR	interrupted
- *
- * We must undo the sem->count down_interruptible() increment while we are
- * protected by the spinlock in order to make atomic this atomic_inc() with the
- * atomic_read() in wake_one_more(), otherwise we can race. -arca
- */
-
-static __inline__ int waking_non_zero_interruptible(struct semaphore *sem,
-						struct task_struct *tsk)
-{
-	unsigned long flags;
-	int ret = 0;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	if (sem->sleepers > 0) {
-		sem->sleepers--;
-		ret = 1;
-	} else if (signal_pending(tsk)) {
-		atomic_inc(&sem->count);
-		ret = -EINTR;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;
-}
-
-/*
- * waking_non_zero_trylock:
- *	1	failed to lock
- *	0	got the lock
- *
- * We must undo the sem->count down_trylock() increment while we are
- * protected by the spinlock in order to make atomic this atomic_inc() with the
- * atomic_read() in wake_one_more(), otherwise we can race. -arca
- */
-
-static __inline__ int waking_non_zero_trylock(struct semaphore *sem)
-{
-	unsigned long flags;
-	int ret = 1;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	if (sem->sleepers <= 0)
-		atomic_inc(&sem->count);
-	else {
-		sem->sleepers--;
-		ret = 0;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;
-}
-
-DEFINE_SPINLOCK(semaphore_wake_lock);
-
-/*
- * Semaphores are implemented using a two-way counter:
- * The "count" variable is decremented for each process
- * that tries to sleep, while the "waking" variable is
- * incremented when the "up()" code goes to wake up waiting
- * processes.
- *
- * Notably, the inline "up()" and "down()" functions can
- * efficiently test if they need to do any extra work (up
- * needs to do something only if count was negative before
- * the increment operation.
- *
- * waking_non_zero() (from asm/semaphore.h) must execute
- * atomically.
- *
- * When __up() is called, the count was negative before
- * incrementing it, and we need to wake up somebody.
- *
- * This routine adds one to the count of processes that need to
- * wake up and exit.  ALL waiting processes actually wake up but
- * only the one that gets to the "waking" field first will gate
- * through and acquire the semaphore.  The others will go back
- * to sleep.
- *
- * Note that these functions are only called when there is
- * contention on the lock, and as such all this is the
- * "non-critical" part of the whole semaphore business. The
- * critical part is the inline stuff in <asm/semaphore.h>
- * where we want to avoid any extra jumps and calls.
- */
-
-void __up(struct semaphore *sem)
-{
-	wake_one_more(sem);
-	wake_up(&sem->wait);
-}
-
-/*
- * Perform the "down" function.  Return zero for semaphore acquired,
- * return negative for signalled out of the function.
- *
- * If called from __down, the return is ignored and the wait loop is
- * not interruptible.  This means that a task waiting on a semaphore
- * using "down()" cannot be killed until someone does an "up()" on
- * the semaphore.
- *
- * If called from __down_interruptible, the return value gets checked
- * upon return.  If the return value is negative then the task continues
- * with the negative value in the return register (it can be tested by
- * the caller).
- *
- * Either form may be used in conjunction with "up()".
- *
- */
-
-#define DOWN_VAR				\
-	struct task_struct *tsk = current;	\
-	wait_queue_t wait;			\
-	init_waitqueue_entry(&wait, tsk);
-
-#define DOWN_HEAD(task_state)						\
-									\
-									\
-	tsk->state = (task_state);					\
-	add_wait_queue(&sem->wait, &wait);				\
-									\
-	/*								\
-	 * Ok, we're set up.  sem->count is known to be less than zero	\
-	 * so we must wait.						\
-	 *								\
-	 * We can let go the lock for purposes of waiting.		\
-	 * We re-acquire it after awaking so as to protect		\
-	 * all semaphore operations.					\
-	 *								\
-	 * If "up()" is called before we call waking_non_zero() then	\
-	 * we will catch it right away.  If it is called later then	\
-	 * we will have to go through a wakeup cycle to catch it.	\
-	 *								\
-	 * Multiple waiters contend for the semaphore lock to see	\
-	 * who gets to gate through and who has to wait some more.	\
-	 */								\
-	for (;;) {
-
-#define DOWN_TAIL(task_state)			\
-		tsk->state = (task_state);	\
-	}					\
-	tsk->state = TASK_RUNNING;		\
-	remove_wait_queue(&sem->wait, &wait);
-
-void __sched __down(struct semaphore * sem)
-{
-	DOWN_VAR
-	DOWN_HEAD(TASK_UNINTERRUPTIBLE)
-	if (waking_non_zero(sem))
-		break;
-	schedule();
-	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
-}
-
-int __sched __down_interruptible(struct semaphore * sem)
-{
-	int ret = 0;
-	DOWN_VAR
-	DOWN_HEAD(TASK_INTERRUPTIBLE)
-
-	ret = waking_non_zero_interruptible(sem, tsk);
-	if (ret)
-	{
-		if (ret == 1)
-			/* ret != 0 only if we get interrupted -arca */
-			ret = 0;
-		break;
-	}
-	schedule();
-	DOWN_TAIL(TASK_INTERRUPTIBLE)
-	return ret;
-}
-
-int __down_trylock(struct semaphore * sem)
-{
-	return waking_non_zero_trylock(sem);
-}
diff --git a/arch/xtensa/kernel/xtensa_ksyms.c b/arch/xtensa/kernel/xtensa_ksyms.c
index 60dbdb43fb4..6e52cdd6166 100644
--- a/arch/xtensa/kernel/xtensa_ksyms.c
+++ b/arch/xtensa/kernel/xtensa_ksyms.c
@@ -26,7 +26,6 @@
 #include <asm/io.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
-#include <asm/semaphore.h>
 #ifdef CONFIG_BLK_DEV_FD
 #include <asm/floppy.h>
 #endif
@@ -71,14 +70,6 @@ EXPORT_SYMBOL(__umodsi3);
 EXPORT_SYMBOL(__udivdi3);
 EXPORT_SYMBOL(__umoddi3);
 
-/*
- * Semaphore operations
- */
-EXPORT_SYMBOL(__down);
-EXPORT_SYMBOL(__down_interruptible);
-EXPORT_SYMBOL(__down_trylock);
-EXPORT_SYMBOL(__up);
-
 #ifdef CONFIG_NET
 /*
  * Networking support
diff --git a/include/asm-alpha/semaphore.h b/include/asm-alpha/semaphore.h
index f1e9278a9fe..d9b2034ed1d 100644
--- a/include/asm-alpha/semaphore.h
+++ b/include/asm-alpha/semaphore.h
@@ -1,149 +1 @@
-#ifndef _ALPHA_SEMAPHORE_H
-#define _ALPHA_SEMAPHORE_H
-
-/*
- * SMP- and interrupt-safe semaphores..
- *
- * (C) Copyright 1996 Linus Torvalds
- * (C) Copyright 1996, 2000 Richard Henderson
- */
-
-#include <asm/current.h>
-#include <asm/system.h>
-#include <asm/atomic.h>
-#include <linux/compiler.h>
-#include <linux/wait.h>
-#include <linux/rwsem.h>
-
-struct semaphore {
-	atomic_t count;
-	wait_queue_head_t wait;
-};
-
-#define __SEMAPHORE_INITIALIZER(name, n)			\
-{								\
-	.count	= ATOMIC_INIT(n),				\
-  	.wait	= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait),	\
-}
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count)		\
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name,1)
-
-static inline void sema_init(struct semaphore *sem, int val)
-{
-	/*
-	 * Logically, 
-	 *   *sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
-	 * except that gcc produces better initializing by parts yet.
-	 */
-
-	atomic_set(&sem->count, val);
-	init_waitqueue_head(&sem->wait);
-}
-
-static inline void init_MUTEX (struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-extern void down(struct semaphore *);
-extern void __down_failed(struct semaphore *);
-extern int  down_interruptible(struct semaphore *);
-extern int  __down_failed_interruptible(struct semaphore *);
-extern int  down_trylock(struct semaphore *);
-extern void up(struct semaphore *);
-extern void __up_wakeup(struct semaphore *);
-
-/*
- * Hidden out of line code is fun, but extremely messy.  Rely on newer
- * compilers to do a respectable job with this.  The contention cases
- * are handled out of line in arch/alpha/kernel/semaphore.c.
- */
-
-static inline void __down(struct semaphore *sem)
-{
-	long count;
-	might_sleep();
-	count = atomic_dec_return(&sem->count);
-	if (unlikely(count < 0))
-		__down_failed(sem);
-}
-
-static inline int __down_interruptible(struct semaphore *sem)
-{
-	long count;
-	might_sleep();
-	count = atomic_dec_return(&sem->count);
-	if (unlikely(count < 0))
-		return __down_failed_interruptible(sem);
-	return 0;
-}
-
-/*
- * down_trylock returns 0 on success, 1 if we failed to get the lock.
- */
-
-static inline int __down_trylock(struct semaphore *sem)
-{
-	long ret;
-
-	/* "Equivalent" C:
-
-	   do {
-		ret = ldl_l;
-		--ret;
-		if (ret < 0)
-			break;
-		ret = stl_c = ret;
-	   } while (ret == 0);
-	*/
-	__asm__ __volatile__(
-		"1:	ldl_l	%0,%1\n"
-		"	subl	%0,1,%0\n"
-		"	blt	%0,2f\n"
-		"	stl_c	%0,%1\n"
-		"	beq	%0,3f\n"
-		"	mb\n"
-		"2:\n"
-		".subsection 2\n"
-		"3:	br	1b\n"
-		".previous"
-		: "=&r" (ret), "=m" (sem->count)
-		: "m" (sem->count));
-
-	return ret < 0;
-}
-
-static inline void __up(struct semaphore *sem)
-{
-	if (unlikely(atomic_inc_return(&sem->count) <= 0))
-		__up_wakeup(sem);
-}
-
-#if !defined(CONFIG_DEBUG_SEMAPHORE)
-extern inline void down(struct semaphore *sem)
-{
-	__down(sem);
-}
-extern inline int down_interruptible(struct semaphore *sem)
-{
-	return __down_interruptible(sem);
-}
-extern inline int down_trylock(struct semaphore *sem)
-{
-	return __down_trylock(sem);
-}
-extern inline void up(struct semaphore *sem)
-{
-	__up(sem);
-}
-#endif
-
-#endif
+#include <linux/semaphore.h>
diff --git a/include/asm-arm/semaphore-helper.h b/include/asm-arm/semaphore-helper.h
deleted file mode 100644
index 1d7f1987edb..00000000000
--- a/include/asm-arm/semaphore-helper.h
+++ /dev/null
@@ -1,84 +0,0 @@
-#ifndef ASMARM_SEMAPHORE_HELPER_H
-#define ASMARM_SEMAPHORE_HELPER_H
-
-/*
- * These two _must_ execute atomically wrt each other.
- */
-static inline void wake_one_more(struct semaphore * sem)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	if (atomic_read(&sem->count) <= 0)
-		sem->waking++;
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-}
-
-static inline int waking_non_zero(struct semaphore *sem)
-{
-	unsigned long flags;
-	int ret = 0;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	if (sem->waking > 0) {
-		sem->waking--;
-		ret = 1;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;
-}
-
-/*
- * waking non zero interruptible
- *	1	got the lock
- *	0	go to sleep
- *	-EINTR	interrupted
- *
- * We must undo the sem->count down_interruptible() increment while we are
- * protected by the spinlock in order to make this atomic_inc() with the
- * atomic_read() in wake_one_more(), otherwise we can race. -arca
- */
-static inline int waking_non_zero_interruptible(struct semaphore *sem,
-						struct task_struct *tsk)
-{
-	unsigned long flags;
-	int ret = 0;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	if (sem->waking > 0) {
-		sem->waking--;
-		ret = 1;
-	} else if (signal_pending(tsk)) {
-		atomic_inc(&sem->count);
-		ret = -EINTR;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;	
-}
-
-/*
- * waking_non_zero_try_lock:
- *	1	failed to lock
- *	0	got the lock
- *
- * We must undo the sem->count down_interruptible() increment while we are
- * protected by the spinlock in order to make this atomic_inc() with the
- * atomic_read() in wake_one_more(), otherwise we can race. -arca
- */
-static inline int waking_non_zero_trylock(struct semaphore *sem)
-{
-	unsigned long flags;
-	int ret = 1;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	if (sem->waking <= 0)
-		atomic_inc(&sem->count);
-	else {
-		sem->waking--;
-		ret = 0;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;
-}
-
-#endif
diff --git a/include/asm-arm/semaphore.h b/include/asm-arm/semaphore.h
index 1c8b441f89e..d9b2034ed1d 100644
--- a/include/asm-arm/semaphore.h
+++ b/include/asm-arm/semaphore.h
@@ -1,98 +1 @@
-/*
- * linux/include/asm-arm/semaphore.h
- */
-#ifndef __ASM_ARM_SEMAPHORE_H
-#define __ASM_ARM_SEMAPHORE_H
-
-#include <linux/linkage.h>
-#include <linux/spinlock.h>
-#include <linux/wait.h>
-#include <linux/rwsem.h>
-
-#include <asm/atomic.h>
-#include <asm/locks.h>
-
-struct semaphore {
-	atomic_t count;
-	int sleepers;
-	wait_queue_head_t wait;
-};
-
-#define __SEMAPHORE_INIT(name, cnt)				\
-{								\
-	.count	= ATOMIC_INIT(cnt),				\
-	.wait	= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait),	\
-}
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count)	\
-	struct semaphore name = __SEMAPHORE_INIT(name,count)
-
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name,1)
-
-static inline void sema_init(struct semaphore *sem, int val)
-{
-	atomic_set(&sem->count, val);
-	sem->sleepers = 0;
-	init_waitqueue_head(&sem->wait);
-}
-
-static inline void init_MUTEX(struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED(struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-/*
- * special register calling convention
- */
-asmlinkage void __down_failed(void);
-asmlinkage int  __down_interruptible_failed(void);
-asmlinkage int  __down_trylock_failed(void);
-asmlinkage void __up_wakeup(void);
-
-extern void __down(struct semaphore * sem);
-extern int  __down_interruptible(struct semaphore * sem);
-extern int  __down_trylock(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
-
-/*
- * This is ugly, but we want the default case to fall through.
- * "__down" is the actual routine that waits...
- */
-static inline void down(struct semaphore * sem)
-{
-	might_sleep();
-	__down_op(sem, __down_failed);
-}
-
-/*
- * This is ugly, but we want the default case to fall through.
- * "__down_interruptible" is the actual routine that waits...
- */
-static inline int down_interruptible (struct semaphore * sem)
-{
-	might_sleep();
-	return __down_op_ret(sem, __down_interruptible_failed);
-}
-
-static inline int down_trylock(struct semaphore *sem)
-{
-	return __down_op_ret(sem, __down_trylock_failed);
-}
-
-/*
- * Note! This is subtle. We jump to wake people up only if
- * the semaphore was negative (== somebody was waiting on it).
- * The default case (no contention) will result in NO
- * jumps for both down() and up().
- */
-static inline void up(struct semaphore * sem)
-{
-	__up_op(sem, __up_wakeup);
-}
-
-#endif
+#include <linux/semaphore.h>
diff --git a/include/asm-avr32/semaphore.h b/include/asm-avr32/semaphore.h
index feaf1d45338..d9b2034ed1d 100644
--- a/include/asm-avr32/semaphore.h
+++ b/include/asm-avr32/semaphore.h
@@ -1,108 +1 @@
-/*
- * SMP- and interrupt-safe semaphores.
- *
- * Copyright (C) 2006 Atmel Corporation
- *
- * Based on include/asm-i386/semaphore.h
- *   Copyright (C) 1996 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_AVR32_SEMAPHORE_H
-#define __ASM_AVR32_SEMAPHORE_H
-
-#include <linux/linkage.h>
-
-#include <asm/system.h>
-#include <asm/atomic.h>
-#include <linux/wait.h>
-#include <linux/rwsem.h>
-
-struct semaphore {
-	atomic_t count;
-	int sleepers;
-	wait_queue_head_t wait;
-};
-
-#define __SEMAPHORE_INITIALIZER(name, n)				\
-{									\
-	.count		= ATOMIC_INIT(n),				\
-	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
-}
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-
-static inline void sema_init (struct semaphore *sem, int val)
-{
-	atomic_set(&sem->count, val);
-	sem->sleepers = 0;
-	init_waitqueue_head(&sem->wait);
-}
-
-static inline void init_MUTEX (struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-void __down(struct semaphore * sem);
-int  __down_interruptible(struct semaphore * sem);
-void __up(struct semaphore * sem);
-
-/*
- * This is ugly, but we want the default case to fall through.
- * "__down_failed" is a special asm handler that calls the C
- * routine that actually waits. See arch/i386/kernel/semaphore.c
- */
-static inline void down(struct semaphore * sem)
-{
-	might_sleep();
-	if (unlikely(atomic_dec_return (&sem->count) < 0))
-		__down (sem);
-}
-
-/*
- * Interruptible try to acquire a semaphore.  If we obtained
- * it, return zero.  If we were interrupted, returns -EINTR
- */
-static inline int down_interruptible(struct semaphore * sem)
-{
-	int ret = 0;
-
-	might_sleep();
-	if (unlikely(atomic_dec_return (&sem->count) < 0))
-		ret = __down_interruptible (sem);
-	return ret;
-}
-
-/*
- * Non-blockingly attempt to down() a semaphore.
- * Returns zero if we acquired it
- */
-static inline int down_trylock(struct semaphore * sem)
-{
-	return atomic_dec_if_positive(&sem->count) < 0;
-}
-
-/*
- * Note! This is subtle. We jump to wake people up only if
- * the semaphore was negative (== somebody was waiting on it).
- * The default case (no contention) will result in NO
- * jumps for both down() and up().
- */
-static inline void up(struct semaphore * sem)
-{
-	if (unlikely(atomic_inc_return (&sem->count) <= 0))
-		__up (sem);
-}
-
-#endif /*__ASM_AVR32_SEMAPHORE_H */
+#include <linux/semaphore.h>
diff --git a/include/asm-blackfin/semaphore-helper.h b/include/asm-blackfin/semaphore-helper.h
deleted file mode 100644
index 9082b0dc3eb..00000000000
--- a/include/asm-blackfin/semaphore-helper.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Based on M68K version,	Lineo Inc.	May 2001 */
-
-#ifndef _BFIN_SEMAPHORE_HELPER_H
-#define _BFIN_SEMAPHORE_HELPER_H
-
-/*
- * SMP- and interrupt-safe semaphores helper functions.
- *
- * (C) Copyright 1996 Linus Torvalds
- *
- */
-
-#include <asm/errno.h>
-
-/*
- * These two _must_ execute atomically wrt each other.
- */
-static inline void wake_one_more(struct semaphore *sem)
-{
-	atomic_inc(&sem->waking);
-}
-
-static inline int waking_non_zero(struct semaphore *sem)
-{
-	int ret;
-	unsigned long flags = 0;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	ret = 0;
-	if (atomic_read(&sem->waking) > 0) {
-		atomic_dec(&sem->waking);
-		ret = 1;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;
-}
-
-/*
- * waking_non_zero_interruptible:
- *	1	got the lock
- *	0	go to sleep
- *	-EINTR	interrupted
- */
-static inline int waking_non_zero_interruptible(struct semaphore *sem,
-						struct task_struct *tsk)
-{
-	int ret = 0;
-	unsigned long flags = 0;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	if (atomic_read(&sem->waking) > 0) {
-		atomic_dec(&sem->waking);
-		ret = 1;
-	} else if (signal_pending(tsk)) {
-		atomic_inc(&sem->count);
-		ret = -EINTR;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;
-}
-
-/*
- * waking_non_zero_trylock:
- *	1	failed to lock
- *	0	got the lock
- */
-static inline int waking_non_zero_trylock(struct semaphore *sem)
-{
-	int ret = 1;
-	unsigned long flags = 0;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	if (atomic_read(&sem->waking) > 0) {
-		atomic_dec(&sem->waking);
-		ret = 0;
-	} else
-		atomic_inc(&sem->count);
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;
-}
-
-#endif				/* _BFIN_SEMAPHORE_HELPER_H */
diff --git a/include/asm-blackfin/semaphore.h b/include/asm-blackfin/semaphore.h
index 533f90fb2e4..d9b2034ed1d 100644
--- a/include/asm-blackfin/semaphore.h
+++ b/include/asm-blackfin/semaphore.h
@@ -1,105 +1 @@
-#ifndef _BFIN_SEMAPHORE_H
-#define _BFIN_SEMAPHORE_H
-
-#ifndef __ASSEMBLY__
-
-#include <linux/linkage.h>
-#include <linux/wait.h>
-#include <linux/spinlock.h>
-#include <linux/rwsem.h>
-#include <asm/atomic.h>
-
-/*
- * Interrupt-safe semaphores..
- *
- * (C) Copyright 1996 Linus Torvalds
- *
- * BFIN version by akbar hussain Lineo Inc  April 2001
- *
- */
-
-struct semaphore {
-	atomic_t count;
-	int sleepers;
-	wait_queue_head_t wait;
-};
-
-#define __SEMAPHORE_INITIALIZER(name, n)				\
-{									\
-	.count		= ATOMIC_INIT(n),				\
-	.sleepers	= 0,						\
-	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
-}
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-
-static inline void sema_init(struct semaphore *sem, int val)
-{
-	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER(*sem, val);
-}
-
-static inline void init_MUTEX(struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED(struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-asmlinkage void __down(struct semaphore *sem);
-asmlinkage int __down_interruptible(struct semaphore *sem);
-asmlinkage int __down_trylock(struct semaphore *sem);
-asmlinkage void __up(struct semaphore *sem);
-
-extern spinlock_t semaphore_wake_lock;
-
-/*
- * This is ugly, but we want the default case to fall through.
- * "down_failed" is a special asm handler that calls the C
- * routine that actually waits.
- */
-static inline void down(struct semaphore *sem)
-{
-	might_sleep();
-	if (atomic_dec_return(&sem->count) < 0)
-		__down(sem);
-}
-
-static inline int down_interruptible(struct semaphore *sem)
-{
-	int ret = 0;
-
-	might_sleep();
-	if (atomic_dec_return(&sem->count) < 0)
-		ret = __down_interruptible(sem);
-	return (ret);
-}
-
-static inline int down_trylock(struct semaphore *sem)
-{
-	int ret = 0;
-
-	if (atomic_dec_return(&sem->count) < 0)
-		ret = __down_trylock(sem);
-	return ret;
-}
-
-/*
- * Note! This is subtle. We jump to wake people up only if
- * the semaphore was negative (== somebody was waiting on it).
- * The default case (no contention) will result in NO
- * jumps for both down() and up().
- */
-static inline void up(struct semaphore *sem)
-{
-	if (atomic_inc_return(&sem->count) <= 0)
-		__up(sem);
-}
-
-#endif				/* __ASSEMBLY__ */
-#endif				/* _BFIN_SEMAPHORE_H */
+#include <linux/semaphore.h>
diff --git a/include/asm-cris/semaphore-helper.h b/include/asm-cris/semaphore-helper.h
deleted file mode 100644
index 27bfeca1b98..00000000000
--- a/include/asm-cris/semaphore-helper.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* $Id: semaphore-helper.h,v 1.3 2001/03/26 15:00:33 orjanf Exp $
- *
- * SMP- and interrupt-safe semaphores helper functions. Generic versions, no
- * optimizations whatsoever... 
- *
- */
-
-#ifndef _ASM_SEMAPHORE_HELPER_H
-#define _ASM_SEMAPHORE_HELPER_H
-
-#include <asm/atomic.h>
-#include <linux/errno.h>
-
-#define read(a) ((a)->counter)
-#define inc(a) (((a)->counter)++)
-#define dec(a) (((a)->counter)--)
-
-#define count_inc(a) ((*(a))++)
-
-/*
- * These two _must_ execute atomically wrt each other.
- */
-static inline void wake_one_more(struct semaphore * sem)
-{
-	atomic_inc(&sem->waking);
-}
-
-static inline int waking_non_zero(struct semaphore *sem)
-{
-	unsigned long flags;
-	int ret = 0;
-
-	local_irq_save(flags);
-	if (read(&sem->waking) > 0) {
-		dec(&sem->waking);
-		ret = 1;
-	}
-	local_irq_restore(flags);
-	return ret;
-}
-
-static inline int waking_non_zero_interruptible(struct semaphore *sem,
-						struct task_struct *tsk)
-{
-	int ret = 0;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	if (read(&sem->waking) > 0) {
-		dec(&sem->waking);
-		ret = 1;
-	} else if (signal_pending(tsk)) {
-		inc(&sem->count);
-		ret = -EINTR;
-	}
-	local_irq_restore(flags);
-	return ret;
-}
-
-static inline int waking_non_zero_trylock(struct semaphore *sem)
-{
-        int ret = 1;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	if (read(&sem->waking) <= 0)
-		inc(&sem->count);
-	else {
-		dec(&sem->waking);
-		ret = 0;
-	}
-	local_irq_restore(flags);
-	return ret;
-}
-
-#endif /* _ASM_SEMAPHORE_HELPER_H */
-
-
diff --git a/include/asm-cris/semaphore.h b/include/asm-cris/semaphore.h
index 31a4ac44819..d9b2034ed1d 100644
--- a/include/asm-cris/semaphore.h
+++ b/include/asm-cris/semaphore.h
@@ -1,133 +1 @@
-/* $Id: semaphore.h,v 1.3 2001/05/08 13:54:09 bjornw Exp $ */
-
-/* On the i386 these are coded in asm, perhaps we should as well. Later.. */
-
-#ifndef _CRIS_SEMAPHORE_H
-#define _CRIS_SEMAPHORE_H
-
-#define RW_LOCK_BIAS             0x01000000
-
-#include <linux/wait.h>
-#include <linux/spinlock.h>
-#include <linux/rwsem.h>
-
-#include <asm/system.h>
-#include <asm/atomic.h>
-
-/*
- * CRIS semaphores, implemented in C-only so far. 
- */
-
-struct semaphore {
-	atomic_t count;
-	atomic_t waking;
-	wait_queue_head_t wait;
-};
-
-#define __SEMAPHORE_INITIALIZER(name, n)				\
-{									\
-	.count		= ATOMIC_INIT(n),				\
-	.waking		= ATOMIC_INIT(0),				\
-	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)    \
-}
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-        struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-
-static inline void sema_init(struct semaphore *sem, int val)
-{
-	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
-}
-
-static inline void init_MUTEX (struct semaphore *sem)
-{
-        sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
-{
-        sema_init(sem, 0);
-}
-
-extern void __down(struct semaphore * sem);
-extern int __down_interruptible(struct semaphore * sem);
-extern int __down_trylock(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
-
-/* notice - we probably can do cli/sti here instead of saving */
-
-static inline void down(struct semaphore * sem)
-{
-	unsigned long flags;
-	int failed;
-
-	might_sleep();
-
-	/* atomically decrement the semaphores count, and if its negative, we wait */
-	cris_atomic_save(sem, flags);
-	failed = --(sem->count.counter) < 0;
-	cris_atomic_restore(sem, flags);
-	if(failed) {
-		__down(sem);
-	}
-}
-
-/*
- * This version waits in interruptible state so that the waiting
- * process can be killed.  The down_interruptible routine
- * returns negative for signalled and zero for semaphore acquired.
- */
-
-static inline int down_interruptible(struct semaphore * sem)
-{
-	unsigned long flags;
-	int failed;
-
-	might_sleep();
-
-	/* atomically decrement the semaphores count, and if its negative, we wait */
-	cris_atomic_save(sem, flags);
-	failed = --(sem->count.counter) < 0;
-	cris_atomic_restore(sem, flags);
-	if(failed)
-		failed = __down_interruptible(sem);
-	return(failed);
-}
-
-static inline int down_trylock(struct semaphore * sem)
-{
-	unsigned long flags;
-	int failed;
-
-	cris_atomic_save(sem, flags);
-	failed = --(sem->count.counter) < 0;
-	cris_atomic_restore(sem, flags);
-	if(failed)
-		failed = __down_trylock(sem);
-	return(failed);
-
-}
-
-/*
- * Note! This is subtle. We jump to wake people up only if
- * the semaphore was negative (== somebody was waiting on it).
- * The default case (no contention) will result in NO
- * jumps for both down() and up().
- */
-static inline void up(struct semaphore * sem)
-{  
-	unsigned long flags;
-	int wakeup;
-
-	/* atomically increment the semaphores count, and if it was negative, we wake people */
-	cris_atomic_save(sem, flags);
-	wakeup = ++(sem->count.counter) <= 0;
-	cris_atomic_restore(sem, flags);
-	if(wakeup) {
-		__up(sem);
-	}
-}
-
-#endif
+#include <linux/semaphore.h>
diff --git a/include/asm-frv/semaphore.h b/include/asm-frv/semaphore.h
index d7aaa1911a1..d9b2034ed1d 100644
--- a/include/asm-frv/semaphore.h
+++ b/include/asm-frv/semaphore.h
@@ -1,155 +1 @@
-/* semaphore.h: semaphores for the FR-V
- *
- * Copyright (C) 2003 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#ifndef _ASM_SEMAPHORE_H
-#define _ASM_SEMAPHORE_H
-
-#define RW_LOCK_BIAS		 0x01000000
-
-#ifndef __ASSEMBLY__
-
-#include <linux/linkage.h>
-#include <linux/wait.h>
-#include <linux/spinlock.h>
-#include <linux/rwsem.h>
-
-/*
- * the semaphore definition
- * - if counter is >0 then there are tokens available on the semaphore for down to collect
- * - if counter is <=0 then there are no spare tokens, and anyone that wants one must wait
- * - if wait_list is not empty, then there are processes waiting for the semaphore
- */
-struct semaphore {
-	unsigned		counter;
-	spinlock_t		wait_lock;
-	struct list_head	wait_list;
-#ifdef CONFIG_DEBUG_SEMAPHORE
-	unsigned		__magic;
-#endif
-};
-
-#ifdef CONFIG_DEBUG_SEMAPHORE
-# define __SEM_DEBUG_INIT(name) , (long)&(name).__magic
-#else
-# define __SEM_DEBUG_INIT(name)
-#endif
-
-
-#define __SEMAPHORE_INITIALIZER(name,count) \
-{ count, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) __SEM_DEBUG_INIT(name) }
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-
-static inline void sema_init (struct semaphore *sem, int val)
-{
-	*sem = (struct semaphore) __SEMAPHORE_INITIALIZER(*sem, val);
-}
-
-static inline void init_MUTEX (struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-extern void __down(struct semaphore *sem, unsigned long flags);
-extern int  __down_interruptible(struct semaphore *sem, unsigned long flags);
-extern void __up(struct semaphore *sem);
-
-static inline void down(struct semaphore *sem)
-{
-	unsigned long flags;
-
-#ifdef CONFIG_DEBUG_SEMAPHORE
-	CHECK_MAGIC(sem->__magic);
-#endif
-
-	spin_lock_irqsave(&sem->wait_lock, flags);
-	if (likely(sem->counter > 0)) {
-		sem->counter--;
-		spin_unlock_irqrestore(&sem->wait_lock, flags);
-	}
-	else {
-		__down(sem, flags);
-	}
-}
-
-static inline int down_interruptible(struct semaphore *sem)
-{
-	unsigned long flags;
-	int ret = 0;
-
-#ifdef CONFIG_DEBUG_SEMAPHORE
-	CHECK_MAGIC(sem->__magic);
-#endif
-
-	spin_lock_irqsave(&sem->wait_lock, flags);
-	if (likely(sem->counter > 0)) {
-		sem->counter--;
-		spin_unlock_irqrestore(&sem->wait_lock, flags);
-	}
-	else {
-		ret = __down_interruptible(sem, flags);
-	}
-	return ret;
-}
-
-/*
- * non-blockingly attempt to down() a semaphore.
- * - returns zero if we acquired it
- */
-static inline int down_trylock(struct semaphore *sem)
-{
-	unsigned long flags;
-	int success = 0;
-
-#ifdef CONFIG_DEBUG_SEMAPHORE
-	CHECK_MAGIC(sem->__magic);
-#endif
-
-	spin_lock_irqsave(&sem->wait_lock, flags);
-	if (sem->counter > 0) {
-		sem->counter--;
-		success = 1;
-	}
-	spin_unlock_irqrestore(&sem->wait_lock, flags);
-	return !success;
-}
-
-static inline void up(struct semaphore *sem)
-{
-	unsigned long flags;
-
-#ifdef CONFIG_DEBUG_SEMAPHORE
-	CHECK_MAGIC(sem->__magic);
-#endif
-
-	spin_lock_irqsave(&sem->wait_lock, flags);
-	if (!list_empty(&sem->wait_list))
-		__up(sem);
-	else
-		sem->counter++;
-	spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
-static inline int sem_getcount(struct semaphore *sem)
-{
-	return sem->counter;
-}
-
-#endif /* __ASSEMBLY__ */
-
-#endif
+#include <linux/semaphore.h>
diff --git a/include/asm-h8300/semaphore-helper.h b/include/asm-h8300/semaphore-helper.h
deleted file mode 100644
index 4fea36be5fd..00000000000
--- a/include/asm-h8300/semaphore-helper.h
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef _H8300_SEMAPHORE_HELPER_H
-#define _H8300_SEMAPHORE_HELPER_H
-
-/*
- * SMP- and interrupt-safe semaphores helper functions.
- *
- * (C) Copyright 1996 Linus Torvalds
- *
- * based on
- * m68k version by Andreas Schwab
- */
-
-#include <linux/errno.h>
-
-/*
- * These two _must_ execute atomically wrt each other.
- */
-static inline void wake_one_more(struct semaphore * sem)
-{
-	atomic_inc((atomic_t *)&sem->sleepers);
-}
-
-static inline int waking_non_zero(struct semaphore *sem)
-{
-	int ret;
-	unsigned long flags;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	ret = 0;
-	if (sem->sleepers > 0) {
-		sem->sleepers--;
-		ret = 1;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;
-}
-
-/*
- * waking_non_zero_interruptible:
- *	1	got the lock
- *	0	go to sleep
- *	-EINTR	interrupted
- */
-static inline int waking_non_zero_interruptible(struct semaphore *sem,
-						struct task_struct *tsk)
-{
-	int ret;
-	unsigned long flags;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	ret = 0;
-	if (sem->sleepers > 0) {
-		sem->sleepers--;
-		ret = 1;
-	} else if (signal_pending(tsk)) {
-		atomic_inc(&sem->count);
-		ret = -EINTR;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;
-}
-
-/*
- * waking_non_zero_trylock:
- *	1	failed to lock
- *	0	got the lock
- */
-static inline int waking_non_zero_trylock(struct semaphore *sem)
-{
-	int ret;
-	unsigned long flags;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	ret = 1;
-	if (sem->sleepers <= 0)
-		atomic_inc(&sem->count);
-	else {
-		sem->sleepers--;
-		ret = 0;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;
-}
-
-#endif
diff --git a/include/asm-h8300/semaphore.h b/include/asm-h8300/semaphore.h
index f3ffff83ff0..d9b2034ed1d 100644
--- a/include/asm-h8300/semaphore.h
+++ b/include/asm-h8300/semaphore.h
@@ -1,190 +1 @@
-#ifndef _H8300_SEMAPHORE_H
-#define _H8300_SEMAPHORE_H
-
-#define RW_LOCK_BIAS		 0x01000000
-
-#ifndef __ASSEMBLY__
-
-#include <linux/linkage.h>
-#include <linux/wait.h>
-#include <linux/spinlock.h>
-#include <linux/rwsem.h>
-
-#include <asm/system.h>
-#include <asm/atomic.h>
-
-/*
- * Interrupt-safe semaphores..
- *
- * (C) Copyright 1996 Linus Torvalds
- *
- * H8/300 version by Yoshinori Sato
- */
-
-
-struct semaphore {
-	atomic_t count;
-	int sleepers;
-	wait_queue_head_t wait;
-};
-
-#define __SEMAPHORE_INITIALIZER(name, n)				\
-{									\
-	.count		= ATOMIC_INIT(n),				\
-	.sleepers	= 0,						\
-	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
-}
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-
-static inline void sema_init (struct semaphore *sem, int val)
-{
-	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER(*sem, val);
-}
-
-static inline void init_MUTEX (struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-asmlinkage void __down_failed(void /* special register calling convention */);
-asmlinkage int  __down_failed_interruptible(void  /* params in registers */);
-asmlinkage int  __down_failed_trylock(void  /* params in registers */);
-asmlinkage void __up_wakeup(void /* special register calling convention */);
-
-asmlinkage void __down(struct semaphore * sem);
-asmlinkage int  __down_interruptible(struct semaphore * sem);
-asmlinkage int  __down_trylock(struct semaphore * sem);
-asmlinkage void __up(struct semaphore * sem);
-
-extern spinlock_t semaphore_wake_lock;
-
-/*
- * This is ugly, but we want the default case to fall through.
- * "down_failed" is a special asm handler that calls the C
- * routine that actually waits. See arch/m68k/lib/semaphore.S
- */
-static inline void down(struct semaphore * sem)
-{
-	register atomic_t *count asm("er0");
-
-	might_sleep();
-
-	count = &(sem->count);
-	__asm__ __volatile__(
-		"stc ccr,r3l\n\t"
-		"orc #0x80,ccr\n\t"
-		"mov.l %2, er1\n\t"
-		"dec.l #1,er1\n\t"
-		"mov.l er1,%0\n\t"
-		"bpl 1f\n\t"
-		"ldc r3l,ccr\n\t"
-		"mov.l %1,er0\n\t"
-		"jsr @___down\n\t"
-		"bra 2f\n"
-		"1:\n\t"
-		"ldc r3l,ccr\n"
-		"2:"
-		: "=m"(*count)
-		: "g"(sem),"m"(*count)
-		: "cc",  "er1", "er2", "er3");
-}
-
-static inline int down_interruptible(struct semaphore * sem)
-{
-	register atomic_t *count asm("er0");
-
-	might_sleep();
-
-	count = &(sem->count);
-	__asm__ __volatile__(
-		"stc ccr,r1l\n\t"
-		"orc #0x80,ccr\n\t"
-		"mov.l %3, er2\n\t"
-		"dec.l #1,er2\n\t"
-		"mov.l er2,%1\n\t"
-		"bpl 1f\n\t"
-		"ldc r1l,ccr\n\t"
-		"mov.l %2,er0\n\t"
-		"jsr @___down_interruptible\n\t"
-		"bra 2f\n"
-		"1:\n\t"
-		"ldc r1l,ccr\n\t"
-		"sub.l %0,%0\n\t"
-		"2:\n\t"
-		: "=r" (count),"=m" (*count)
-		: "g"(sem),"m"(*count)
-		: "cc", "er1", "er2", "er3");
-	return (int)count;
-}
-
-static inline int down_trylock(struct semaphore * sem)
-{
-	register atomic_t *count asm("er0");
-
-	count = &(sem->count);
-	__asm__ __volatile__(
-		"stc ccr,r3l\n\t"
-		"orc #0x80,ccr\n\t"
-		"mov.l %3,er2\n\t"
-		"dec.l #1,er2\n\t"
-		"mov.l er2,%0\n\t"
-		"bpl 1f\n\t"
-		"ldc r3l,ccr\n\t"
-		"jmp @3f\n\t"
-		LOCK_SECTION_START(".align 2\n\t")
-		"3:\n\t"
-		"mov.l %2,er0\n\t"
-		"jsr @___down_trylock\n\t"
-		"jmp @2f\n\t"
-		LOCK_SECTION_END
-		"1:\n\t"
-		"ldc r3l,ccr\n\t"
-		"sub.l %1,%1\n"
-		"2:"
-		: "=m" (*count),"=r"(count)
-		: "g"(sem),"m"(*count)
-		: "cc", "er1","er2", "er3");
-	return (int)count;
-}
-
-/*
- * Note! This is subtle. We jump to wake people up only if
- * the semaphore was negative (== somebody was waiting on it).
- * The default case (no contention) will result in NO
- * jumps for both down() and up().
- */
-static inline void up(struct semaphore * sem)
-{
-	register atomic_t *count asm("er0");
-
-	count = &(sem->count);
-	__asm__ __volatile__(
-		"stc ccr,r3l\n\t"
-		"orc #0x80,ccr\n\t"
-		"mov.l %2,er1\n\t"
-		"inc.l #1,er1\n\t"
-		"mov.l er1,%0\n\t"
-		"ldc r3l,ccr\n\t"
-		"sub.l er2,er2\n\t"
-		"cmp.l er2,er1\n\t"
-		"bgt 1f\n\t"
-		"mov.l %1,er0\n\t"
-		"jsr @___up\n"
-		"1:"
-		: "=m"(*count)
-		: "g"(sem),"m"(*count)
-		: "cc", "er1", "er2", "er3");
-}
-
-#endif /* __ASSEMBLY__ */
-
-#endif
+#include <linux/semaphore.h>
diff --git a/include/asm-ia64/semaphore.h b/include/asm-ia64/semaphore.h
index d8393d11288..d9b2034ed1d 100644
--- a/include/asm-ia64/semaphore.h
+++ b/include/asm-ia64/semaphore.h
@@ -1,99 +1 @@
-#ifndef _ASM_IA64_SEMAPHORE_H
-#define _ASM_IA64_SEMAPHORE_H
-
-/*
- * Copyright (C) 1998-2000 Hewlett-Packard Co
- * Copyright (C) 1998-2000 David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <linux/wait.h>
-#include <linux/rwsem.h>
-
-#include <asm/atomic.h>
-
-struct semaphore {
-	atomic_t count;
-	int sleepers;
-	wait_queue_head_t wait;
-};
-
-#define __SEMAPHORE_INITIALIZER(name, n)				\
-{									\
-	.count		= ATOMIC_INIT(n),				\
-	.sleepers	= 0,						\
-	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
-}
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count)					\
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name, count)
-
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name, 1)
-
-static inline void
-sema_init (struct semaphore *sem, int val)
-{
-	*sem = (struct semaphore) __SEMAPHORE_INITIALIZER(*sem, val);
-}
-
-static inline void
-init_MUTEX (struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void
-init_MUTEX_LOCKED (struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-extern void __down (struct semaphore * sem);
-extern int  __down_interruptible (struct semaphore * sem);
-extern int  __down_trylock (struct semaphore * sem);
-extern void __up (struct semaphore * sem);
-
-/*
- * Atomically decrement the semaphore's count.  If it goes negative,
- * block the calling thread in the TASK_UNINTERRUPTIBLE state.
- */
-static inline void
-down (struct semaphore *sem)
-{
-	might_sleep();
-	if (ia64_fetchadd(-1, &sem->count.counter, acq) < 1)
-		__down(sem);
-}
-
-/*
- * Atomically decrement the semaphore's count.  If it goes negative,
- * block the calling thread in the TASK_INTERRUPTIBLE state.
- */
-static inline int
-down_interruptible (struct semaphore * sem)
-{
-	int ret = 0;
-
-	might_sleep();
-	if (ia64_fetchadd(-1, &sem->count.counter, acq) < 1)
-		ret = __down_interruptible(sem);
-	return ret;
-}
-
-static inline int
-down_trylock (struct semaphore *sem)
-{
-	int ret = 0;
-
-	if (ia64_fetchadd(-1, &sem->count.counter, acq) < 1)
-		ret = __down_trylock(sem);
-	return ret;
-}
-
-static inline void
-up (struct semaphore * sem)
-{
-	if (ia64_fetchadd(1, &sem->count.counter, rel) <= -1)
-		__up(sem);
-}
-
-#endif /* _ASM_IA64_SEMAPHORE_H */
+#include <linux/semaphore.h>
diff --git a/include/asm-m32r/semaphore.h b/include/asm-m32r/semaphore.h
index b5bf95a6f2b..d9b2034ed1d 100644
--- a/include/asm-m32r/semaphore.h
+++ b/include/asm-m32r/semaphore.h
@@ -1,144 +1 @@
-#ifndef _ASM_M32R_SEMAPHORE_H
-#define _ASM_M32R_SEMAPHORE_H
-
-#include <linux/linkage.h>
-
-#ifdef __KERNEL__
-
-/*
- * SMP- and interrupt-safe semaphores..
- *
- * Copyright (C) 1996  Linus Torvalds
- * Copyright (C) 2004, 2006  Hirokazu Takata <takata at linux-m32r.org>
- */
-
-#include <linux/wait.h>
-#include <linux/rwsem.h>
-#include <asm/assembler.h>
-#include <asm/system.h>
-#include <asm/atomic.h>
-
-struct semaphore {
-	atomic_t count;
-	int sleepers;
-	wait_queue_head_t wait;
-};
-
-#define __SEMAPHORE_INITIALIZER(name, n)				\
-{									\
-	.count		= ATOMIC_INIT(n),				\
-	.sleepers	= 0,						\
-	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
-}
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-
-static inline void sema_init (struct semaphore *sem, int val)
-{
-/*
- *	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
- *
- * i'd rather use the more flexible initialization above, but sadly
- * GCC 2.7.2.3 emits a bogus warning. EGCS doesnt. Oh well.
- */
-	atomic_set(&sem->count, val);
-	sem->sleepers = 0;
-	init_waitqueue_head(&sem->wait);
-}
-
-static inline void init_MUTEX (struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-asmlinkage void __down_failed(void /* special register calling convention */);
-asmlinkage int  __down_failed_interruptible(void  /* params in registers */);
-asmlinkage int  __down_failed_trylock(void  /* params in registers */);
-asmlinkage void __up_wakeup(void /* special register calling convention */);
-
-asmlinkage void __down(struct semaphore * sem);
-asmlinkage int  __down_interruptible(struct semaphore * sem);
-asmlinkage int  __down_trylock(struct semaphore * sem);
-asmlinkage void __up(struct semaphore * sem);
-
-/*
- * Atomically decrement the semaphore's count.  If it goes negative,
- * block the calling thread in the TASK_UNINTERRUPTIBLE state.
- */
-static inline void down(struct semaphore * sem)
-{
-	might_sleep();
-	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		__down(sem);
-}
-
-/*
- * Interruptible try to acquire a semaphore.  If we obtained
- * it, return zero.  If we were interrupted, returns -EINTR
- */
-static inline int down_interruptible(struct semaphore * sem)
-{
-	int result = 0;
-
-	might_sleep();
-	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		result = __down_interruptible(sem);
-
-	return result;
-}
-
-/*
- * Non-blockingly attempt to down() a semaphore.
- * Returns zero if we acquired it
- */
-static inline int down_trylock(struct semaphore * sem)
-{
-	unsigned long flags;
-	long count;
-	int result = 0;
-
-	local_irq_save(flags);
-	__asm__ __volatile__ (
-		"# down_trylock			\n\t"
-		DCACHE_CLEAR("%0", "r4", "%1")
-		M32R_LOCK" %0, @%1;		\n\t"
-		"addi	%0, #-1;		\n\t"
-		M32R_UNLOCK" %0, @%1;		\n\t"
-		: "=&r" (count)
-		: "r" (&sem->count)
-		: "memory"
-#ifdef CONFIG_CHIP_M32700_TS1
-		, "r4"
-#endif	/* CONFIG_CHIP_M32700_TS1 */
-	);
-	local_irq_restore(flags);
-
-	if (unlikely(count < 0))
-		result = __down_trylock(sem);
-
-	return result;
-}
-
-/*
- * Note! This is subtle. We jump to wake people up only if
- * the semaphore was negative (== somebody was waiting on it).
- * The default case (no contention) will result in NO
- * jumps for both down() and up().
- */
-static inline void up(struct semaphore * sem)
-{
-	if (unlikely(atomic_inc_return(&sem->count) <= 0))
-		__up(sem);
-}
-
-#endif  /* __KERNEL__ */
-
-#endif  /* _ASM_M32R_SEMAPHORE_H */
+#include <linux/semaphore.h>
diff --git a/include/asm-m68k/semaphore-helper.h b/include/asm-m68k/semaphore-helper.h
deleted file mode 100644
index eef30ba0b49..00000000000
--- a/include/asm-m68k/semaphore-helper.h
+++ /dev/null
@@ -1,142 +0,0 @@
-#ifndef _M68K_SEMAPHORE_HELPER_H
-#define _M68K_SEMAPHORE_HELPER_H
-
-/*
- * SMP- and interrupt-safe semaphores helper functions.
- *
- * (C) Copyright 1996 Linus Torvalds
- *
- * m68k version by Andreas Schwab
- */
-
-#include <linux/errno.h>
-
-/*
- * These two _must_ execute atomically wrt each other.
- */
-static inline void wake_one_more(struct semaphore * sem)
-{
-	atomic_inc(&sem->waking);
-}
-
-#ifndef CONFIG_RMW_INSNS
-extern spinlock_t semaphore_wake_lock;
-#endif
-
-static inline int waking_non_zero(struct semaphore *sem)
-{
-	int ret;
-#ifndef CONFIG_RMW_INSNS
-	unsigned long flags;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	ret = 0;
-	if (atomic_read(&sem->waking) > 0) {
-		atomic_dec(&sem->waking);
-		ret = 1;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-#else
-	int tmp1, tmp2;
-
-	__asm__ __volatile__
-	  ("1:	movel	%1,%2\n"
-	   "    jle	2f\n"
-	   "	subql	#1,%2\n"
-	   "	casl	%1,%2,%3\n"
-	   "	jne	1b\n"
-	   "	moveq	#1,%0\n"
-	   "2:"
-	   : "=d" (ret), "=d" (tmp1), "=d" (tmp2)
-	   : "m" (sem->waking), "0" (0), "1" (sem->waking));
-#endif
-
-	return ret;
-}
-
-/*
- * waking_non_zero_interruptible:
- *	1	got the lock
- *	0	go to sleep
- *	-EINTR	interrupted
- */
-static inline int waking_non_zero_interruptible(struct semaphore *sem,
-						struct task_struct *tsk)
-{
-	int ret;
-#ifndef CONFIG_RMW_INSNS
-	unsigned long flags;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	ret = 0;
-	if (atomic_read(&sem->waking) > 0) {
-		atomic_dec(&sem->waking);
-		ret = 1;
-	} else if (signal_pending(tsk)) {
-		atomic_inc(&sem->count);
-		ret = -EINTR;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-#else
-	int tmp1, tmp2;
-
-	__asm__ __volatile__
-	  ("1:	movel	%1,%2\n"
-	   "	jle	2f\n"
-	   "	subql	#1,%2\n"
-	   "	casl	%1,%2,%3\n"
-	   "	jne	1b\n"
-	   "	moveq	#1,%0\n"
-	   "	jra	%a4\n"
-	   "2:"
-	   : "=d" (ret), "=d" (tmp1), "=d" (tmp2)
-	   : "m" (sem->waking), "i" (&&next), "0" (0), "1" (sem->waking));
-	if (signal_pending(tsk)) {
-		atomic_inc(&sem->count);
-		ret = -EINTR;
-	}
-next:
-#endif
-
-	return ret;
-}
-
-/*
- * waking_non_zero_trylock:
- *	1	failed to lock
- *	0	got the lock
- */
-static inline int waking_non_zero_trylock(struct semaphore *sem)
-{
-	int ret;
-#ifndef CONFIG_RMW_INSNS
-	unsigned long flags;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	ret = 1;
-	if (atomic_read(&sem->waking) > 0) {
-		atomic_dec(&sem->waking);
-		ret = 0;
-	} else
-		atomic_inc(&sem->count);
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-#else
-	int tmp1, tmp2;
-
-	__asm__ __volatile__
-	  ("1:	movel	%1,%2\n"
-	   "    jle	2f\n"
-	   "	subql	#1,%2\n"
-	   "	casl	%1,%2,%3\n"
-	   "	jne	1b\n"
-	   "	moveq	#0,%0\n"
-	   "2:"
-	   : "=d" (ret), "=d" (tmp1), "=d" (tmp2)
-	   : "m" (sem->waking), "0" (1), "1" (sem->waking));
-	if (ret)
-		atomic_inc(&sem->count);
-#endif
-	return ret;
-}
-
-#endif
diff --git a/include/asm-m68k/semaphore.h b/include/asm-m68k/semaphore.h
index 64d6b119bb0..d9b2034ed1d 100644
--- a/include/asm-m68k/semaphore.h
+++ b/include/asm-m68k/semaphore.h
@@ -1,163 +1 @@
-#ifndef _M68K_SEMAPHORE_H
-#define _M68K_SEMAPHORE_H
-
-#define RW_LOCK_BIAS		 0x01000000
-
-#ifndef __ASSEMBLY__
-
-#include <linux/linkage.h>
-#include <linux/wait.h>
-#include <linux/spinlock.h>
-#include <linux/rwsem.h>
-#include <linux/stringify.h>
-
-#include <asm/system.h>
-#include <asm/atomic.h>
-
-/*
- * Interrupt-safe semaphores..
- *
- * (C) Copyright 1996 Linus Torvalds
- *
- * m68k version by Andreas Schwab
- */
-
-
-struct semaphore {
-	atomic_t count;
-	atomic_t waking;
-	wait_queue_head_t wait;
-};
-
-#define __SEMAPHORE_INITIALIZER(name, n)				\
-{									\
-	.count		= ATOMIC_INIT(n),				\
-	.waking		= ATOMIC_INIT(0),				\
-	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
-}
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-
-static inline void sema_init(struct semaphore *sem, int val)
-{
-	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER(*sem, val);
-}
-
-static inline void init_MUTEX (struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-asmlinkage void __down_failed(void /* special register calling convention */);
-asmlinkage int  __down_failed_interruptible(void  /* params in registers */);
-asmlinkage int  __down_failed_trylock(void  /* params in registers */);
-asmlinkage void __up_wakeup(void /* special register calling convention */);
-
-asmlinkage void __down(struct semaphore * sem);
-asmlinkage int  __down_interruptible(struct semaphore * sem);
-asmlinkage int  __down_trylock(struct semaphore * sem);
-asmlinkage void __up(struct semaphore * sem);
-
-/*
- * This is ugly, but we want the default case to fall through.
- * "down_failed" is a special asm handler that calls the C
- * routine that actually waits. See arch/m68k/lib/semaphore.S
- */
-static inline void down(struct semaphore *sem)
-{
-	register struct semaphore *sem1 __asm__ ("%a1") = sem;
-
-	might_sleep();
-	__asm__ __volatile__(
-		"| atomic down operation\n\t"
-		"subql #1,%0@\n\t"
-		"jmi 2f\n\t"
-		"1:\n"
-		LOCK_SECTION_START(".even\n\t")
-		"2:\tpea 1b\n\t"
-		"jbra __down_failed\n"
-		LOCK_SECTION_END
-		: /* no outputs */
-		: "a" (sem1)
-		: "memory");
-}
-
-static inline int down_interruptible(struct semaphore *sem)
-{
-	register struct semaphore *sem1 __asm__ ("%a1") = sem;
-	register int result __asm__ ("%d0");
-
-	might_sleep();
-	__asm__ __volatile__(
-		"| atomic interruptible down operation\n\t"
-		"subql #1,%1@\n\t"
-		"jmi 2f\n\t"
-		"clrl %0\n"
-		"1:\n"
-		LOCK_SECTION_START(".even\n\t")
-		"2:\tpea 1b\n\t"
-		"jbra __down_failed_interruptible\n"
-		LOCK_SECTION_END
-		: "=d" (result)
-		: "a" (sem1)
-		: "memory");
-	return result;
-}
-
-static inline int down_trylock(struct semaphore *sem)
-{
-	register struct semaphore *sem1 __asm__ ("%a1") = sem;
-	register int result __asm__ ("%d0");
-
-	__asm__ __volatile__(
-		"| atomic down trylock operation\n\t"
-		"subql #1,%1@\n\t"
-		"jmi 2f\n\t"
-		"clrl %0\n"
-		"1:\n"
-		LOCK_SECTION_START(".even\n\t")
-		"2:\tpea 1b\n\t"
-		"jbra __down_failed_trylock\n"
-		LOCK_SECTION_END
-		: "=d" (result)
-		: "a" (sem1)
-		: "memory");
-	return result;
-}
-
-/*
- * Note! This is subtle. We jump to wake people up only if
- * the semaphore was negative (== somebody was waiting on it).
- * The default case (no contention) will result in NO
- * jumps for both down() and up().
- */
-static inline void up(struct semaphore *sem)
-{
-	register struct semaphore *sem1 __asm__ ("%a1") = sem;
-
-	__asm__ __volatile__(
-		"| atomic up operation\n\t"
-		"addql #1,%0@\n\t"
-		"jle 2f\n"
-		"1:\n"
-		LOCK_SECTION_START(".even\n\t")
-		"2:\t"
-		"pea 1b\n\t"
-		"jbra __up_wakeup\n"
-		LOCK_SECTION_END
-		: /* no outputs */
-		: "a" (sem1)
-		: "memory");
-}
-
-#endif /* __ASSEMBLY__ */
-
-#endif
+#include <linux/semaphore.h>
diff --git a/include/asm-m68knommu/semaphore-helper.h b/include/asm-m68knommu/semaphore-helper.h
deleted file mode 100644
index 43da7bc483c..00000000000
--- a/include/asm-m68knommu/semaphore-helper.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef _M68K_SEMAPHORE_HELPER_H
-#define _M68K_SEMAPHORE_HELPER_H
-
-/*
- * SMP- and interrupt-safe semaphores helper functions.
- *
- * (C) Copyright 1996 Linus Torvalds
- *
- * m68k version by Andreas Schwab
- */
-
-
-/*
- * These two _must_ execute atomically wrt each other.
- */
-static inline void wake_one_more(struct semaphore * sem)
-{
-	atomic_inc(&sem->waking);
-}
-
-static inline int waking_non_zero(struct semaphore *sem)
-{
-	int ret;
-	unsigned long flags;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	ret = 0;
-	if (atomic_read(&sem->waking) > 0) {
-		atomic_dec(&sem->waking);
-		ret = 1;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;
-}
-
-/*
- * waking_non_zero_interruptible:
- *	1	got the lock
- *	0	go to sleep
- *	-EINTR	interrupted
- */
-static inline int waking_non_zero_interruptible(struct semaphore *sem,
-						struct task_struct *tsk)
-{
-	int ret;
-	unsigned long flags;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	ret = 0;
-	if (atomic_read(&sem->waking) > 0) {
-		atomic_dec(&sem->waking);
-		ret = 1;
-	} else if (signal_pending(tsk)) {
-		atomic_inc(&sem->count);
-		ret = -EINTR;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;
-}
-
-/*
- * waking_non_zero_trylock:
- *	1	failed to lock
- *	0	got the lock
- */
-static inline int waking_non_zero_trylock(struct semaphore *sem)
-{
-	int ret;
-	unsigned long flags;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	ret = 1;
-	if (atomic_read(&sem->waking) > 0) {
-		atomic_dec(&sem->waking);
-		ret = 0;
-	} else
-		atomic_inc(&sem->count);
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;
-}
-
-#endif
diff --git a/include/asm-m68knommu/semaphore.h b/include/asm-m68knommu/semaphore.h
index 5779eb6c068..d9b2034ed1d 100644
--- a/include/asm-m68knommu/semaphore.h
+++ b/include/asm-m68knommu/semaphore.h
@@ -1,153 +1 @@
-#ifndef _M68K_SEMAPHORE_H
-#define _M68K_SEMAPHORE_H
-
-#define RW_LOCK_BIAS		 0x01000000
-
-#ifndef __ASSEMBLY__
-
-#include <linux/linkage.h>
-#include <linux/wait.h>
-#include <linux/spinlock.h>
-#include <linux/rwsem.h>
-
-#include <asm/system.h>
-#include <asm/atomic.h>
-
-/*
- * Interrupt-safe semaphores..
- *
- * (C) Copyright 1996 Linus Torvalds
- *
- * m68k version by Andreas Schwab
- */
-
-
-struct semaphore {
-	atomic_t count;
-	atomic_t waking;
-	wait_queue_head_t wait;
-};
-
-#define __SEMAPHORE_INITIALIZER(name, n)				\
-{									\
-	.count		= ATOMIC_INIT(n),				\
-	.waking		= ATOMIC_INIT(0),				\
-	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
-}
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-
-static inline void sema_init (struct semaphore *sem, int val)
-{
-	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER(*sem, val);
-}
-
-static inline void init_MUTEX (struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-asmlinkage void __down_failed(void /* special register calling convention */);
-asmlinkage int  __down_failed_interruptible(void  /* params in registers */);
-asmlinkage int  __down_failed_trylock(void  /* params in registers */);
-asmlinkage void __up_wakeup(void /* special register calling convention */);
-
-asmlinkage void __down(struct semaphore * sem);
-asmlinkage int  __down_interruptible(struct semaphore * sem);
-asmlinkage int  __down_trylock(struct semaphore * sem);
-asmlinkage void __up(struct semaphore * sem);
-
-extern spinlock_t semaphore_wake_lock;
-
-/*
- * This is ugly, but we want the default case to fall through.
- * "down_failed" is a special asm handler that calls the C
- * routine that actually waits. See arch/m68k/lib/semaphore.S
- */
-static inline void down(struct semaphore * sem)
-{
-	might_sleep();
-	__asm__ __volatile__(
-		"| atomic down operation\n\t"
-		"movel	%0, %%a1\n\t"
-		"lea	%%pc@(1f), %%a0\n\t"
-		"subql	#1, %%a1@\n\t"
-		"jmi __down_failed\n"
-		"1:"
-		: /* no outputs */
-		: "g" (sem)
-		: "cc", "%a0", "%a1", "memory");
-}
-
-static inline int down_interruptible(struct semaphore * sem)
-{
-	int ret;
-
-	might_sleep();
-	__asm__ __volatile__(
-		"| atomic down operation\n\t"
-		"movel	%1, %%a1\n\t"
-		"lea	%%pc@(1f), %%a0\n\t"
-		"subql	#1, %%a1@\n\t"
-		"jmi __down_failed_interruptible\n\t"
-		"clrl	%%d0\n"
-		"1: movel	%%d0, %0\n"
-		: "=d" (ret)
-		: "g" (sem)
-		: "cc", "%d0", "%a0", "%a1", "memory");
-	return(ret);
-}
-
-static inline int down_trylock(struct semaphore * sem)
-{
-	register struct semaphore *sem1 __asm__ ("%a1") = sem;
-	register int result __asm__ ("%d0");
-
-	__asm__ __volatile__(
-		"| atomic down trylock operation\n\t"
-		"subql #1,%1@\n\t"
-		"jmi 2f\n\t"
-		"clrl %0\n"
-		"1:\n"
-		".section .text.lock,\"ax\"\n"
-		".even\n"
-		"2:\tpea 1b\n\t"
-		"jbra __down_failed_trylock\n"
-		".previous"
-		: "=d" (result)
-		: "a" (sem1)
-		: "memory");
-	return result;
-}
-
-/*
- * Note! This is subtle. We jump to wake people up only if
- * the semaphore was negative (== somebody was waiting on it).
- * The default case (no contention) will result in NO
- * jumps for both down() and up().
- */
-static inline void up(struct semaphore * sem)
-{
-	__asm__ __volatile__(
-		"| atomic up operation\n\t"
-		"movel	%0, %%a1\n\t"
-		"lea	%%pc@(1f), %%a0\n\t"
-		"addql	#1, %%a1@\n\t"
-		"jle __up_wakeup\n"
-		"1:"
-		: /* no outputs */
-		: "g" (sem)
-		: "cc", "%a0", "%a1", "memory");
-}
-
-#endif /* __ASSEMBLY__ */
-
-#endif
+#include <linux/semaphore.h>
diff --git a/include/asm-mips/semaphore.h b/include/asm-mips/semaphore.h
index fdf8042b784..d9b2034ed1d 100644
--- a/include/asm-mips/semaphore.h
+++ b/include/asm-mips/semaphore.h
@@ -1,108 +1 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 1996  Linus Torvalds
- * Copyright (C) 1998, 99, 2000, 01, 04  Ralf Baechle
- * Copyright (C) 1999, 2000, 01  Silicon Graphics, Inc.
- * Copyright (C) 2000, 01 MIPS Technologies, Inc.
- *
- * In all honesty, little of the old MIPS code left - the PPC64 variant was
- * just looking nice and portable so I ripped it.  Credits to whoever wrote
- * it.
- */
-#ifndef __ASM_SEMAPHORE_H
-#define __ASM_SEMAPHORE_H
-
-/*
- * Remove spinlock-based RW semaphores; RW semaphore definitions are
- * now in rwsem.h and we use the generic lib/rwsem.c implementation.
- * Rework semaphores to use atomic_dec_if_positive.
- * -- Paul Mackerras (paulus@samba.org)
- */
-
-#ifdef __KERNEL__
-
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <linux/wait.h>
-#include <linux/rwsem.h>
-
-struct semaphore {
-	/*
-	 * Note that any negative value of count is equivalent to 0,
-	 * but additionally indicates that some process(es) might be
-	 * sleeping on `wait'.
-	 */
-	atomic_t count;
-	wait_queue_head_t wait;
-};
-
-#define __SEMAPHORE_INITIALIZER(name, n)				\
-{									\
-	.count		= ATOMIC_INIT(n),				\
-	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
-}
-
-#define __DECLARE_SEMAPHORE_GENERIC(name, count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name, count)
-
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name, 1)
-
-static inline void sema_init(struct semaphore *sem, int val)
-{
-	atomic_set(&sem->count, val);
-	init_waitqueue_head(&sem->wait);
-}
-
-static inline void init_MUTEX(struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED(struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-extern void __down(struct semaphore * sem);
-extern int  __down_interruptible(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
-
-static inline void down(struct semaphore * sem)
-{
-	might_sleep();
-
-	/*
-	 * Try to get the semaphore, take the slow path if we fail.
-	 */
-	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		__down(sem);
-}
-
-static inline int down_interruptible(struct semaphore * sem)
-{
-	int ret = 0;
-
-	might_sleep();
-
-	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		ret = __down_interruptible(sem);
-	return ret;
-}
-
-static inline int down_trylock(struct semaphore * sem)
-{
-	return atomic_dec_if_positive(&sem->count) < 0;
-}
-
-static inline void up(struct semaphore * sem)
-{
-	if (unlikely(atomic_inc_return(&sem->count) <= 0))
-		__up(sem);
-}
-
-#endif /* __KERNEL__ */
-
-#endif /* __ASM_SEMAPHORE_H */
+#include <linux/semaphore.h>
diff --git a/include/asm-mn10300/semaphore.h b/include/asm-mn10300/semaphore.h
index 5a9e1ad0b25..d9b2034ed1d 100644
--- a/include/asm-mn10300/semaphore.h
+++ b/include/asm-mn10300/semaphore.h
@@ -1,169 +1 @@
-/* MN10300 Semaphores
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-#ifndef _ASM_SEMAPHORE_H
-#define _ASM_SEMAPHORE_H
-
-#ifndef __ASSEMBLY__
-
-#include <linux/linkage.h>
-#include <linux/wait.h>
-#include <linux/spinlock.h>
-#include <linux/rwsem.h>
-
-#define SEMAPHORE_DEBUG		0
-
-/*
- * the semaphore definition
- * - if count is >0 then there are tokens available on the semaphore for down
- *   to collect
- * - if count is <=0 then there are no spare tokens, and anyone that wants one
- *   must wait
- * - if wait_list is not empty, then there are processes waiting for the
- *   semaphore
- */
-struct semaphore {
-	atomic_t		count;		/* it's not really atomic, it's
-						 * just that certain modules
-						 * expect to be able to access
-						 * it directly */
-	spinlock_t		wait_lock;
-	struct list_head	wait_list;
-#if SEMAPHORE_DEBUG
-	unsigned		__magic;
-#endif
-};
-
-#if SEMAPHORE_DEBUG
-# define __SEM_DEBUG_INIT(name) , (long)&(name).__magic
-#else
-# define __SEM_DEBUG_INIT(name)
-#endif
-
-
-#define __SEMAPHORE_INITIALIZER(name, init_count)			\
-{									\
-	.count		= ATOMIC_INIT(init_count),			\
-	.wait_lock	= __SPIN_LOCK_UNLOCKED((name).wait_lock),	\
-	.wait_list	= LIST_HEAD_INIT((name).wait_list)		\
-	__SEM_DEBUG_INIT(name)						\
-}
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name, count)
-
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name, 1)
-#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name, 0)
-
-static inline void sema_init(struct semaphore *sem, int val)
-{
-	*sem = (struct semaphore) __SEMAPHORE_INITIALIZER(*sem, val);
-}
-
-static inline void init_MUTEX(struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED(struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-extern void __down(struct semaphore *sem, unsigned long flags);
-extern int  __down_interruptible(struct semaphore *sem, unsigned long flags);
-extern void __up(struct semaphore *sem);
-
-static inline void down(struct semaphore *sem)
-{
-	unsigned long flags;
-	int count;
-
-#if SEMAPHORE_DEBUG
-	CHECK_MAGIC(sem->__magic);
-#endif
-
-	spin_lock_irqsave(&sem->wait_lock, flags);
-	count = atomic_read(&sem->count);
-	if (likely(count > 0)) {
-		atomic_set(&sem->count, count - 1);
-		spin_unlock_irqrestore(&sem->wait_lock, flags);
-	} else {
-		__down(sem, flags);
-	}
-}
-
-static inline int down_interruptible(struct semaphore *sem)
-{
-	unsigned long flags;
-	int count, ret = 0;
-
-#if SEMAPHORE_DEBUG
-	CHECK_MAGIC(sem->__magic);
-#endif
-
-	spin_lock_irqsave(&sem->wait_lock, flags);
-	count = atomic_read(&sem->count);
-	if (likely(count > 0)) {
-		atomic_set(&sem->count, count - 1);
-		spin_unlock_irqrestore(&sem->wait_lock, flags);
-	} else {
-		ret = __down_interruptible(sem, flags);
-	}
-	return ret;
-}
-
-/*
- * non-blockingly attempt to down() a semaphore.
- * - returns zero if we acquired it
- */
-static inline int down_trylock(struct semaphore *sem)
-{
-	unsigned long flags;
-	int count, success = 0;
-
-#if SEMAPHORE_DEBUG
-	CHECK_MAGIC(sem->__magic);
-#endif
-
-	spin_lock_irqsave(&sem->wait_lock, flags);
-	count = atomic_read(&sem->count);
-	if (likely(count > 0)) {
-		atomic_set(&sem->count, count - 1);
-		success = 1;
-	}
-	spin_unlock_irqrestore(&sem->wait_lock, flags);
-	return !success;
-}
-
-static inline void up(struct semaphore *sem)
-{
-	unsigned long flags;
-
-#if SEMAPHORE_DEBUG
-	CHECK_MAGIC(sem->__magic);
-#endif
-
-	spin_lock_irqsave(&sem->wait_lock, flags);
-	if (!list_empty(&sem->wait_list))
-		__up(sem);
-	else
-		atomic_set(&sem->count, atomic_read(&sem->count) + 1);
-	spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
-static inline int sem_getcount(struct semaphore *sem)
-{
-	return atomic_read(&sem->count);
-}
-
-#endif /* __ASSEMBLY__ */
-
-#endif
+#include <linux/semaphore.h>
diff --git a/include/asm-parisc/semaphore-helper.h b/include/asm-parisc/semaphore-helper.h
deleted file mode 100644
index 387f7c1277a..00000000000
--- a/include/asm-parisc/semaphore-helper.h
+++ /dev/null
@@ -1,89 +0,0 @@
-#ifndef _ASM_PARISC_SEMAPHORE_HELPER_H
-#define _ASM_PARISC_SEMAPHORE_HELPER_H
-
-/*
- * SMP- and interrupt-safe semaphores helper functions.
- *
- * (C) Copyright 1996 Linus Torvalds
- * (C) Copyright 1999 Andrea Arcangeli
- */
-
-/*
- * These two _must_ execute atomically wrt each other.
- *
- * This is trivially done with load_locked/store_cond,
- * which we have.  Let the rest of the losers suck eggs.
- */
-static __inline__ void wake_one_more(struct semaphore * sem)
-{
-	atomic_inc((atomic_t *)&sem->waking);
-}
-
-static __inline__ int waking_non_zero(struct semaphore *sem)
-{
-	unsigned long flags;
-	int ret = 0;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	if (sem->waking > 0) {
-		sem->waking--;
-		ret = 1;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;
-}
-
-/*
- * waking_non_zero_interruptible:
- *	1	got the lock
- *	0	go to sleep
- *	-EINTR	interrupted
- *
- * We must undo the sem->count down_interruptible() increment while we are
- * protected by the spinlock in order to make atomic this atomic_inc() with the
- * atomic_read() in wake_one_more(), otherwise we can race. -arca
- */
-static __inline__ int waking_non_zero_interruptible(struct semaphore *sem,
-						struct task_struct *tsk)
-{
-	unsigned long flags;
-	int ret = 0;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	if (sem->waking > 0) {
-		sem->waking--;
-		ret = 1;
-	} else if (signal_pending(tsk)) {
-		atomic_inc(&sem->count);
-		ret = -EINTR;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;
-}
-
-/*
- * waking_non_zero_trylock:
- *	1	failed to lock
- *	0	got the lock
- *
- * We must undo the sem->count down_trylock() increment while we are
- * protected by the spinlock in order to make atomic this atomic_inc() with the
- * atomic_read() in wake_one_more(), otherwise we can race. -arca
- */
-static __inline__ int waking_non_zero_trylock(struct semaphore *sem)
-{
-	unsigned long flags;
-	int ret = 1;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	if (sem->waking <= 0)
-		atomic_inc(&sem->count);
-	else {
-		sem->waking--;
-		ret = 0;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;
-}
-
-#endif /* _ASM_PARISC_SEMAPHORE_HELPER_H */
diff --git a/include/asm-parisc/semaphore.h b/include/asm-parisc/semaphore.h
index a16271cdc74..d9b2034ed1d 100644
--- a/include/asm-parisc/semaphore.h
+++ b/include/asm-parisc/semaphore.h
@@ -1,145 +1 @@
-/*    SMP- and interrupt-safe semaphores.
- *    PA-RISC version by Matthew Wilcox
- *
- *    Linux/PA-RISC Project (http://www.parisc-linux.org/)
- *    Copyright (C) 1996 Linus Torvalds
- *    Copyright (C) 1999-2001 Matthew Wilcox < willy at debian d0T org >
- *    Copyright (C) 2000 Grant Grundler < grundler a debian org >
- *
- *    This program is free software; you can redistribute it and/or modify
- *    it under the terms of the GNU General Public License as published by
- *    the Free Software Foundation; either version 2 of the License, or
- *    (at your option) any later version.
- *
- *    This program is distributed in the hope that it will be useful,
- *    but WITHOUT ANY WARRANTY; without even the implied warranty of
- *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *    GNU General Public License for more details.
- *
- *    You should have received a copy of the GNU General Public License
- *    along with this program; if not, write to the Free Software
- *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-#ifndef _ASM_PARISC_SEMAPHORE_H
-#define _ASM_PARISC_SEMAPHORE_H
-
-#include <linux/spinlock.h>
-#include <linux/wait.h>
-#include <linux/rwsem.h>
-
-#include <asm/system.h>
-
-/*
- * The `count' is initialised to the number of people who are allowed to
- * take the lock.  (Normally we want a mutex, so this is `1').  if
- * `count' is positive, the lock can be taken.  if it's 0, no-one is
- * waiting on it.  if it's -1, at least one task is waiting.
- */
-struct semaphore {
-	spinlock_t	sentry;
-	int		count;
-	wait_queue_head_t wait;
-};
-
-#define __SEMAPHORE_INITIALIZER(name, n)				\
-{									\
-	.sentry		= SPIN_LOCK_UNLOCKED,				\
-	.count		= n,						\
-	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
-}
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-
-static inline void sema_init (struct semaphore *sem, int val)
-{
-	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
-}
-
-static inline void init_MUTEX (struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-static inline int sem_getcount(struct semaphore *sem)
-{
-	return sem->count;
-}
-
-asmlinkage void __down(struct semaphore * sem);
-asmlinkage int  __down_interruptible(struct semaphore * sem);
-asmlinkage void __up(struct semaphore * sem);
-
-/* Semaphores can be `tried' from irq context.  So we have to disable
- * interrupts while we're messing with the semaphore.  Sorry.
- */
-
-static inline void down(struct semaphore * sem)
-{
-	might_sleep();
-	spin_lock_irq(&sem->sentry);
-	if (sem->count > 0) {
-		sem->count--;
-	} else {
-		__down(sem);
-	}
-	spin_unlock_irq(&sem->sentry);
-}
-
-static inline int down_interruptible(struct semaphore * sem)
-{
-	int ret = 0;
-	might_sleep();
-	spin_lock_irq(&sem->sentry);
-	if (sem->count > 0) {
-		sem->count--;
-	} else {
-		ret = __down_interruptible(sem);
-	}
-	spin_unlock_irq(&sem->sentry);
-	return ret;
-}
-
-/*
- * down_trylock returns 0 on success, 1 if we failed to get the lock.
- * May not sleep, but must preserve irq state
- */
-static inline int down_trylock(struct semaphore * sem)
-{
-	unsigned long flags;
-	int count;
-
-	spin_lock_irqsave(&sem->sentry, flags);
-	count = sem->count - 1;
-	if (count >= 0)
-		sem->count = count;
-	spin_unlock_irqrestore(&sem->sentry, flags);
-	return (count < 0);
-}
-
-/*
- * Note! This is subtle. We jump to wake people up only if
- * the semaphore was negative (== somebody was waiting on it).
- */
-static inline void up(struct semaphore * sem)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&sem->sentry, flags);
-	if (sem->count < 0) {
-		__up(sem);
-	} else {
-		sem->count++;
-	}
-	spin_unlock_irqrestore(&sem->sentry, flags);
-}
-
-#endif /* _ASM_PARISC_SEMAPHORE_H */
+#include <linux/semaphore.h>
diff --git a/include/asm-powerpc/semaphore.h b/include/asm-powerpc/semaphore.h
index 48dd32e0774..d9b2034ed1d 100644
--- a/include/asm-powerpc/semaphore.h
+++ b/include/asm-powerpc/semaphore.h
@@ -1,94 +1 @@
-#ifndef _ASM_POWERPC_SEMAPHORE_H
-#define _ASM_POWERPC_SEMAPHORE_H
-
-/*
- * Remove spinlock-based RW semaphores; RW semaphore definitions are
- * now in rwsem.h and we use the generic lib/rwsem.c implementation.
- * Rework semaphores to use atomic_dec_if_positive.
- * -- Paul Mackerras (paulus@samba.org)
- */
-
-#ifdef __KERNEL__
-
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <linux/wait.h>
-#include <linux/rwsem.h>
-
-struct semaphore {
-	/*
-	 * Note that any negative value of count is equivalent to 0,
-	 * but additionally indicates that some process(es) might be
-	 * sleeping on `wait'.
-	 */
-	atomic_t count;
-	wait_queue_head_t wait;
-};
-
-#define __SEMAPHORE_INITIALIZER(name, n)				\
-{									\
-	.count		= ATOMIC_INIT(n),				\
-	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
-}
-
-#define __DECLARE_SEMAPHORE_GENERIC(name, count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name, 1)
-
-static inline void sema_init (struct semaphore *sem, int val)
-{
-	atomic_set(&sem->count, val);
-	init_waitqueue_head(&sem->wait);
-}
-
-static inline void init_MUTEX (struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-extern void __down(struct semaphore * sem);
-extern int  __down_interruptible(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
-
-static inline void down(struct semaphore * sem)
-{
-	might_sleep();
-
-	/*
-	 * Try to get the semaphore, take the slow path if we fail.
-	 */
-	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		__down(sem);
-}
-
-static inline int down_interruptible(struct semaphore * sem)
-{
-	int ret = 0;
-
-	might_sleep();
-
-	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		ret = __down_interruptible(sem);
-	return ret;
-}
-
-static inline int down_trylock(struct semaphore * sem)
-{
-	return atomic_dec_if_positive(&sem->count) < 0;
-}
-
-static inline void up(struct semaphore * sem)
-{
-	if (unlikely(atomic_inc_return(&sem->count) <= 0))
-		__up(sem);
-}
-
-#endif /* __KERNEL__ */
-
-#endif /* _ASM_POWERPC_SEMAPHORE_H */
+#include <linux/semaphore.h>
diff --git a/include/asm-s390/semaphore.h b/include/asm-s390/semaphore.h
index 0e7001ad839..d9b2034ed1d 100644
--- a/include/asm-s390/semaphore.h
+++ b/include/asm-s390/semaphore.h
@@ -1,107 +1 @@
-/*
- *  include/asm-s390/semaphore.h
- *
- *  S390 version
- *    Copyright (C) 1999 IBM Deutschland Entwicklung GmbH, IBM Corporation
- *
- *  Derived from "include/asm-i386/semaphore.h"
- *    (C) Copyright 1996 Linus Torvalds
- */
-
-#ifndef _S390_SEMAPHORE_H
-#define _S390_SEMAPHORE_H
-
-#include <asm/system.h>
-#include <asm/atomic.h>
-#include <linux/wait.h>
-#include <linux/rwsem.h>
-
-struct semaphore {
-	/*
-	 * Note that any negative value of count is equivalent to 0,
-	 * but additionally indicates that some process(es) might be
-	 * sleeping on `wait'.
-	 */
-	atomic_t count;
-	wait_queue_head_t wait;
-};
-
-#define __SEMAPHORE_INITIALIZER(name,count) \
-	{ ATOMIC_INIT(count), __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) }
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-
-static inline void sema_init (struct semaphore *sem, int val)
-{
-	atomic_set(&sem->count, val);
-	init_waitqueue_head(&sem->wait);
-}
-
-static inline void init_MUTEX (struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-asmlinkage void __down(struct semaphore * sem);
-asmlinkage int  __down_interruptible(struct semaphore * sem);
-asmlinkage int  __down_trylock(struct semaphore * sem);
-asmlinkage void __up(struct semaphore * sem);
-
-static inline void down(struct semaphore * sem)
-{
-	might_sleep();
-	if (atomic_dec_return(&sem->count) < 0)
-		__down(sem);
-}
-
-static inline int down_interruptible(struct semaphore * sem)
-{
-	int ret = 0;
-
-	might_sleep();
-	if (atomic_dec_return(&sem->count) < 0)
-		ret = __down_interruptible(sem);
-	return ret;
-}
-
-static inline int down_trylock(struct semaphore * sem)
-{
-	int old_val, new_val;
-
-	/*
-	 * This inline assembly atomically implements the equivalent
-	 * to the following C code:
-	 *   old_val = sem->count.counter;
-	 *   if ((new_val = old_val) > 0)
-	 *       sem->count.counter = --new_val;
-	 * In the ppc code this is called atomic_dec_if_positive.
-	 */
-	asm volatile(
-		"	l	%0,0(%3)\n"
-		"0:	ltr	%1,%0\n"
-		"	jle	1f\n"
-		"	ahi	%1,-1\n"
-		"	cs	%0,%1,0(%3)\n"
-		"	jl	0b\n"
-		"1:"
-		: "=&d" (old_val), "=&d" (new_val), "=m" (sem->count.counter)
-		: "a" (&sem->count.counter), "m" (sem->count.counter)
-		: "cc", "memory");
-	return old_val <= 0;
-}
-
-static inline void up(struct semaphore * sem)
-{
-	if (atomic_inc_return(&sem->count) <= 0)
-		__up(sem);
-}
-
-#endif
+#include <linux/semaphore.h>
diff --git a/include/asm-sh/semaphore-helper.h b/include/asm-sh/semaphore-helper.h
deleted file mode 100644
index bd8230c369c..00000000000
--- a/include/asm-sh/semaphore-helper.h
+++ /dev/null
@@ -1,89 +0,0 @@
-#ifndef __ASM_SH_SEMAPHORE_HELPER_H
-#define __ASM_SH_SEMAPHORE_HELPER_H
-
-/*
- * SMP- and interrupt-safe semaphores helper functions.
- *
- * (C) Copyright 1996 Linus Torvalds
- * (C) Copyright 1999 Andrea Arcangeli
- */
-
-/*
- * These two _must_ execute atomically wrt each other.
- *
- * This is trivially done with load_locked/store_cond,
- * which we have.  Let the rest of the losers suck eggs.
- */
-static __inline__ void wake_one_more(struct semaphore * sem)
-{
-	atomic_inc((atomic_t *)&sem->sleepers);
-}
-
-static __inline__ int waking_non_zero(struct semaphore *sem)
-{
-	unsigned long flags;
-	int ret = 0;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	if (sem->sleepers > 0) {
-		sem->sleepers--;
-		ret = 1;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;
-}
-
-/*
- * waking_non_zero_interruptible:
- *	1	got the lock
- *	0	go to sleep
- *	-EINTR	interrupted
- *
- * We must undo the sem->count down_interruptible() increment while we are
- * protected by the spinlock in order to make atomic this atomic_inc() with the
- * atomic_read() in wake_one_more(), otherwise we can race. -arca
- */
-static __inline__ int waking_non_zero_interruptible(struct semaphore *sem,
-						struct task_struct *tsk)
-{
-	unsigned long flags;
-	int ret = 0;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	if (sem->sleepers > 0) {
-		sem->sleepers--;
-		ret = 1;
-	} else if (signal_pending(tsk)) {
-		atomic_inc(&sem->count);
-		ret = -EINTR;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;
-}
-
-/*
- * waking_non_zero_trylock:
- *	1	failed to lock
- *	0	got the lock
- *
- * We must undo the sem->count down_trylock() increment while we are
- * protected by the spinlock in order to make atomic this atomic_inc() with the
- * atomic_read() in wake_one_more(), otherwise we can race. -arca
- */
-static __inline__ int waking_non_zero_trylock(struct semaphore *sem)
-{
-	unsigned long flags;
-	int ret = 1;
-
-	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	if (sem->sleepers <= 0)
-		atomic_inc(&sem->count);
-	else {
-		sem->sleepers--;
-		ret = 0;
-	}
-	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
-	return ret;
-}
-
-#endif /* __ASM_SH_SEMAPHORE_HELPER_H */
diff --git a/include/asm-sh/semaphore.h b/include/asm-sh/semaphore.h
index 9e5a37c4dce..d9b2034ed1d 100644
--- a/include/asm-sh/semaphore.h
+++ b/include/asm-sh/semaphore.h
@@ -1,115 +1 @@
-#ifndef __ASM_SH_SEMAPHORE_H
-#define __ASM_SH_SEMAPHORE_H
-
-#include <linux/linkage.h>
-
-#ifdef __KERNEL__
-/*
- * SMP- and interrupt-safe semaphores.
- *
- * (C) Copyright 1996 Linus Torvalds
- *
- * SuperH verison by Niibe Yutaka
- *  (Currently no asm implementation but generic C code...)
- */
-
-#include <linux/spinlock.h>
-#include <linux/rwsem.h>
-#include <linux/wait.h>
-
-#include <asm/system.h>
-#include <asm/atomic.h>
-
-struct semaphore {
-	atomic_t count;
-	int sleepers;
-	wait_queue_head_t wait;
-};
-
-#define __SEMAPHORE_INITIALIZER(name, n)				\
-{									\
-	.count		= ATOMIC_INIT(n),				\
-	.sleepers	= 0,						\
-	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
-}
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-
-static inline void sema_init (struct semaphore *sem, int val)
-{
-/*
- *	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
- *
- * i'd rather use the more flexible initialization above, but sadly
- * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well.
- */
-	atomic_set(&sem->count, val);
-	sem->sleepers = 0;
-	init_waitqueue_head(&sem->wait);
-}
-
-static inline void init_MUTEX (struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-#if 0
-asmlinkage void __down_failed(void /* special register calling convention */);
-asmlinkage int  __down_failed_interruptible(void  /* params in registers */);
-asmlinkage int  __down_failed_trylock(void  /* params in registers */);
-asmlinkage void __up_wakeup(void /* special register calling convention */);
-#endif
-
-asmlinkage void __down(struct semaphore * sem);
-asmlinkage int  __down_interruptible(struct semaphore * sem);
-asmlinkage int  __down_trylock(struct semaphore * sem);
-asmlinkage void __up(struct semaphore * sem);
-
-extern spinlock_t semaphore_wake_lock;
-
-static inline void down(struct semaphore * sem)
-{
-	might_sleep();
-	if (atomic_dec_return(&sem->count) < 0)
-		__down(sem);
-}
-
-static inline int down_interruptible(struct semaphore * sem)
-{
-	int ret = 0;
-
-	might_sleep();
-	if (atomic_dec_return(&sem->count) < 0)
-		ret = __down_interruptible(sem);
-	return ret;
-}
-
-static inline int down_trylock(struct semaphore * sem)
-{
-	int ret = 0;
-
-	if (atomic_dec_return(&sem->count) < 0)
-		ret = __down_trylock(sem);
-	return ret;
-}
-
-/*
- * Note! This is subtle. We jump to wake people up only if
- * the semaphore was negative (== somebody was waiting on it).
- */
-static inline void up(struct semaphore * sem)
-{
-	if (atomic_inc_return(&sem->count) <= 0)
-		__up(sem);
-}
-
-#endif
-#endif /* __ASM_SH_SEMAPHORE_H */
+#include <linux/semaphore.h>
diff --git a/include/asm-sparc/semaphore.h b/include/asm-sparc/semaphore.h
index 8018f9f4d49..d9b2034ed1d 100644
--- a/include/asm-sparc/semaphore.h
+++ b/include/asm-sparc/semaphore.h
@@ -1,192 +1 @@
-#ifndef _SPARC_SEMAPHORE_H
-#define _SPARC_SEMAPHORE_H
-
-/* Dinky, good for nothing, just barely irq safe, Sparc semaphores. */
-
-#ifdef __KERNEL__
-
-#include <asm/atomic.h>
-#include <linux/wait.h>
-#include <linux/rwsem.h>
-
-struct semaphore {
-	atomic24_t count;
-	int sleepers;
-	wait_queue_head_t wait;
-};
-
-#define __SEMAPHORE_INITIALIZER(name, n)				\
-{									\
-	.count		= ATOMIC24_INIT(n),				\
-	.sleepers	= 0,						\
-	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
-}
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-
-static inline void sema_init (struct semaphore *sem, int val)
-{
-	atomic24_set(&sem->count, val);
-	sem->sleepers = 0;
-	init_waitqueue_head(&sem->wait);
-}
-
-static inline void init_MUTEX (struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-extern void __down(struct semaphore * sem);
-extern int __down_interruptible(struct semaphore * sem);
-extern int __down_trylock(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
-
-static inline void down(struct semaphore * sem)
-{
-	register volatile int *ptr asm("g1");
-	register int increment asm("g2");
-
-	might_sleep();
-
-	ptr = &(sem->count.counter);
-	increment = 1;
-
-	__asm__ __volatile__(
-	"mov	%%o7, %%g4\n\t"
-	"call	___atomic24_sub\n\t"
-	" add	%%o7, 8, %%o7\n\t"
-	"tst	%%g2\n\t"
-	"bl	2f\n\t"
-	" nop\n"
-	"1:\n\t"
-	".subsection 2\n"
-	"2:\n\t"
-	"save	%%sp, -64, %%sp\n\t"
-	"mov	%%g1, %%l1\n\t"
-	"mov	%%g5, %%l5\n\t"
-	"call	%3\n\t"
-	" mov	%%g1, %%o0\n\t"
-	"mov	%%l1, %%g1\n\t"
-	"ba	1b\n\t"
-	" restore %%l5, %%g0, %%g5\n\t"
-	".previous\n"
-	: "=&r" (increment)
-	: "0" (increment), "r" (ptr), "i" (__down)
-	: "g3", "g4", "g7", "memory", "cc");
-}
-
-static inline int down_interruptible(struct semaphore * sem)
-{
-	register volatile int *ptr asm("g1");
-	register int increment asm("g2");
-
-	might_sleep();
-
-	ptr = &(sem->count.counter);
-	increment = 1;
-
-	__asm__ __volatile__(
-	"mov	%%o7, %%g4\n\t"
-	"call	___atomic24_sub\n\t"
-	" add	%%o7, 8, %%o7\n\t"
-	"tst	%%g2\n\t"
-	"bl	2f\n\t"
-	" clr	%%g2\n"
-	"1:\n\t"
-	".subsection 2\n"
-	"2:\n\t"
-	"save	%%sp, -64, %%sp\n\t"
-	"mov	%%g1, %%l1\n\t"
-	"mov	%%g5, %%l5\n\t"
-	"call	%3\n\t"
-	" mov	%%g1, %%o0\n\t"
-	"mov	%%l1, %%g1\n\t"
-	"mov	%%l5, %%g5\n\t"
-	"ba	1b\n\t"
-	" restore %%o0, %%g0, %%g2\n\t"
-	".previous\n"
-	: "=&r" (increment)
-	: "0" (increment), "r" (ptr), "i" (__down_interruptible)
-	: "g3", "g4", "g7", "memory", "cc");
-
-	return increment;
-}
-
-static inline int down_trylock(struct semaphore * sem)
-{
-	register volatile int *ptr asm("g1");
-	register int increment asm("g2");
-
-	ptr = &(sem->count.counter);
-	increment = 1;
-
-	__asm__ __volatile__(
-	"mov	%%o7, %%g4\n\t"
-	"call	___atomic24_sub\n\t"
-	" add	%%o7, 8, %%o7\n\t"
-	"tst	%%g2\n\t"
-	"bl	2f\n\t"
-	" clr	%%g2\n"
-	"1:\n\t"
-	".subsection 2\n"
-	"2:\n\t"
-	"save	%%sp, -64, %%sp\n\t"
-	"mov	%%g1, %%l1\n\t"
-	"mov	%%g5, %%l5\n\t"
-	"call	%3\n\t"
-	" mov	%%g1, %%o0\n\t"
-	"mov	%%l1, %%g1\n\t"
-	"mov	%%l5, %%g5\n\t"
-	"ba	1b\n\t"
-	" restore %%o0, %%g0, %%g2\n\t"
-	".previous\n"
-	: "=&r" (increment)
-	: "0" (increment), "r" (ptr), "i" (__down_trylock)
-	: "g3", "g4", "g7", "memory", "cc");
-
-	return increment;
-}
-
-static inline void up(struct semaphore * sem)
-{
-	register volatile int *ptr asm("g1");
-	register int increment asm("g2");
-
-	ptr = &(sem->count.counter);
-	increment = 1;
-
-	__asm__ __volatile__(
-	"mov	%%o7, %%g4\n\t"
-	"call	___atomic24_add\n\t"
-	" add	%%o7, 8, %%o7\n\t"
-	"tst	%%g2\n\t"
-	"ble	2f\n\t"
-	" nop\n"
-	"1:\n\t"
-	".subsection 2\n"
-	"2:\n\t"
-	"save	%%sp, -64, %%sp\n\t"
-	"mov	%%g1, %%l1\n\t"
-	"mov	%%g5, %%l5\n\t"
-	"call	%3\n\t"
-	" mov	%%g1, %%o0\n\t"
-	"mov	%%l1, %%g1\n\t"
-	"ba	1b\n\t"
-	" restore %%l5, %%g0, %%g5\n\t"
-	".previous\n"
-	: "=&r" (increment)
-	: "0" (increment), "r" (ptr), "i" (__up)
-	: "g3", "g4", "g7", "memory", "cc");
-}	
-
-#endif /* __KERNEL__ */
-
-#endif /* !(_SPARC_SEMAPHORE_H) */
+#include <linux/semaphore.h>
diff --git a/include/asm-sparc64/semaphore.h b/include/asm-sparc64/semaphore.h
index 7f7c0c4e024..d9b2034ed1d 100644
--- a/include/asm-sparc64/semaphore.h
+++ b/include/asm-sparc64/semaphore.h
@@ -1,53 +1 @@
-#ifndef _SPARC64_SEMAPHORE_H
-#define _SPARC64_SEMAPHORE_H
-
-/* These are actually reasonable on the V9.
- *
- * See asm-ppc/semaphore.h for implementation commentary,
- * only sparc64 specific issues are commented here.
- */
-#ifdef __KERNEL__
-
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <linux/wait.h>
-#include <linux/rwsem.h>
-
-struct semaphore {
-	atomic_t count;
-	wait_queue_head_t wait;
-};
-
-#define __SEMAPHORE_INITIALIZER(name, count) \
-	{ ATOMIC_INIT(count), \
-	  __WAIT_QUEUE_HEAD_INITIALIZER((name).wait) }
-
-#define __DECLARE_SEMAPHORE_GENERIC(name, count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name, 1)
-
-static inline void sema_init (struct semaphore *sem, int val)
-{
-	atomic_set(&sem->count, val);
-	init_waitqueue_head(&sem->wait);
-}
-
-static inline void init_MUTEX (struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-extern void up(struct semaphore *sem);
-extern void down(struct semaphore *sem);
-extern int down_trylock(struct semaphore *sem);
-extern int down_interruptible(struct semaphore *sem);
-
-#endif /* __KERNEL__ */
-
-#endif /* !(_SPARC64_SEMAPHORE_H) */
+#include <linux/semaphore.h>
diff --git a/include/asm-um/semaphore.h b/include/asm-um/semaphore.h
index ff13c34de42..d9b2034ed1d 100644
--- a/include/asm-um/semaphore.h
+++ b/include/asm-um/semaphore.h
@@ -1,6 +1 @@
-#ifndef __UM_SEMAPHORE_H
-#define __UM_SEMAPHORE_H
-
-#include "asm/arch/semaphore.h"
-
-#endif
+#include <linux/semaphore.h>
diff --git a/include/asm-v850/semaphore.h b/include/asm-v850/semaphore.h
index 10ed0ccf37d..d9b2034ed1d 100644
--- a/include/asm-v850/semaphore.h
+++ b/include/asm-v850/semaphore.h
@@ -1,84 +1 @@
-#ifndef __V850_SEMAPHORE_H__
-#define __V850_SEMAPHORE_H__
-
-#include <linux/linkage.h>
-#include <linux/spinlock.h>
-#include <linux/wait.h>
-#include <linux/rwsem.h>
-
-#include <asm/atomic.h>
-
-struct semaphore {
-	atomic_t count;
-	int sleepers;
-	wait_queue_head_t wait;
-};
-
-#define __SEMAPHORE_INITIALIZER(name,count)				      \
-	{ ATOMIC_INIT (count), 0,					      \
-	  __WAIT_QUEUE_HEAD_INITIALIZER ((name).wait) }
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count)	\
-	struct semaphore name = __SEMAPHORE_INITIALIZER (name,count)
-
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC (name,1)
-
-static inline void sema_init (struct semaphore *sem, int val)
-{
-	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
-}
-
-static inline void init_MUTEX (struct semaphore *sem)
-{
-	sema_init (sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
-{
-	sema_init (sem, 0);
-}
-
-/*
- * special register calling convention
- */
-asmlinkage void __down_failed (void);
-asmlinkage int  __down_interruptible_failed (void);
-asmlinkage int  __down_trylock_failed (void);
-asmlinkage void __up_wakeup (void);
-
-extern void __down (struct semaphore * sem);
-extern int  __down_interruptible (struct semaphore * sem);
-extern int  __down_trylock (struct semaphore * sem);
-extern void __up (struct semaphore * sem);
-
-static inline void down (struct semaphore * sem)
-{
-	might_sleep();
-	if (atomic_dec_return (&sem->count) < 0)
-		__down (sem);
-}
-
-static inline int down_interruptible (struct semaphore * sem)
-{
-	int ret = 0;
-	might_sleep();
-	if (atomic_dec_return (&sem->count) < 0)
-		ret = __down_interruptible (sem);
-	return ret;
-}
-
-static inline int down_trylock (struct semaphore *sem)
-{
-	int ret = 0;
-	if (atomic_dec_return (&sem->count) < 0)
-		ret = __down_trylock (sem);
-	return ret;
-}
-
-static inline void up (struct semaphore * sem)
-{
-	if (atomic_inc_return (&sem->count) <= 0)
-		__up (sem);
-}
-
-#endif /* __V850_SEMAPHORE_H__ */
+#include <linux/semaphore.h>
diff --git a/include/asm-x86/semaphore.h b/include/asm-x86/semaphore.h
index 572c0b67a6b..d9b2034ed1d 100644
--- a/include/asm-x86/semaphore.h
+++ b/include/asm-x86/semaphore.h
@@ -1,5 +1 @@
-#ifdef CONFIG_X86_32
-# include "semaphore_32.h"
-#else
-# include "semaphore_64.h"
-#endif
+#include <linux/semaphore.h>
diff --git a/include/asm-x86/semaphore_32.h b/include/asm-x86/semaphore_32.h
deleted file mode 100644
index ac96d3804d0..00000000000
--- a/include/asm-x86/semaphore_32.h
+++ /dev/null
@@ -1,175 +0,0 @@
-#ifndef _I386_SEMAPHORE_H
-#define _I386_SEMAPHORE_H
-
-#include <linux/linkage.h>
-
-#ifdef __KERNEL__
-
-/*
- * SMP- and interrupt-safe semaphores..
- *
- * (C) Copyright 1996 Linus Torvalds
- *
- * Modified 1996-12-23 by Dave Grothe <dave@gcom.com> to fix bugs in
- *                     the original code and to make semaphore waits
- *                     interruptible so that processes waiting on
- *                     semaphores can be killed.
- * Modified 1999-02-14 by Andrea Arcangeli, split the sched.c helper
- *		       functions in asm/sempahore-helper.h while fixing a
- *		       potential and subtle race discovered by Ulrich Schmid
- *		       in down_interruptible(). Since I started to play here I
- *		       also implemented the `trylock' semaphore operation.
- *          1999-07-02 Artur Skawina <skawina@geocities.com>
- *                     Optimized "0(ecx)" -> "(ecx)" (the assembler does not
- *                     do this). Changed calling sequences from push/jmp to
- *                     traditional call/ret.
- * Modified 2001-01-01 Andreas Franck <afranck@gmx.de>
- *		       Some hacks to ensure compatibility with recent
- *		       GCC snapshots, to avoid stack corruption when compiling
- *		       with -fomit-frame-pointer. It's not sure if this will
- *		       be fixed in GCC, as our previous implementation was a
- *		       bit dubious.
- *
- * If you would like to see an analysis of this implementation, please
- * ftp to gcom.com and download the file
- * /pub/linux/src/semaphore/semaphore-2.0.24.tar.gz.
- *
- */
-
-#include <asm/system.h>
-#include <asm/atomic.h>
-#include <linux/wait.h>
-#include <linux/rwsem.h>
-
-struct semaphore {
-	atomic_t count;
-	int sleepers;
-	wait_queue_head_t wait;
-};
-
-
-#define __SEMAPHORE_INITIALIZER(name, n)				\
-{									\
-	.count		= ATOMIC_INIT(n),				\
-	.sleepers	= 0,						\
-	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
-}
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-
-static inline void sema_init (struct semaphore *sem, int val)
-{
-/*
- *	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
- *
- * i'd rather use the more flexible initialization above, but sadly
- * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well.
- */
-	atomic_set(&sem->count, val);
-	sem->sleepers = 0;
-	init_waitqueue_head(&sem->wait);
-}
-
-static inline void init_MUTEX (struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-extern asmregparm void __down_failed(atomic_t *count_ptr);
-extern asmregparm int  __down_failed_interruptible(atomic_t *count_ptr);
-extern asmregparm int  __down_failed_trylock(atomic_t *count_ptr);
-extern asmregparm void __up_wakeup(atomic_t *count_ptr);
-
-/*
- * This is ugly, but we want the default case to fall through.
- * "__down_failed" is a special asm handler that calls the C
- * routine that actually waits. See arch/i386/kernel/semaphore.c
- */
-static inline void down(struct semaphore * sem)
-{
-	might_sleep();
-	__asm__ __volatile__(
-		"# atomic down operation\n\t"
-		LOCK_PREFIX "decl %0\n\t"     /* --sem->count */
-		"jns 2f\n"
-		"\tlea %0,%%eax\n\t"
-		"call __down_failed\n"
-		"2:"
-		:"+m" (sem->count)
-		:
-		:"memory","ax");
-}
-
-/*
- * Interruptible try to acquire a semaphore.  If we obtained
- * it, return zero.  If we were interrupted, returns -EINTR
- */
-static inline int down_interruptible(struct semaphore * sem)
-{
-	int result;
-
-	might_sleep();
-	__asm__ __volatile__(
-		"# atomic interruptible down operation\n\t"
-		"xorl %0,%0\n\t"
-		LOCK_PREFIX "decl %1\n\t"     /* --sem->count */
-		"jns 2f\n\t"
-		"lea %1,%%eax\n\t"
-		"call __down_failed_interruptible\n"
-		"2:"
-		:"=&a" (result), "+m" (sem->count)
-		:
-		:"memory");
-	return result;
-}
-
-/*
- * Non-blockingly attempt to down() a semaphore.
- * Returns zero if we acquired it
- */
-static inline int down_trylock(struct semaphore * sem)
-{
-	int result;
-
-	__asm__ __volatile__(
-		"# atomic interruptible down operation\n\t"
-		"xorl %0,%0\n\t"
-		LOCK_PREFIX "decl %1\n\t"     /* --sem->count */
-		"jns 2f\n\t"
-		"lea %1,%%eax\n\t"
-		"call __down_failed_trylock\n\t"
-		"2:\n"
-		:"=&a" (result), "+m" (sem->count)
-		:
-		:"memory");
-	return result;
-}
-
-/*
- * Note! This is subtle. We jump to wake people up only if
- * the semaphore was negative (== somebody was waiting on it).
- */
-static inline void up(struct semaphore * sem)
-{
-	__asm__ __volatile__(
-		"# atomic up operation\n\t"
-		LOCK_PREFIX "incl %0\n\t"     /* ++sem->count */
-		"jg 1f\n\t"
-		"lea %0,%%eax\n\t"
-		"call __up_wakeup\n"
-		"1:"
-		:"+m" (sem->count)
-		:
-		:"memory","ax");
-}
-
-#endif
-#endif
diff --git a/include/asm-x86/semaphore_64.h b/include/asm-x86/semaphore_64.h
deleted file mode 100644
index 79694306bf7..00000000000
--- a/include/asm-x86/semaphore_64.h
+++ /dev/null
@@ -1,180 +0,0 @@
-#ifndef _X86_64_SEMAPHORE_H
-#define _X86_64_SEMAPHORE_H
-
-#include <linux/linkage.h>
-
-#ifdef __KERNEL__
-
-/*
- * SMP- and interrupt-safe semaphores..
- *
- * (C) Copyright 1996 Linus Torvalds
- *
- * Modified 1996-12-23 by Dave Grothe <dave@gcom.com> to fix bugs in
- *                     the original code and to make semaphore waits
- *                     interruptible so that processes waiting on
- *                     semaphores can be killed.
- * Modified 1999-02-14 by Andrea Arcangeli, split the sched.c helper
- *		       functions in asm/sempahore-helper.h while fixing a
- *		       potential and subtle race discovered by Ulrich Schmid
- *		       in down_interruptible(). Since I started to play here I
- *		       also implemented the `trylock' semaphore operation.
- *          1999-07-02 Artur Skawina <skawina@geocities.com>
- *                     Optimized "0(ecx)" -> "(ecx)" (the assembler does not
- *                     do this). Changed calling sequences from push/jmp to
- *                     traditional call/ret.
- * Modified 2001-01-01 Andreas Franck <afranck@gmx.de>
- *		       Some hacks to ensure compatibility with recent
- *		       GCC snapshots, to avoid stack corruption when compiling
- *		       with -fomit-frame-pointer. It's not sure if this will
- *		       be fixed in GCC, as our previous implementation was a
- *		       bit dubious.
- *
- * If you would like to see an analysis of this implementation, please
- * ftp to gcom.com and download the file
- * /pub/linux/src/semaphore/semaphore-2.0.24.tar.gz.
- *
- */
-
-#include <asm/system.h>
-#include <asm/atomic.h>
-#include <asm/rwlock.h>
-#include <linux/wait.h>
-#include <linux/rwsem.h>
-#include <linux/stringify.h>
-
-struct semaphore {
-	atomic_t count;
-	int sleepers;
-	wait_queue_head_t wait;
-};
-
-#define __SEMAPHORE_INITIALIZER(name, n)				\
-{									\
-	.count		= ATOMIC_INIT(n),				\
-	.sleepers	= 0,						\
-	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
-}
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-
-static inline void sema_init (struct semaphore *sem, int val)
-{
-/*
- *	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
- *
- * i'd rather use the more flexible initialization above, but sadly
- * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well.
- */
-	atomic_set(&sem->count, val);
-	sem->sleepers = 0;
-	init_waitqueue_head(&sem->wait);
-}
-
-static inline void init_MUTEX (struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-asmlinkage void __down_failed(void /* special register calling convention */);
-asmlinkage int  __down_failed_interruptible(void  /* params in registers */);
-asmlinkage int  __down_failed_trylock(void  /* params in registers */);
-asmlinkage void __up_wakeup(void /* special register calling convention */);
-
-asmlinkage void __down(struct semaphore * sem);
-asmlinkage int  __down_interruptible(struct semaphore * sem);
-asmlinkage int  __down_trylock(struct semaphore * sem);
-asmlinkage void __up(struct semaphore * sem);
-
-/*
- * This is ugly, but we want the default case to fall through.
- * "__down_failed" is a special asm handler that calls the C
- * routine that actually waits. See arch/x86_64/kernel/semaphore.c
- */
-static inline void down(struct semaphore * sem)
-{
-	might_sleep();
-
-	__asm__ __volatile__(
-		"# atomic down operation\n\t"
-		LOCK_PREFIX "decl %0\n\t"     /* --sem->count */
-		"jns 1f\n\t"
-		"call __down_failed\n"
-		"1:"
-		:"=m" (sem->count)
-		:"D" (sem)
-		:"memory");
-}
-
-/*
- * Interruptible try to acquire a semaphore.  If we obtained
- * it, return zero.  If we were interrupted, returns -EINTR
- */
-static inline int down_interruptible(struct semaphore * sem)
-{
-	int result;
-
-	might_sleep();
-
-	__asm__ __volatile__(
-		"# atomic interruptible down operation\n\t"
-		"xorl %0,%0\n\t"
-		LOCK_PREFIX "decl %1\n\t"     /* --sem->count */
-		"jns 2f\n\t"
-		"call __down_failed_interruptible\n"
-		"2:\n"
-		:"=&a" (result), "=m" (sem->count)
-		:"D" (sem)
-		:"memory");
-	return result;
-}
-
-/*
- * Non-blockingly attempt to down() a semaphore.
- * Returns zero if we acquired it
- */
-static inline int down_trylock(struct semaphore * sem)
-{
-	int result;
-
-	__asm__ __volatile__(
-		"# atomic interruptible down operation\n\t"
-		"xorl %0,%0\n\t"
-		LOCK_PREFIX "decl %1\n\t"     /* --sem->count */
-		"jns 2f\n\t"
-		"call __down_failed_trylock\n\t"
-		"2:\n"
-		:"=&a" (result), "=m" (sem->count)
-		:"D" (sem)
-		:"memory","cc");
-	return result;
-}
-
-/*
- * Note! This is subtle. We jump to wake people up only if
- * the semaphore was negative (== somebody was waiting on it).
- * The default case (no contention) will result in NO
- * jumps for both down() and up().
- */
-static inline void up(struct semaphore * sem)
-{
-	__asm__ __volatile__(
-		"# atomic up operation\n\t"
-		LOCK_PREFIX "incl %0\n\t"     /* ++sem->count */
-		"jg 1f\n\t"
-		"call __up_wakeup\n"
-		"1:"
-		:"=m" (sem->count)
-		:"D" (sem)
-		:"memory");
-}
-#endif /* __KERNEL__ */
-#endif
diff --git a/include/asm-xtensa/semaphore.h b/include/asm-xtensa/semaphore.h
index 3e04167cd9d..d9b2034ed1d 100644
--- a/include/asm-xtensa/semaphore.h
+++ b/include/asm-xtensa/semaphore.h
@@ -1,99 +1 @@
-/*
- * linux/include/asm-xtensa/semaphore.h
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 2001 - 2005 Tensilica Inc.
- */
-
-#ifndef _XTENSA_SEMAPHORE_H
-#define _XTENSA_SEMAPHORE_H
-
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <linux/wait.h>
-#include <linux/rwsem.h>
-
-struct semaphore {
-	atomic_t count;
-	int sleepers;
-	wait_queue_head_t wait;
-};
-
-#define __SEMAPHORE_INITIALIZER(name,n)					\
-{									\
-	.count		= ATOMIC_INIT(n),				\
-	.sleepers	= 0,						\
-	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
-}
-
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) 			\
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
-
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-
-static inline void sema_init (struct semaphore *sem, int val)
-{
-	atomic_set(&sem->count, val);
-	sem->sleepers = 0;
-	init_waitqueue_head(&sem->wait);
-}
-
-static inline void init_MUTEX (struct semaphore *sem)
-{
-	sema_init(sem, 1);
-}
-
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
-{
-	sema_init(sem, 0);
-}
-
-asmlinkage void __down(struct semaphore * sem);
-asmlinkage int  __down_interruptible(struct semaphore * sem);
-asmlinkage int  __down_trylock(struct semaphore * sem);
-asmlinkage void __up(struct semaphore * sem);
-
-extern spinlock_t semaphore_wake_lock;
-
-static inline void down(struct semaphore * sem)
-{
-	might_sleep();
-
-	if (atomic_sub_return(1, &sem->count) < 0)
-		__down(sem);
-}
-
-static inline int down_interruptible(struct semaphore * sem)
-{
-	int ret = 0;
-
-	might_sleep();
-
-	if (atomic_sub_return(1, &sem->count) < 0)
-		ret = __down_interruptible(sem);
-	return ret;
-}
-
-static inline int down_trylock(struct semaphore * sem)
-{
-	int ret = 0;
-
-	if (atomic_sub_return(1, &sem->count) < 0)
-		ret = __down_trylock(sem);
-	return ret;
-}
-
-/*
- * Note! This is subtle. We jump to wake people up only if
- * the semaphore was negative (== somebody was waiting on it).
- */
-static inline void up(struct semaphore * sem)
-{
-	if (atomic_add_return(1, &sem->count) <= 0)
-		__up(sem);
-}
-
-#endif /* _XTENSA_SEMAPHORE_H */
+#include <linux/semaphore.h>
diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h
new file mode 100644
index 00000000000..b3c691b089b
--- /dev/null
+++ b/include/linux/semaphore.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2008 Intel Corporation
+ * Author: Matthew Wilcox <willy@linux.intel.com>
+ *
+ * Distributed under the terms of the GNU GPL, version 2
+ *
+ * Counting semaphores allow up to <n> tasks to acquire the semaphore
+ * simultaneously.
+ */
+#ifndef __LINUX_SEMAPHORE_H
+#define __LINUX_SEMAPHORE_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+/*
+ * The spinlock controls access to the other members of the semaphore.
+ * 'count' is decremented by every task which calls down*() and incremented
+ * by every call to up().  Thus, if it is positive, it indicates how many
+ * more tasks may acquire the lock.  If it is negative, it indicates how
+ * many tasks are waiting for the lock.  Tasks waiting for the lock are
+ * kept on the wait_list.
+ */
+struct semaphore {
+	spinlock_t		lock;
+	int			count;
+	struct list_head	wait_list;
+};
+
+#define __SEMAPHORE_INITIALIZER(name, n)				\
+{									\
+	.lock		= __SPIN_LOCK_UNLOCKED((name).lock),		\
+	.count		= n,						\
+	.wait_list	= LIST_HEAD_INIT((name).wait_list),		\
+}
+
+#define __DECLARE_SEMAPHORE_GENERIC(name, count) \
+	struct semaphore name = __SEMAPHORE_INITIALIZER(name, count)
+
+#define DECLARE_MUTEX(name)	__DECLARE_SEMAPHORE_GENERIC(name, 1)
+
+static inline void sema_init(struct semaphore *sem, int val)
+{
+	static struct lock_class_key __key;
+	*sem = (struct semaphore) __SEMAPHORE_INITIALIZER(*sem, val);
+	lockdep_init_map(&sem->lock.dep_map, "semaphore->lock", &__key, 0);
+}
+
+#define init_MUTEX(sem)		sema_init(sem, 1)
+#define init_MUTEX_LOCKED(sem)	sema_init(sem, 0)
+
+/*
+ * Attempt to acquire the semaphore.  If another task is already holding the
+ * semaphore, sleep until the semaphore is released.
+ */
+extern void down(struct semaphore *sem);
+
+/*
+ * As down(), except the sleep may be interrupted by a signal.  If it is,
+ * this function will return -EINTR.
+ */
+extern int __must_check down_interruptible(struct semaphore *sem);
+
+/*
+ * As down(), except this function will not sleep.  It will return 0 if it
+ * acquired the semaphore and 1 if the semaphore was contended.  This
+ * function may be called from any context, including interrupt and softirq.
+ */
+extern int __must_check down_trylock(struct semaphore *sem);
+
+/*
+ * Release the semaphore.  Unlike mutexes, up() may be called from any
+ * context and even by tasks which have never called down().
+ */
+extern void up(struct semaphore *sem);
+
+#endif /* __LINUX_SEMAPHORE_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c584c55a6e..f45c69e6968 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
-	    hrtimer.o rwsem.o nsproxy.o srcu.o \
+	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o
 
 obj-$(CONFIG_SYSCTL) += sysctl_check.o
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
new file mode 100644
index 00000000000..d5a72702f26
--- /dev/null
+++ b/kernel/semaphore.c
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2008 Intel Corporation
+ * Author: Matthew Wilcox <willy@linux.intel.com>
+ *
+ * Distributed under the terms of the GNU GPL, version 2
+ */
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+#include <linux/spinlock.h>
+
+/*
+ * Some notes on the implementation:
+ *
+ * down_trylock() and up() can be called from interrupt context.
+ * So we have to disable interrupts when taking the lock.
+ *
+ * The ->count variable, if positive, defines how many more tasks can
+ * acquire the semaphore.  If negative, it represents how many tasks are
+ * waiting on the semaphore (*).  If zero, no tasks are waiting, and no more
+ * tasks can acquire the semaphore.
+ *
+ * (*) Except for the window between one task calling up() and the task
+ * sleeping in a __down_common() waking up.  In order to avoid a third task
+ * coming in and stealing the second task's wakeup, we leave the ->count
+ * negative.  If we have a more complex situation, the ->count may become
+ * zero or negative (eg a semaphore with count = 2, three tasks attempt to
+ * acquire it, one sleeps, two finish and call up(), the second task to call
+ * up() notices that the list is empty and just increments count).
+ */
+
+static noinline void __down(struct semaphore *sem);
+static noinline int __down_interruptible(struct semaphore *sem);
+static noinline void __up(struct semaphore *sem);
+
+void down(struct semaphore *sem)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&sem->lock, flags);
+	if (unlikely(sem->count-- <= 0))
+		__down(sem);
+	spin_unlock_irqrestore(&sem->lock, flags);
+}
+EXPORT_SYMBOL(down);
+
+int down_interruptible(struct semaphore *sem)
+{
+	unsigned long flags;
+	int result = 0;
+
+	spin_lock_irqsave(&sem->lock, flags);
+	if (unlikely(sem->count-- <= 0))
+		result = __down_interruptible(sem);
+	spin_unlock_irqrestore(&sem->lock, flags);
+
+	return result;
+}
+EXPORT_SYMBOL(down_interruptible);
+
+/**
+ * down_trylock - try to acquire the semaphore, without waiting
+ * @sem: the semaphore to be acquired
+ *
+ * Try to acquire the semaphore atomically.  Returns 0 if the mutex has
+ * been acquired successfully and 1 if it is contended.
+ *
+ * NOTE: This return value is inverted from both spin_trylock and
+ * mutex_trylock!  Be careful about this when converting code.
+ *
+ * Unlike mutex_trylock, this function can be used from interrupt context,
+ * and the semaphore can be released by any task or interrupt.
+ */
+int down_trylock(struct semaphore *sem)
+{
+	unsigned long flags;
+	int count;
+
+	spin_lock_irqsave(&sem->lock, flags);
+	count = sem->count - 1;
+	if (likely(count >= 0))
+		sem->count = count;
+	spin_unlock_irqrestore(&sem->lock, flags);
+
+	return (count < 0);
+}
+EXPORT_SYMBOL(down_trylock);
+
+void up(struct semaphore *sem)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&sem->lock, flags);
+	if (likely(sem->count >= 0))
+		sem->count++;
+	else
+		__up(sem);
+	spin_unlock_irqrestore(&sem->lock, flags);
+}
+EXPORT_SYMBOL(up);
+
+/* Functions for the contended case */
+
+struct semaphore_waiter {
+	struct list_head list;
+	struct task_struct *task;
+	int up;
+};
+
+/*
+ * Wake up a process waiting on a semaphore.  We need to call this from both
+ * __up and __down_common as it's possible to race a task into the semaphore
+ * if it comes in at just the right time between two tasks calling up() and
+ * a third task waking up.  This function assumes the wait_list is already
+ * checked for being non-empty.
+ */
+static noinline void __sched __up_down_common(struct semaphore *sem)
+{
+	struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
+						struct semaphore_waiter, list);
+	list_del(&waiter->list);
+	waiter->up = 1;
+	wake_up_process(waiter->task);
+}
+
+/*
+ * Because this function is inlined, the 'state' parameter will be constant,
+ * and thus optimised away by the compiler.
+ */
+static inline int __sched __down_common(struct semaphore *sem, long state)
+{
+	int result = 0;
+	struct task_struct *task = current;
+	struct semaphore_waiter waiter;
+
+	list_add_tail(&waiter.list, &sem->wait_list);
+	waiter.task = task;
+	waiter.up = 0;
+
+	for (;;) {
+		if (state == TASK_INTERRUPTIBLE && signal_pending(task))
+			goto interrupted;
+		__set_task_state(task, state);
+		spin_unlock_irq(&sem->lock);
+		schedule();
+		spin_lock_irq(&sem->lock);
+		if (waiter.up)
+			goto woken;
+	}
+
+ interrupted:
+	list_del(&waiter.list);
+	result = -EINTR;
+ woken:
+	/*
+	 * Account for the process which woke us up.  For the case where
+	 * we're interrupted, we need to increment the count on our own
+	 * behalf.  I don't believe we can hit the case where the
+	 * sem->count hits zero, *and* there's a second task sleeping,
+	 * but it doesn't hurt, that's not a commonly exercised path and
+	 * it's not a performance path either.
+	 */
+	if (unlikely((++sem->count >= 0) && !list_empty(&sem->wait_list)))
+		__up_down_common(sem);
+	return result;
+}
+
+static noinline void __sched __down(struct semaphore *sem)
+{
+	__down_common(sem, TASK_UNINTERRUPTIBLE);
+}
+
+static noinline int __sched __down_interruptible(struct semaphore *sem)
+{
+	return __down_common(sem, TASK_INTERRUPTIBLE);
+}
+
+static noinline void __sched __up(struct semaphore *sem)
+{
+	if (unlikely(list_empty(&sem->wait_list)))
+		sem->count++;
+	else
+		__up_down_common(sem);
+}
diff --git a/lib/Makefile b/lib/Makefile
index 23de261a4c8..28dba90d502 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -29,7 +29,6 @@ obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
 lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
-lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
diff --git a/lib/semaphore-sleepers.c b/lib/semaphore-sleepers.c
deleted file mode 100644
index 0198782cdac..00000000000
--- a/lib/semaphore-sleepers.c
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * i386 and x86-64 semaphore implementation.
- *
- * (C) Copyright 1999 Linus Torvalds
- *
- * Portions Copyright 1999 Red Hat, Inc.
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
- */
-#include <linux/sched.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <asm/semaphore.h>
-
-/*
- * Semaphores are implemented using a two-way counter:
- * The "count" variable is decremented for each process
- * that tries to acquire the semaphore, while the "sleeping"
- * variable is a count of such acquires.
- *
- * Notably, the inline "up()" and "down()" functions can
- * efficiently test if they need to do any extra work (up
- * needs to do something only if count was negative before
- * the increment operation.
- *
- * "sleeping" and the contention routine ordering is protected
- * by the spinlock in the semaphore's waitqueue head.
- *
- * Note that these functions are only called when there is
- * contention on the lock, and as such all this is the
- * "non-critical" part of the whole semaphore business. The
- * critical part is the inline stuff in <asm/semaphore.h>
- * where we want to avoid any extra jumps and calls.
- */
-
-/*
- * Logic:
- *  - only on a boundary condition do we need to care. When we go
- *    from a negative count to a non-negative, we wake people up.
- *  - when we go from a non-negative count to a negative do we
- *    (a) synchronize with the "sleeper" count and (b) make sure
- *    that we're on the wakeup list before we synchronize so that
- *    we cannot lose wakeup events.
- */
-
-void __up(struct semaphore *sem)
-{
-	wake_up(&sem->wait);
-}
-
-void __sched __down(struct semaphore *sem)
-{
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	unsigned long flags;
-
-	tsk->state = TASK_UNINTERRUPTIBLE;
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	add_wait_queue_exclusive_locked(&sem->wait, &wait);
-
-	sem->sleepers++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock in
-		 * the wait_queue_head.
-		 */
-		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-		schedule();
-
-		spin_lock_irqsave(&sem->wait.lock, flags);
-		tsk->state = TASK_UNINTERRUPTIBLE;
-	}
-	remove_wait_queue_locked(&sem->wait, &wait);
-	wake_up_locked(&sem->wait);
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-	tsk->state = TASK_RUNNING;
-}
-
-int __sched __down_interruptible(struct semaphore *sem)
-{
-	int retval = 0;
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	unsigned long flags;
-
-	tsk->state = TASK_INTERRUPTIBLE;
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	add_wait_queue_exclusive_locked(&sem->wait, &wait);
-
-	sem->sleepers++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * With signals pending, this turns into
-		 * the trylock failure case - we won't be
-		 * sleeping, and we* can't get the lock as
-		 * it has contention. Just correct the count
-		 * and exit.
-		 */
-		if (signal_pending(current)) {
-			retval = -EINTR;
-			sem->sleepers = 0;
-			atomic_add(sleepers, &sem->count);
-			break;
-		}
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock in
-		 * wait_queue_head. The "-1" is because we're
-		 * still hoping to get the semaphore.
-		 */
-		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-		schedule();
-
-		spin_lock_irqsave(&sem->wait.lock, flags);
-		tsk->state = TASK_INTERRUPTIBLE;
-	}
-	remove_wait_queue_locked(&sem->wait, &wait);
-	wake_up_locked(&sem->wait);
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-	tsk->state = TASK_RUNNING;
-	return retval;
-}
-
-/*
- * Trylock failed - make sure we correct for
- * having decremented the count.
- *
- * We could have done the trylock with a
- * single "cmpxchg" without failure cases,
- * but then it wouldn't work on a 386.
- */
-int __down_trylock(struct semaphore *sem)
-{
-	int sleepers;
-	unsigned long flags;
-
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	sleepers = sem->sleepers + 1;
-	sem->sleepers = 0;
-
-	/*
-	 * Add "everybody else" and us into it. They aren't
-	 * playing, because we own the spinlock in the
-	 * wait_queue_head.
-	 */
-	if (!atomic_add_negative(sleepers, &sem->count)) {
-		wake_up_locked(&sem->wait);
-	}
-
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-	return 1;
-}
-- 
cgit v1.2.3-70-g09d2


From f06d96865861c3dd01520f47e2e61c899db1631f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Fri, 14 Mar 2008 13:19:33 -0400
Subject: Introduce down_killable()

down_killable() is the functional counterpart of mutex_lock_killable.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
 include/linux/semaphore.h |  6 ++++++
 kernel/semaphore.c        | 22 ++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h
index b3c691b089b..88f2a28cc0f 100644
--- a/include/linux/semaphore.h
+++ b/include/linux/semaphore.h
@@ -61,6 +61,12 @@ extern void down(struct semaphore *sem);
  */
 extern int __must_check down_interruptible(struct semaphore *sem);
 
+/*
+ * As down_interruptible(), except the sleep may only be interrupted by
+ * signals which are fatal to this process.
+ */
+extern int __must_check down_killable(struct semaphore *sem);
+
 /*
  * As down(), except this function will not sleep.  It will return 0 if it
  * acquired the semaphore and 1 if the semaphore was contended.  This
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index d5a72702f26..2da2aed950f 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -34,6 +34,7 @@
 
 static noinline void __down(struct semaphore *sem);
 static noinline int __down_interruptible(struct semaphore *sem);
+static noinline int __down_killable(struct semaphore *sem);
 static noinline void __up(struct semaphore *sem);
 
 void down(struct semaphore *sem)
@@ -61,6 +62,20 @@ int down_interruptible(struct semaphore *sem)
 }
 EXPORT_SYMBOL(down_interruptible);
 
+int down_killable(struct semaphore *sem)
+{
+	unsigned long flags;
+	int result = 0;
+
+	spin_lock_irqsave(&sem->lock, flags);
+	if (unlikely(sem->count-- <= 0))
+		result = __down_killable(sem);
+	spin_unlock_irqrestore(&sem->lock, flags);
+
+	return result;
+}
+EXPORT_SYMBOL(down_killable);
+
 /**
  * down_trylock - try to acquire the semaphore, without waiting
  * @sem: the semaphore to be acquired
@@ -143,6 +158,8 @@ static inline int __sched __down_common(struct semaphore *sem, long state)
 	for (;;) {
 		if (state == TASK_INTERRUPTIBLE && signal_pending(task))
 			goto interrupted;
+		if (state == TASK_KILLABLE && fatal_signal_pending(task))
+			goto interrupted;
 		__set_task_state(task, state);
 		spin_unlock_irq(&sem->lock);
 		schedule();
@@ -178,6 +195,11 @@ static noinline int __sched __down_interruptible(struct semaphore *sem)
 	return __down_common(sem, TASK_INTERRUPTIBLE);
 }
 
+static noinline int __sched __down_killable(struct semaphore *sem)
+{
+	return __down_common(sem, TASK_KILLABLE);
+}
+
 static noinline void __sched __up(struct semaphore *sem)
 {
 	if (unlikely(list_empty(&sem->wait_list)))
-- 
cgit v1.2.3-70-g09d2


From f1241c87a16c4fe9f4f51d6ed3589f031c505e8d Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Fri, 14 Mar 2008 13:43:13 -0400
Subject: Add down_timeout and change ACPI to use it

ACPI currently emulates a timeout for semaphores with calls to
down_trylock and sleep.  This produces horrible behaviour in terms of
fairness and excessive wakeups.  Now that we have a unified semaphore
implementation, adding a real down_trylock is almost trivial.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
 drivers/acpi/osl.c        | 89 +++++++++++------------------------------------
 include/linux/semaphore.h |  6 ++++
 kernel/semaphore.c        | 42 ++++++++++++++++++----
 3 files changed, 62 insertions(+), 75 deletions(-)

(limited to 'kernel')

diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index a697fb6cf05..a498a6cc68f 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -4,6 +4,8 @@
  *  Copyright (C) 2000       Andrew Henroid
  *  Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
  *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ *  Copyright (c) 2008 Intel Corporation
+ *   Author: Matthew Wilcox <willy@linux.intel.com>
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *
@@ -37,15 +39,18 @@
 #include <linux/workqueue.h>
 #include <linux/nmi.h>
 #include <linux/acpi.h>
-#include <acpi/acpi.h>
-#include <asm/io.h>
-#include <acpi/acpi_bus.h>
-#include <acpi/processor.h>
-#include <asm/uaccess.h>
-
 #include <linux/efi.h>
 #include <linux/ioport.h>
 #include <linux/list.h>
+#include <linux/jiffies.h>
+#include <linux/semaphore.h>
+
+#include <asm/io.h>
+#include <asm/uaccess.h>
+
+#include <acpi/acpi.h>
+#include <acpi/acpi_bus.h>
+#include <acpi/processor.h>
 
 #define _COMPONENT		ACPI_OS_SERVICES
 ACPI_MODULE_NAME("osl");
@@ -764,7 +769,6 @@ acpi_os_create_semaphore(u32 max_units, u32 initial_units, acpi_handle * handle)
 {
 	struct semaphore *sem = NULL;
 
-
 	sem = acpi_os_allocate(sizeof(struct semaphore));
 	if (!sem)
 		return AE_NO_MEMORY;
@@ -791,12 +795,12 @@ acpi_status acpi_os_delete_semaphore(acpi_handle handle)
 {
 	struct semaphore *sem = (struct semaphore *)handle;
 
-
 	if (!sem)
 		return AE_BAD_PARAMETER;
 
 	ACPI_DEBUG_PRINT((ACPI_DB_MUTEX, "Deleting semaphore[%p].\n", handle));
 
+	BUG_ON(!list_empty(&sem->wait_list));
 	kfree(sem);
 	sem = NULL;
 
@@ -804,21 +808,15 @@ acpi_status acpi_os_delete_semaphore(acpi_handle handle)
 }
 
 /*
- * TODO: The kernel doesn't have a 'down_timeout' function -- had to
- * improvise.  The process is to sleep for one scheduler quantum
- * until the semaphore becomes available.  Downside is that this
- * may result in starvation for timeout-based waits when there's
- * lots of semaphore activity.
- *
  * TODO: Support for units > 1?
  */
 acpi_status acpi_os_wait_semaphore(acpi_handle handle, u32 units, u16 timeout)
 {
 	acpi_status status = AE_OK;
 	struct semaphore *sem = (struct semaphore *)handle;
+	long jiffies;
 	int ret = 0;
 
-
 	if (!sem || (units < 1))
 		return AE_BAD_PARAMETER;
 
@@ -828,58 +826,14 @@ acpi_status acpi_os_wait_semaphore(acpi_handle handle, u32 units, u16 timeout)
 	ACPI_DEBUG_PRINT((ACPI_DB_MUTEX, "Waiting for semaphore[%p|%d|%d]\n",
 			  handle, units, timeout));
 
-	/*
-	 * This can be called during resume with interrupts off.
-	 * Like boot-time, we should be single threaded and will
-	 * always get the lock if we try -- timeout or not.
-	 * If this doesn't succeed, then we will oops courtesy of
-	 * might_sleep() in down().
-	 */
-	if (!down_trylock(sem))
-		return AE_OK;
-
-	switch (timeout) {
-		/*
-		 * No Wait:
-		 * --------
-		 * A zero timeout value indicates that we shouldn't wait - just
-		 * acquire the semaphore if available otherwise return AE_TIME
-		 * (a.k.a. 'would block').
-		 */
-	case 0:
-		if (down_trylock(sem))
-			status = AE_TIME;
-		break;
-
-		/*
-		 * Wait Indefinitely:
-		 * ------------------
-		 */
-	case ACPI_WAIT_FOREVER:
-		down(sem);
-		break;
-
-		/*
-		 * Wait w/ Timeout:
-		 * ----------------
-		 */
-	default:
-		// TODO: A better timeout algorithm?
-		{
-			int i = 0;
-			static const int quantum_ms = 1000 / HZ;
-
-			ret = down_trylock(sem);
-			for (i = timeout; (i > 0 && ret != 0); i -= quantum_ms) {
-				schedule_timeout_interruptible(1);
-				ret = down_trylock(sem);
-			}
-
-			if (ret != 0)
-				status = AE_TIME;
-		}
-		break;
-	}
+	if (timeout == ACPI_WAIT_FOREVER)
+		jiffies = MAX_SCHEDULE_TIMEOUT;
+	else
+		jiffies = msecs_to_jiffies(timeout);
+	
+	ret = down_timeout(sem, jiffies);
+	if (ret)
+		status = AE_TIME;
 
 	if (ACPI_FAILURE(status)) {
 		ACPI_DEBUG_PRINT((ACPI_DB_MUTEX,
@@ -902,7 +856,6 @@ acpi_status acpi_os_signal_semaphore(acpi_handle handle, u32 units)
 {
 	struct semaphore *sem = (struct semaphore *)handle;
 
-
 	if (!sem || (units < 1))
 		return AE_BAD_PARAMETER;
 
diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h
index 88f2a28cc0f..a107aebd914 100644
--- a/include/linux/semaphore.h
+++ b/include/linux/semaphore.h
@@ -74,6 +74,12 @@ extern int __must_check down_killable(struct semaphore *sem);
  */
 extern int __must_check down_trylock(struct semaphore *sem);
 
+/*
+ * As down(), except this function will return -ETIME if it fails to
+ * acquire the semaphore within the specified number of jiffies.
+ */
+extern int __must_check down_timeout(struct semaphore *sem, long jiffies);
+
 /*
  * Release the semaphore.  Unlike mutexes, up() may be called from any
  * context and even by tasks which have never called down().
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 2da2aed950f..5a12a855898 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -35,6 +35,7 @@
 static noinline void __down(struct semaphore *sem);
 static noinline int __down_interruptible(struct semaphore *sem);
 static noinline int __down_killable(struct semaphore *sem);
+static noinline int __down_timeout(struct semaphore *sem, long jiffies);
 static noinline void __up(struct semaphore *sem);
 
 void down(struct semaphore *sem)
@@ -104,6 +105,20 @@ int down_trylock(struct semaphore *sem)
 }
 EXPORT_SYMBOL(down_trylock);
 
+int down_timeout(struct semaphore *sem, long jiffies)
+{
+	unsigned long flags;
+	int result = 0;
+
+	spin_lock_irqsave(&sem->lock, flags);
+	if (unlikely(sem->count-- <= 0))
+		result = __down_timeout(sem, jiffies);
+	spin_unlock_irqrestore(&sem->lock, flags);
+
+	return result;
+}
+EXPORT_SYMBOL(down_timeout);
+
 void up(struct semaphore *sem)
 {
 	unsigned long flags;
@@ -142,10 +157,12 @@ static noinline void __sched __up_down_common(struct semaphore *sem)
 }
 
 /*
- * Because this function is inlined, the 'state' parameter will be constant,
- * and thus optimised away by the compiler.
+ * Because this function is inlined, the 'state' parameter will be
+ * constant, and thus optimised away by the compiler.  Likewise the
+ * 'timeout' parameter for the cases without timeouts.
  */
-static inline int __sched __down_common(struct semaphore *sem, long state)
+static inline int __sched __down_common(struct semaphore *sem, long state,
+								long timeout)
 {
 	int result = 0;
 	struct task_struct *task = current;
@@ -160,14 +177,20 @@ static inline int __sched __down_common(struct semaphore *sem, long state)
 			goto interrupted;
 		if (state == TASK_KILLABLE && fatal_signal_pending(task))
 			goto interrupted;
+		if (timeout <= 0)
+			goto timed_out;
 		__set_task_state(task, state);
 		spin_unlock_irq(&sem->lock);
-		schedule();
+		timeout = schedule_timeout(timeout);
 		spin_lock_irq(&sem->lock);
 		if (waiter.up)
 			goto woken;
 	}
 
+ timed_out:
+	list_del(&waiter.list);
+	result = -ETIME;
+	goto woken;
  interrupted:
 	list_del(&waiter.list);
 	result = -EINTR;
@@ -187,17 +210,22 @@ static inline int __sched __down_common(struct semaphore *sem, long state)
 
 static noinline void __sched __down(struct semaphore *sem)
 {
-	__down_common(sem, TASK_UNINTERRUPTIBLE);
+	__down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
 }
 
 static noinline int __sched __down_interruptible(struct semaphore *sem)
 {
-	return __down_common(sem, TASK_INTERRUPTIBLE);
+	return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
 }
 
 static noinline int __sched __down_killable(struct semaphore *sem)
 {
-	return __down_common(sem, TASK_KILLABLE);
+	return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT);
+}
+
+static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies)
+{
+	return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies);
 }
 
 static noinline void __sched __up(struct semaphore *sem)
-- 
cgit v1.2.3-70-g09d2


From b17170b2fac96705db3188f093f89e8e838418e4 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Fri, 14 Mar 2008 14:35:22 -0400
Subject: Simplify semaphore implementation

By removing the negative values of 'count' and relying on the wait_list to
indicate whether we have any waiters, we can simplify the implementation
by removing the protection against an unlikely race condition.  Thanks to
David Howells for his suggestions.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
 include/linux/semaphore.h |  9 ++----
 kernel/semaphore.c        | 78 +++++++++++++++--------------------------------
 2 files changed, 27 insertions(+), 60 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h
index a107aebd914..a7125daaff9 100644
--- a/include/linux/semaphore.h
+++ b/include/linux/semaphore.h
@@ -15,15 +15,12 @@
 
 /*
  * The spinlock controls access to the other members of the semaphore.
- * 'count' is decremented by every task which calls down*() and incremented
- * by every call to up().  Thus, if it is positive, it indicates how many
- * more tasks may acquire the lock.  If it is negative, it indicates how
- * many tasks are waiting for the lock.  Tasks waiting for the lock are
- * kept on the wait_list.
+ * 'count' represents how many more tasks can acquire this semaphore.
+ * Tasks waiting for the lock are kept on the wait_list.
  */
 struct semaphore {
 	spinlock_t		lock;
-	int			count;
+	unsigned int		count;
 	struct list_head	wait_list;
 };
 
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 5a12a855898..bef977b1696 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -18,18 +18,8 @@
  * down_trylock() and up() can be called from interrupt context.
  * So we have to disable interrupts when taking the lock.
  *
- * The ->count variable, if positive, defines how many more tasks can
- * acquire the semaphore.  If negative, it represents how many tasks are
- * waiting on the semaphore (*).  If zero, no tasks are waiting, and no more
- * tasks can acquire the semaphore.
- *
- * (*) Except for the window between one task calling up() and the task
- * sleeping in a __down_common() waking up.  In order to avoid a third task
- * coming in and stealing the second task's wakeup, we leave the ->count
- * negative.  If we have a more complex situation, the ->count may become
- * zero or negative (eg a semaphore with count = 2, three tasks attempt to
- * acquire it, one sleeps, two finish and call up(), the second task to call
- * up() notices that the list is empty and just increments count).
+ * The ->count variable defines how many more tasks can acquire the
+ * semaphore.  If it's zero, there may be tasks waiting on the list.
  */
 
 static noinline void __down(struct semaphore *sem);
@@ -43,7 +33,9 @@ void down(struct semaphore *sem)
 	unsigned long flags;
 
 	spin_lock_irqsave(&sem->lock, flags);
-	if (unlikely(sem->count-- <= 0))
+	if (likely(sem->count > 0))
+		sem->count--;
+	else
 		__down(sem);
 	spin_unlock_irqrestore(&sem->lock, flags);
 }
@@ -55,7 +47,9 @@ int down_interruptible(struct semaphore *sem)
 	int result = 0;
 
 	spin_lock_irqsave(&sem->lock, flags);
-	if (unlikely(sem->count-- <= 0))
+	if (likely(sem->count > 0))
+		sem->count--;
+	else
 		result = __down_interruptible(sem);
 	spin_unlock_irqrestore(&sem->lock, flags);
 
@@ -69,7 +63,9 @@ int down_killable(struct semaphore *sem)
 	int result = 0;
 
 	spin_lock_irqsave(&sem->lock, flags);
-	if (unlikely(sem->count-- <= 0))
+	if (likely(sem->count > 0))
+		sem->count--;
+	else
 		result = __down_killable(sem);
 	spin_unlock_irqrestore(&sem->lock, flags);
 
@@ -111,7 +107,9 @@ int down_timeout(struct semaphore *sem, long jiffies)
 	int result = 0;
 
 	spin_lock_irqsave(&sem->lock, flags);
-	if (unlikely(sem->count-- <= 0))
+	if (likely(sem->count > 0))
+		sem->count--;
+	else
 		result = __down_timeout(sem, jiffies);
 	spin_unlock_irqrestore(&sem->lock, flags);
 
@@ -124,7 +122,7 @@ void up(struct semaphore *sem)
 	unsigned long flags;
 
 	spin_lock_irqsave(&sem->lock, flags);
-	if (likely(sem->count >= 0))
+	if (likely(list_empty(&sem->wait_list)))
 		sem->count++;
 	else
 		__up(sem);
@@ -140,22 +138,6 @@ struct semaphore_waiter {
 	int up;
 };
 
-/*
- * Wake up a process waiting on a semaphore.  We need to call this from both
- * __up and __down_common as it's possible to race a task into the semaphore
- * if it comes in at just the right time between two tasks calling up() and
- * a third task waking up.  This function assumes the wait_list is already
- * checked for being non-empty.
- */
-static noinline void __sched __up_down_common(struct semaphore *sem)
-{
-	struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
-						struct semaphore_waiter, list);
-	list_del(&waiter->list);
-	waiter->up = 1;
-	wake_up_process(waiter->task);
-}
-
 /*
  * Because this function is inlined, the 'state' parameter will be
  * constant, and thus optimised away by the compiler.  Likewise the
@@ -164,7 +146,6 @@ static noinline void __sched __up_down_common(struct semaphore *sem)
 static inline int __sched __down_common(struct semaphore *sem, long state,
 								long timeout)
 {
-	int result = 0;
 	struct task_struct *task = current;
 	struct semaphore_waiter waiter;
 
@@ -184,28 +165,16 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
 		timeout = schedule_timeout(timeout);
 		spin_lock_irq(&sem->lock);
 		if (waiter.up)
-			goto woken;
+			return 0;
 	}
 
  timed_out:
 	list_del(&waiter.list);
-	result = -ETIME;
-	goto woken;
+	return -ETIME;
+
  interrupted:
 	list_del(&waiter.list);
-	result = -EINTR;
- woken:
-	/*
-	 * Account for the process which woke us up.  For the case where
-	 * we're interrupted, we need to increment the count on our own
-	 * behalf.  I don't believe we can hit the case where the
-	 * sem->count hits zero, *and* there's a second task sleeping,
-	 * but it doesn't hurt, that's not a commonly exercised path and
-	 * it's not a performance path either.
-	 */
-	if (unlikely((++sem->count >= 0) && !list_empty(&sem->wait_list)))
-		__up_down_common(sem);
-	return result;
+	return -EINTR;
 }
 
 static noinline void __sched __down(struct semaphore *sem)
@@ -230,8 +199,9 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies)
 
 static noinline void __sched __up(struct semaphore *sem)
 {
-	if (unlikely(list_empty(&sem->wait_list)))
-		sem->count++;
-	else
-		__up_down_common(sem);
+	struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
+						struct semaphore_waiter, list);
+	list_del(&waiter->list);
+	waiter->up = 1;
+	wake_up_process(waiter->task);
 }
-- 
cgit v1.2.3-70-g09d2


From 714493cd5468f42ca3c4f730a9c17c203abd5059 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Fri, 11 Apr 2008 15:23:52 -0400
Subject: Improve semaphore documentation

Move documentation from semaphore.h to semaphore.c as requested by
Andrew Morton.  Also reformat to kernel-doc style and add some more
notes about the implementation.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
 include/linux/semaphore.h | 39 ++---------------------
 kernel/semaphore.c        | 79 ++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 70 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h
index a7125daaff9..9cae64b00d6 100644
--- a/include/linux/semaphore.h
+++ b/include/linux/semaphore.h
@@ -4,8 +4,7 @@
  *
  * Distributed under the terms of the GNU GPL, version 2
  *
- * Counting semaphores allow up to <n> tasks to acquire the semaphore
- * simultaneously.
+ * Please see kernel/semaphore.c for documentation of these functions
  */
 #ifndef __LINUX_SEMAPHORE_H
 #define __LINUX_SEMAPHORE_H
@@ -13,11 +12,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 
-/*
- * The spinlock controls access to the other members of the semaphore.
- * 'count' represents how many more tasks can acquire this semaphore.
- * Tasks waiting for the lock are kept on the wait_list.
- */
+/* Please don't access any members of this structure directly */
 struct semaphore {
 	spinlock_t		lock;
 	unsigned int		count;
@@ -46,41 +41,11 @@ static inline void sema_init(struct semaphore *sem, int val)
 #define init_MUTEX(sem)		sema_init(sem, 1)
 #define init_MUTEX_LOCKED(sem)	sema_init(sem, 0)
 
-/*
- * Attempt to acquire the semaphore.  If another task is already holding the
- * semaphore, sleep until the semaphore is released.
- */
 extern void down(struct semaphore *sem);
-
-/*
- * As down(), except the sleep may be interrupted by a signal.  If it is,
- * this function will return -EINTR.
- */
 extern int __must_check down_interruptible(struct semaphore *sem);
-
-/*
- * As down_interruptible(), except the sleep may only be interrupted by
- * signals which are fatal to this process.
- */
 extern int __must_check down_killable(struct semaphore *sem);
-
-/*
- * As down(), except this function will not sleep.  It will return 0 if it
- * acquired the semaphore and 1 if the semaphore was contended.  This
- * function may be called from any context, including interrupt and softirq.
- */
 extern int __must_check down_trylock(struct semaphore *sem);
-
-/*
- * As down(), except this function will return -ETIME if it fails to
- * acquire the semaphore within the specified number of jiffies.
- */
 extern int __must_check down_timeout(struct semaphore *sem, long jiffies);
-
-/*
- * Release the semaphore.  Unlike mutexes, up() may be called from any
- * context and even by tasks which have never called down().
- */
 extern void up(struct semaphore *sem);
 
 #endif /* __LINUX_SEMAPHORE_H */
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index bef977b1696..5c2942e768c 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -3,6 +3,26 @@
  * Author: Matthew Wilcox <willy@linux.intel.com>
  *
  * Distributed under the terms of the GNU GPL, version 2
+ *
+ * This file implements counting semaphores.
+ * A counting semaphore may be acquired 'n' times before sleeping.
+ * See mutex.c for single-acquisition sleeping locks which enforce
+ * rules which allow code to be debugged more easily.
+ */
+
+/*
+ * Some notes on the implementation:
+ *
+ * The spinlock controls access to the other members of the semaphore.
+ * down_trylock() and up() can be called from interrupt context, so we
+ * have to disable interrupts when taking the lock.  It turns out various
+ * parts of the kernel expect to be able to use down() on a semaphore in
+ * interrupt context when they know it will succeed, so we have to use
+ * irqsave variants for down(), down_interruptible() and down_killable()
+ * too.
+ *
+ * The ->count variable represents how many more tasks can acquire this
+ * semaphore.  If it's zero, there may be tasks waiting on the wait_list.
  */
 
 #include <linux/compiler.h>
@@ -12,22 +32,23 @@
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
 
-/*
- * Some notes on the implementation:
- *
- * down_trylock() and up() can be called from interrupt context.
- * So we have to disable interrupts when taking the lock.
- *
- * The ->count variable defines how many more tasks can acquire the
- * semaphore.  If it's zero, there may be tasks waiting on the list.
- */
-
 static noinline void __down(struct semaphore *sem);
 static noinline int __down_interruptible(struct semaphore *sem);
 static noinline int __down_killable(struct semaphore *sem);
 static noinline int __down_timeout(struct semaphore *sem, long jiffies);
 static noinline void __up(struct semaphore *sem);
 
+/**
+ * down - acquire the semaphore
+ * @sem: the semaphore to be acquired
+ *
+ * Acquires the semaphore.  If no more tasks are allowed to acquire the
+ * semaphore, calling this function will put the task to sleep until the
+ * semaphore is released.
+ *
+ * Use of this function is deprecated, please use down_interruptible() or
+ * down_killable() instead.
+ */
 void down(struct semaphore *sem)
 {
 	unsigned long flags;
@@ -41,6 +62,15 @@ void down(struct semaphore *sem)
 }
 EXPORT_SYMBOL(down);
 
+/**
+ * down_interruptible - acquire the semaphore unless interrupted
+ * @sem: the semaphore to be acquired
+ *
+ * Attempts to acquire the semaphore.  If no more tasks are allowed to
+ * acquire the semaphore, calling this function will put the task to sleep.
+ * If the sleep is interrupted by a signal, this function will return -EINTR.
+ * If the semaphore is successfully acquired, this function returns 0.
+ */
 int down_interruptible(struct semaphore *sem)
 {
 	unsigned long flags;
@@ -57,6 +87,16 @@ int down_interruptible(struct semaphore *sem)
 }
 EXPORT_SYMBOL(down_interruptible);
 
+/**
+ * down_killable - acquire the semaphore unless killed
+ * @sem: the semaphore to be acquired
+ *
+ * Attempts to acquire the semaphore.  If no more tasks are allowed to
+ * acquire the semaphore, calling this function will put the task to sleep.
+ * If the sleep is interrupted by a fatal signal, this function will return
+ * -EINTR.  If the semaphore is successfully acquired, this function returns
+ * 0.
+ */
 int down_killable(struct semaphore *sem)
 {
 	unsigned long flags;
@@ -78,7 +118,7 @@ EXPORT_SYMBOL(down_killable);
  * @sem: the semaphore to be acquired
  *
  * Try to acquire the semaphore atomically.  Returns 0 if the mutex has
- * been acquired successfully and 1 if it is contended.
+ * been acquired successfully or 1 if it it cannot be acquired.
  *
  * NOTE: This return value is inverted from both spin_trylock and
  * mutex_trylock!  Be careful about this when converting code.
@@ -101,6 +141,16 @@ int down_trylock(struct semaphore *sem)
 }
 EXPORT_SYMBOL(down_trylock);
 
+/**
+ * down_timeout - acquire the semaphore within a specified time
+ * @sem: the semaphore to be acquired
+ * @jiffies: how long to wait before failing
+ *
+ * Attempts to acquire the semaphore.  If no more tasks are allowed to
+ * acquire the semaphore, calling this function will put the task to sleep.
+ * If the semaphore is not released within the specified number of jiffies,
+ * this function returns -ETIME.  It returns 0 if the semaphore was acquired.
+ */
 int down_timeout(struct semaphore *sem, long jiffies)
 {
 	unsigned long flags;
@@ -117,6 +167,13 @@ int down_timeout(struct semaphore *sem, long jiffies)
 }
 EXPORT_SYMBOL(down_timeout);
 
+/**
+ * up - release the semaphore
+ * @sem: the semaphore to release
+ *
+ * Release the semaphore.  Unlike mutexes, up() may be called from any
+ * context and even by tasks which have never called down().
+ */
 void up(struct semaphore *sem)
 {
 	unsigned long flags;
-- 
cgit v1.2.3-70-g09d2


From dc7d552705215ac50a0617fcf51bb9c736255b8e Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Thu, 17 Apr 2008 20:05:37 +0200
Subject: kgdb: core

kgdb core code. Handles the protocol and the arch details.

[ mingo@elte.hu: heavily modified, simplified and cleaned up. ]
[ xemul@openvz.org: use find_task_by_pid_ns ]

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jan Kiszka <jan.kiszka@web.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/kgdb.h |  271 ++++++++
 kernel/Makefile      |    1 +
 kernel/kgdb.c        | 1693 ++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug    |    2 +
 lib/Kconfig.kgdb     |   27 +
 5 files changed, 1994 insertions(+)
 create mode 100644 include/linux/kgdb.h
 create mode 100644 kernel/kgdb.c
 create mode 100644 lib/Kconfig.kgdb

(limited to 'kernel')

diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h
new file mode 100644
index 00000000000..b0985b79b63
--- /dev/null
+++ b/include/linux/kgdb.h
@@ -0,0 +1,271 @@
+/*
+ * This provides the callbacks and functions that KGDB needs to share between
+ * the core, I/O and arch-specific portions.
+ *
+ * Author: Amit Kale <amitkale@linsyssoft.com> and
+ *         Tom Rini <trini@kernel.crashing.org>
+ *
+ * 2001-2004 (c) Amit S. Kale and 2003-2005 (c) MontaVista Software, Inc.
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+#ifndef _KGDB_H_
+#define _KGDB_H_
+
+#include <linux/serial_8250.h>
+#include <linux/linkage.h>
+#include <linux/init.h>
+
+#include <asm/atomic.h>
+#include <asm/kgdb.h>
+
+struct pt_regs;
+
+/*
+ *	kgdb_skipexception - Bail out of KGDB when we've been triggered.
+ *	@exception: Exception vector number
+ *	@regs: Current &struct pt_regs.
+ *
+ *	On some architectures we need to skip a breakpoint exception when
+ *	it occurs after a breakpoint has been removed.
+ */
+extern int kgdb_skipexception(int exception, struct pt_regs *regs);
+
+/*
+ *	kgdb_post_primary_code - Save error vector/code numbers.
+ *	@regs: Original pt_regs.
+ *	@e_vector: Original error vector.
+ *	@err_code: Original error code.
+ *
+ *	This is needed on architectures which support SMP and KGDB.
+ *	This function is called after all the secondary cpus have been put
+ *	to a know spin state and the primary CPU has control over KGDB.
+ */
+extern void kgdb_post_primary_code(struct pt_regs *regs, int e_vector,
+				  int err_code);
+
+/*
+ *	kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
+ *	@regs: Current &struct pt_regs.
+ *
+ *	This function will be called if the particular architecture must
+ *	disable hardware debugging while it is processing gdb packets or
+ *	handling exception.
+ */
+extern void kgdb_disable_hw_debug(struct pt_regs *regs);
+
+struct tasklet_struct;
+struct task_struct;
+struct uart_port;
+
+/* To enter the debugger explicitly. */
+void kgdb_breakpoint(void);
+
+extern int kgdb_connected;
+
+extern atomic_t			kgdb_setting_breakpoint;
+extern atomic_t			kgdb_cpu_doing_single_step;
+
+extern struct task_struct	*kgdb_usethread;
+extern struct task_struct	*kgdb_contthread;
+
+enum kgdb_bptype {
+	BP_BREAKPOINT = 0,
+	BP_HARDWARE_BREAKPOINT,
+	BP_WRITE_WATCHPOINT,
+	BP_READ_WATCHPOINT,
+	BP_ACCESS_WATCHPOINT
+};
+
+enum kgdb_bpstate {
+	BP_UNDEFINED = 0,
+	BP_REMOVED,
+	BP_SET,
+	BP_ACTIVE
+};
+
+struct kgdb_bkpt {
+	unsigned long		bpt_addr;
+	unsigned char		saved_instr[BREAK_INSTR_SIZE];
+	enum kgdb_bptype	type;
+	enum kgdb_bpstate	state;
+};
+
+#ifndef KGDB_MAX_BREAKPOINTS
+# define KGDB_MAX_BREAKPOINTS	1000
+#endif
+
+#define KGDB_HW_BREAKPOINT	1
+
+/*
+ * Functions each KGDB-supporting architecture must provide:
+ */
+
+/*
+ *	kgdb_arch_init - Perform any architecture specific initalization.
+ *
+ *	This function will handle the initalization of any architecture
+ *	specific callbacks.
+ */
+extern int kgdb_arch_init(void);
+
+/*
+ *	kgdb_arch_exit - Perform any architecture specific uninitalization.
+ *
+ *	This function will handle the uninitalization of any architecture
+ *	specific callbacks, for dynamic registration and unregistration.
+ */
+extern void kgdb_arch_exit(void);
+
+/*
+ *	pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs
+ *	@gdb_regs: A pointer to hold the registers in the order GDB wants.
+ *	@regs: The &struct pt_regs of the current process.
+ *
+ *	Convert the pt_regs in @regs into the format for registers that
+ *	GDB expects, stored in @gdb_regs.
+ */
+extern void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs);
+
+/*
+ *	sleeping_thread_to_gdb_regs - Convert ptrace regs to GDB regs
+ *	@gdb_regs: A pointer to hold the registers in the order GDB wants.
+ *	@p: The &struct task_struct of the desired process.
+ *
+ *	Convert the register values of the sleeping process in @p to
+ *	the format that GDB expects.
+ *	This function is called when kgdb does not have access to the
+ *	&struct pt_regs and therefore it should fill the gdb registers
+ *	@gdb_regs with what has	been saved in &struct thread_struct
+ *	thread field during switch_to.
+ */
+extern void
+sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p);
+
+/*
+ *	gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs.
+ *	@gdb_regs: A pointer to hold the registers we've received from GDB.
+ *	@regs: A pointer to a &struct pt_regs to hold these values in.
+ *
+ *	Convert the GDB regs in @gdb_regs into the pt_regs, and store them
+ *	in @regs.
+ */
+extern void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs);
+
+/*
+ *	kgdb_arch_handle_exception - Handle architecture specific GDB packets.
+ *	@vector: The error vector of the exception that happened.
+ *	@signo: The signal number of the exception that happened.
+ *	@err_code: The error code of the exception that happened.
+ *	@remcom_in_buffer: The buffer of the packet we have read.
+ *	@remcom_out_buffer: The buffer of %BUFMAX bytes to write a packet into.
+ *	@regs: The &struct pt_regs of the current process.
+ *
+ *	This function MUST handle the 'c' and 's' command packets,
+ *	as well packets to set / remove a hardware breakpoint, if used.
+ *	If there are additional packets which the hardware needs to handle,
+ *	they are handled here.  The code should return -1 if it wants to
+ *	process more packets, and a %0 or %1 if it wants to exit from the
+ *	kgdb callback.
+ */
+extern int
+kgdb_arch_handle_exception(int vector, int signo, int err_code,
+			   char *remcom_in_buffer,
+			   char *remcom_out_buffer,
+			   struct pt_regs *regs);
+
+/*
+ *	kgdb_roundup_cpus - Get other CPUs into a holding pattern
+ *	@flags: Current IRQ state
+ *
+ *	On SMP systems, we need to get the attention of the other CPUs
+ *	and get them be in a known state.  This should do what is needed
+ *	to get the other CPUs to call kgdb_wait(). Note that on some arches,
+ *	the NMI approach is not used for rounding up all the CPUs. For example,
+ *	in case of MIPS, smp_call_function() is used to roundup CPUs. In
+ *	this case, we have to make sure that interrupts are enabled before
+ *	calling smp_call_function(). The argument to this function is
+ *	the flags that will be used when restoring the interrupts. There is
+ *	local_irq_save() call before kgdb_roundup_cpus().
+ *
+ *	On non-SMP systems, this is not called.
+ */
+extern void kgdb_roundup_cpus(unsigned long flags);
+
+/* Optional functions. */
+extern int kgdb_validate_break_address(unsigned long addr);
+extern int kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr);
+extern int kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle);
+
+/*
+ * struct kgdb_arch - Describe architecture specific values.
+ * @gdb_bpt_instr: The instruction to trigger a breakpoint.
+ * @flags: Flags for the breakpoint, currently just %KGDB_HW_BREAKPOINT.
+ * @set_breakpoint: Allow an architecture to specify how to set a software
+ * breakpoint.
+ * @remove_breakpoint: Allow an architecture to specify how to remove a
+ * software breakpoint.
+ * @set_hw_breakpoint: Allow an architecture to specify how to set a hardware
+ * breakpoint.
+ * @remove_hw_breakpoint: Allow an architecture to specify how to remove a
+ * hardware breakpoint.
+ * @remove_all_hw_break: Allow an architecture to specify how to remove all
+ * hardware breakpoints.
+ * @correct_hw_break: Allow an architecture to specify how to correct the
+ * hardware debug registers.
+ */
+struct kgdb_arch {
+	unsigned char		gdb_bpt_instr[BREAK_INSTR_SIZE];
+	unsigned long		flags;
+
+	int	(*set_breakpoint)(unsigned long, char *);
+	int	(*remove_breakpoint)(unsigned long, char *);
+	int	(*set_hw_breakpoint)(unsigned long, int, enum kgdb_bptype);
+	int	(*remove_hw_breakpoint)(unsigned long, int, enum kgdb_bptype);
+	void	(*remove_all_hw_break)(void);
+	void	(*correct_hw_break)(void);
+};
+
+/*
+ * struct kgdb_io - Describe the interface for an I/O driver to talk with KGDB.
+ * @name: Name of the I/O driver.
+ * @read_char: Pointer to a function that will return one char.
+ * @write_char: Pointer to a function that will write one char.
+ * @flush: Pointer to a function that will flush any pending writes.
+ * @init: Pointer to a function that will initialize the device.
+ * @pre_exception: Pointer to a function that will do any prep work for
+ * the I/O driver.
+ * @post_exception: Pointer to a function that will do any cleanup work
+ * for the I/O driver.
+ */
+struct kgdb_io {
+	const char		*name;
+	int			(*read_char) (void);
+	void			(*write_char) (u8);
+	void			(*flush) (void);
+	int			(*init) (void);
+	void			(*pre_exception) (void);
+	void			(*post_exception) (void);
+};
+
+extern struct kgdb_arch		arch_kgdb_ops;
+
+extern int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops);
+extern void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops);
+
+extern int kgdb_hex2long(char **ptr, long *long_val);
+extern int kgdb_mem2hex(char *mem, char *buf, int count);
+extern int kgdb_hex2mem(char *buf, char *mem, int count);
+
+extern int kgdb_isremovedbreak(unsigned long addr);
+
+extern int
+kgdb_handle_exception(int ex_vector, int signo, int err_code,
+		      struct pt_regs *regs);
+extern int kgdb_nmicallback(int cpu, void *regs);
+
+extern int			kgdb_single_step;
+extern atomic_t			kgdb_active;
+
+#endif /* _KGDB_H_ */
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c584c55a6e..05c8003718e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_KGDB) += kgdb.o
 obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
new file mode 100644
index 00000000000..017ee782bc0
--- /dev/null
+++ b/kernel/kgdb.c
@@ -0,0 +1,1693 @@
+/*
+ * KGDB stub.
+ *
+ * Maintainer: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (C) 2000-2001 VERITAS Software Corporation.
+ * Copyright (C) 2002-2004 Timesys Corporation
+ * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
+ * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
+ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
+ * Copyright (C) 2005-2008 Wind River Systems, Inc.
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Contributors at various stages not listed above:
+ *  Jason Wessel ( jason.wessel@windriver.com )
+ *  George Anzinger <george@mvista.com>
+ *  Anurekh Saxena (anurekh.saxena@timesys.com)
+ *  Lake Stevens Instrument Division (Glenn Engel)
+ *  Jim Kingdon, Cygnus Support.
+ *
+ * Original KGDB stub: David Grothe <dave@gcom.com>,
+ * Tigran Aivazian <tigran@sco.com>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+#include <linux/pid_namespace.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/console.h>
+#include <linux/threads.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <linux/init.h>
+#include <linux/kgdb.h>
+#include <linux/pid.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+
+#include <asm/cacheflush.h>
+#include <asm/byteorder.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+
+static int kgdb_break_asap;
+
+struct kgdb_state {
+	int			ex_vector;
+	int			signo;
+	int			err_code;
+	int			cpu;
+	int			pass_exception;
+	long			threadid;
+	long			kgdb_usethreadid;
+	struct pt_regs		*linux_regs;
+};
+
+static struct debuggerinfo_struct {
+	void			*debuggerinfo;
+	struct task_struct	*task;
+} kgdb_info[NR_CPUS];
+
+/**
+ * kgdb_connected - Is a host GDB connected to us?
+ */
+int				kgdb_connected;
+EXPORT_SYMBOL_GPL(kgdb_connected);
+
+/* All the KGDB handlers are installed */
+static int			kgdb_io_module_registered;
+
+/* Guard for recursive entry */
+static int			exception_level;
+
+static struct kgdb_io		*kgdb_io_ops;
+static DEFINE_SPINLOCK(kgdb_registration_lock);
+
+/* kgdb console driver is loaded */
+static int kgdb_con_registered;
+/* determine if kgdb console output should be used */
+static int kgdb_use_con;
+
+static int __init opt_kgdb_con(char *str)
+{
+	kgdb_use_con = 1;
+	return 0;
+}
+
+early_param("kgdbcon", opt_kgdb_con);
+
+module_param(kgdb_use_con, int, 0644);
+
+/*
+ * Holds information about breakpoints in a kernel. These breakpoints are
+ * added and removed by gdb.
+ */
+static struct kgdb_bkpt		kgdb_break[KGDB_MAX_BREAKPOINTS] = {
+	[0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
+};
+
+/*
+ * The CPU# of the active CPU, or -1 if none:
+ */
+atomic_t			kgdb_active = ATOMIC_INIT(-1);
+
+/*
+ * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
+ * bootup code (which might not have percpu set up yet):
+ */
+static atomic_t			passive_cpu_wait[NR_CPUS];
+static atomic_t			cpu_in_kgdb[NR_CPUS];
+atomic_t			kgdb_setting_breakpoint;
+
+struct task_struct		*kgdb_usethread;
+struct task_struct		*kgdb_contthread;
+
+int				kgdb_single_step;
+
+/* Our I/O buffers. */
+static char			remcom_in_buffer[BUFMAX];
+static char			remcom_out_buffer[BUFMAX];
+
+/* Storage for the registers, in GDB format. */
+static unsigned long		gdb_regs[(NUMREGBYTES +
+					sizeof(unsigned long) - 1) /
+					sizeof(unsigned long)];
+
+/* to keep track of the CPU which is doing the single stepping*/
+atomic_t			kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
+
+/*
+ * If you are debugging a problem where roundup (the collection of
+ * all other CPUs) is a problem [this should be extremely rare],
+ * then use the nokgdbroundup option to avoid roundup. In that case
+ * the other CPUs might interfere with your debugging context, so
+ * use this with care:
+ */
+int				kgdb_do_roundup = 1;
+
+static int __init opt_nokgdbroundup(char *str)
+{
+	kgdb_do_roundup = 0;
+
+	return 0;
+}
+
+early_param("nokgdbroundup", opt_nokgdbroundup);
+
+/*
+ * Finally, some KGDB code :-)
+ */
+
+/*
+ * Weak aliases for breakpoint management,
+ * can be overriden by architectures when needed:
+ */
+int __weak kgdb_validate_break_address(unsigned long addr)
+{
+	char tmp_variable[BREAK_INSTR_SIZE];
+
+	return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE);
+}
+
+int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
+{
+	int err;
+
+	err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
+	if (err)
+		return err;
+
+	return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
+				  BREAK_INSTR_SIZE);
+}
+
+int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
+{
+	return probe_kernel_write((char *)addr,
+				  (char *)bundle, BREAK_INSTR_SIZE);
+}
+
+unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
+{
+	return instruction_pointer(regs);
+}
+
+int __weak kgdb_arch_init(void)
+{
+	return 0;
+}
+
+/**
+ *	kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
+ *	@regs: Current &struct pt_regs.
+ *
+ *	This function will be called if the particular architecture must
+ *	disable hardware debugging while it is processing gdb packets or
+ *	handling exception.
+ */
+void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
+{
+}
+
+/*
+ * GDB remote protocol parser:
+ */
+
+static const char	hexchars[] = "0123456789abcdef";
+
+static int hex(char ch)
+{
+	if ((ch >= 'a') && (ch <= 'f'))
+		return ch - 'a' + 10;
+	if ((ch >= '0') && (ch <= '9'))
+		return ch - '0';
+	if ((ch >= 'A') && (ch <= 'F'))
+		return ch - 'A' + 10;
+	return -1;
+}
+
+/* scan for the sequence $<data>#<checksum> */
+static void get_packet(char *buffer)
+{
+	unsigned char checksum;
+	unsigned char xmitcsum;
+	int count;
+	char ch;
+
+	do {
+		/*
+		 * Spin and wait around for the start character, ignore all
+		 * other characters:
+		 */
+		while ((ch = (kgdb_io_ops->read_char())) != '$')
+			/* nothing */;
+
+		kgdb_connected = 1;
+		checksum = 0;
+		xmitcsum = -1;
+
+		count = 0;
+
+		/*
+		 * now, read until a # or end of buffer is found:
+		 */
+		while (count < (BUFMAX - 1)) {
+			ch = kgdb_io_ops->read_char();
+			if (ch == '#')
+				break;
+			checksum = checksum + ch;
+			buffer[count] = ch;
+			count = count + 1;
+		}
+		buffer[count] = 0;
+
+		if (ch == '#') {
+			xmitcsum = hex(kgdb_io_ops->read_char()) << 4;
+			xmitcsum += hex(kgdb_io_ops->read_char());
+
+			if (checksum != xmitcsum)
+				/* failed checksum */
+				kgdb_io_ops->write_char('-');
+			else
+				/* successful transfer */
+				kgdb_io_ops->write_char('+');
+			if (kgdb_io_ops->flush)
+				kgdb_io_ops->flush();
+		}
+	} while (checksum != xmitcsum);
+}
+
+/*
+ * Send the packet in buffer.
+ * Check for gdb connection if asked for.
+ */
+static void put_packet(char *buffer)
+{
+	unsigned char checksum;
+	int count;
+	char ch;
+
+	/*
+	 * $<packet info>#<checksum>.
+	 */
+	while (1) {
+		kgdb_io_ops->write_char('$');
+		checksum = 0;
+		count = 0;
+
+		while ((ch = buffer[count])) {
+			kgdb_io_ops->write_char(ch);
+			checksum += ch;
+			count++;
+		}
+
+		kgdb_io_ops->write_char('#');
+		kgdb_io_ops->write_char(hexchars[checksum >> 4]);
+		kgdb_io_ops->write_char(hexchars[checksum & 0xf]);
+		if (kgdb_io_ops->flush)
+			kgdb_io_ops->flush();
+
+		/* Now see what we get in reply. */
+		ch = kgdb_io_ops->read_char();
+
+		if (ch == 3)
+			ch = kgdb_io_ops->read_char();
+
+		/* If we get an ACK, we are done. */
+		if (ch == '+')
+			return;
+
+		/*
+		 * If we get the start of another packet, this means
+		 * that GDB is attempting to reconnect.  We will NAK
+		 * the packet being sent, and stop trying to send this
+		 * packet.
+		 */
+		if (ch == '$') {
+			kgdb_io_ops->write_char('-');
+			if (kgdb_io_ops->flush)
+				kgdb_io_ops->flush();
+			return;
+		}
+	}
+}
+
+static char *pack_hex_byte(char *pkt, u8 byte)
+{
+	*pkt++ = hexchars[byte >> 4];
+	*pkt++ = hexchars[byte & 0xf];
+
+	return pkt;
+}
+
+/*
+ * Convert the memory pointed to by mem into hex, placing result in buf.
+ * Return a pointer to the last char put in buf (null). May return an error.
+ */
+int kgdb_mem2hex(char *mem, char *buf, int count)
+{
+	char *tmp;
+	int err;
+
+	/*
+	 * We use the upper half of buf as an intermediate buffer for the
+	 * raw memory copy.  Hex conversion will work against this one.
+	 */
+	tmp = buf + count;
+
+	err = probe_kernel_read(tmp, mem, count);
+	if (!err) {
+		while (count > 0) {
+			buf = pack_hex_byte(buf, *tmp);
+			tmp++;
+			count--;
+		}
+
+		*buf = 0;
+	}
+
+	return err;
+}
+
+/*
+ * Copy the binary array pointed to by buf into mem.  Fix $, #, and
+ * 0x7d escaped with 0x7d.  Return a pointer to the character after
+ * the last byte written.
+ */
+static int kgdb_ebin2mem(char *buf, char *mem, int count)
+{
+	int err = 0;
+	char c;
+
+	while (count-- > 0) {
+		c = *buf++;
+		if (c == 0x7d)
+			c = *buf++ ^ 0x20;
+
+		err = probe_kernel_write(mem, &c, 1);
+		if (err)
+			break;
+
+		mem++;
+	}
+
+	return err;
+}
+
+/*
+ * Convert the hex array pointed to by buf into binary to be placed in mem.
+ * Return a pointer to the character AFTER the last byte written.
+ * May return an error.
+ */
+int kgdb_hex2mem(char *buf, char *mem, int count)
+{
+	char *tmp_raw;
+	char *tmp_hex;
+
+	/*
+	 * We use the upper half of buf as an intermediate buffer for the
+	 * raw memory that is converted from hex.
+	 */
+	tmp_raw = buf + count * 2;
+
+	tmp_hex = tmp_raw - 1;
+	while (tmp_hex >= buf) {
+		tmp_raw--;
+		*tmp_raw = hex(*tmp_hex--);
+		*tmp_raw |= hex(*tmp_hex--) << 4;
+	}
+
+	return probe_kernel_write(mem, tmp_raw, count);
+}
+
+/*
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+int kgdb_hex2long(char **ptr, long *long_val)
+{
+	int hex_val;
+	int num = 0;
+
+	*long_val = 0;
+
+	while (**ptr) {
+		hex_val = hex(**ptr);
+		if (hex_val < 0)
+			break;
+
+		*long_val = (*long_val << 4) | hex_val;
+		num++;
+		(*ptr)++;
+	}
+
+	return num;
+}
+
+/* Write memory due to an 'M' or 'X' packet. */
+static int write_mem_msg(int binary)
+{
+	char *ptr = &remcom_in_buffer[1];
+	unsigned long addr;
+	unsigned long length;
+	int err;
+
+	if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
+	    kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
+		if (binary)
+			err = kgdb_ebin2mem(ptr, (char *)addr, length);
+		else
+			err = kgdb_hex2mem(ptr, (char *)addr, length);
+		if (err)
+			return err;
+		if (CACHE_FLUSH_IS_SAFE)
+			flush_icache_range(addr, addr + length + 1);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static void error_packet(char *pkt, int error)
+{
+	error = -error;
+	pkt[0] = 'E';
+	pkt[1] = hexchars[(error / 10)];
+	pkt[2] = hexchars[(error % 10)];
+	pkt[3] = '\0';
+}
+
+/*
+ * Thread ID accessors. We represent a flat TID space to GDB, where
+ * the per CPU idle threads (which under Linux all have PID 0) are
+ * remapped to negative TIDs.
+ */
+
+#define BUF_THREAD_ID_SIZE	16
+
+static char *pack_threadid(char *pkt, unsigned char *id)
+{
+	char *limit;
+
+	limit = pkt + BUF_THREAD_ID_SIZE;
+	while (pkt < limit)
+		pkt = pack_hex_byte(pkt, *id++);
+
+	return pkt;
+}
+
+static void int_to_threadref(unsigned char *id, int value)
+{
+	unsigned char *scan;
+	int i = 4;
+
+	scan = (unsigned char *)id;
+	while (i--)
+		*scan++ = 0;
+	*scan++ = (value >> 24) & 0xff;
+	*scan++ = (value >> 16) & 0xff;
+	*scan++ = (value >> 8) & 0xff;
+	*scan++ = (value & 0xff);
+}
+
+static struct task_struct *getthread(struct pt_regs *regs, int tid)
+{
+	/*
+	 * Non-positive TIDs are remapped idle tasks:
+	 */
+	if (tid <= 0)
+		return idle_task(-tid);
+
+	/*
+	 * find_task_by_pid_ns() does not take the tasklist lock anymore
+	 * but is nicely RCU locked - hence is a pretty resilient
+	 * thing to use:
+	 */
+	return find_task_by_pid_ns(tid, &init_pid_ns);
+}
+
+/*
+ * CPU debug state control:
+ */
+
+#ifdef CONFIG_SMP
+static void kgdb_wait(struct pt_regs *regs)
+{
+	unsigned long flags;
+	int cpu;
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	kgdb_info[cpu].debuggerinfo = regs;
+	kgdb_info[cpu].task = current;
+	/*
+	 * Make sure the above info reaches the primary CPU before
+	 * our cpu_in_kgdb[] flag setting does:
+	 */
+	smp_wmb();
+	atomic_set(&cpu_in_kgdb[cpu], 1);
+
+	/*
+	 * The primary CPU must be active to enter here, but this is
+	 * guard in case the primary CPU had not been selected if
+	 * this was an entry via nmi.
+	 */
+	while (atomic_read(&kgdb_active) == -1)
+		cpu_relax();
+
+	/* Wait till primary CPU goes completely into the debugger. */
+	while (!atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)]))
+		cpu_relax();
+
+	/* Wait till primary CPU is done with debugging */
+	while (atomic_read(&passive_cpu_wait[cpu]))
+		cpu_relax();
+
+	kgdb_info[cpu].debuggerinfo = NULL;
+	kgdb_info[cpu].task = NULL;
+
+	/* fix up hardware debug registers on local cpu */
+	if (arch_kgdb_ops.correct_hw_break)
+		arch_kgdb_ops.correct_hw_break();
+
+	/* Signal the primary CPU that we are done: */
+	atomic_set(&cpu_in_kgdb[cpu], 0);
+	local_irq_restore(flags);
+}
+#endif
+
+/*
+ * Some architectures need cache flushes when we set/clear a
+ * breakpoint:
+ */
+static void kgdb_flush_swbreak_addr(unsigned long addr)
+{
+	if (!CACHE_FLUSH_IS_SAFE)
+		return;
+
+	if (current->mm) {
+		flush_cache_range(current->mm->mmap_cache,
+				  addr, addr + BREAK_INSTR_SIZE);
+	} else {
+		flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
+	}
+}
+
+/*
+ * SW breakpoint management:
+ */
+static int kgdb_activate_sw_breakpoints(void)
+{
+	unsigned long addr;
+	int error = 0;
+	int i;
+
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if (kgdb_break[i].state != BP_SET)
+			continue;
+
+		addr = kgdb_break[i].bpt_addr;
+		error = kgdb_arch_set_breakpoint(addr,
+				kgdb_break[i].saved_instr);
+		if (error)
+			return error;
+
+		kgdb_flush_swbreak_addr(addr);
+		kgdb_break[i].state = BP_ACTIVE;
+	}
+	return 0;
+}
+
+static int kgdb_set_sw_break(unsigned long addr)
+{
+	int err = kgdb_validate_break_address(addr);
+	int breakno = -1;
+	int i;
+
+	if (err)
+		return err;
+
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if ((kgdb_break[i].state == BP_SET) &&
+					(kgdb_break[i].bpt_addr == addr))
+			return -EEXIST;
+	}
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if (kgdb_break[i].state == BP_REMOVED &&
+					kgdb_break[i].bpt_addr == addr) {
+			breakno = i;
+			break;
+		}
+	}
+
+	if (breakno == -1) {
+		for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+			if (kgdb_break[i].state == BP_UNDEFINED) {
+				breakno = i;
+				break;
+			}
+		}
+	}
+
+	if (breakno == -1)
+		return -E2BIG;
+
+	kgdb_break[breakno].state = BP_SET;
+	kgdb_break[breakno].type = BP_BREAKPOINT;
+	kgdb_break[breakno].bpt_addr = addr;
+
+	return 0;
+}
+
+static int kgdb_deactivate_sw_breakpoints(void)
+{
+	unsigned long addr;
+	int error = 0;
+	int i;
+
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if (kgdb_break[i].state != BP_ACTIVE)
+			continue;
+		addr = kgdb_break[i].bpt_addr;
+		error = kgdb_arch_remove_breakpoint(addr,
+					kgdb_break[i].saved_instr);
+		if (error)
+			return error;
+
+		kgdb_flush_swbreak_addr(addr);
+		kgdb_break[i].state = BP_SET;
+	}
+	return 0;
+}
+
+static int kgdb_remove_sw_break(unsigned long addr)
+{
+	int i;
+
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if ((kgdb_break[i].state == BP_SET) &&
+				(kgdb_break[i].bpt_addr == addr)) {
+			kgdb_break[i].state = BP_REMOVED;
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+int kgdb_isremovedbreak(unsigned long addr)
+{
+	int i;
+
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if ((kgdb_break[i].state == BP_REMOVED) &&
+					(kgdb_break[i].bpt_addr == addr))
+			return 1;
+	}
+	return 0;
+}
+
+int remove_all_break(void)
+{
+	unsigned long addr;
+	int error;
+	int i;
+
+	/* Clear memory breakpoints. */
+	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+		if (kgdb_break[i].state != BP_SET)
+			continue;
+		addr = kgdb_break[i].bpt_addr;
+		error = kgdb_arch_remove_breakpoint(addr,
+				kgdb_break[i].saved_instr);
+		if (error)
+			return error;
+		kgdb_break[i].state = BP_REMOVED;
+	}
+
+	/* Clear hardware breakpoints. */
+	if (arch_kgdb_ops.remove_all_hw_break)
+		arch_kgdb_ops.remove_all_hw_break();
+
+	return 0;
+}
+
+/*
+ * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs:
+ */
+static inline int shadow_pid(int realpid)
+{
+	if (realpid)
+		return realpid;
+
+	return -1-raw_smp_processor_id();
+}
+
+static char gdbmsgbuf[BUFMAX + 1];
+
+static void kgdb_msg_write(const char *s, int len)
+{
+	char *bufptr;
+	int wcount;
+	int i;
+
+	/* 'O'utput */
+	gdbmsgbuf[0] = 'O';
+
+	/* Fill and send buffers... */
+	while (len > 0) {
+		bufptr = gdbmsgbuf + 1;
+
+		/* Calculate how many this time */
+		if ((len << 1) > (BUFMAX - 2))
+			wcount = (BUFMAX - 2) >> 1;
+		else
+			wcount = len;
+
+		/* Pack in hex chars */
+		for (i = 0; i < wcount; i++)
+			bufptr = pack_hex_byte(bufptr, s[i]);
+		*bufptr = '\0';
+
+		/* Move up */
+		s += wcount;
+		len -= wcount;
+
+		/* Write packet */
+		put_packet(gdbmsgbuf);
+	}
+}
+
+/*
+ * Return true if there is a valid kgdb I/O module.  Also if no
+ * debugger is attached a message can be printed to the console about
+ * waiting for the debugger to attach.
+ *
+ * The print_wait argument is only to be true when called from inside
+ * the core kgdb_handle_exception, because it will wait for the
+ * debugger to attach.
+ */
+static int kgdb_io_ready(int print_wait)
+{
+	if (!kgdb_io_ops)
+		return 0;
+	if (kgdb_connected)
+		return 1;
+	if (atomic_read(&kgdb_setting_breakpoint))
+		return 1;
+	if (print_wait)
+		printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
+	return 1;
+}
+
+/*
+ * All the functions that start with gdb_cmd are the various
+ * operations to implement the handlers for the gdbserial protocol
+ * where KGDB is communicating with an external debugger
+ */
+
+/* Handle the '?' status packets */
+static void gdb_cmd_status(struct kgdb_state *ks)
+{
+	/*
+	 * We know that this packet is only sent
+	 * during initial connect.  So to be safe,
+	 * we clear out our breakpoints now in case
+	 * GDB is reconnecting.
+	 */
+	remove_all_break();
+
+	remcom_out_buffer[0] = 'S';
+	pack_hex_byte(&remcom_out_buffer[1], ks->signo);
+}
+
+/* Handle the 'g' get registers request */
+static void gdb_cmd_getregs(struct kgdb_state *ks)
+{
+	struct task_struct *thread;
+	void *local_debuggerinfo;
+	int i;
+
+	thread = kgdb_usethread;
+	if (!thread) {
+		thread = kgdb_info[ks->cpu].task;
+		local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
+	} else {
+		local_debuggerinfo = NULL;
+		for (i = 0; i < NR_CPUS; i++) {
+			/*
+			 * Try to find the task on some other
+			 * or possibly this node if we do not
+			 * find the matching task then we try
+			 * to approximate the results.
+			 */
+			if (thread == kgdb_info[i].task)
+				local_debuggerinfo = kgdb_info[i].debuggerinfo;
+		}
+	}
+
+	/*
+	 * All threads that don't have debuggerinfo should be
+	 * in __schedule() sleeping, since all other CPUs
+	 * are in kgdb_wait, and thus have debuggerinfo.
+	 */
+	if (local_debuggerinfo) {
+		pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
+	} else {
+		/*
+		 * Pull stuff saved during switch_to; nothing
+		 * else is accessible (or even particularly
+		 * relevant).
+		 *
+		 * This should be enough for a stack trace.
+		 */
+		sleeping_thread_to_gdb_regs(gdb_regs, thread);
+	}
+	kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
+}
+
+/* Handle the 'G' set registers request */
+static void gdb_cmd_setregs(struct kgdb_state *ks)
+{
+	kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
+
+	if (kgdb_usethread && kgdb_usethread != current) {
+		error_packet(remcom_out_buffer, -EINVAL);
+	} else {
+		gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
+		strcpy(remcom_out_buffer, "OK");
+	}
+}
+
+/* Handle the 'm' memory read bytes */
+static void gdb_cmd_memread(struct kgdb_state *ks)
+{
+	char *ptr = &remcom_in_buffer[1];
+	unsigned long length;
+	unsigned long addr;
+	int err;
+
+	if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
+					kgdb_hex2long(&ptr, &length) > 0) {
+		err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
+		if (err)
+			error_packet(remcom_out_buffer, err);
+	} else {
+		error_packet(remcom_out_buffer, -EINVAL);
+	}
+}
+
+/* Handle the 'M' memory write bytes */
+static void gdb_cmd_memwrite(struct kgdb_state *ks)
+{
+	int err = write_mem_msg(0);
+
+	if (err)
+		error_packet(remcom_out_buffer, err);
+	else
+		strcpy(remcom_out_buffer, "OK");
+}
+
+/* Handle the 'X' memory binary write bytes */
+static void gdb_cmd_binwrite(struct kgdb_state *ks)
+{
+	int err = write_mem_msg(1);
+
+	if (err)
+		error_packet(remcom_out_buffer, err);
+	else
+		strcpy(remcom_out_buffer, "OK");
+}
+
+/* Handle the 'D' or 'k', detach or kill packets */
+static void gdb_cmd_detachkill(struct kgdb_state *ks)
+{
+	int error;
+
+	/* The detach case */
+	if (remcom_in_buffer[0] == 'D') {
+		error = remove_all_break();
+		if (error < 0) {
+			error_packet(remcom_out_buffer, error);
+		} else {
+			strcpy(remcom_out_buffer, "OK");
+			kgdb_connected = 0;
+		}
+		put_packet(remcom_out_buffer);
+	} else {
+		/*
+		 * Assume the kill case, with no exit code checking,
+		 * trying to force detach the debugger:
+		 */
+		remove_all_break();
+		kgdb_connected = 0;
+	}
+}
+
+/* Handle the 'R' reboot packets */
+static int gdb_cmd_reboot(struct kgdb_state *ks)
+{
+	/* For now, only honor R0 */
+	if (strcmp(remcom_in_buffer, "R0") == 0) {
+		printk(KERN_CRIT "Executing emergency reboot\n");
+		strcpy(remcom_out_buffer, "OK");
+		put_packet(remcom_out_buffer);
+
+		/*
+		 * Execution should not return from
+		 * machine_emergency_restart()
+		 */
+		machine_emergency_restart();
+		kgdb_connected = 0;
+
+		return 1;
+	}
+	return 0;
+}
+
+/* Handle the 'q' query packets */
+static void gdb_cmd_query(struct kgdb_state *ks)
+{
+	struct task_struct *thread;
+	unsigned char thref[8];
+	char *ptr;
+	int i;
+
+	switch (remcom_in_buffer[1]) {
+	case 's':
+	case 'f':
+		if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
+			error_packet(remcom_out_buffer, -EINVAL);
+			break;
+		}
+
+		if (remcom_in_buffer[1] == 'f')
+			ks->threadid = 1;
+
+		remcom_out_buffer[0] = 'm';
+		ptr = remcom_out_buffer + 1;
+
+		for (i = 0; i < 17; ks->threadid++) {
+			thread = getthread(ks->linux_regs, ks->threadid);
+			if (thread) {
+				int_to_threadref(thref, ks->threadid);
+				pack_threadid(ptr, thref);
+				ptr += BUF_THREAD_ID_SIZE;
+				*(ptr++) = ',';
+				i++;
+			}
+		}
+		*(--ptr) = '\0';
+		break;
+
+	case 'C':
+		/* Current thread id */
+		strcpy(remcom_out_buffer, "QC");
+		ks->threadid = shadow_pid(current->pid);
+		int_to_threadref(thref, ks->threadid);
+		pack_threadid(remcom_out_buffer + 2, thref);
+		break;
+	case 'T':
+		if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
+			error_packet(remcom_out_buffer, -EINVAL);
+			break;
+		}
+		ks->threadid = 0;
+		ptr = remcom_in_buffer + 17;
+		kgdb_hex2long(&ptr, &ks->threadid);
+		if (!getthread(ks->linux_regs, ks->threadid)) {
+			error_packet(remcom_out_buffer, -EINVAL);
+			break;
+		}
+		if (ks->threadid > 0) {
+			kgdb_mem2hex(getthread(ks->linux_regs,
+					ks->threadid)->comm,
+					remcom_out_buffer, 16);
+		} else {
+			static char tmpstr[23 + BUF_THREAD_ID_SIZE];
+
+			sprintf(tmpstr, "Shadow task %d for pid 0",
+					(int)(-ks->threadid-1));
+			kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
+		}
+		break;
+	}
+}
+
+/* Handle the 'H' task query packets */
+static void gdb_cmd_task(struct kgdb_state *ks)
+{
+	struct task_struct *thread;
+	char *ptr;
+
+	switch (remcom_in_buffer[1]) {
+	case 'g':
+		ptr = &remcom_in_buffer[2];
+		kgdb_hex2long(&ptr, &ks->threadid);
+		thread = getthread(ks->linux_regs, ks->threadid);
+		if (!thread && ks->threadid > 0) {
+			error_packet(remcom_out_buffer, -EINVAL);
+			break;
+		}
+		kgdb_usethread = thread;
+		ks->kgdb_usethreadid = ks->threadid;
+		strcpy(remcom_out_buffer, "OK");
+		break;
+	case 'c':
+		ptr = &remcom_in_buffer[2];
+		kgdb_hex2long(&ptr, &ks->threadid);
+		if (!ks->threadid) {
+			kgdb_contthread = NULL;
+		} else {
+			thread = getthread(ks->linux_regs, ks->threadid);
+			if (!thread && ks->threadid > 0) {
+				error_packet(remcom_out_buffer, -EINVAL);
+				break;
+			}
+			kgdb_contthread = thread;
+		}
+		strcpy(remcom_out_buffer, "OK");
+		break;
+	}
+}
+
+/* Handle the 'T' thread query packets */
+static void gdb_cmd_thread(struct kgdb_state *ks)
+{
+	char *ptr = &remcom_in_buffer[1];
+	struct task_struct *thread;
+
+	kgdb_hex2long(&ptr, &ks->threadid);
+	thread = getthread(ks->linux_regs, ks->threadid);
+	if (thread)
+		strcpy(remcom_out_buffer, "OK");
+	else
+		error_packet(remcom_out_buffer, -EINVAL);
+}
+
+/* Handle the 'z' or 'Z' breakpoint remove or set packets */
+static void gdb_cmd_break(struct kgdb_state *ks)
+{
+	/*
+	 * Since GDB-5.3, it's been drafted that '0' is a software
+	 * breakpoint, '1' is a hardware breakpoint, so let's do that.
+	 */
+	char *bpt_type = &remcom_in_buffer[1];
+	char *ptr = &remcom_in_buffer[2];
+	unsigned long addr;
+	unsigned long length;
+	int error = 0;
+
+	if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
+		/* Unsupported */
+		if (*bpt_type > '4')
+			return;
+	} else {
+		if (*bpt_type != '0' && *bpt_type != '1')
+			/* Unsupported. */
+			return;
+	}
+
+	/*
+	 * Test if this is a hardware breakpoint, and
+	 * if we support it:
+	 */
+	if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
+		/* Unsupported. */
+		return;
+
+	if (*(ptr++) != ',') {
+		error_packet(remcom_out_buffer, -EINVAL);
+		return;
+	}
+	if (!kgdb_hex2long(&ptr, &addr)) {
+		error_packet(remcom_out_buffer, -EINVAL);
+		return;
+	}
+	if (*(ptr++) != ',' ||
+		!kgdb_hex2long(&ptr, &length)) {
+		error_packet(remcom_out_buffer, -EINVAL);
+		return;
+	}
+
+	if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
+		error = kgdb_set_sw_break(addr);
+	else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
+		error = kgdb_remove_sw_break(addr);
+	else if (remcom_in_buffer[0] == 'Z')
+		error = arch_kgdb_ops.set_hw_breakpoint(addr,
+			(int)length, *bpt_type);
+	else if (remcom_in_buffer[0] == 'z')
+		error = arch_kgdb_ops.remove_hw_breakpoint(addr,
+			(int) length, *bpt_type);
+
+	if (error == 0)
+		strcpy(remcom_out_buffer, "OK");
+	else
+		error_packet(remcom_out_buffer, error);
+}
+
+/* Handle the 'C' signal / exception passing packets */
+static int gdb_cmd_exception_pass(struct kgdb_state *ks)
+{
+	/* C09 == pass exception
+	 * C15 == detach kgdb, pass exception
+	 */
+	if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
+
+		ks->pass_exception = 1;
+		remcom_in_buffer[0] = 'c';
+
+	} else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
+
+		ks->pass_exception = 1;
+		remcom_in_buffer[0] = 'D';
+		remove_all_break();
+		kgdb_connected = 0;
+		return 1;
+
+	} else {
+		error_packet(remcom_out_buffer, -EINVAL);
+		return 0;
+	}
+
+	/* Indicate fall through */
+	return -1;
+}
+
+/*
+ * This function performs all gdbserial command procesing
+ */
+static int gdb_serial_stub(struct kgdb_state *ks)
+{
+	int error = 0;
+	int tmp;
+
+	/* Clear the out buffer. */
+	memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+
+	if (kgdb_connected) {
+		unsigned char thref[8];
+		char *ptr;
+
+		/* Reply to host that an exception has occurred */
+		ptr = remcom_out_buffer;
+		*ptr++ = 'T';
+		ptr = pack_hex_byte(ptr, ks->signo);
+		ptr += strlen(strcpy(ptr, "thread:"));
+		int_to_threadref(thref, shadow_pid(current->pid));
+		ptr = pack_threadid(ptr, thref);
+		*ptr++ = ';';
+		put_packet(remcom_out_buffer);
+	}
+
+	kgdb_usethread = kgdb_info[ks->cpu].task;
+	ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
+	ks->pass_exception = 0;
+
+	while (1) {
+		error = 0;
+
+		/* Clear the out buffer. */
+		memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+
+		get_packet(remcom_in_buffer);
+
+		switch (remcom_in_buffer[0]) {
+		case '?': /* gdbserial status */
+			gdb_cmd_status(ks);
+			break;
+		case 'g': /* return the value of the CPU registers */
+			gdb_cmd_getregs(ks);
+			break;
+		case 'G': /* set the value of the CPU registers - return OK */
+			gdb_cmd_setregs(ks);
+			break;
+		case 'm': /* mAA..AA,LLLL  Read LLLL bytes at address AA..AA */
+			gdb_cmd_memread(ks);
+			break;
+		case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+			gdb_cmd_memwrite(ks);
+			break;
+		case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+			gdb_cmd_binwrite(ks);
+			break;
+			/* kill or detach. KGDB should treat this like a
+			 * continue.
+			 */
+		case 'D': /* Debugger detach */
+		case 'k': /* Debugger detach via kill */
+			gdb_cmd_detachkill(ks);
+			goto default_handle;
+		case 'R': /* Reboot */
+			if (gdb_cmd_reboot(ks))
+				goto default_handle;
+			break;
+		case 'q': /* query command */
+			gdb_cmd_query(ks);
+			break;
+		case 'H': /* task related */
+			gdb_cmd_task(ks);
+			break;
+		case 'T': /* Query thread status */
+			gdb_cmd_thread(ks);
+			break;
+		case 'z': /* Break point remove */
+		case 'Z': /* Break point set */
+			gdb_cmd_break(ks);
+			break;
+		case 'C': /* Exception passing */
+			tmp = gdb_cmd_exception_pass(ks);
+			if (tmp > 0)
+				goto default_handle;
+			if (tmp == 0)
+				break;
+			/* Fall through on tmp < 0 */
+		case 'c': /* Continue packet */
+		case 's': /* Single step packet */
+			if (kgdb_contthread && kgdb_contthread != current) {
+				/* Can't switch threads in kgdb */
+				error_packet(remcom_out_buffer, -EINVAL);
+				break;
+			}
+			kgdb_activate_sw_breakpoints();
+			/* Fall through to default processing */
+		default:
+default_handle:
+			error = kgdb_arch_handle_exception(ks->ex_vector,
+						ks->signo,
+						ks->err_code,
+						remcom_in_buffer,
+						remcom_out_buffer,
+						ks->linux_regs);
+			/*
+			 * Leave cmd processing on error, detach,
+			 * kill, continue, or single step.
+			 */
+			if (error >= 0 || remcom_in_buffer[0] == 'D' ||
+			    remcom_in_buffer[0] == 'k') {
+				error = 0;
+				goto kgdb_exit;
+			}
+
+		}
+
+		/* reply to the request */
+		put_packet(remcom_out_buffer);
+	}
+
+kgdb_exit:
+	if (ks->pass_exception)
+		error = 1;
+	return error;
+}
+
+static int kgdb_reenter_check(struct kgdb_state *ks)
+{
+	unsigned long addr;
+
+	if (atomic_read(&kgdb_active) != raw_smp_processor_id())
+		return 0;
+
+	/* Panic on recursive debugger calls: */
+	exception_level++;
+	addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
+	kgdb_deactivate_sw_breakpoints();
+
+	/*
+	 * If the break point removed ok at the place exception
+	 * occurred, try to recover and print a warning to the end
+	 * user because the user planted a breakpoint in a place that
+	 * KGDB needs in order to function.
+	 */
+	if (kgdb_remove_sw_break(addr) == 0) {
+		exception_level = 0;
+		kgdb_skipexception(ks->ex_vector, ks->linux_regs);
+		kgdb_activate_sw_breakpoints();
+		printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed\n");
+		WARN_ON_ONCE(1);
+
+		return 1;
+	}
+	remove_all_break();
+	kgdb_skipexception(ks->ex_vector, ks->linux_regs);
+
+	if (exception_level > 1) {
+		dump_stack();
+		panic("Recursive entry to debugger");
+	}
+
+	printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
+	dump_stack();
+	panic("Recursive entry to debugger");
+
+	return 1;
+}
+
+/*
+ * kgdb_handle_exception() - main entry point from a kernel exception
+ *
+ * Locking hierarchy:
+ *	interface locks, if any (begin_session)
+ *	kgdb lock (kgdb_active)
+ */
+int
+kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
+{
+	struct kgdb_state kgdb_var;
+	struct kgdb_state *ks = &kgdb_var;
+	unsigned long flags;
+	int error = 0;
+	int i, cpu;
+
+	ks->cpu			= raw_smp_processor_id();
+	ks->ex_vector		= evector;
+	ks->signo		= signo;
+	ks->ex_vector		= evector;
+	ks->err_code		= ecode;
+	ks->kgdb_usethreadid	= 0;
+	ks->linux_regs		= regs;
+
+	if (kgdb_reenter_check(ks))
+		return 0; /* Ouch, double exception ! */
+
+acquirelock:
+	/*
+	 * Interrupts will be restored by the 'trap return' code, except when
+	 * single stepping.
+	 */
+	local_irq_save(flags);
+
+	cpu = raw_smp_processor_id();
+
+	/*
+	 * Acquire the kgdb_active lock:
+	 */
+	while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1)
+		cpu_relax();
+
+	/*
+	 * Do not start the debugger connection on this CPU if the last
+	 * instance of the exception handler wanted to come into the
+	 * debugger on a different CPU via a single step
+	 */
+	if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
+	    atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
+
+		atomic_set(&kgdb_active, -1);
+		local_irq_restore(flags);
+
+		goto acquirelock;
+	}
+
+	if (!kgdb_io_ready(1)) {
+		error = 1;
+		goto kgdb_restore; /* No I/O connection, so resume the system */
+	}
+
+	/*
+	 * Don't enter if we have hit a removed breakpoint.
+	 */
+	if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
+		goto kgdb_restore;
+
+	/* Call the I/O driver's pre_exception routine */
+	if (kgdb_io_ops->pre_exception)
+		kgdb_io_ops->pre_exception();
+
+	kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs;
+	kgdb_info[ks->cpu].task = current;
+
+	kgdb_disable_hw_debug(ks->linux_regs);
+
+	/*
+	 * Get the passive CPU lock which will hold all the non-primary
+	 * CPU in a spin state while the debugger is active
+	 */
+	if (!kgdb_single_step || !kgdb_contthread) {
+		for (i = 0; i < NR_CPUS; i++)
+			atomic_set(&passive_cpu_wait[i], 1);
+	}
+
+#ifdef CONFIG_SMP
+	/* Signal the other CPUs to enter kgdb_wait() */
+	if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup)
+		kgdb_roundup_cpus(flags);
+#endif
+
+	/*
+	 * spin_lock code is good enough as a barrier so we don't
+	 * need one here:
+	 */
+	atomic_set(&cpu_in_kgdb[ks->cpu], 1);
+
+	/*
+	 * Wait for the other CPUs to be notified and be waiting for us:
+	 */
+	for_each_online_cpu(i) {
+		while (!atomic_read(&cpu_in_kgdb[i]))
+			cpu_relax();
+	}
+
+	/*
+	 * At this point the primary processor is completely
+	 * in the debugger and all secondary CPUs are quiescent
+	 */
+	kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
+	kgdb_deactivate_sw_breakpoints();
+	kgdb_single_step = 0;
+	kgdb_contthread = NULL;
+	exception_level = 0;
+
+	/* Talk to debugger with gdbserial protocol */
+	error = gdb_serial_stub(ks);
+
+	/* Call the I/O driver's post_exception routine */
+	if (kgdb_io_ops->post_exception)
+		kgdb_io_ops->post_exception();
+
+	kgdb_info[ks->cpu].debuggerinfo = NULL;
+	kgdb_info[ks->cpu].task = NULL;
+	atomic_set(&cpu_in_kgdb[ks->cpu], 0);
+
+	if (!kgdb_single_step || !kgdb_contthread) {
+		for (i = NR_CPUS-1; i >= 0; i--)
+			atomic_set(&passive_cpu_wait[i], 0);
+		/*
+		 * Wait till all the CPUs have quit
+		 * from the debugger.
+		 */
+		for_each_online_cpu(i) {
+			while (atomic_read(&cpu_in_kgdb[i]))
+				cpu_relax();
+		}
+	}
+
+kgdb_restore:
+	/* Free kgdb_active */
+	atomic_set(&kgdb_active, -1);
+	local_irq_restore(flags);
+
+	return error;
+}
+
+int kgdb_nmicallback(int cpu, void *regs)
+{
+#ifdef CONFIG_SMP
+	if (!atomic_read(&cpu_in_kgdb[cpu]) &&
+			atomic_read(&kgdb_active) != cpu) {
+		kgdb_wait((struct pt_regs *)regs);
+		return 0;
+	}
+#endif
+	return 1;
+}
+
+void kgdb_console_write(struct console *co, const char *s, unsigned count)
+{
+	unsigned long flags;
+
+	/* If we're debugging, or KGDB has not connected, don't try
+	 * and print. */
+	if (!kgdb_connected || atomic_read(&kgdb_active) != -1)
+		return;
+
+	local_irq_save(flags);
+	kgdb_msg_write(s, count);
+	local_irq_restore(flags);
+}
+
+static struct console kgdbcons = {
+	.name		= "kgdb",
+	.write		= kgdb_console_write,
+	.flags		= CON_PRINTBUFFER | CON_ENABLED,
+	.index		= -1,
+};
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static void sysrq_handle_gdb(int key, struct tty_struct *tty)
+{
+	if (!kgdb_io_ops) {
+		printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
+		return;
+	}
+	if (!kgdb_connected)
+		printk(KERN_CRIT "Entering KGDB\n");
+
+	kgdb_breakpoint();
+}
+
+static struct sysrq_key_op sysrq_gdb_op = {
+	.handler	= sysrq_handle_gdb,
+	.help_msg	= "Gdb",
+	.action_msg	= "GDB",
+};
+#endif
+
+static void kgdb_register_callbacks(void)
+{
+	if (!kgdb_io_module_registered) {
+		kgdb_io_module_registered = 1;
+		kgdb_arch_init();
+#ifdef CONFIG_MAGIC_SYSRQ
+		register_sysrq_key('g', &sysrq_gdb_op);
+#endif
+		if (kgdb_use_con && !kgdb_con_registered) {
+			register_console(&kgdbcons);
+			kgdb_con_registered = 1;
+		}
+	}
+}
+
+static void kgdb_unregister_callbacks(void)
+{
+	/*
+	 * When this routine is called KGDB should unregister from the
+	 * panic handler and clean up, making sure it is not handling any
+	 * break exceptions at the time.
+	 */
+	if (kgdb_io_module_registered) {
+		kgdb_io_module_registered = 0;
+		kgdb_arch_exit();
+#ifdef CONFIG_MAGIC_SYSRQ
+		unregister_sysrq_key('g', &sysrq_gdb_op);
+#endif
+		if (kgdb_con_registered) {
+			unregister_console(&kgdbcons);
+			kgdb_con_registered = 0;
+		}
+	}
+}
+
+static void kgdb_initial_breakpoint(void)
+{
+	kgdb_break_asap = 0;
+
+	printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
+	kgdb_breakpoint();
+}
+
+/**
+ *	kkgdb_register_io_module - register KGDB IO module
+ *	@new_kgdb_io_ops: the io ops vector
+ *
+ *	Register it with the KGDB core.
+ */
+int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops)
+{
+	int err;
+
+	spin_lock(&kgdb_registration_lock);
+
+	if (kgdb_io_ops) {
+		spin_unlock(&kgdb_registration_lock);
+
+		printk(KERN_ERR "kgdb: Another I/O driver is already "
+				"registered with KGDB.\n");
+		return -EBUSY;
+	}
+
+	if (new_kgdb_io_ops->init) {
+		err = new_kgdb_io_ops->init();
+		if (err) {
+			spin_unlock(&kgdb_registration_lock);
+			return err;
+		}
+	}
+
+	kgdb_io_ops = new_kgdb_io_ops;
+
+	spin_unlock(&kgdb_registration_lock);
+
+	printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
+	       new_kgdb_io_ops->name);
+
+	/* Arm KGDB now. */
+	kgdb_register_callbacks();
+
+	if (kgdb_break_asap)
+		kgdb_initial_breakpoint();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kgdb_register_io_module);
+
+/**
+ *	kkgdb_unregister_io_module - unregister KGDB IO module
+ *	@old_kgdb_io_ops: the io ops vector
+ *
+ *	Unregister it with the KGDB core.
+ */
+void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops)
+{
+	BUG_ON(kgdb_connected);
+
+	/*
+	 * KGDB is no longer able to communicate out, so
+	 * unregister our callbacks and reset state.
+	 */
+	kgdb_unregister_callbacks();
+
+	spin_lock(&kgdb_registration_lock);
+
+	WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops);
+	kgdb_io_ops = NULL;
+
+	spin_unlock(&kgdb_registration_lock);
+
+	printk(KERN_INFO
+		"kgdb: Unregistered I/O driver %s, debugger disabled.\n",
+		old_kgdb_io_ops->name);
+}
+EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
+
+/**
+ * kgdb_breakpoint - generate breakpoint exception
+ *
+ * This function will generate a breakpoint exception.  It is used at the
+ * beginning of a program to sync up with a debugger and can be used
+ * otherwise as a quick means to stop program execution and "break" into
+ * the debugger.
+ */
+void kgdb_breakpoint(void)
+{
+	atomic_set(&kgdb_setting_breakpoint, 1);
+	wmb(); /* Sync point before breakpoint */
+	arch_kgdb_breakpoint();
+	wmb(); /* Sync point after breakpoint */
+	atomic_set(&kgdb_setting_breakpoint, 0);
+}
+EXPORT_SYMBOL_GPL(kgdb_breakpoint);
+
+static int __init opt_kgdb_wait(char *str)
+{
+	kgdb_break_asap = 1;
+
+	if (kgdb_io_module_registered)
+		kgdb_initial_breakpoint();
+
+	return 0;
+}
+
+early_param("kgdbwait", opt_kgdb_wait);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 0796c1a090c..e601d0e7ac5 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -622,3 +622,5 @@ config PROVIDE_OHCI1394_DMA_INIT
 	  See Documentation/debugging-via-ohci1394.txt for more information.
 
 source "samples/Kconfig"
+
+source "lib/Kconfig.kgdb"
diff --git a/lib/Kconfig.kgdb b/lib/Kconfig.kgdb
new file mode 100644
index 00000000000..9631ba3baaf
--- /dev/null
+++ b/lib/Kconfig.kgdb
@@ -0,0 +1,27 @@
+
+menuconfig KGDB
+	bool "KGDB: kernel debugging with remote gdb"
+	select FRAME_POINTER
+	depends on HAVE_ARCH_KGDB
+	depends on DEBUG_KERNEL && EXPERIMENTAL
+	help
+	  If you say Y here, it will be possible to remotely debug the
+	  kernel using gdb.  Documentation of kernel debugger is available
+	  at http://kgdb.sourceforge.net as well as in DocBook form
+	  in Documentation/DocBook/.  If unsure, say N.
+
+config HAVE_ARCH_KGDB_SHADOW_INFO
+	bool
+
+config HAVE_ARCH_KGDB
+	bool
+
+config KGDB_SERIAL_CONSOLE
+	tristate "KGDB: use kgdb over the serial console"
+	depends on KGDB
+	select CONSOLE_POLL
+	select MAGIC_SYSRQ
+	default y
+	help
+	  Share a serial console with kgdb. Sysrq-g must be used
+	  to break in initially.
-- 
cgit v1.2.3-70-g09d2


From 7c3078b637882303b1dcf6a16229d0e35f6b60a5 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Fri, 15 Feb 2008 14:55:54 -0600
Subject: kgdb: clocksource watchdog

In order to not trip the clocksource watchdog, kgdb must touch the
clocksource watchdog on the return to normal system run state.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/clocksource.h |  1 +
 kernel/kgdb.c               |  4 ++++
 kernel/time/clocksource.c   | 12 ++++++++++++
 3 files changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 85778a4b120..35094479ca5 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -216,6 +216,7 @@ static inline void clocksource_calculate_interval(struct clocksource *c,
 /* used to install a new clocksource */
 extern int clocksource_register(struct clocksource*);
 extern void clocksource_unregister(struct clocksource*);
+extern void clocksource_touch_watchdog(void);
 extern struct clocksource* clocksource_get_next(void);
 extern void clocksource_change_rating(struct clocksource *cs, int rating);
 extern void clocksource_resume(void);
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 017ee782bc0..e3f60374042 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -28,6 +28,7 @@
  * kind, whether express or implied.
  */
 #include <linux/pid_namespace.h>
+#include <linux/clocksource.h>
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/console.h>
@@ -574,6 +575,7 @@ static void kgdb_wait(struct pt_regs *regs)
 
 	/* Signal the primary CPU that we are done: */
 	atomic_set(&cpu_in_kgdb[cpu], 0);
+	clocksource_touch_watchdog();
 	local_irq_restore(flags);
 }
 #endif
@@ -1396,6 +1398,7 @@ acquirelock:
 	    atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
 
 		atomic_set(&kgdb_active, -1);
+		clocksource_touch_watchdog();
 		local_irq_restore(flags);
 
 		goto acquirelock;
@@ -1487,6 +1490,7 @@ acquirelock:
 kgdb_restore:
 	/* Free kgdb_active */
 	atomic_set(&kgdb_active, -1);
+	clocksource_touch_watchdog();
 	local_irq_restore(flags);
 
 	return error;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 7f60097d443..f61402b1f2d 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -221,6 +221,18 @@ void clocksource_resume(void)
 	spin_unlock_irqrestore(&clocksource_lock, flags);
 }
 
+/**
+ * clocksource_touch_watchdog - Update watchdog
+ *
+ * Update the watchdog after exception contexts such as kgdb so as not
+ * to incorrectly trip the watchdog.
+ *
+ */
+void clocksource_touch_watchdog(void)
+{
+	clocksource_resume_watchdog();
+}
+
 /**
  * clocksource_get_next - Returns the selected clocksource
  *
-- 
cgit v1.2.3-70-g09d2


From 67baf94cd260dc37504dbd15ba3faa2d8cf8a444 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Fri, 15 Feb 2008 14:55:55 -0600
Subject: kgdb: print breakpoint removed on exception

If kgdb does remove a breakpoint that had a problem on the recursion
check, it should also print the address of the breakpoint.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kgdb.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index e3f60374042..319c08c92ee 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -1327,7 +1327,8 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
 		exception_level = 0;
 		kgdb_skipexception(ks->ex_vector, ks->linux_regs);
 		kgdb_activate_sw_breakpoints();
-		printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed\n");
+		printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
+			addr);
 		WARN_ON_ONCE(1);
 
 		return 1;
-- 
cgit v1.2.3-70-g09d2


From 64e9ee3095b61d0300ea548216a57d2536611309 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Fri, 15 Feb 2008 14:55:56 -0600
Subject: kgdb: add x86 HW breakpoints

Add HW breakpoints into the arch specific portion of x86 kgdb.  In the
current x86 kernel.org kernels HW breakpoints are changed out in lazy
fashion because there is no infrastructure around changing them when
changing to a kernel task or entering the kernel mode via a system
call.  This lazy approach means that if a user process uses HW
breakpoints the kgdb will loose out.  This is an acceptable trade off
because the developer debugging the kernel is assumed to know what is
going on system wide and would be aware of this trade off.

There is a minor bug fix to the kgdb core so as to correctly call the
hw breakpoint functions with a valid value from the enum.

There is also a minor change to the x86_64 startup code when using
early HW breakpoints.  When the debugger is connected, the cpu startup
code must not zero out the HW breakpoint registers or you cannot hit
the breakpoints you are interested in, in the first place.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/kgdb.c    | 138 ++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/setup64.c |  16 ++++++
 kernel/kgdb.c             |   4 +-
 3 files changed, 156 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 5d7a21119bf..7d651adcb22 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -182,6 +182,122 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
 #endif
 }
 
+static struct hw_breakpoint {
+	unsigned		enabled;
+	unsigned		type;
+	unsigned		len;
+	unsigned long		addr;
+} breakinfo[4];
+
+static void kgdb_correct_hw_break(void)
+{
+	unsigned long dr7;
+	int correctit = 0;
+	int breakbit;
+	int breakno;
+
+	get_debugreg(dr7, 7);
+	for (breakno = 0; breakno < 4; breakno++) {
+		breakbit = 2 << (breakno << 1);
+		if (!(dr7 & breakbit) && breakinfo[breakno].enabled) {
+			correctit = 1;
+			dr7 |= breakbit;
+			dr7 &= ~(0xf0000 << (breakno << 2));
+			dr7 |= ((breakinfo[breakno].len << 2) |
+				 breakinfo[breakno].type) <<
+			       ((breakno << 2) + 16);
+			if (breakno >= 0 && breakno <= 3)
+				set_debugreg(breakinfo[breakno].addr, breakno);
+
+		} else {
+			if ((dr7 & breakbit) && !breakinfo[breakno].enabled) {
+				correctit = 1;
+				dr7 &= ~breakbit;
+				dr7 &= ~(0xf0000 << (breakno << 2));
+			}
+		}
+	}
+	if (correctit)
+		set_debugreg(dr7, 7);
+}
+
+static int
+kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
+{
+	int i;
+
+	for (i = 0; i < 4; i++)
+		if (breakinfo[i].addr == addr && breakinfo[i].enabled)
+			break;
+	if (i == 4)
+		return -1;
+
+	breakinfo[i].enabled = 0;
+
+	return 0;
+}
+
+static void kgdb_remove_all_hw_break(void)
+{
+	int i;
+
+	for (i = 0; i < 4; i++)
+		memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint));
+}
+
+static int
+kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
+{
+	unsigned type;
+	int i;
+
+	for (i = 0; i < 4; i++)
+		if (!breakinfo[i].enabled)
+			break;
+	if (i == 4)
+		return -1;
+
+	switch (bptype) {
+	case BP_HARDWARE_BREAKPOINT:
+		type = 0;
+		len  = 1;
+		break;
+	case BP_WRITE_WATCHPOINT:
+		type = 1;
+		break;
+	case BP_ACCESS_WATCHPOINT:
+		type = 3;
+		break;
+	default:
+		return -1;
+	}
+
+	if (len == 1 || len == 2 || len == 4)
+		breakinfo[i].len  = len - 1;
+	else
+		return -1;
+
+	breakinfo[i].enabled = 1;
+	breakinfo[i].addr = addr;
+	breakinfo[i].type = type;
+
+	return 0;
+}
+
+/**
+ *	kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
+ *	@regs: Current &struct pt_regs.
+ *
+ *	This function will be called if the particular architecture must
+ *	disable hardware debugging while it is processing gdb packets or
+ *	handling exception.
+ */
+void kgdb_disable_hw_debug(struct pt_regs *regs)
+{
+	/* Disable hardware debugging while we are in kgdb: */
+	set_debugreg(0UL, 7);
+}
+
 /**
  *	kgdb_post_primary_code - Save error vector/code numbers.
  *	@regs: Original pt_regs.
@@ -243,6 +359,7 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
 			       struct pt_regs *linux_regs)
 {
 	unsigned long addr;
+	unsigned long dr6;
 	char *ptr;
 	int newPC;
 
@@ -269,6 +386,22 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
 			}
 		}
 
+		get_debugreg(dr6, 6);
+		if (!(dr6 & 0x4000)) {
+			int breakno;
+
+			for (breakno = 0; breakno < 4; breakno++) {
+				if (dr6 & (1 << breakno) &&
+				    breakinfo[breakno].type == 0) {
+					/* Set restore flag: */
+					linux_regs->flags |= X86_EFLAGS_RF;
+					break;
+				}
+			}
+		}
+		set_debugreg(0UL, 6);
+		kgdb_correct_hw_break();
+
 		return 0;
 	}
 
@@ -426,4 +559,9 @@ unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs)
 struct kgdb_arch arch_kgdb_ops = {
 	/* Breakpoint instruction: */
 	.gdb_bpt_instr		= { 0xcc },
+	.flags			= KGDB_HW_BREAKPOINT,
+	.set_hw_breakpoint	= kgdb_set_hw_break,
+	.remove_hw_breakpoint	= kgdb_remove_hw_break,
+	.remove_all_hw_break	= kgdb_remove_all_hw_break,
+	.correct_hw_break	= kgdb_correct_hw_break,
 };
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index e24c4567709..143aa78c566 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -11,6 +11,7 @@
 #include <linux/bootmem.h>
 #include <linux/bitops.h>
 #include <linux/module.h>
+#include <linux/kgdb.h>
 #include <asm/pda.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -327,6 +328,17 @@ void __cpuinit cpu_init (void)
 	load_TR_desc();
 	load_LDT(&init_mm.context);
 
+#ifdef CONFIG_KGDB
+	/*
+	 * If the kgdb is connected no debug regs should be altered.  This
+	 * is only applicable when KGDB and a KGDB I/O module are built
+	 * into the kernel and you are using early debugging with
+	 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
+	 */
+	if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
+		arch_kgdb_ops.correct_hw_break();
+	else {
+#endif
 	/*
 	 * Clear all 6 debug registers:
 	 */
@@ -337,6 +349,10 @@ void __cpuinit cpu_init (void)
 	set_debugreg(0UL, 3);
 	set_debugreg(0UL, 6);
 	set_debugreg(0UL, 7);
+#ifdef CONFIG_KGDB
+	/* If the kgdb is connected no debug regs should be altered. */
+	}
+#endif
 
 	fpu_init(); 
 
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 319c08c92ee..68aea78407e 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -1139,10 +1139,10 @@ static void gdb_cmd_break(struct kgdb_state *ks)
 		error = kgdb_remove_sw_break(addr);
 	else if (remcom_in_buffer[0] == 'Z')
 		error = arch_kgdb_ops.set_hw_breakpoint(addr,
-			(int)length, *bpt_type);
+			(int)length, *bpt_type - '0');
 	else if (remcom_in_buffer[0] == 'z')
 		error = arch_kgdb_ops.remove_hw_breakpoint(addr,
-			(int) length, *bpt_type);
+			(int) length, *bpt_type - '0');
 
 	if (error == 0)
 		strcpy(remcom_out_buffer, "OK");
-- 
cgit v1.2.3-70-g09d2


From b4b8ac524d9b6ed7229017145afa1d7afbea4a48 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Wed, 20 Feb 2008 13:33:38 -0600
Subject: kgdb: fix optional arch functions and probe_kernel_*

Fix two regressions dealing with the kgdb core.

1) kgdb_skipexception and kgdb_post_primary_code are optional
functions that are only required on archs that need special exception
fixups.

2) The kernel address space scope must be set on any probe_kernel_*
function or archs such as ARCH=arm will not allow access to the kernel
memory space.  As an example, it is required to allow the full kernel
address space is when you the kernel debugger to inspect a system
call.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kgdb.c | 11 +++++++++++
 mm/maccess.c  |  6 ++++++
 2 files changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 68aea78407e..31425e0fbf2 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -200,6 +200,17 @@ int __weak kgdb_arch_init(void)
 	return 0;
 }
 
+int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
+{
+	return 0;
+}
+
+void __weak
+kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
+{
+	return;
+}
+
 /**
  *	kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
  *	@regs: Current &struct pt_regs.
diff --git a/mm/maccess.c b/mm/maccess.c
index 24f81b97140..ac40796cfb1 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -17,11 +17,14 @@
 long probe_kernel_read(void *dst, void *src, size_t size)
 {
 	long ret;
+	mm_segment_t old_fs = get_fs();
 
+	set_fs(KERNEL_DS);
 	pagefault_disable();
 	ret = __copy_from_user_inatomic(dst,
 			(__force const void __user *)src, size);
 	pagefault_enable();
+	set_fs(old_fs);
 
 	return ret ? -EFAULT : 0;
 }
@@ -39,10 +42,13 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
 long probe_kernel_write(void *dst, void *src, size_t size)
 {
 	long ret;
+	mm_segment_t old_fs = get_fs();
 
+	set_fs(KERNEL_DS);
 	pagefault_disable();
 	ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
 	pagefault_enable();
+	set_fs(old_fs);
 
 	return ret ? -EFAULT : 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 737a460f21febe551ff1d2299b63bae9b154078f Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Fri, 7 Mar 2008 16:34:16 -0600
Subject: kgdb: fix several kgdb regressions

kgdb core fixes:
- Check to see that mm->mmap_cache is not null before calling
  flush_cache_range(), else on arch=ARM it will cause a fatal
  fault.

- Breakpoints should only be restored if they are in the BP_ACTIVE
  state.

- Fix a typo in comments to "kgdb_register_io_module"

x86 kgdb fixes:
- Fix the x86 arch handler such that on a kill or detach that the
  appropriate cleanup on the single stepping flags gets run.

- Add in the DIE_NMIWATCHDOG call for x86_64

- Touch the nmi watchdog before returning the system to normal
  operation after performing any kind of kgdb operation, else
  the possibility exists to trigger the watchdog.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/kgdb.c     |  4 ++++
 arch/x86/kernel/traps_64.c |  7 ++++++-
 kernel/kgdb.c              | 14 ++++++++------
 3 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 7d651adcb22..8c7e555f6d3 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -370,6 +370,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
 		ptr = &remcomInBuffer[1];
 		if (kgdb_hex2long(&ptr, &addr))
 			linux_regs->ip = addr;
+	case 'D':
+	case 'k':
 		newPC = linux_regs->ip;
 
 		/* clear the trace bit */
@@ -480,6 +482,8 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
 	if (kgdb_handle_exception(args->trapnr, args->signr, args->err, regs))
 		return NOTIFY_DONE;
 
+	/* Must touch watchdog before return to normal operation */
+	touch_nmi_watchdog();
 	return NOTIFY_STOP;
 }
 
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 055b1650c69..4e073320e70 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -600,8 +600,13 @@ void die(const char * str, struct pt_regs * regs, long err)
 
 void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
 {
-	unsigned long flags = oops_begin();
+	unsigned long flags;
+
+	if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
+	    NOTIFY_STOP)
+		return;
 
+	flags = oops_begin();
 	/*
 	 * We are in trouble anyway, lets at least try
 	 * to get a message out.
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 31425e0fbf2..85b7e5b934a 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -600,7 +600,7 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
 	if (!CACHE_FLUSH_IS_SAFE)
 		return;
 
-	if (current->mm) {
+	if (current->mm && current->mm->mmap_cache) {
 		flush_cache_range(current->mm->mmap_cache,
 				  addr, addr + BREAK_INSTR_SIZE);
 	} else {
@@ -729,14 +729,16 @@ int remove_all_break(void)
 
 	/* Clear memory breakpoints. */
 	for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-		if (kgdb_break[i].state != BP_SET)
-			continue;
+		if (kgdb_break[i].state != BP_ACTIVE)
+			goto setundefined;
 		addr = kgdb_break[i].bpt_addr;
 		error = kgdb_arch_remove_breakpoint(addr,
 				kgdb_break[i].saved_instr);
 		if (error)
-			return error;
-		kgdb_break[i].state = BP_REMOVED;
+			printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
+			   addr);
+setundefined:
+		kgdb_break[i].state = BP_UNDEFINED;
 	}
 
 	/* Clear hardware breakpoints. */
@@ -1605,7 +1607,7 @@ static void kgdb_initial_breakpoint(void)
 }
 
 /**
- *	kkgdb_register_io_module - register KGDB IO module
+ *	kgdb_register_io_module - register KGDB IO module
  *	@new_kgdb_io_ops: the io ops vector
  *
  *	Register it with the KGDB core.
-- 
cgit v1.2.3-70-g09d2


From 56fb70932964927597ce30bbd820471633c72adc Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Tue, 1 Apr 2008 16:55:27 -0500
Subject: kgdb: fix SMP NMI kgdb_handle_exception exit race

Fix the problem of protecting the kgdb handle_exception exit
which had an NMI race condition, while trying to restore
normal system operation.

There was a small window after the master processor sets cpu_in_debug
to zero but before it has set kgdb_active to zero where a
non-master processor in an SMP system could receive an NMI and
re-enter the kgdb_wait() loop.

As long as the master processor sets the cpu_in_debug before sending
the cpu roundup the cpu_in_debug variable can also be used to guard
against the race condition.

The kgdb_wait() function no longer needs to check
kgdb_active because it is done in the arch specific code
and handled along with the nmi traps at the low level.
This also allows kgdb_wait() to exit correctly if it was
entered for some unknown reason due to a spurious NMI that
could not be handled by the arch specific code.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kgdb.c | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 85b7e5b934a..4d1b3c23237 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -561,18 +561,6 @@ static void kgdb_wait(struct pt_regs *regs)
 	smp_wmb();
 	atomic_set(&cpu_in_kgdb[cpu], 1);
 
-	/*
-	 * The primary CPU must be active to enter here, but this is
-	 * guard in case the primary CPU had not been selected if
-	 * this was an entry via nmi.
-	 */
-	while (atomic_read(&kgdb_active) == -1)
-		cpu_relax();
-
-	/* Wait till primary CPU goes completely into the debugger. */
-	while (!atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)]))
-		cpu_relax();
-
 	/* Wait till primary CPU is done with debugging */
 	while (atomic_read(&passive_cpu_wait[cpu]))
 		cpu_relax();
@@ -1447,18 +1435,18 @@ acquirelock:
 			atomic_set(&passive_cpu_wait[i], 1);
 	}
 
-#ifdef CONFIG_SMP
-	/* Signal the other CPUs to enter kgdb_wait() */
-	if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup)
-		kgdb_roundup_cpus(flags);
-#endif
-
 	/*
 	 * spin_lock code is good enough as a barrier so we don't
 	 * need one here:
 	 */
 	atomic_set(&cpu_in_kgdb[ks->cpu], 1);
 
+#ifdef CONFIG_SMP
+	/* Signal the other CPUs to enter kgdb_wait() */
+	if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup)
+		kgdb_roundup_cpus(flags);
+#endif
+
 	/*
 	 * Wait for the other CPUs to be notified and be waiting for us:
 	 */
@@ -1514,7 +1502,8 @@ int kgdb_nmicallback(int cpu, void *regs)
 {
 #ifdef CONFIG_SMP
 	if (!atomic_read(&cpu_in_kgdb[cpu]) &&
-			atomic_read(&kgdb_active) != cpu) {
+			atomic_read(&kgdb_active) != cpu &&
+			atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) {
 		kgdb_wait((struct pt_regs *)regs);
 		return 0;
 	}
-- 
cgit v1.2.3-70-g09d2


From 1a9a3e76dde191f82f7a8a66059dcbb4a9f63ff3 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Tue, 1 Apr 2008 16:55:28 -0500
Subject: kgdb: always use icache flush for sw breakpoints

On the ppc 4xx architecture the instruction cache must be flushed as
well as the data cache.  This patch just makes it generic for all
architectures where CACHE_FLUSH_IS_SAFE is set to 1.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kgdb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 4d1b3c23237..1bd0ec1c80b 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -591,9 +591,9 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
 	if (current->mm && current->mm->mmap_cache) {
 		flush_cache_range(current->mm->mmap_cache,
 				  addr, addr + BREAK_INSTR_SIZE);
-	} else {
-		flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
 	}
+	/* Force flush instruction cache if it was outside the mm */
+	flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 0e04388f0189fa1f6812a8e1cb6172136eada87e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 17 Apr 2008 11:37:15 +0800
Subject: cgroup: fix a race condition in manipulating tsk->cg_list

When I ran a test program to fork mass processes and at the same time
'cat /cgroup/tasks', I got the following oops:

  ------------[ cut here ]------------
  kernel BUG at lib/list_debug.c:72!
  invalid opcode: 0000 [#1] SMP
  Pid: 4178, comm: a.out Not tainted (2.6.25-rc9 #72)
  ...
  Call Trace:
   [<c044a5f9>] ? cgroup_exit+0x55/0x94
   [<c0427acf>] ? do_exit+0x217/0x5ba
   [<c0427ed7>] ? do_group_exit+0.65/0x7c
   [<c0427efd>] ? sys_exit_group+0xf/0x11
   [<c0404842>] ? syscall_call+0x7/0xb
   [<c05e0000>] ? init_cyrix+0x2fa/0x479
  ...
  EIP: [<c04df671>] list_del+0x35/0x53 SS:ESP 0068:ebc7df4
  ---[ end trace caffb7332252612b ]---
  Fixing recursive fault but reboot is needed!

After digging into the code and debugging, I finlly found out a race
situation:

				do_exit()
				  ->cgroup_exit()
				    ->if (!list_empty(&tsk->cg_list))
				        list_del(&tsk->cg_list);

  cgroup_iter_start()
    ->cgroup_enable_task_cg_list()
      ->list_add(&tsk->cg_list, ..);

In this case the list won't be deleted though the process has exited.

We got two bug reports in the past, which seem to be the same bug as
this one:
	http://lkml.org/lkml/2008/3/5/332
	http://lkml.org/lkml/2007/10/17/224

Actually sometimes I got oops on list_del, sometimes oops on list_add.
And I can change my test program a bit to trigger other oops.

The patch has been tested both on x86_32 and x86_64.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2727f923835..6d8de051382 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1722,7 +1722,12 @@ void cgroup_enable_task_cg_lists(void)
 	use_task_css_set_links = 1;
 	do_each_thread(g, p) {
 		task_lock(p);
-		if (list_empty(&p->cg_list))
+		/*
+		 * We should check if the process is exiting, otherwise
+		 * it will race with cgroup_exit() in that the list
+		 * entry won't be deleted though the process has exited.
+		 */
+		if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
 			list_add(&p->cg_list, &p->cgroups->tasks);
 		task_unlock(p);
 	} while_each_thread(g, p);
-- 
cgit v1.2.3-70-g09d2


From 18c98b65279c00c3c983a4525161207f1aa6a04b Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Thu, 17 Apr 2008 18:44:38 -0700
Subject: ptrace_signal subroutine

This breaks out the ptrace handling from get_signal_to_deliver into a
new subroutine.  The actual code there doesn't change, and it gets
inlined into nearly identical compiled code.  This makes the function
substantially shorter and thus easier to read, and it nicely isolates
the ptrace magic.

Signed-off-by: Roland McGrath <roland@redhat.com>
Acked-by: Kyle McMartin <kyle@mcmartin.ca>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 71 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 6af1210092c..cc8303cd093 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1757,6 +1757,45 @@ static int do_signal_stop(int signr)
 	return 1;
 }
 
+static int ptrace_signal(int signr, siginfo_t *info,
+			 struct pt_regs *regs, void *cookie)
+{
+	if (!(current->ptrace & PT_PTRACED))
+		return signr;
+
+	ptrace_signal_deliver(regs, cookie);
+
+	/* Let the debugger run.  */
+	ptrace_stop(signr, 0, info);
+
+	/* We're back.  Did the debugger cancel the sig?  */
+	signr = current->exit_code;
+	if (signr == 0)
+		return signr;
+
+	current->exit_code = 0;
+
+	/* Update the siginfo structure if the signal has
+	   changed.  If the debugger wanted something
+	   specific in the siginfo structure then it should
+	   have updated *info via PTRACE_SETSIGINFO.  */
+	if (signr != info->si_signo) {
+		info->si_signo = signr;
+		info->si_errno = 0;
+		info->si_code = SI_USER;
+		info->si_pid = task_pid_vnr(current->parent);
+		info->si_uid = current->parent->uid;
+	}
+
+	/* If the (new) signal is now blocked, requeue it.  */
+	if (sigismember(&current->blocked, signr)) {
+		specific_send_sig_info(signr, info, current);
+		signr = 0;
+	}
+
+	return signr;
+}
+
 int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
 			  struct pt_regs *regs, void *cookie)
 {
@@ -1785,36 +1824,10 @@ relock:
 		if (!signr)
 			break; /* will return 0 */
 
-		if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) {
-			ptrace_signal_deliver(regs, cookie);
-
-			/* Let the debugger run.  */
-			ptrace_stop(signr, 0, info);
-
-			/* We're back.  Did the debugger cancel the sig?  */
-			signr = current->exit_code;
-			if (signr == 0)
-				continue;
-
-			current->exit_code = 0;
-
-			/* Update the siginfo structure if the signal has
-			   changed.  If the debugger wanted something
-			   specific in the siginfo structure then it should
-			   have updated *info via PTRACE_SETSIGINFO.  */
-			if (signr != info->si_signo) {
-				info->si_signo = signr;
-				info->si_errno = 0;
-				info->si_code = SI_USER;
-				info->si_pid = task_pid_vnr(current->parent);
-				info->si_uid = current->parent->uid;
-			}
-
-			/* If the (new) signal is now blocked, requeue it.  */
-			if (sigismember(&current->blocked, signr)) {
-				specific_send_sig_info(signr, info, current);
+		if (signr != SIGKILL) {
+			signr = ptrace_signal(signr, info, regs, cookie);
+			if (!signr)
 				continue;
-			}
 		}
 
 		ka = &current->sighand->action[signr-1];
-- 
cgit v1.2.3-70-g09d2


From 2a862b32f3da5a2120043921ad301322ad526084 Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <darwish.07@gmail.com>
Date: Sat, 1 Mar 2008 21:54:38 +0200
Subject: Audit: use new LSM hooks instead of SELinux exports

Stop using the following exported SELinux interfaces:
selinux_get_inode_sid(inode, sid)
selinux_get_ipc_sid(ipcp, sid)
selinux_get_task_sid(tsk, sid)
selinux_sid_to_string(sid, ctx, len)
kfree(ctx)

and use following generic LSM equivalents respectively:
security_inode_getsecid(inode, secid)
security_ipc_getsecid*(ipcp, secid)
security_task_getsecid(tsk, secid)
security_sid_to_secctx(sid, ctx, len)
security_release_secctx(ctx, len)

Call security_release_secctx only if security_secid_to_secctx
succeeded.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Ahmed S. Darwish <darwish.07@gmail.com>
Acked-by: James Morris <jmorris@namei.org>
Reviewed-by: Paul Moore <paul.moore@hp.com>
---
 kernel/audit.c       | 17 ++++++++--------
 kernel/auditfilter.c |  8 +++++---
 kernel/auditsc.c     | 55 +++++++++++++++++++++++++++-------------------------
 3 files changed, 43 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index b782b046543..784a48e9f38 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -265,13 +265,13 @@ static int audit_log_config_change(char *function_name, int new, int old,
 		char *ctx = NULL;
 		u32 len;
 
-		rc = selinux_sid_to_string(sid, &ctx, &len);
+		rc = security_secid_to_secctx(sid, &ctx, &len);
 		if (rc) {
 			audit_log_format(ab, " sid=%u", sid);
 			allow_changes = 0; /* Something weird, deny request */
 		} else {
 			audit_log_format(ab, " subj=%s", ctx);
-			kfree(ctx);
+			security_release_secctx(ctx, len);
 		}
 	}
 	audit_log_format(ab, " res=%d", allow_changes);
@@ -550,12 +550,13 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
 	audit_log_format(*ab, "user pid=%d uid=%u auid=%u",
 			 pid, uid, auid);
 	if (sid) {
-		rc = selinux_sid_to_string(sid, &ctx, &len);
+		rc = security_secid_to_secctx(sid, &ctx, &len);
 		if (rc)
 			audit_log_format(*ab, " ssid=%u", sid);
-		else
+		else {
 			audit_log_format(*ab, " subj=%s", ctx);
-		kfree(ctx);
+			security_release_secctx(ctx, len);
+		}
 	}
 
 	return rc;
@@ -758,18 +759,18 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		break;
 	}
 	case AUDIT_SIGNAL_INFO:
-		err = selinux_sid_to_string(audit_sig_sid, &ctx, &len);
+		err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
 		if (err)
 			return err;
 		sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
 		if (!sig_data) {
-			kfree(ctx);
+			security_release_secctx(ctx, len);
 			return -ENOMEM;
 		}
 		sig_data->uid = audit_sig_uid;
 		sig_data->pid = audit_sig_pid;
 		memcpy(sig_data->ctx, ctx, len);
-		kfree(ctx);
+		security_release_secctx(ctx, len);
 		audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
 				0, 0, sig_data, sizeof(*sig_data) + len);
 		kfree(sig_data);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 2f2914b7cc3..35e58a146ef 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -28,6 +28,7 @@
 #include <linux/netlink.h>
 #include <linux/sched.h>
 #include <linux/inotify.h>
+#include <linux/security.h>
 #include <linux/selinux.h>
 #include "audit.h"
 
@@ -1515,11 +1516,12 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
 	if (sid) {
 		char *ctx = NULL;
 		u32 len;
-		if (selinux_sid_to_string(sid, &ctx, &len))
+		if (security_secid_to_secctx(sid, &ctx, &len))
 			audit_log_format(ab, " ssid=%u", sid);
-		else
+		else {
 			audit_log_format(ab, " subj=%s", ctx);
-		kfree(ctx);
+			security_release_secctx(ctx, len);
+		}
 	}
 	audit_log_format(ab, " op=%s rule key=", action);
 	if (rule->filterkey)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 782262e4107..6a83c706b50 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -530,7 +530,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 			   logged upon error */
 			if (f->se_rule) {
 				if (need_sid) {
-					selinux_get_task_sid(tsk, &sid);
+					security_task_getsecid(tsk, &sid);
 					need_sid = 0;
 				}
 				result = selinux_audit_rule_match(sid, f->type,
@@ -885,11 +885,11 @@ void audit_log_task_context(struct audit_buffer *ab)
 	int error;
 	u32 sid;
 
-	selinux_get_task_sid(current, &sid);
+	security_task_getsecid(current, &sid);
 	if (!sid)
 		return;
 
-	error = selinux_sid_to_string(sid, &ctx, &len);
+	error = security_secid_to_secctx(sid, &ctx, &len);
 	if (error) {
 		if (error != -EINVAL)
 			goto error_path;
@@ -897,7 +897,7 @@ void audit_log_task_context(struct audit_buffer *ab)
 	}
 
 	audit_log_format(ab, " subj=%s", ctx);
-	kfree(ctx);
+	security_release_secctx(ctx, len);
 	return;
 
 error_path:
@@ -941,7 +941,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
 				 u32 sid, char *comm)
 {
 	struct audit_buffer *ab;
-	char *s = NULL;
+	char *ctx = NULL;
 	u32 len;
 	int rc = 0;
 
@@ -951,15 +951,16 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
 
 	audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid,
 			 uid, sessionid);
-	if (selinux_sid_to_string(sid, &s, &len)) {
+	if (security_secid_to_secctx(sid, &ctx, &len)) {
 		audit_log_format(ab, " obj=(none)");
 		rc = 1;
-	} else
-		audit_log_format(ab, " obj=%s", s);
+	} else {
+		audit_log_format(ab, " obj=%s", ctx);
+		security_release_secctx(ctx, len);
+	}
 	audit_log_format(ab, " ocomm=");
 	audit_log_untrustedstring(ab, comm);
 	audit_log_end(ab);
-	kfree(s);
 
 	return rc;
 }
@@ -1271,14 +1272,15 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			if (axi->osid != 0) {
 				char *ctx = NULL;
 				u32 len;
-				if (selinux_sid_to_string(
+				if (security_secid_to_secctx(
 						axi->osid, &ctx, &len)) {
 					audit_log_format(ab, " osid=%u",
 							axi->osid);
 					call_panic = 1;
-				} else
+				} else {
 					audit_log_format(ab, " obj=%s", ctx);
-				kfree(ctx);
+					security_release_secctx(ctx, len);
+				}
 			}
 			break; }
 
@@ -1392,13 +1394,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		if (n->osid != 0) {
 			char *ctx = NULL;
 			u32 len;
-			if (selinux_sid_to_string(
+			if (security_secid_to_secctx(
 				n->osid, &ctx, &len)) {
 				audit_log_format(ab, " osid=%u", n->osid);
 				call_panic = 2;
-			} else
+			} else {
 				audit_log_format(ab, " obj=%s", ctx);
-			kfree(ctx);
+				security_release_secctx(ctx, len);
+			}
 		}
 
 		audit_log_end(ab);
@@ -1775,7 +1778,7 @@ static void audit_copy_inode(struct audit_names *name, const struct inode *inode
 	name->uid   = inode->i_uid;
 	name->gid   = inode->i_gid;
 	name->rdev  = inode->i_rdev;
-	selinux_get_inode_sid(inode, &name->osid);
+	security_inode_getsecid(inode, &name->osid);
 }
 
 /**
@@ -2190,8 +2193,7 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
 	ax->uid = ipcp->uid;
 	ax->gid = ipcp->gid;
 	ax->mode = ipcp->mode;
-	selinux_get_ipc_sid(ipcp, &ax->osid);
-
+	security_ipc_getsecid(ipcp, &ax->osid);
 	ax->d.type = AUDIT_IPC;
 	ax->d.next = context->aux;
 	context->aux = (void *)ax;
@@ -2343,7 +2345,7 @@ void __audit_ptrace(struct task_struct *t)
 	context->target_auid = audit_get_loginuid(t);
 	context->target_uid = t->uid;
 	context->target_sessionid = audit_get_sessionid(t);
-	selinux_get_task_sid(t, &context->target_sid);
+	security_task_getsecid(t, &context->target_sid);
 	memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
 }
 
@@ -2371,7 +2373,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 				audit_sig_uid = tsk->loginuid;
 			else
 				audit_sig_uid = tsk->uid;
-			selinux_get_task_sid(tsk, &audit_sig_sid);
+			security_task_getsecid(tsk, &audit_sig_sid);
 		}
 		if (!audit_signals || audit_dummy_context())
 			return 0;
@@ -2384,7 +2386,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 		ctx->target_auid = audit_get_loginuid(t);
 		ctx->target_uid = t->uid;
 		ctx->target_sessionid = audit_get_sessionid(t);
-		selinux_get_task_sid(t, &ctx->target_sid);
+		security_task_getsecid(t, &ctx->target_sid);
 		memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
 		return 0;
 	}
@@ -2405,7 +2407,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 	axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
 	axp->target_uid[axp->pid_count] = t->uid;
 	axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
-	selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]);
+	security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
 	memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
 	axp->pid_count++;
 
@@ -2435,16 +2437,17 @@ void audit_core_dumps(long signr)
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
 	audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
 			auid, current->uid, current->gid, sessionid);
-	selinux_get_task_sid(current, &sid);
+	security_task_getsecid(current, &sid);
 	if (sid) {
 		char *ctx = NULL;
 		u32 len;
 
-		if (selinux_sid_to_string(sid, &ctx, &len))
+		if (security_secid_to_secctx(sid, &ctx, &len))
 			audit_log_format(ab, " ssid=%u", sid);
-		else
+		else {
 			audit_log_format(ab, " subj=%s", ctx);
-		kfree(ctx);
+			security_release_secctx(ctx, len);
+		}
 	}
 	audit_log_format(ab, " pid=%d comm=", current->pid);
 	audit_log_untrustedstring(ab, current->comm);
-- 
cgit v1.2.3-70-g09d2


From d7a96f3a1ae279a2129653d6cb18d722f2f00f91 Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <darwish.07@gmail.com>
Date: Sat, 1 Mar 2008 22:01:11 +0200
Subject: Audit: internally use the new LSM audit hooks

Convert Audit to use the new LSM Audit hooks instead of
the exported SELinux interface.

Basically, use:
security_audit_rule_init
secuirty_audit_rule_free
security_audit_rule_known
security_audit_rule_match

instad of (respectively) :
selinux_audit_rule_init
selinux_audit_rule_free
audit_rule_has_selinux
selinux_audit_rule_match

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Ahmed S. Darwish <darwish.07@gmail.com>
Acked-by: James Morris <jmorris@namei.org>
---
 kernel/audit.c       |  7 +-----
 kernel/auditfilter.c | 61 +++++++++++++++-------------------------------------
 kernel/auditsc.c     |  9 ++++----
 3 files changed, 22 insertions(+), 55 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 784a48e9f38..a7b16086d36 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -21,7 +21,7 @@
  *
  * Written by Rickard E. (Rik) Faith <faith@redhat.com>
  *
- * Goals: 1) Integrate fully with SELinux.
+ * Goals: 1) Integrate fully with Security Modules.
  *	  2) Minimal run-time overhead:
  *	     a) Minimal when syscall auditing is disabled (audit_enable=0).
  *	     b) Small when syscall auditing is enabled and no audit record
@@ -55,7 +55,6 @@
 #include <net/netlink.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
-#include <linux/selinux.h>
 #include <linux/inotify.h>
 #include <linux/freezer.h>
 #include <linux/tty.h>
@@ -882,10 +881,6 @@ static int __init audit_init(void)
 	audit_enabled = audit_default;
 	audit_ever_enabled |= !!audit_default;
 
-	/* Register the callback with selinux.  This callback will be invoked
-	 * when a new policy is loaded. */
-	selinux_audit_set_callback(&selinux_audit_rule_update);
-
 	audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
 
 #ifdef CONFIG_AUDITSYSCALL
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 35e58a146ef..7c69cb5e44f 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -29,7 +29,6 @@
 #include <linux/sched.h>
 #include <linux/inotify.h>
 #include <linux/security.h>
-#include <linux/selinux.h>
 #include "audit.h"
 
 /*
@@ -39,7 +38,7 @@
  * 		Synchronizes writes and blocking reads of audit's filterlist
  * 		data.  Rcu is used to traverse the filterlist and access
  * 		contents of structs audit_entry, audit_watch and opaque
- * 		selinux rules during filtering.  If modified, these structures
+ * 		LSM rules during filtering.  If modified, these structures
  * 		must be copied and replace their counterparts in the filterlist.
  * 		An audit_parent struct is not accessed during filtering, so may
  * 		be written directly provided audit_filter_mutex is held.
@@ -141,7 +140,7 @@ static inline void audit_free_rule(struct audit_entry *e)
 		for (i = 0; i < e->rule.field_count; i++) {
 			struct audit_field *f = &e->rule.fields[i];
 			kfree(f->se_str);
-			selinux_audit_rule_free(f->se_rule);
+			security_audit_rule_free(f->se_rule);
 		}
 	kfree(e->rule.fields);
 	kfree(e->rule.filterkey);
@@ -598,12 +597,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 				goto exit_free;
 			entry->rule.buflen += f->val;
 
-			err = selinux_audit_rule_init(f->type, f->op, str,
-						      &f->se_rule);
+			err = security_audit_rule_init(f->type, f->op, str,
+						       (void **)&f->se_rule);
 			/* Keep currently invalid fields around in case they
 			 * become valid after a policy reload. */
 			if (err == -EINVAL) {
-				printk(KERN_WARNING "audit rule for selinux "
+				printk(KERN_WARNING "audit rule for LSM "
 				       "\'%s\' is invalid\n",  str);
 				err = 0;
 			}
@@ -863,9 +862,9 @@ out:
 	return new;
 }
 
-/* Duplicate selinux field information.  The se_rule is opaque, so must be
+/* Duplicate LSM field information.  The se_rule is opaque, so must be
  * re-initialized. */
-static inline int audit_dupe_selinux_field(struct audit_field *df,
+static inline int audit_dupe_lsm_field(struct audit_field *df,
 					   struct audit_field *sf)
 {
 	int ret = 0;
@@ -878,12 +877,12 @@ static inline int audit_dupe_selinux_field(struct audit_field *df,
 	df->se_str = se_str;
 
 	/* our own (refreshed) copy of se_rule */
-	ret = selinux_audit_rule_init(df->type, df->op, df->se_str,
-				      &df->se_rule);
+	ret = security_audit_rule_init(df->type, df->op, df->se_str,
+				       (void **)&df->se_rule);
 	/* Keep currently invalid fields around in case they
 	 * become valid after a policy reload. */
 	if (ret == -EINVAL) {
-		printk(KERN_WARNING "audit rule for selinux \'%s\' is "
+		printk(KERN_WARNING "audit rule for LSM \'%s\' is "
 		       "invalid\n", df->se_str);
 		ret = 0;
 	}
@@ -892,7 +891,7 @@ static inline int audit_dupe_selinux_field(struct audit_field *df,
 }
 
 /* Duplicate an audit rule.  This will be a deep copy with the exception
- * of the watch - that pointer is carried over.  The selinux specific fields
+ * of the watch - that pointer is carried over.  The LSM specific fields
  * will be updated in the copy.  The point is to be able to replace the old
  * rule with the new rule in the filterlist, then free the old rule.
  * The rlist element is undefined; list manipulations are handled apart from
@@ -945,7 +944,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
 		case AUDIT_OBJ_TYPE:
 		case AUDIT_OBJ_LEV_LOW:
 		case AUDIT_OBJ_LEV_HIGH:
-			err = audit_dupe_selinux_field(&new->fields[i],
+			err = audit_dupe_lsm_field(&new->fields[i],
 						       &old->fields[i]);
 			break;
 		case AUDIT_FILTERKEY:
@@ -1763,38 +1762,12 @@ unlock_and_return:
 	return result;
 }
 
-/* Check to see if the rule contains any selinux fields.  Returns 1 if there
-   are selinux fields specified in the rule, 0 otherwise. */
-static inline int audit_rule_has_selinux(struct audit_krule *rule)
-{
-	int i;
-
-	for (i = 0; i < rule->field_count; i++) {
-		struct audit_field *f = &rule->fields[i];
-		switch (f->type) {
-		case AUDIT_SUBJ_USER:
-		case AUDIT_SUBJ_ROLE:
-		case AUDIT_SUBJ_TYPE:
-		case AUDIT_SUBJ_SEN:
-		case AUDIT_SUBJ_CLR:
-		case AUDIT_OBJ_USER:
-		case AUDIT_OBJ_ROLE:
-		case AUDIT_OBJ_TYPE:
-		case AUDIT_OBJ_LEV_LOW:
-		case AUDIT_OBJ_LEV_HIGH:
-			return 1;
-		}
-	}
-
-	return 0;
-}
-
 /* This function will re-initialize the se_rule field of all applicable rules.
- * It will traverse the filter lists serarching for rules that contain selinux
+ * It will traverse the filter lists serarching for rules that contain LSM
  * specific filter fields.  When such a rule is found, it is copied, the
- * selinux field is re-initialized, and the old rule is replaced with the
+ * LSM field is re-initialized, and the old rule is replaced with the
  * updated rule. */
-int selinux_audit_rule_update(void)
+int audit_update_lsm_rules(void)
 {
 	struct audit_entry *entry, *n, *nentry;
 	struct audit_watch *watch;
@@ -1806,7 +1779,7 @@ int selinux_audit_rule_update(void)
 
 	for (i = 0; i < AUDIT_NR_FILTERS; i++) {
 		list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) {
-			if (!audit_rule_has_selinux(&entry->rule))
+			if (!security_audit_rule_known(&entry->rule))
 				continue;
 
 			watch = entry->rule.watch;
@@ -1817,7 +1790,7 @@ int selinux_audit_rule_update(void)
 				 * return value */
 				if (!err)
 					err = PTR_ERR(nentry);
-				audit_panic("error updating selinux filters");
+				audit_panic("error updating LSM filters");
 				if (watch)
 					list_del(&entry->rule.rlist);
 				list_del_rcu(&entry->list);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 6a83c706b50..c0700535e5c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -61,7 +61,6 @@
 #include <linux/security.h>
 #include <linux/list.h>
 #include <linux/tty.h>
-#include <linux/selinux.h>
 #include <linux/binfmts.h>
 #include <linux/highmem.h>
 #include <linux/syscalls.h>
@@ -533,7 +532,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 					security_task_getsecid(tsk, &sid);
 					need_sid = 0;
 				}
-				result = selinux_audit_rule_match(sid, f->type,
+				result = security_audit_rule_match(sid, f->type,
 				                                  f->op,
 				                                  f->se_rule,
 				                                  ctx);
@@ -549,12 +548,12 @@ static int audit_filter_rules(struct task_struct *tsk,
 			if (f->se_rule) {
 				/* Find files that match */
 				if (name) {
-					result = selinux_audit_rule_match(
+					result = security_audit_rule_match(
 					           name->osid, f->type, f->op,
 					           f->se_rule, ctx);
 				} else if (ctx) {
 					for (j = 0; j < ctx->name_count; j++) {
-						if (selinux_audit_rule_match(
+						if (security_audit_rule_match(
 						      ctx->names[j].osid,
 						      f->type, f->op,
 						      f->se_rule, ctx)) {
@@ -570,7 +569,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 					     aux = aux->next) {
 						if (aux->type == AUDIT_IPC) {
 							struct audit_aux_data_ipcctl *axi = (void *)aux;
-							if (selinux_audit_rule_match(axi->osid, f->type, f->op, f->se_rule, ctx)) {
+							if (security_audit_rule_match(axi->osid, f->type, f->op, f->se_rule, ctx)) {
 								++result;
 								break;
 							}
-- 
cgit v1.2.3-70-g09d2


From 9d57a7f9e23dc30783d245280fc9907cf2c87837 Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <darwish.07@gmail.com>
Date: Sat, 1 Mar 2008 22:03:14 +0200
Subject: SELinux: use new audit hooks, remove redundant exports

Setup the new Audit LSM hooks for SELinux.
Remove the now redundant exported SELinux Audit interface.

Audit: Export 'audit_krule' and 'audit_field' to the public
since their internals are needed by the implementation of the
new LSM hook 'audit_rule_known'.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Ahmed S. Darwish <darwish.07@gmail.com>
Acked-by: James Morris <jmorris@namei.org>
---
 include/linux/audit.h          | 29 +++++++++++++++++
 include/linux/selinux.h        | 72 ------------------------------------------
 kernel/audit.h                 | 25 ---------------
 security/selinux/hooks.c       |  8 +++++
 security/selinux/ss/services.c | 45 +++++++++++++++++++-------
 5 files changed, 71 insertions(+), 108 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 2af9ec02501..04869c96016 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -353,6 +353,33 @@ struct netlink_skb_parms;
 struct linux_binprm;
 struct mq_attr;
 struct mqstat;
+struct audit_watch;
+struct audit_tree;
+
+struct audit_krule {
+	int			vers_ops;
+	u32			flags;
+	u32			listnr;
+	u32			action;
+	u32			mask[AUDIT_BITMASK_SIZE];
+	u32			buflen; /* for data alloc on list rules */
+	u32			field_count;
+	char			*filterkey; /* ties events to rules */
+	struct audit_field	*fields;
+	struct audit_field	*arch_f; /* quick access to arch field */
+	struct audit_field	*inode_f; /* quick access to an inode field */
+	struct audit_watch	*watch;	/* associated watch */
+	struct audit_tree	*tree;	/* associated watched tree */
+	struct list_head	rlist;	/* entry in audit_{watch,tree}.rules list */
+};
+
+struct audit_field {
+	u32				type;
+	u32				val;
+	u32				op;
+	char				*se_str;
+	void				*se_rule;
+};
 
 #define AUDITSC_INVALID 0
 #define AUDITSC_SUCCESS 1
@@ -536,6 +563,8 @@ extern void		    audit_log_d_path(struct audit_buffer *ab,
 					     const char *prefix,
 					     struct path *path);
 extern void		    audit_log_lost(const char *message);
+extern int		    audit_update_lsm_rules(void);
+
 				/* Private API (for audit.c only) */
 extern int audit_filter_user(struct netlink_skb_parms *cb, int type);
 extern int audit_filter_type(int type);
diff --git a/include/linux/selinux.h b/include/linux/selinux.h
index 24b0af1c4ca..20f965d4b04 100644
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -20,54 +20,6 @@ struct kern_ipc_perm;
 
 #ifdef CONFIG_SECURITY_SELINUX
 
-/**
- *	selinux_audit_rule_init - alloc/init an selinux audit rule structure.
- *	@field: the field this rule refers to
- *	@op: the operater the rule uses
- *	@rulestr: the text "target" of the rule
- *	@rule: pointer to the new rule structure returned via this
- *
- *	Returns 0 if successful, -errno if not.  On success, the rule structure
- *	will be allocated internally.  The caller must free this structure with
- *	selinux_audit_rule_free() after use.
- */
-int selinux_audit_rule_init(u32 field, u32 op, char *rulestr,
-                            struct selinux_audit_rule **rule);
-
-/**
- *	selinux_audit_rule_free - free an selinux audit rule structure.
- *	@rule: pointer to the audit rule to be freed
- *
- *	This will free all memory associated with the given rule.
- *	If @rule is NULL, no operation is performed.
- */
-void selinux_audit_rule_free(struct selinux_audit_rule *rule);
-
-/**
- *	selinux_audit_rule_match - determine if a context ID matches a rule.
- *	@sid: the context ID to check
- *	@field: the field this rule refers to
- *	@op: the operater the rule uses
- *	@rule: pointer to the audit rule to check against
- *	@actx: the audit context (can be NULL) associated with the check
- *
- *	Returns 1 if the context id matches the rule, 0 if it does not, and
- *	-errno on failure.
- */
-int selinux_audit_rule_match(u32 sid, u32 field, u32 op,
-                             struct selinux_audit_rule *rule,
-                             struct audit_context *actx);
-
-/**
- *	selinux_audit_set_callback - set the callback for policy reloads.
- *	@callback: the function to call when the policy is reloaded
- *
- *	This sets the function callback function that will update the rules
- *	upon policy reloads.  This callback should rebuild all existing rules
- *	using selinux_audit_rule_init().
- */
-void selinux_audit_set_callback(int (*callback)(void));
-
 /**
  *     selinux_string_to_sid - map a security context string to a security ID
  *     @str: the security context string to be mapped
@@ -111,30 +63,6 @@ void selinux_secmark_refcount_inc(void);
 void selinux_secmark_refcount_dec(void);
 #else
 
-static inline int selinux_audit_rule_init(u32 field, u32 op,
-                                          char *rulestr,
-                                          struct selinux_audit_rule **rule)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline void selinux_audit_rule_free(struct selinux_audit_rule *rule)
-{
-	return;
-}
-
-static inline int selinux_audit_rule_match(u32 sid, u32 field, u32 op,
-                                           struct selinux_audit_rule *rule,
-                                           struct audit_context *actx)
-{
-	return 0;
-}
-
-static inline void selinux_audit_set_callback(int (*callback)(void))
-{
-	return;
-}
-
 static inline int selinux_string_to_sid(const char *str, u32 *sid)
 {
        *sid = 0;
diff --git a/kernel/audit.h b/kernel/audit.h
index 2554bd524fd..3cfc54ee3e1 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -65,34 +65,9 @@ struct audit_watch {
 	struct list_head	rules;	/* associated rules */
 };
 
-struct audit_field {
-	u32				type;
-	u32				val;
-	u32				op;
-	char				*se_str;
-	struct selinux_audit_rule	*se_rule;
-};
-
 struct audit_tree;
 struct audit_chunk;
 
-struct audit_krule {
-	int			vers_ops;
-	u32			flags;
-	u32			listnr;
-	u32			action;
-	u32			mask[AUDIT_BITMASK_SIZE];
-	u32			buflen; /* for data alloc on list rules */
-	u32			field_count;
-	char			*filterkey; /* ties events to rules */
-	struct audit_field	*fields;
-	struct audit_field	*arch_f; /* quick access to arch field */
-	struct audit_field	*inode_f; /* quick access to an inode field */
-	struct audit_watch	*watch;	/* associated watch */
-	struct audit_tree	*tree;	/* associated watched tree */
-	struct list_head	rlist;	/* entry in audit_{watch,tree}.rules list */
-};
-
 struct audit_entry {
 	struct list_head	list;
 	struct rcu_head		rcu;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index bfffaa52e0c..a2f7e9cf78c 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -83,6 +83,7 @@
 #include "netport.h"
 #include "xfrm.h"
 #include "netlabel.h"
+#include "audit.h"
 
 #define XATTR_SELINUX_SUFFIX "selinux"
 #define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX
@@ -5478,6 +5479,13 @@ static struct security_operations selinux_ops = {
 	.key_free =                     selinux_key_free,
 	.key_permission =               selinux_key_permission,
 #endif
+
+#ifdef CONFIG_AUDIT
+	.audit_rule_init =		selinux_audit_rule_init,
+	.audit_rule_known =		selinux_audit_rule_known,
+	.audit_rule_match =		selinux_audit_rule_match,
+	.audit_rule_free =		selinux_audit_rule_free,
+#endif
 };
 
 static __init int selinux_init(void)
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index d75050819b0..1e0df5ec1bc 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -57,6 +57,7 @@
 #include "netlabel.h"
 #include "xfrm.h"
 #include "ebitmap.h"
+#include "audit.h"
 
 extern void selnl_notify_policyload(u32 seqno);
 unsigned int policydb_loaded_version;
@@ -2296,21 +2297,23 @@ struct selinux_audit_rule {
 	struct context au_ctxt;
 };
 
-void selinux_audit_rule_free(struct selinux_audit_rule *rule)
+void selinux_audit_rule_free(void *vrule)
 {
+	struct selinux_audit_rule *rule = vrule;
+
 	if (rule) {
 		context_destroy(&rule->au_ctxt);
 		kfree(rule);
 	}
 }
 
-int selinux_audit_rule_init(u32 field, u32 op, char *rulestr,
-                            struct selinux_audit_rule **rule)
+int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule)
 {
 	struct selinux_audit_rule *tmprule;
 	struct role_datum *roledatum;
 	struct type_datum *typedatum;
 	struct user_datum *userdatum;
+	struct selinux_audit_rule **rule = (struct selinux_audit_rule **)vrule;
 	int rc = 0;
 
 	*rule = NULL;
@@ -2397,12 +2400,37 @@ int selinux_audit_rule_init(u32 field, u32 op, char *rulestr,
 	return rc;
 }
 
-int selinux_audit_rule_match(u32 sid, u32 field, u32 op,
-                             struct selinux_audit_rule *rule,
+/* Check to see if the rule contains any selinux fields */
+int selinux_audit_rule_known(struct audit_krule *rule)
+{
+	int i;
+
+	for (i = 0; i < rule->field_count; i++) {
+		struct audit_field *f = &rule->fields[i];
+		switch (f->type) {
+		case AUDIT_SUBJ_USER:
+		case AUDIT_SUBJ_ROLE:
+		case AUDIT_SUBJ_TYPE:
+		case AUDIT_SUBJ_SEN:
+		case AUDIT_SUBJ_CLR:
+		case AUDIT_OBJ_USER:
+		case AUDIT_OBJ_ROLE:
+		case AUDIT_OBJ_TYPE:
+		case AUDIT_OBJ_LEV_LOW:
+		case AUDIT_OBJ_LEV_HIGH:
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule,
                              struct audit_context *actx)
 {
 	struct context *ctxt;
 	struct mls_level *level;
+	struct selinux_audit_rule *rule = vrule;
 	int match = 0;
 
 	if (!rule) {
@@ -2509,7 +2537,7 @@ out:
 	return match;
 }
 
-static int (*aurule_callback)(void) = NULL;
+static int (*aurule_callback)(void) = audit_update_lsm_rules;
 
 static int aurule_avc_callback(u32 event, u32 ssid, u32 tsid,
                                u16 class, u32 perms, u32 *retained)
@@ -2534,11 +2562,6 @@ static int __init aurule_init(void)
 }
 __initcall(aurule_init);
 
-void selinux_audit_set_callback(int (*callback)(void))
-{
-	aurule_callback = callback;
-}
-
 #ifdef CONFIG_NETLABEL
 /**
  * security_netlbl_cache_add - Add an entry to the NetLabel cache
-- 
cgit v1.2.3-70-g09d2


From 04305e4aff8b0533dc05f9f6f1a34d0796bd985f Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <darwish.07@gmail.com>
Date: Sat, 19 Apr 2008 09:59:43 +1000
Subject: Audit: Final renamings and cleanup

Rename the se_str and se_rule audit fields elements to
lsm_str and lsm_rule to avoid confusion.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Ahmed S. Darwish <darwish.07@gmail.com>
Acked-by: James Morris <jmorris@namei.org>
---
 include/linux/audit.h            |  4 +--
 kernel/auditfilter.c             | 40 ++++++++++++-------------
 kernel/auditsc.c                 | 12 ++++----
 security/selinux/include/audit.h | 65 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 93 insertions(+), 28 deletions(-)
 create mode 100644 security/selinux/include/audit.h

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 04869c96016..4ccb048cae1 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -377,8 +377,8 @@ struct audit_field {
 	u32				type;
 	u32				val;
 	u32				op;
-	char				*se_str;
-	void				*se_rule;
+	char				*lsm_str;
+	void				*lsm_rule;
 };
 
 #define AUDITSC_INVALID 0
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 7c69cb5e44f..28fef6bf853 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -139,8 +139,8 @@ static inline void audit_free_rule(struct audit_entry *e)
 	if (e->rule.fields)
 		for (i = 0; i < e->rule.field_count; i++) {
 			struct audit_field *f = &e->rule.fields[i];
-			kfree(f->se_str);
-			security_audit_rule_free(f->se_rule);
+			kfree(f->lsm_str);
+			security_audit_rule_free(f->lsm_rule);
 		}
 	kfree(e->rule.fields);
 	kfree(e->rule.filterkey);
@@ -554,8 +554,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		f->op = data->fieldflags[i] & AUDIT_OPERATORS;
 		f->type = data->fields[i];
 		f->val = data->values[i];
-		f->se_str = NULL;
-		f->se_rule = NULL;
+		f->lsm_str = NULL;
+		f->lsm_rule = NULL;
 		switch(f->type) {
 		case AUDIT_PID:
 		case AUDIT_UID:
@@ -598,7 +598,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 			entry->rule.buflen += f->val;
 
 			err = security_audit_rule_init(f->type, f->op, str,
-						       (void **)&f->se_rule);
+						       (void **)&f->lsm_rule);
 			/* Keep currently invalid fields around in case they
 			 * become valid after a policy reload. */
 			if (err == -EINVAL) {
@@ -610,7 +610,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 				kfree(str);
 				goto exit_free;
 			} else
-				f->se_str = str;
+				f->lsm_str = str;
 			break;
 		case AUDIT_WATCH:
 			str = audit_unpack_string(&bufp, &remain, f->val);
@@ -754,7 +754,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
 		case AUDIT_OBJ_LEV_LOW:
 		case AUDIT_OBJ_LEV_HIGH:
 			data->buflen += data->values[i] =
-				audit_pack_string(&bufp, f->se_str);
+				audit_pack_string(&bufp, f->lsm_str);
 			break;
 		case AUDIT_WATCH:
 			data->buflen += data->values[i] =
@@ -806,7 +806,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
 		case AUDIT_OBJ_TYPE:
 		case AUDIT_OBJ_LEV_LOW:
 		case AUDIT_OBJ_LEV_HIGH:
-			if (strcmp(a->fields[i].se_str, b->fields[i].se_str))
+			if (strcmp(a->fields[i].lsm_str, b->fields[i].lsm_str))
 				return 1;
 			break;
 		case AUDIT_WATCH:
@@ -862,28 +862,28 @@ out:
 	return new;
 }
 
-/* Duplicate LSM field information.  The se_rule is opaque, so must be
+/* Duplicate LSM field information.  The lsm_rule is opaque, so must be
  * re-initialized. */
 static inline int audit_dupe_lsm_field(struct audit_field *df,
 					   struct audit_field *sf)
 {
 	int ret = 0;
-	char *se_str;
+	char *lsm_str;
 
-	/* our own copy of se_str */
-	se_str = kstrdup(sf->se_str, GFP_KERNEL);
-	if (unlikely(!se_str))
+	/* our own copy of lsm_str */
+	lsm_str = kstrdup(sf->lsm_str, GFP_KERNEL);
+	if (unlikely(!lsm_str))
 		return -ENOMEM;
-	df->se_str = se_str;
+	df->lsm_str = lsm_str;
 
-	/* our own (refreshed) copy of se_rule */
-	ret = security_audit_rule_init(df->type, df->op, df->se_str,
-				       (void **)&df->se_rule);
+	/* our own (refreshed) copy of lsm_rule */
+	ret = security_audit_rule_init(df->type, df->op, df->lsm_str,
+				       (void **)&df->lsm_rule);
 	/* Keep currently invalid fields around in case they
 	 * become valid after a policy reload. */
 	if (ret == -EINVAL) {
 		printk(KERN_WARNING "audit rule for LSM \'%s\' is "
-		       "invalid\n", df->se_str);
+		       "invalid\n", df->lsm_str);
 		ret = 0;
 	}
 
@@ -930,7 +930,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
 	new->tree = old->tree;
 	memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount);
 
-	/* deep copy this information, updating the se_rule fields, because
+	/* deep copy this information, updating the lsm_rule fields, because
 	 * the originals will all be freed when the old rule is freed. */
 	for (i = 0; i < fcount; i++) {
 		switch (new->fields[i].type) {
@@ -1762,7 +1762,7 @@ unlock_and_return:
 	return result;
 }
 
-/* This function will re-initialize the se_rule field of all applicable rules.
+/* This function will re-initialize the lsm_rule field of all applicable rules.
  * It will traverse the filter lists serarching for rules that contain LSM
  * specific filter fields.  When such a rule is found, it is copied, the
  * LSM field is re-initialized, and the old rule is replaced with the
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index c0700535e5c..56e56ed594a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -527,14 +527,14 @@ static int audit_filter_rules(struct task_struct *tsk,
 			   match for now to avoid losing information that
 			   may be wanted.   An error message will also be
 			   logged upon error */
-			if (f->se_rule) {
+			if (f->lsm_rule) {
 				if (need_sid) {
 					security_task_getsecid(tsk, &sid);
 					need_sid = 0;
 				}
 				result = security_audit_rule_match(sid, f->type,
 				                                  f->op,
-				                                  f->se_rule,
+				                                  f->lsm_rule,
 				                                  ctx);
 			}
 			break;
@@ -545,18 +545,18 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_OBJ_LEV_HIGH:
 			/* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR
 			   also applies here */
-			if (f->se_rule) {
+			if (f->lsm_rule) {
 				/* Find files that match */
 				if (name) {
 					result = security_audit_rule_match(
 					           name->osid, f->type, f->op,
-					           f->se_rule, ctx);
+					           f->lsm_rule, ctx);
 				} else if (ctx) {
 					for (j = 0; j < ctx->name_count; j++) {
 						if (security_audit_rule_match(
 						      ctx->names[j].osid,
 						      f->type, f->op,
-						      f->se_rule, ctx)) {
+						      f->lsm_rule, ctx)) {
 							++result;
 							break;
 						}
@@ -569,7 +569,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 					     aux = aux->next) {
 						if (aux->type == AUDIT_IPC) {
 							struct audit_aux_data_ipcctl *axi = (void *)aux;
-							if (security_audit_rule_match(axi->osid, f->type, f->op, f->se_rule, ctx)) {
+							if (security_audit_rule_match(axi->osid, f->type, f->op, f->lsm_rule, ctx)) {
 								++result;
 								break;
 							}
diff --git a/security/selinux/include/audit.h b/security/selinux/include/audit.h
new file mode 100644
index 00000000000..6c8b9ef1557
--- /dev/null
+++ b/security/selinux/include/audit.h
@@ -0,0 +1,65 @@
+/*
+ * SELinux support for the Audit LSM hooks
+ *
+ * Most of below header was moved from include/linux/selinux.h which 
+ * is released under below copyrights:
+ *
+ * Author: James Morris <jmorris@redhat.com>
+ *
+ * Copyright (C) 2005 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ * Copyright (C) 2006 Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
+ * Copyright (C) 2006 IBM Corporation, Timothy R. Chavez <tinytim@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2,
+ * as published by the Free Software Foundation.
+ */
+
+#ifndef _SELINUX_AUDIT_H
+#define _SELINUX_AUDIT_H
+
+/**
+ *	selinux_audit_rule_init - alloc/init an selinux audit rule structure.
+ *	@field: the field this rule refers to
+ *	@op: the operater the rule uses
+ *	@rulestr: the text "target" of the rule
+ *	@rule: pointer to the new rule structure returned via this
+ *
+ *	Returns 0 if successful, -errno if not.  On success, the rule structure
+ *	will be allocated internally.  The caller must free this structure with
+ *	selinux_audit_rule_free() after use.
+ */
+int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **rule);
+
+/**
+ *	selinux_audit_rule_free - free an selinux audit rule structure.
+ *	@rule: pointer to the audit rule to be freed
+ *
+ *	This will free all memory associated with the given rule.
+ *	If @rule is NULL, no operation is performed.
+ */
+void selinux_audit_rule_free(void *rule);
+
+/**
+ *	selinux_audit_rule_match - determine if a context ID matches a rule.
+ *	@sid: the context ID to check
+ *	@field: the field this rule refers to
+ *	@op: the operater the rule uses
+ *	@rule: pointer to the audit rule to check against
+ *	@actx: the audit context (can be NULL) associated with the check
+ *
+ *	Returns 1 if the context id matches the rule, 0 if it does not, and
+ *	-errno on failure.
+ */
+int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *rule,
+                             struct audit_context *actx);
+
+/**
+ *	selinux_audit_rule_known - check to see if rule contains selinux fields.
+ *	@rule: rule to be checked
+ *	Returns 1 if there are selinux fields specified in the rule, 0 otherwise.
+ */
+int selinux_audit_rule_known(struct audit_krule *krule);
+
+#endif /* _SELINUX_AUDIT_H */
+
-- 
cgit v1.2.3-70-g09d2


From a6550207538619bc9b90bac2e1d5e54902a432ad Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Tue, 26 Feb 2008 10:47:18 -0500
Subject: kernel: Remove unnecessary inclusions of asm/semaphore.h

None of these files use any of the functionality promised by
asm/semaphore.h.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
---
 kernel/kexec.c        | 1 -
 kernel/kthread.c      | 1 -
 kernel/module.c       | 3 +--
 kernel/posix-timers.c | 1 -
 kernel/profile.c      | 1 -
 kernel/stop_machine.c | 1 -
 6 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 06a0e277565..6782dce93d0 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -29,7 +29,6 @@
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/system.h>
-#include <asm/semaphore.h>
 #include <asm/sections.h>
 
 /* Per cpu memory for storing cpu states in case of system crash. */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 0ac887882f9..92dd1bcaa0d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -13,7 +13,6 @@
 #include <linux/file.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-#include <asm/semaphore.h>
 
 #define KTHREAD_NICE_LEVEL (-5)
 
diff --git a/kernel/module.c b/kernel/module.c
index 5d437bffd8d..8d6cccc6c3c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -43,7 +43,6 @@
 #include <linux/mutex.h>
 #include <linux/unwind.h>
 #include <asm/uaccess.h>
-#include <asm/semaphore.h>
 #include <asm/cacheflush.h>
 #include <linux/license.h>
 #include <asm/sections.h>
@@ -664,7 +663,7 @@ static void free_module(struct module *mod);
 
 static void wait_for_zero_refcount(struct module *mod)
 {
-	/* Since we might sleep for some time, drop the semaphore first */
+	/* Since we might sleep for some time, release the mutex first */
 	mutex_unlock(&module_mutex);
 	for (;;) {
 		DEBUGP("Looking at refcount...\n");
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index a9b04203a66..8476956ffd9 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -37,7 +37,6 @@
 #include <linux/mutex.h>
 
 #include <asm/uaccess.h>
-#include <asm/semaphore.h>
 #include <linux/list.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
diff --git a/kernel/profile.c b/kernel/profile.c
index 3b7a1b05512..606d7387265 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -23,7 +23,6 @@
 #include <linux/highmem.h>
 #include <linux/mutex.h>
 #include <asm/sections.h>
-#include <asm/semaphore.h>
 #include <asm/irq_regs.h>
 #include <asm/ptrace.h>
 
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 6f4e0e13f70..3da3c2c6702 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -11,7 +11,6 @@
 #include <linux/interrupt.h>
 
 #include <asm/atomic.h>
-#include <asm/semaphore.h>
 #include <asm/uaccess.h>
 
 /* Since we effect priority and affinity (both of which are visible
-- 
cgit v1.2.3-70-g09d2


From 8fb402bccf203ecca8f9e0202b8fd3c937dece6f Mon Sep 17 00:00:00 2001
From: Erik Bosman <ebn310@few.vu.nl>
Date: Fri, 11 Apr 2008 18:54:17 +0200
Subject: generic, x86: add prctl commands PR_GET_TSC and PR_SET_TSC

This patch adds prctl commands that make it possible
to deny the execution of timestamp counters in userspace.
If this is not implemented on a specific architecture,
prctl will return -EINVAL.

ned-off-by: Erik Bosman <ejbosman@cs.vu.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/prctl.h |  6 ++++++
 kernel/sys.c          | 13 ++++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 3800639775a..5c80b193963 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -67,4 +67,10 @@
 #define PR_CAPBSET_READ 23
 #define PR_CAPBSET_DROP 24
 
+/* Get/set the process' ability to use the timestamp counter instruction */
+#define PR_GET_TSC 25
+#define PR_SET_TSC 26
+# define PR_TSC_ENABLE		1	/* allow the use of the timestamp counter */
+# define PR_TSC_SIGSEGV		2	/* throw a SIGSEGV instead of reading the TSC */
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index a626116af5d..6a0cc71ee88 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -67,6 +67,12 @@
 #ifndef SET_ENDIAN
 # define SET_ENDIAN(a,b)	(-EINVAL)
 #endif
+#ifndef GET_TSC_CTL
+# define GET_TSC_CTL(a)		(-EINVAL)
+#endif
+#ifndef SET_TSC_CTL
+# define SET_TSC_CTL(a)		(-EINVAL)
+#endif
 
 /*
  * this is where the system-wide overflow UID and GID are defined, for
@@ -1737,7 +1743,12 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 #else
 			return -EINVAL;
 #endif
-
+		case PR_GET_TSC:
+			error = GET_TSC_CTL(arg2);
+			break;
+		case PR_SET_TSC:
+			error = SET_TSC_CTL(arg2);
+			break;
 		default:
 			error = -EINVAL;
 			break;
-- 
cgit v1.2.3-70-g09d2


From d8bb6f4c1670c8324e4135c61ef07486f7f17379 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Apr 2008 19:45:18 +0200
Subject: x86: tsc prevent time going backwards

We already catch most of the TSC problems by sanity checks, but there
is a subtle bug which has been in the code forever. This can cause
time jumps in the range of hours.

This was reported in:
     http://lkml.org/lkml/2007/8/23/96
and
     http://lkml.org/lkml/2008/3/31/23

I was able to reproduce the problem with a gettimeofday loop test on a
dual core and a quad core machine which both have sychronized
TSCs. The TSCs seems not to be perfectly in sync though, but the
kernel is not able to detect the slight delta in the sync check. Still
there exists an extremly small window where this delta can be observed
with a real big time jump. So far I was only able to reproduce this
with the vsyscall gettimeofday implementation, but in theory this
might be observable with the syscall based version as well.

CPU 0 updates the clock source variables under xtime/vyscall lock and
CPU1, where the TSC is slighty behind CPU0, is reading the time right
after the seqlock was unlocked.

The clocksource reference data was updated with the TSC from CPU0 and
the value which is read from TSC on CPU1 is less than the reference
data. This results in a huge delta value due to the unsigned
subtraction of the TSC value and the reference value. This algorithm
can not be changed due to the support of wrapping clock sources like
pm timer.

The huge delta is converted to nanoseconds and added to xtime, which
is then observable by the caller. The next gettimeofday call on CPU1
will show the correct time again as now the TSC has advanced above the
reference value.

To prevent this TSC specific wreckage we need to compare the TSC value
against the reference value and return the latter when it is larger
than the actual TSC value.

I pondered to mark the TSC unstable when the readout is smaller than
the reference value, but this would render an otherwise good and fast
clocksource unusable without a real good reason.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tsc_32.c  | 15 ++++++++++++++-
 arch/x86/kernel/tsc_64.c  | 23 ++++++++++++++++++++---
 kernel/time/timekeeping.c |  2 ++
 3 files changed, 36 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 06af8cf8251..e4790728b22 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -284,14 +284,27 @@ core_initcall(cpufreq_tsc);
 /* clock source code */
 
 static unsigned long current_tsc_khz;
+static struct clocksource clocksource_tsc;
 
+/*
+ * We compare the TSC to the cycle_last value in the clocksource
+ * structure to avoid a nasty time-warp issue. This can be observed in
+ * a very small window right after one CPU updated cycle_last under
+ * xtime lock and the other CPU reads a TSC value which is smaller
+ * than the cycle_last reference value due to a TSC which is slighty
+ * behind. This delta is nowhere else observable, but in that case it
+ * results in a forward time jump in the range of hours due to the
+ * unsigned delta calculation of the time keeping core code, which is
+ * necessary to support wrapping clocksources like pm timer.
+ */
 static cycle_t read_tsc(void)
 {
 	cycle_t ret;
 
 	rdtscll(ret);
 
-	return ret;
+	return ret >= clocksource_tsc.cycle_last ?
+		ret : clocksource_tsc.cycle_last;
 }
 
 static struct clocksource clocksource_tsc = {
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index ceeba01e7f4..fcc16e58609 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -11,6 +11,7 @@
 #include <asm/hpet.h>
 #include <asm/timex.h>
 #include <asm/timer.h>
+#include <asm/vgtod.h>
 
 static int notsc __initdata = 0;
 
@@ -287,18 +288,34 @@ int __init notsc_setup(char *s)
 
 __setup("notsc", notsc_setup);
 
+static struct clocksource clocksource_tsc;
 
-/* clock source code: */
+/*
+ * We compare the TSC to the cycle_last value in the clocksource
+ * structure to avoid a nasty time-warp. This can be observed in a
+ * very small window right after one CPU updated cycle_last under
+ * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
+ * is smaller than the cycle_last reference value due to a TSC which
+ * is slighty behind. This delta is nowhere else observable, but in
+ * that case it results in a forward time jump in the range of hours
+ * due to the unsigned delta calculation of the time keeping core
+ * code, which is necessary to support wrapping clocksources like pm
+ * timer.
+ */
 static cycle_t read_tsc(void)
 {
 	cycle_t ret = (cycle_t)get_cycles();
-	return ret;
+
+	return ret >= clocksource_tsc.cycle_last ?
+		ret : clocksource_tsc.cycle_last;
 }
 
 static cycle_t __vsyscall_fn vread_tsc(void)
 {
 	cycle_t ret = (cycle_t)vget_cycles();
-	return ret;
+
+	return ret >= __vsyscall_gtod_data.clock.cycle_last ?
+		ret : __vsyscall_gtod_data.clock.cycle_last;
 }
 
 static struct clocksource clocksource_tsc = {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index a3fa587c350..2d6087c7cf9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -178,6 +178,7 @@ static void change_clocksource(void)
 	if (clock == new)
 		return;
 
+	new->cycle_last = 0;
 	now = clocksource_read(new);
 	nsec =  __get_nsec_offset();
 	timespec_add_ns(&xtime, nsec);
@@ -295,6 +296,7 @@ static int timekeeping_resume(struct sys_device *dev)
 	timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
 	update_xtime_cache(0);
 	/* re-base the last cycle value */
+	clock->cycle_last = 0;
 	clock->cycle_last = clocksource_read(clock);
 	clock->error = 0;
 	timekeeping_suspended = 0;
-- 
cgit v1.2.3-70-g09d2


From 61c4628b538608c1a85211ed8438136adfeb9a95 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 10 Mar 2008 15:28:04 -0700
Subject: x86, fpu: split FPU state from task struct - v5

Split the FPU save area from the task struct. This allows easy migration
of FPU context, and it's generally cleaner. It also allows the following
two optimizations:

1) only allocate when the application actually uses FPU, so in the first
lazy FPU trap. This could save memory for non-fpu using apps. Next patch
does this lazy allocation.

2) allocate the right size for the actual cpu rather than 512 bytes always.
Patches enabling xsave/xrstor support (coming shortly) will take advantage
of this.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile         |  1 +
 arch/x86/kernel/i387.c           | 80 +++++++++++++++++++++++-----------------
 arch/x86/kernel/process.c        | 35 ++++++++++++++++++
 arch/x86/kernel/process_32.c     |  2 +-
 arch/x86/kernel/process_64.c     |  2 +-
 arch/x86/kernel/traps_32.c       |  6 +--
 arch/x86/kernel/traps_64.c       |  6 ++-
 arch/x86/math-emu/fpu_entry.c    |  4 +-
 arch/x86/math-emu/fpu_system.h   | 26 ++++++-------
 arch/x86/math-emu/reg_ld_str.c   |  4 +-
 include/asm-x86/i387.h           | 35 +++++++++---------
 include/asm-x86/processor.h      |  7 ++--
 include/asm-x86/thread_info.h    |  8 ++++
 include/asm-x86/thread_info_32.h |  2 -
 include/asm-x86/thread_info_64.h |  2 -
 kernel/fork.c                    | 31 +++++++++++++---
 16 files changed, 161 insertions(+), 90 deletions(-)
 create mode 100644 arch/x86/kernel/process.c

(limited to 'kernel')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index c3920ea8ac5..7a2a2e93e84 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_X86_64)	+= pci-nommu_64.o bugs_64.o
 obj-y			+= tsc_$(BITS).o io_delay.o rtc.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
+obj-y				+= process.o
 obj-y				+= i387.o
 obj-y				+= ptrace.o
 obj-y				+= ds.o
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 8f8102d967b..baf632b221d 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/regset.h>
 #include <linux/sched.h>
+#include <linux/bootmem.h>
 
 #include <asm/sigcontext.h>
 #include <asm/processor.h>
@@ -35,17 +36,18 @@
 #endif
 
 static unsigned int		mxcsr_feature_mask __read_mostly = 0xffffffffu;
+unsigned int xstate_size;
+static struct i387_fxsave_struct fx_scratch __cpuinitdata;
 
-void mxcsr_feature_mask_init(void)
+void __cpuinit mxcsr_feature_mask_init(void)
 {
 	unsigned long mask = 0;
 
 	clts();
 	if (cpu_has_fxsr) {
-		memset(&current->thread.i387.fxsave, 0,
-		       sizeof(struct i387_fxsave_struct));
-		asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
-		mask = current->thread.i387.fxsave.mxcsr_mask;
+		memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct));
+		asm volatile("fxsave %0" : : "m" (fx_scratch));
+		mask = fx_scratch.mxcsr_mask;
 		if (mask == 0)
 			mask = 0x0000ffbf;
 	}
@@ -53,6 +55,17 @@ void mxcsr_feature_mask_init(void)
 	stts();
 }
 
+void __init init_thread_xstate(void)
+{
+	if (cpu_has_fxsr)
+		xstate_size = sizeof(struct i387_fxsave_struct);
+#ifdef CONFIG_X86_32
+	else
+		xstate_size = sizeof(struct i387_fsave_struct);
+#endif
+	init_task.thread.xstate = alloc_bootmem(xstate_size);
+}
+
 #ifdef CONFIG_X86_64
 /*
  * Called at bootup to set up the initial FPU state that is later cloned
@@ -61,10 +74,6 @@ void mxcsr_feature_mask_init(void)
 void __cpuinit fpu_init(void)
 {
 	unsigned long oldcr0 = read_cr0();
-	extern void __bad_fxsave_alignment(void);
-
-	if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
-		__bad_fxsave_alignment();
 
 	set_in_cr4(X86_CR4_OSFXSR);
 	set_in_cr4(X86_CR4_OSXMMEXCPT);
@@ -93,18 +102,19 @@ void init_fpu(struct task_struct *tsk)
 	}
 
 	if (cpu_has_fxsr) {
-		memset(&tsk->thread.i387.fxsave, 0,
-		       sizeof(struct i387_fxsave_struct));
-		tsk->thread.i387.fxsave.cwd = 0x37f;
+		struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+
+		memset(fx, 0, xstate_size);
+		fx->cwd = 0x37f;
 		if (cpu_has_xmm)
-			tsk->thread.i387.fxsave.mxcsr = MXCSR_DEFAULT;
+			fx->mxcsr = MXCSR_DEFAULT;
 	} else {
-		memset(&tsk->thread.i387.fsave, 0,
-		       sizeof(struct i387_fsave_struct));
-		tsk->thread.i387.fsave.cwd = 0xffff037fu;
-		tsk->thread.i387.fsave.swd = 0xffff0000u;
-		tsk->thread.i387.fsave.twd = 0xffffffffu;
-		tsk->thread.i387.fsave.fos = 0xffff0000u;
+		struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
+		memset(fp, 0, xstate_size);
+		fp->cwd = 0xffff037fu;
+		fp->swd = 0xffff0000u;
+		fp->twd = 0xffffffffu;
+		fp->fos = 0xffff0000u;
 	}
 	/*
 	 * Only the device not available exception or ptrace can call init_fpu.
@@ -132,7 +142,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 	init_fpu(target);
 
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				   &target->thread.i387.fxsave, 0, -1);
+				   &target->thread.xstate->fxsave, 0, -1);
 }
 
 int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -148,12 +158,12 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 	set_stopped_child_used_math(target);
 
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 &target->thread.i387.fxsave, 0, -1);
+				 &target->thread.xstate->fxsave, 0, -1);
 
 	/*
 	 * mxcsr reserved bits must be masked to zero for security reasons.
 	 */
-	target->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+	target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
 
 	return ret;
 }
@@ -233,7 +243,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
 static void
 convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
 {
-	struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave;
+	struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
 	struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
 	struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
 	int i;
@@ -273,7 +283,7 @@ static void convert_to_fxsr(struct task_struct *tsk,
 			    const struct user_i387_ia32_struct *env)
 
 {
-	struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave;
+	struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
 	struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
 	struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
 	int i;
@@ -310,7 +320,8 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
 
 	if (!cpu_has_fxsr) {
 		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-					   &target->thread.i387.fsave, 0, -1);
+					   &target->thread.xstate->fsave, 0,
+					   -1);
 	}
 
 	if (kbuf && pos == 0 && count == sizeof(env)) {
@@ -338,7 +349,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 
 	if (!cpu_has_fxsr) {
 		return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-					  &target->thread.i387.fsave, 0, -1);
+					  &target->thread.xstate->fsave, 0, -1);
 	}
 
 	if (pos > 0 || count < sizeof(env))
@@ -358,11 +369,11 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
 {
 	struct task_struct *tsk = current;
+	struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
 
 	unlazy_fpu(tsk);
-	tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
-	if (__copy_to_user(buf, &tsk->thread.i387.fsave,
-			   sizeof(struct i387_fsave_struct)))
+	fp->status = fp->swd;
+	if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
 		return -1;
 	return 1;
 }
@@ -370,6 +381,7 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
 static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
 {
 	struct task_struct *tsk = current;
+	struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
 	struct user_i387_ia32_struct env;
 	int err = 0;
 
@@ -379,12 +391,12 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
 	if (__copy_to_user(buf, &env, sizeof(env)))
 		return -1;
 
-	err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
+	err |= __put_user(fx->swd, &buf->status);
 	err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
 	if (err)
 		return -1;
 
-	if (__copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
+	if (__copy_to_user(&buf->_fxsr_env[0], fx,
 			   sizeof(struct i387_fxsave_struct)))
 		return -1;
 	return 1;
@@ -417,7 +429,7 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
 	struct task_struct *tsk = current;
 
 	clear_fpu(tsk);
-	return __copy_from_user(&tsk->thread.i387.fsave, buf,
+	return __copy_from_user(&tsk->thread.xstate->fsave, buf,
 				sizeof(struct i387_fsave_struct));
 }
 
@@ -428,10 +440,10 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
 	int err;
 
 	clear_fpu(tsk);
-	err = __copy_from_user(&tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
+	err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0],
 			       sizeof(struct i387_fxsave_struct));
 	/* mxcsr reserved bits must be masked to zero for security reasons */
-	tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+	tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
 	if (err || __copy_from_user(&env, buf, sizeof(env)))
 		return 1;
 	convert_to_fxsr(tsk, &env);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
new file mode 100644
index 00000000000..ead24efbcba
--- /dev/null
+++ b/arch/x86/kernel/process.c
@@ -0,0 +1,35 @@
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+static struct kmem_cache *task_xstate_cachep;
+
+int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
+{
+	*dst = *src;
+	dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL);
+	if (!dst->thread.xstate)
+		return -ENOMEM;
+	WARN_ON((unsigned long)dst->thread.xstate & 15);
+	memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
+	return 0;
+}
+
+void free_thread_info(struct thread_info *ti)
+{
+	kmem_cache_free(task_xstate_cachep, ti->task->thread.xstate);
+	ti->task->thread.xstate = NULL;
+
+	free_pages((unsigned long)(ti), get_order(THREAD_SIZE));
+}
+
+void arch_task_cache_init(void)
+{
+        task_xstate_cachep =
+        	kmem_cache_create("task_xstate", xstate_size,
+				  __alignof__(union thread_xstate),
+				  SLAB_PANIC, NULL);
+}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a3790a3f8a8..3890a5dd25f 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -703,7 +703,7 @@ struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct
 
 	/* we're going to use this soon, after a few expensive things */
 	if (next_p->fpu_counter > 5)
-		prefetch(&next->i387.fxsave);
+		prefetch(next->xstate);
 
 	/*
 	 * Reload esp0.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 4c13b1406c7..b795e831afd 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -682,7 +682,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	/* we're going to use this soon, after a few expensive things */
 	if (next_p->fpu_counter>5)
-		prefetch(&next->i387.fxsave);
+		prefetch(next->xstate);
 
 	/*
 	 * Reload esp0, LDT and the page table pointer:
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index dc4273010f2..8d136a73ce8 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -1208,11 +1208,6 @@ void __init trap_init(void)
 #endif
 	set_trap_gate(19, &simd_coprocessor_error);
 
-	/*
-	 * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
-	 * Generate a build-time error if the alignment is wrong.
-	 */
-	BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
 	if (cpu_has_fxsr) {
 		printk(KERN_INFO "Enabling fast FPU save and restore... ");
 		set_in_cr4(X86_CR4_OSFXSR);
@@ -1233,6 +1228,7 @@ void __init trap_init(void)
 
 	set_bit(SYSCALL_VECTOR, used_vectors);
 
+	init_thread_xstate();
 	/*
 	 * Should be a barrier for any external CPU state:
 	 */
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 6d883b13ef4..dc0cb497eec 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -1128,7 +1128,7 @@ asmlinkage void math_state_restore(void)
 
 	if (!used_math())
 		init_fpu(me);
-	restore_fpu_checking(&me->thread.i387.fxsave);
+	restore_fpu_checking(&me->thread.xstate->fxsave);
 	task_thread_info(me)->status |= TS_USEDFPU;
 	me->fpu_counter++;
 }
@@ -1163,6 +1163,10 @@ void __init trap_init(void)
 	set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
 #endif
        
+	/*
+	 * initialize the per thread extended state:
+	 */
+        init_thread_xstate();
 	/*
 	 * Should be a barrier for any external CPU state.
 	 */
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 4bab3b14539..6e38d877ea7 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -678,7 +678,7 @@ int fpregs_soft_set(struct task_struct *target,
 		    unsigned int pos, unsigned int count,
 		    const void *kbuf, const void __user *ubuf)
 {
-	struct i387_soft_struct *s387 = &target->thread.i387.soft;
+	struct i387_soft_struct *s387 = &target->thread.xstate->soft;
 	void *space = s387->st_space;
 	int ret;
 	int offset, other, i, tags, regnr, tag, newtop;
@@ -730,7 +730,7 @@ int fpregs_soft_get(struct task_struct *target,
 		    unsigned int pos, unsigned int count,
 		    void *kbuf, void __user *ubuf)
 {
-	struct i387_soft_struct *s387 = &target->thread.i387.soft;
+	struct i387_soft_struct *s387 = &target->thread.xstate->soft;
 	const void *space = s387->st_space;
 	int ret;
 	int offset = (S387->ftop & 7) * 10, other = 80 - offset;
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index a3ae28c49dd..13488fa153e 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -35,8 +35,8 @@
 #define SEG_EXPAND_DOWN(s)	(((s).b & ((1 << 11) | (1 << 10))) \
 				 == (1 << 10))
 
-#define I387			(current->thread.i387)
-#define FPU_info		(I387.soft.info)
+#define I387			(current->thread.xstate)
+#define FPU_info		(I387->soft.info)
 
 #define FPU_CS			(*(unsigned short *) &(FPU_info->___cs))
 #define FPU_SS			(*(unsigned short *) &(FPU_info->___ss))
@@ -46,25 +46,25 @@
 #define FPU_EIP			(FPU_info->___eip)
 #define FPU_ORIG_EIP		(FPU_info->___orig_eip)
 
-#define FPU_lookahead           (I387.soft.lookahead)
+#define FPU_lookahead           (I387->soft.lookahead)
 
 /* nz if ip_offset and cs_selector are not to be set for the current
    instruction. */
-#define no_ip_update		(*(u_char *)&(I387.soft.no_update))
-#define FPU_rm			(*(u_char *)&(I387.soft.rm))
+#define no_ip_update		(*(u_char *)&(I387->soft.no_update))
+#define FPU_rm			(*(u_char *)&(I387->soft.rm))
 
 /* Number of bytes of data which can be legally accessed by the current
    instruction. This only needs to hold a number <= 108, so a byte will do. */
-#define access_limit		(*(u_char *)&(I387.soft.alimit))
+#define access_limit		(*(u_char *)&(I387->soft.alimit))
 
-#define partial_status		(I387.soft.swd)
-#define control_word		(I387.soft.cwd)
-#define fpu_tag_word		(I387.soft.twd)
-#define registers		(I387.soft.st_space)
-#define top			(I387.soft.ftop)
+#define partial_status		(I387->soft.swd)
+#define control_word		(I387->soft.cwd)
+#define fpu_tag_word		(I387->soft.twd)
+#define registers		(I387->soft.st_space)
+#define top			(I387->soft.ftop)
 
-#define instruction_address	(*(struct address *)&I387.soft.fip)
-#define operand_address		(*(struct address *)&I387.soft.foo)
+#define instruction_address	(*(struct address *)&I387->soft.fip)
+#define operand_address		(*(struct address *)&I387->soft.foo)
 
 #define FPU_access_ok(x,y,z)	if ( !access_ok(x,y,z) ) \
 				math_abort(FPU_info,SIGSEGV)
diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c
index 02af772a24d..d597fe7423c 100644
--- a/arch/x86/math-emu/reg_ld_str.c
+++ b/arch/x86/math-emu/reg_ld_str.c
@@ -1180,8 +1180,8 @@ u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d)
 		control_word |= 0xffff0040;
 		partial_status = status_word() | 0xffff0000;
 		fpu_tag_word |= 0xffff0000;
-		I387.soft.fcs &= ~0xf8000000;
-		I387.soft.fos |= 0xffff0000;
+		I387->soft.fcs &= ~0xf8000000;
+		I387->soft.fos |= 0xffff0000;
 #endif /* PECULIAR_486 */
 		if (__copy_to_user(d, &control_word, 7 * 4))
 			FPU_abort;
diff --git a/include/asm-x86/i387.h b/include/asm-x86/i387.h
index 54522b814f1..382a5fa9d49 100644
--- a/include/asm-x86/i387.h
+++ b/include/asm-x86/i387.h
@@ -23,6 +23,7 @@ extern void fpu_init(void);
 extern void mxcsr_feature_mask_init(void);
 extern void init_fpu(struct task_struct *child);
 extern asmlinkage void math_state_restore(void);
+extern void init_thread_xstate(void);
 
 extern user_regset_active_fn fpregs_active, xfpregs_active;
 extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get;
@@ -117,24 +118,22 @@ static inline void __save_init_fpu(struct task_struct *tsk)
 	/* Using "fxsaveq %0" would be the ideal choice, but is only supported
 	   starting with gas 2.16. */
 	__asm__ __volatile__("fxsaveq %0"
-			     : "=m" (tsk->thread.i387.fxsave));
+			     : "=m" (tsk->thread.xstate->fxsave));
 #elif 0
 	/* Using, as a workaround, the properly prefixed form below isn't
 	   accepted by any binutils version so far released, complaining that
 	   the same type of prefix is used twice if an extended register is
 	   needed for addressing (fix submitted to mainline 2005-11-21). */
 	__asm__ __volatile__("rex64/fxsave %0"
-			     : "=m" (tsk->thread.i387.fxsave));
+			     : "=m" (tsk->thread.xstate->fxsave));
 #else
 	/* This, however, we can work around by forcing the compiler to select
 	   an addressing mode that doesn't require extended registers. */
-	__asm__ __volatile__("rex64/fxsave %P2(%1)"
-			     : "=m" (tsk->thread.i387.fxsave)
-			     : "cdaSDb" (tsk),
-				"i" (offsetof(__typeof__(*tsk),
-					      thread.i387.fxsave)));
+	__asm__ __volatile__("rex64/fxsave (%1)"
+			     : "=m" (tsk->thread.xstate->fxsave)
+			     : "cdaSDb" (&tsk->thread.xstate->fxsave));
 #endif
-	clear_fpu_state(&tsk->thread.i387.fxsave);
+	clear_fpu_state(&tsk->thread.xstate->fxsave);
 	task_thread_info(tsk)->status &= ~TS_USEDFPU;
 }
 
@@ -148,7 +147,7 @@ static inline int save_i387(struct _fpstate __user *buf)
 	int err = 0;
 
 	BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
-			sizeof(tsk->thread.i387.fxsave));
+			sizeof(tsk->thread.xstate->fxsave));
 
 	if ((unsigned long)buf % 16)
 		printk("save_i387: bad fpstate %p\n", buf);
@@ -164,7 +163,7 @@ static inline int save_i387(struct _fpstate __user *buf)
 		task_thread_info(tsk)->status &= ~TS_USEDFPU;
 		stts();
 	} else {
-		if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
+		if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
 				   sizeof(struct i387_fxsave_struct)))
 			return -1;
 	}
@@ -201,7 +200,7 @@ static inline void restore_fpu(struct task_struct *tsk)
 		"nop ; frstor %1",
 		"fxrstor %1",
 		X86_FEATURE_FXSR,
-		"m" ((tsk)->thread.i387.fxsave));
+		"m" (tsk->thread.xstate->fxsave));
 }
 
 /* We need a safe address that is cheap to find and that is already
@@ -225,8 +224,8 @@ static inline void __save_init_fpu(struct task_struct *tsk)
 		"fxsave %[fx]\n"
 		"bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
 		X86_FEATURE_FXSR,
-		[fx] "m" (tsk->thread.i387.fxsave),
-		[fsw] "m" (tsk->thread.i387.fxsave.swd) : "memory");
+		[fx] "m" (tsk->thread.xstate->fxsave),
+		[fsw] "m" (tsk->thread.xstate->fxsave.swd) : "memory");
 	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
 	   is pending.  Clear the x87 state here by setting it to fixed
 	   values. safe_address is a random variable that should be in L1 */
@@ -327,25 +326,25 @@ static inline void clear_fpu(struct task_struct *tsk)
 static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
 {
 	if (cpu_has_fxsr) {
-		return tsk->thread.i387.fxsave.cwd;
+		return tsk->thread.xstate->fxsave.cwd;
 	} else {
-		return (unsigned short)tsk->thread.i387.fsave.cwd;
+		return (unsigned short) tsk->thread.xstate->fsave.cwd;
 	}
 }
 
 static inline unsigned short get_fpu_swd(struct task_struct *tsk)
 {
 	if (cpu_has_fxsr) {
-		return tsk->thread.i387.fxsave.swd;
+		return tsk->thread.xstate->fxsave.swd;
 	} else {
-		return (unsigned short)tsk->thread.i387.fsave.swd;
+		return (unsigned short) tsk->thread.xstate->fsave.swd;
 	}
 }
 
 static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
 {
 	if (cpu_has_xmm) {
-		return tsk->thread.i387.fxsave.mxcsr;
+		return tsk->thread.xstate->fxsave.mxcsr;
 	} else {
 		return MXCSR_DEFAULT;
 	}
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index eaf4548a23d..99d29788578 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -354,7 +354,7 @@ struct i387_soft_struct {
 	u32			entry_eip;
 };
 
-union i387_union {
+union thread_xstate {
 	struct i387_fsave_struct	fsave;
 	struct i387_fxsave_struct	fxsave;
 	struct i387_soft_struct		soft;
@@ -365,6 +365,7 @@ DECLARE_PER_CPU(struct orig_ist, orig_ist);
 #endif
 
 extern void print_cpu_info(struct cpuinfo_x86 *);
+extern unsigned int xstate_size;
 extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
 extern unsigned short num_cache_leaves;
@@ -397,8 +398,8 @@ struct thread_struct {
 	unsigned long		cr2;
 	unsigned long		trap_no;
 	unsigned long		error_code;
-	/* Floating point info: */
-	union i387_union	i387 __attribute__((aligned(16)));;
+	/* floating point and extended processor state */
+	union thread_xstate	*xstate;
 #ifdef CONFIG_X86_32
 	/* Virtual 86 mode info */
 	struct vm86_struct __user *vm86_info;
diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h
index d5fd12f2abd..407b88c170d 100644
--- a/include/asm-x86/thread_info.h
+++ b/include/asm-x86/thread_info.h
@@ -1,5 +1,13 @@
+#ifndef _ASM_X86_THREAD_INFO_H
 #ifdef CONFIG_X86_32
 # include "thread_info_32.h"
 #else
 # include "thread_info_64.h"
 #endif
+
+#ifndef __ASSEMBLY__
+extern void arch_task_cache_init(void);
+extern void free_thread_info(struct thread_info *ti);
+extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
+#endif
+#endif /* _ASM_X86_THREAD_INFO_H */
diff --git a/include/asm-x86/thread_info_32.h b/include/asm-x86/thread_info_32.h
index 4e053fa561a..53185996209 100644
--- a/include/asm-x86/thread_info_32.h
+++ b/include/asm-x86/thread_info_32.h
@@ -102,8 +102,6 @@ static inline struct thread_info *current_thread_info(void)
 	__get_free_pages(GFP_KERNEL, get_order(THREAD_SIZE)))
 #endif
 
-#define free_thread_info(info)	free_pages((unsigned long)(info), get_order(THREAD_SIZE))
-
 #else /* !__ASSEMBLY__ */
 
 /* how to get the thread information struct from ASM */
diff --git a/include/asm-x86/thread_info_64.h b/include/asm-x86/thread_info_64.h
index b17f5f6c2c5..ed664e874de 100644
--- a/include/asm-x86/thread_info_64.h
+++ b/include/asm-x86/thread_info_64.h
@@ -85,8 +85,6 @@ static inline struct thread_info *stack_thread_info(void)
 #define alloc_thread_info(tsk)						\
 	((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER))
 
-#define free_thread_info(ti) free_pages((unsigned long) (ti), THREAD_ORDER)
-
 #else /* !__ASSEMBLY__ */
 
 /* how to get the thread information struct from ASM */
diff --git a/kernel/fork.c b/kernel/fork.c
index 9c042f90157..44a18192c42 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -132,6 +132,10 @@ void __put_task_struct(struct task_struct *tsk)
 		free_task(tsk);
 }
 
+void __attribute__((weak)) arch_task_cache_init(void)
+{
+}
+
 void __init fork_init(unsigned long mempages)
 {
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
@@ -144,6 +148,9 @@ void __init fork_init(unsigned long mempages)
 			ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL);
 #endif
 
+	/* do the arch specific task caches init */
+	arch_task_cache_init();
+
 	/*
 	 * The default maximum number of threads is set to a safe
 	 * value: the thread structures can take up at most half
@@ -163,6 +170,13 @@ void __init fork_init(unsigned long mempages)
 		init_task.signal->rlim[RLIMIT_NPROC];
 }
 
+int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
+					       struct task_struct *src)
+{
+	*dst = *src;
+	return 0;
+}
+
 static struct task_struct *dup_task_struct(struct task_struct *orig)
 {
 	struct task_struct *tsk;
@@ -181,15 +195,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 		return NULL;
 	}
 
-	*tsk = *orig;
+ 	err = arch_dup_task_struct(tsk, orig);
+	if (err)
+		goto out;
+
 	tsk->stack = ti;
 
 	err = prop_local_init_single(&tsk->dirties);
-	if (err) {
-		free_thread_info(ti);
-		free_task_struct(tsk);
-		return NULL;
-	}
+	if (err)
+		goto out;
 
 	setup_thread_stack(tsk, orig);
 
@@ -205,6 +219,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 #endif
 	tsk->splice_pipe = NULL;
 	return tsk;
+
+out:
+	free_thread_info(ti);
+	free_task_struct(tsk);
+	return NULL;
 }
 
 #ifdef CONFIG_MMU
-- 
cgit v1.2.3-70-g09d2


From 2adee9b30d1382fba97825b9c50e4f50a0117c36 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Wed, 16 Apr 2008 10:25:35 +0200
Subject: x86: fpu xstate split fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/asm-x86/thread_info.h |  1 +
 kernel/fork.c                 | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h
index 407b88c170d..77244f17993 100644
--- a/include/asm-x86/thread_info.h
+++ b/include/asm-x86/thread_info.h
@@ -9,5 +9,6 @@
 extern void arch_task_cache_init(void);
 extern void free_thread_info(struct thread_info *ti);
 extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
+#define arch_task_cache_init arch_task_cache_init
 #endif
 #endif /* _ASM_X86_THREAD_INFO_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 44a18192c42..89fe414645e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -132,9 +132,13 @@ void __put_task_struct(struct task_struct *tsk)
 		free_task(tsk);
 }
 
-void __attribute__((weak)) arch_task_cache_init(void)
-{
-}
+/*
+ * macro override instead of weak attribute alias, to workaround
+ * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
+ */
+#ifndef arch_task_cache_init
+#define arch_task_cache_init()
+#endif
 
 void __init fork_init(unsigned long mempages)
 {
-- 
cgit v1.2.3-70-g09d2


From 018d6db4cb5bbdcd65424a16f2dcca692ed32ae4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 14 Apr 2008 08:53:32 +0200
Subject: sched: re-do "sched: fix fair sleepers"

re-apply:

| commit e22ecef1d2658ba54ed7d3fdb5d60829fb434c23
| Author: Ingo Molnar <mingo@elte.hu>
| Date:   Fri Mar 14 22:16:08 2008 +0100
|
|     sched: fix fair sleepers
|
|     Fair sleepers need to scale their latency target down by runqueue
|     weight. Otherwise busy systems will gain ever larger sleep bonus.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0080968d3e4..86a93376282 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -510,8 +510,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 	if (!initial) {
 		/* sleeps upto a single latency don't count. */
-		if (sched_feat(NEW_FAIR_SLEEPERS))
-			vruntime -= sysctl_sched_latency;
+		if (sched_feat(NEW_FAIR_SLEEPERS)) {
+			vruntime -= calc_delta_fair(sysctl_sched_latency,
+						    &cfs_rq->load);
+		}
 
 		/* ensure we never gain time by being placed backwards. */
 		vruntime = max_vruntime(se->vruntime, vruntime);
-- 
cgit v1.2.3-70-g09d2


From 27ec4407790d075c325e1f4da0a19c56953cce23 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 28 Feb 2008 21:00:21 +0100
Subject: sched: make cpu_clock() globally synchronous

Alexey Zaytsev reported (and bisected) that the introduction of
cpu_clock() in printk made the timestamps jump back and forth.

Make cpu_clock() more reliable while still keeping it fast when it's
called frequently.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8dcdec6fe0f..7377222ab42 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -632,11 +632,39 @@ int sysctl_sched_rt_runtime = 950000;
  */
 #define RUNTIME_INF	((u64)~0ULL)
 
+static const unsigned long long time_sync_thresh = 100000;
+
+static DEFINE_PER_CPU(unsigned long long, time_offset);
+static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
+
 /*
- * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
- * clock constructed from sched_clock():
+ * Global lock which we take every now and then to synchronize
+ * the CPUs time. This method is not warp-safe, but it's good
+ * enough to synchronize slowly diverging time sources and thus
+ * it's good enough for tracing:
  */
-unsigned long long cpu_clock(int cpu)
+static DEFINE_SPINLOCK(time_sync_lock);
+static unsigned long long prev_global_time;
+
+static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&time_sync_lock, flags);
+
+	if (time < prev_global_time) {
+		per_cpu(time_offset, cpu) += prev_global_time - time;
+		time = prev_global_time;
+	} else {
+		prev_global_time = time;
+	}
+
+	spin_unlock_irqrestore(&time_sync_lock, flags);
+
+	return time;
+}
+
+static unsigned long long __cpu_clock(int cpu)
 {
 	unsigned long long now;
 	unsigned long flags;
@@ -657,6 +685,24 @@ unsigned long long cpu_clock(int cpu)
 
 	return now;
 }
+
+/*
+ * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
+ * clock constructed from sched_clock():
+ */
+unsigned long long cpu_clock(int cpu)
+{
+	unsigned long long prev_cpu_time, time, delta_time;
+
+	prev_cpu_time = per_cpu(prev_cpu_time, cpu);
+	time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
+	delta_time = time-prev_cpu_time;
+
+	if (unlikely(delta_time > time_sync_thresh))
+		time = __sync_cpu_clock(time, cpu);
+
+	return time;
+}
 EXPORT_SYMBOL_GPL(cpu_clock);
 
 #ifndef prepare_arch_switch
-- 
cgit v1.2.3-70-g09d2


From 30914a58af9d21c5f1831adabb5d7a800a378675 Mon Sep 17 00:00:00 2001
From: Reynes Philippe <tremyfr@yahoo.fr>
Date: Mon, 17 Mar 2008 16:19:05 -0700
Subject: sched: sched.c needs tick.h

kernel/sched.c:506: erreur: implicit declaration of function tick_get_tick_sched
kernel/sched.c:506: erreur: invalid type argument of ->
kernel/sched.c:506: erreur: NOHZ_MODE_INACTIVE undeclared (first use in this function)
kernel/sched.c:506: erreur: (Each undeclared identifier is reported only once
kernel/sched.c:506: erreur: for each function it appears in.)

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7377222ab42..7fe334ead4f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -66,6 +66,7 @@
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
+#include <linux/tick.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
-- 
cgit v1.2.3-70-g09d2


From 15934a37324f32e0fda633dc7984a671ea81cd75 Mon Sep 17 00:00:00 2001
From: Guillaume Chazarain <guichaz@yahoo.fr>
Date: Sat, 19 Apr 2008 19:44:57 +0200
Subject: sched: fix rq->clock overflows detection with CONFIG_NO_HZ

When using CONFIG_NO_HZ, rq->tick_timestamp is not updated every TICK_NSEC.
We check that the number of skipped ticks matches the clock jump seen in
__update_rq_clock().

Signed-off-by: Guillaume Chazarain <guichaz@yahoo.fr>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 38 +++++++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7fe334ead4f..d8456a9ac9a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -397,6 +397,7 @@ struct rq {
 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 	unsigned char idle_at_tick;
 #ifdef CONFIG_NO_HZ
+	unsigned long last_tick_seen;
 	unsigned char in_nohz_recently;
 #endif
 	/* capture load from *all* tasks on this cpu: */
@@ -500,6 +501,32 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
+#ifdef CONFIG_NO_HZ
+static inline bool nohz_on(int cpu)
+{
+	return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE;
+}
+
+static inline u64 max_skipped_ticks(struct rq *rq)
+{
+	return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1;
+}
+
+static inline void update_last_tick_seen(struct rq *rq)
+{
+	rq->last_tick_seen = jiffies;
+}
+#else
+static inline u64 max_skipped_ticks(struct rq *rq)
+{
+	return 1;
+}
+
+static inline void update_last_tick_seen(struct rq *rq)
+{
+}
+#endif
+
 /*
  * Update the per-runqueue clock, as finegrained as the platform can give
  * us, but without assuming monotonicity, etc.:
@@ -524,9 +551,12 @@ static void __update_rq_clock(struct rq *rq)
 		/*
 		 * Catch too large forward jumps too:
 		 */
-		if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) {
-			if (clock < rq->tick_timestamp + TICK_NSEC)
-				clock = rq->tick_timestamp + TICK_NSEC;
+		u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC;
+		u64 max_time = rq->tick_timestamp + max_jump;
+
+		if (unlikely(clock + delta > max_time)) {
+			if (clock < max_time)
+				clock = max_time;
 			else
 				clock++;
 			rq->clock_overflows++;
@@ -3812,6 +3842,7 @@ void scheduler_tick(void)
 		rq->clock_underflows++;
 	}
 	rq->tick_timestamp = rq->clock;
+	update_last_tick_seen(rq);
 	update_cpu_load(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	update_sched_rt_period(rq);
@@ -7261,6 +7292,7 @@ void __init sched_init(void)
 		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
 		rq->nr_running = 0;
 		rq->clock = 1;
+		update_last_tick_seen(rq);
 		init_cfs_rq(&rq->cfs, rq);
 		init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-- 
cgit v1.2.3-70-g09d2


From 0bbd3336eee1e712a290e0dfd1a64cbbdd63a508 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 19 Apr 2008 19:44:57 +0200
Subject: sched: fix wakeup granularity for buddies

The wakeup buddy logic didn't use the same wakeup granularity logic as the
wakeup preemption did, this might cause the ->next buddy to be selected past
the point where we would have preempted had the task been a single running
instance.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 69 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 49 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 86a93376282..b01f8e77f2a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -73,13 +73,13 @@ unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
 
 /*
  * SCHED_OTHER wake-up granularity.
- * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
  *
  * This option delays the preemption effects of decoupled workloads
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
-unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
+unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 
@@ -629,20 +629,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
 
+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+
 static struct sched_entity *
 pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	s64 diff, gran;
-
 	if (!cfs_rq->next)
 		return se;
 
-	diff = cfs_rq->next->vruntime - se->vruntime;
-	if (diff < 0)
-		return se;
-
-	gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load);
-	if (diff > gran)
+	if (wakeup_preempt_entity(cfs_rq->next, se) != 0)
 		return se;
 
 	return cfs_rq->next;
@@ -1101,6 +1097,48 @@ out:
 }
 #endif /* CONFIG_SMP */
 
+static unsigned long wakeup_gran(struct sched_entity *se)
+{
+	unsigned long gran = sysctl_sched_wakeup_granularity;
+
+	/*
+	 * More easily preempt - nice tasks, while not making
+	 * it harder for + nice tasks.
+	 */
+	if (unlikely(se->load.weight > NICE_0_LOAD))
+		gran = calc_delta_fair(gran, &se->load);
+
+	return gran;
+}
+
+/*
+ * Should 'se' preempt 'curr'.
+ *
+ *             |s1
+ *        |s2
+ *   |s3
+ *         g
+ *      |<--->|c
+ *
+ *  w(c, s1) = -1
+ *  w(c, s2) =  0
+ *  w(c, s3) =  1
+ *
+ */
+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
+{
+	s64 gran, vdiff = curr->vruntime - se->vruntime;
+
+	if (vdiff < 0)
+		return -1;
+
+	gran = wakeup_gran(curr);
+	if (vdiff > gran)
+		return 1;
+
+	return 0;
+}
 
 /*
  * Preempt the current task with a newly woken task if needed:
@@ -1110,7 +1148,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	struct sched_entity *se = &curr->se, *pse = &p->se;
-	unsigned long gran;
 
 	if (unlikely(rt_prio(p->prio))) {
 		update_rq_clock(rq);
@@ -1140,15 +1177,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 		pse = parent_entity(pse);
 	}
 
-	gran = sysctl_sched_wakeup_granularity;
-	/*
-	 * More easily preempt - nice tasks, while not making
-	 * it harder for + nice tasks.
-	 */
-	if (unlikely(se->load.weight > NICE_0_LOAD))
-		gran = calc_delta_fair(gran, &se->load);
-
-	if (pse->vruntime + gran < se->vruntime)
+	if (wakeup_preempt_entity(se, pse) == 1)
 		resched_task(curr);
 }
 
-- 
cgit v1.2.3-70-g09d2


From b85d0667268320072ccdeb07c27c25b300ab3724 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 16 Mar 2008 20:03:22 +0100
Subject: sched: introduce SCHED_FEAT_SYNC_WAKEUPS, turn it off

turn off sync wakeups by default. They are not needed anymore - the
buddy logic should be smart enough to keep the system from
overscheduling.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d8456a9ac9a..263e25e1020 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -627,6 +627,7 @@ enum {
 	SCHED_FEAT_START_DEBIT		= 4,
 	SCHED_FEAT_HRTICK		= 8,
 	SCHED_FEAT_DOUBLE_TICK		= 16,
+	SCHED_FEAT_SYNC_WAKEUPS		= 32,
 };
 
 const_debug unsigned int sysctl_sched_features =
@@ -634,7 +635,8 @@ const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_WAKEUP_PREEMPT	* 1 |
 		SCHED_FEAT_START_DEBIT		* 1 |
 		SCHED_FEAT_HRTICK		* 1 |
-		SCHED_FEAT_DOUBLE_TICK		* 0;
+		SCHED_FEAT_DOUBLE_TICK		* 0 |
+		SCHED_FEAT_SYNC_WAKEUPS		* 0;
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 
@@ -1916,6 +1918,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	long old_state;
 	struct rq *rq;
 
+	if (!sched_feat(SYNC_WAKEUPS))
+		sync = 0;
+
 	smp_wmb();
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
-- 
cgit v1.2.3-70-g09d2


From 1fc8afa4c820fcde3658238eab5c010476ede521 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 19 Mar 2008 01:39:19 +0100
Subject: sched: feat affine wakeups

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 263e25e1020..7c5efad78c0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -628,6 +628,7 @@ enum {
 	SCHED_FEAT_HRTICK		= 8,
 	SCHED_FEAT_DOUBLE_TICK		= 16,
 	SCHED_FEAT_SYNC_WAKEUPS		= 32,
+	SCHED_FEAT_AFFINE_WAKEUPS	= 64,
 };
 
 const_debug unsigned int sysctl_sched_features =
@@ -636,7 +637,8 @@ const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_START_DEBIT		* 1 |
 		SCHED_FEAT_HRTICK		* 1 |
 		SCHED_FEAT_DOUBLE_TICK		* 0 |
-		SCHED_FEAT_SYNC_WAKEUPS		* 0;
+		SCHED_FEAT_SYNC_WAKEUPS		* 0 |
+		SCHED_FEAT_AFFINE_WAKEUPS	* 1;
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 
-- 
cgit v1.2.3-70-g09d2


From d25ce4cd499a21aab89ff8755f8c4a2800eae25f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 17 Mar 2008 09:36:53 +0100
Subject: sched: cache hot buddy

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7c5efad78c0..42d2f1155d3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -625,20 +625,22 @@ enum {
 	SCHED_FEAT_NEW_FAIR_SLEEPERS	= 1,
 	SCHED_FEAT_WAKEUP_PREEMPT	= 2,
 	SCHED_FEAT_START_DEBIT		= 4,
-	SCHED_FEAT_HRTICK		= 8,
-	SCHED_FEAT_DOUBLE_TICK		= 16,
-	SCHED_FEAT_SYNC_WAKEUPS		= 32,
-	SCHED_FEAT_AFFINE_WAKEUPS	= 64,
+	SCHED_FEAT_AFFINE_WAKEUPS	= 8,
+	SCHED_FEAT_CACHE_HOT_BUDDY	= 16,
+	SCHED_FEAT_HRTICK		= 32,
+	SCHED_FEAT_DOUBLE_TICK		= 64,
+	SCHED_FEAT_SYNC_WAKEUPS		= 128,
 };
 
 const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_NEW_FAIR_SLEEPERS	* 1 |
 		SCHED_FEAT_WAKEUP_PREEMPT	* 1 |
 		SCHED_FEAT_START_DEBIT		* 1 |
+		SCHED_FEAT_AFFINE_WAKEUPS	* 1 |
+		SCHED_FEAT_CACHE_HOT_BUDDY	* 1 |
 		SCHED_FEAT_HRTICK		* 1 |
 		SCHED_FEAT_DOUBLE_TICK		* 0 |
-		SCHED_FEAT_SYNC_WAKEUPS		* 0 |
-		SCHED_FEAT_AFFINE_WAKEUPS	* 1;
+		SCHED_FEAT_SYNC_WAKEUPS		* 0;
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 
@@ -1519,7 +1521,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 	/*
 	 * Buddy candidates are cache hot:
 	 */
-	if (&p->se == cfs_rq_of(&p->se)->next)
+	if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
 		return 1;
 
 	if (p->sched_class != &fair_sched_class)
-- 
cgit v1.2.3-70-g09d2


From 02e2b83bd25bb05ac2e69cb31458b7d1b3c70707 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 19 Mar 2008 01:37:10 +0100
Subject: sched: reenable sync wakeups

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 42d2f1155d3..770449bee6d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -627,9 +627,9 @@ enum {
 	SCHED_FEAT_START_DEBIT		= 4,
 	SCHED_FEAT_AFFINE_WAKEUPS	= 8,
 	SCHED_FEAT_CACHE_HOT_BUDDY	= 16,
-	SCHED_FEAT_HRTICK		= 32,
-	SCHED_FEAT_DOUBLE_TICK		= 64,
-	SCHED_FEAT_SYNC_WAKEUPS		= 128,
+	SCHED_FEAT_SYNC_WAKEUPS		= 32,
+	SCHED_FEAT_HRTICK		= 64,
+	SCHED_FEAT_DOUBLE_TICK		= 128,
 };
 
 const_debug unsigned int sysctl_sched_features =
@@ -638,9 +638,9 @@ const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_START_DEBIT		* 1 |
 		SCHED_FEAT_AFFINE_WAKEUPS	* 1 |
 		SCHED_FEAT_CACHE_HOT_BUDDY	* 1 |
+		SCHED_FEAT_SYNC_WAKEUPS		* 1 |
 		SCHED_FEAT_HRTICK		* 1 |
-		SCHED_FEAT_DOUBLE_TICK		* 0 |
-		SCHED_FEAT_SYNC_WAKEUPS		* 0;
+		SCHED_FEAT_DOUBLE_TICK		* 0;
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 
-- 
cgit v1.2.3-70-g09d2


From 50df5d6aea6694ca481b8005900401e8c95c2603 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 14 Mar 2008 16:09:59 +0100
Subject: sched: remove sysctl_sched_batch_wakeup_granularity

it's unused.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 -
 kernel/sched.c        |  1 -
 kernel/sched_debug.c  |  1 -
 kernel/sched_fair.c   | 10 ----------
 kernel/sysctl.c       | 11 -----------
 5 files changed, 24 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6a1e7afb099..15f05ff453d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1551,7 +1551,6 @@ static inline void wake_up_idle_cpu(int cpu) { }
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_batch_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
diff --git a/kernel/sched.c b/kernel/sched.c
index 770449bee6d..e813e845d9c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5396,7 +5396,6 @@ static inline void sched_init_granularity(void)
 		sysctl_sched_latency = limit;
 
 	sysctl_sched_wakeup_granularity *= factor;
-	sysctl_sched_batch_wakeup_granularity *= factor;
 }
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index ef358ba0768..3d09106990c 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -214,7 +214,6 @@ static int sched_debug_show(struct seq_file *m, void *v)
 	PN(sysctl_sched_latency);
 	PN(sysctl_sched_min_granularity);
 	PN(sysctl_sched_wakeup_granularity);
-	PN(sysctl_sched_batch_wakeup_granularity);
 	PN(sysctl_sched_child_runs_first);
 	P(sysctl_sched_features);
 #undef PN
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b01f8e77f2a..bedda18f37a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -61,16 +61,6 @@ const_debug unsigned int sysctl_sched_child_runs_first = 1;
  */
 unsigned int __read_mostly sysctl_sched_compat_yield;
 
-/*
- * SCHED_BATCH wake-up granularity.
- * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
- *
- * This option delays the preemption effects of decoupled workloads
- * and reduces their over-scheduling. Synchronous workloads will still
- * have immediate wakeup/sleep latencies.
- */
-unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
-
 /*
  * SCHED_OTHER wake-up granularity.
  * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b2a2d6889ba..be332e1a0c2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -268,17 +268,6 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &min_wakeup_granularity_ns,
 		.extra2		= &max_wakeup_granularity_ns,
 	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_batch_wakeup_granularity_ns",
-		.data		= &sysctl_sched_batch_wakeup_granularity,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &min_wakeup_granularity_ns,
-		.extra2		= &max_wakeup_granularity_ns,
-	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_child_runs_first",
-- 
cgit v1.2.3-70-g09d2


From 19fb518c2a0c5d88ed22bba7083b7e7bc2a9c231 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Sun, 17 Feb 2008 22:34:07 +0100
Subject: latencytop: optimize LT_BACKTRACEDEPTH loops a bit

There is no need to loop any longer when 'same == 0'.

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/latencytop.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index b4e3c85abe7..7c74dab0d21 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -64,8 +64,8 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
 		return;
 
 	for (i = 0; i < MAXLR; i++) {
-		int q;
-		int same = 1;
+		int q, same = 1;
+
 		/* Nothing stored: */
 		if (!latency_record[i].backtrace[0]) {
 			if (firstnonnull > i)
@@ -73,12 +73,15 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
 			continue;
 		}
 		for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
-			if (latency_record[i].backtrace[q] !=
-				lat->backtrace[q])
+			unsigned long record = lat->backtrace[q];
+
+			if (latency_record[i].backtrace[q] != record) {
 				same = 0;
-			if (same && lat->backtrace[q] == 0)
 				break;
-			if (same && lat->backtrace[q] == ULONG_MAX)
+			}
+
+			/* 0 and ULONG_MAX entries mean end of backtrace: */
+			if (record == 0 || record == ULONG_MAX)
 				break;
 		}
 		if (same) {
@@ -143,14 +146,18 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
 	for (i = 0; i < LT_SAVECOUNT ; i++) {
 		struct latency_record *mylat;
 		int same = 1;
+
 		mylat = &tsk->latency_record[i];
 		for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
-			if (mylat->backtrace[q] !=
-				lat.backtrace[q])
+			unsigned long record = lat.backtrace[q];
+
+			if (mylat->backtrace[q] != record) {
 				same = 0;
-			if (same && lat.backtrace[q] == 0)
 				break;
-			if (same && lat.backtrace[q] == ULONG_MAX)
+			}
+
+			/* 0 and ULONG_MAX entries mean end of backtrace: */
+			if (record == 0 || record == ULONG_MAX)
 				break;
 		}
 		if (same) {
-- 
cgit v1.2.3-70-g09d2


From 79b3feffb10417f197d2ab48dd4fa3c0c9e7d788 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 18 Feb 2008 13:39:37 +0100
Subject: sched: fix regression with sched yield

Balbir Singh reported:

> 1:mon> t
> [c0000000e7677da0] c000000000067de0 .sys_sched_yield+0x6c/0xbc
> [c0000000e7677e30] c000000000008748 syscall_exit+0x0/0x40
> --- Exception: c01 (System Call) at 00000400001d09e4
> SP (4000664cb10) is in userspace
> 1:mon> r
> cpu 0x1: Vector: 300 (Data Access) at [c0000000e7677aa0]
>     pc: c000000000068e50: .yield_task_fair+0x94/0xc4
>     lr: c000000000067de0: .sys_sched_yield+0x6c/0xbc

the check that should have avoided that is:

        /*
         * Are we the only task in the tree?
         */
        if (unlikely(rq->load.weight == curr->se.load.weight))
                return;

But I guess that overlooks rt tasks, they also increase the load.
So I guess something like this ought to fix it..

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bedda18f37a..290cf770b71 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -904,7 +904,7 @@ static void yield_task_fair(struct rq *rq)
 	/*
 	 * Already in the rightmost position?
 	 */
-	if (unlikely(rightmost->vruntime < se->vruntime))
+	if (unlikely(!rightmost || rightmost->vruntime < se->vruntime))
 		return;
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From d0b27fa77854b149ad4af08b0fe47fe712a47ade Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 19 Apr 2008 19:44:57 +0200
Subject: sched: rt-group: synchonised bandwidth period

Various SMP balancing algorithms require that the bandwidth period
run in sync.

Possible improvements are moving the rt_bandwidth thing into root_domain
and keeping a span per rt_bandwidth which marks throttled cpus.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h    |   7 ++
 kernel/sched.c           | 260 ++++++++++++++++++++++++++++++++++++++---------
 kernel/sched_rt.c        | 104 +++++++++++++------
 kernel/sysctl.c          |   4 +-
 kernel/time/tick-sched.c |   5 -
 kernel/user.c            |  28 +++++
 6 files changed, 320 insertions(+), 88 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 15f05ff453d..be5d31752db 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1563,6 +1563,10 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
 extern unsigned int sysctl_sched_rt_period;
 extern int sysctl_sched_rt_runtime;
 
+int sched_rt_handler(struct ctl_table *table, int write,
+		struct file *filp, void __user *buffer, size_t *lenp,
+		loff_t *ppos);
+
 extern unsigned int sysctl_sched_compat_yield;
 
 #ifdef CONFIG_RT_MUTEXES
@@ -2052,6 +2056,9 @@ extern unsigned long sched_group_shares(struct task_group *tg);
 extern int sched_group_set_rt_runtime(struct task_group *tg,
 				      long rt_runtime_us);
 extern long sched_group_rt_runtime(struct task_group *tg);
+extern int sched_group_set_rt_period(struct task_group *tg,
+				      long rt_period_us);
+extern long sched_group_rt_period(struct task_group *tg);
 #endif
 #endif
 
diff --git a/kernel/sched.c b/kernel/sched.c
index e813e845d9c..bb20323f7d0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -115,6 +115,11 @@ unsigned long long __attribute__((weak)) sched_clock(void)
  */
 #define DEF_TIMESLICE		(100 * HZ / 1000)
 
+/*
+ * single value that denotes runtime == period, ie unlimited time.
+ */
+#define RUNTIME_INF	((u64)~0ULL)
+
 #ifdef CONFIG_SMP
 /*
  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -156,6 +161,80 @@ struct rt_prio_array {
 	struct list_head queue[MAX_RT_PRIO];
 };
 
+struct rt_bandwidth {
+	ktime_t rt_period;
+	u64 rt_runtime;
+	struct hrtimer rt_period_timer;
+};
+
+static struct rt_bandwidth def_rt_bandwidth;
+
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
+
+static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
+{
+	struct rt_bandwidth *rt_b =
+		container_of(timer, struct rt_bandwidth, rt_period_timer);
+	ktime_t now;
+	int overrun;
+	int idle = 0;
+
+	for (;;) {
+		now = hrtimer_cb_get_time(timer);
+		overrun = hrtimer_forward(timer, now, rt_b->rt_period);
+
+		if (!overrun)
+			break;
+
+		idle = do_sched_rt_period_timer(rt_b, overrun);
+	}
+
+	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+
+static
+void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
+{
+	rt_b->rt_period = ns_to_ktime(period);
+	rt_b->rt_runtime = runtime;
+
+	hrtimer_init(&rt_b->rt_period_timer,
+			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rt_b->rt_period_timer.function = sched_rt_period_timer;
+	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+}
+
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+	ktime_t now;
+
+	if (rt_b->rt_runtime == RUNTIME_INF)
+		return;
+
+	if (hrtimer_active(&rt_b->rt_period_timer))
+		return;
+
+	spin_lock(&rt_b->rt_runtime_lock);
+	for (;;) {
+		if (hrtimer_active(&rt_b->rt_period_timer))
+			break;
+
+		now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
+		hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
+		hrtimer_start(&rt_b->rt_period_timer,
+			      rt_b->rt_period_timer.expires,
+			      HRTIMER_MODE_ABS);
+	}
+	spin_unlock(&rt_b->rt_runtime_lock);
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+	hrtimer_cancel(&rt_b->rt_period_timer);
+}
+#endif
+
 #ifdef CONFIG_GROUP_SCHED
 
 #include <linux/cgroup.h>
@@ -182,7 +261,7 @@ struct task_group {
 	struct sched_rt_entity **rt_se;
 	struct rt_rq **rt_rq;
 
-	u64 rt_runtime;
+	struct rt_bandwidth rt_bandwidth;
 #endif
 
 	struct rcu_head rcu;
@@ -407,8 +486,6 @@ struct rq {
 
 	struct cfs_rq cfs;
 	struct rt_rq rt;
-	u64 rt_period_expire;
-	int rt_throttled;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
@@ -592,23 +669,6 @@ static void update_rq_clock(struct rq *rq)
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
-unsigned long rt_needs_cpu(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	u64 delta;
-
-	if (!rq->rt_throttled)
-		return 0;
-
-	if (rq->clock > rq->rt_period_expire)
-		return 1;
-
-	delta = rq->rt_period_expire - rq->clock;
-	do_div(delta, NSEC_PER_SEC / HZ);
-
-	return (unsigned long)delta;
-}
-
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
@@ -664,10 +724,18 @@ static __read_mostly int scheduler_running;
  */
 int sysctl_sched_rt_runtime = 950000;
 
-/*
- * single value that denotes runtime == period, ie unlimited time.
- */
-#define RUNTIME_INF	((u64)~0ULL)
+static inline u64 global_rt_period(void)
+{
+	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
+}
+
+static inline u64 global_rt_runtime(void)
+{
+	if (sysctl_sched_rt_period < 0)
+		return RUNTIME_INF;
+
+	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
+}
 
 static const unsigned long long time_sync_thresh = 100000;
 
@@ -3854,7 +3922,6 @@ void scheduler_tick(void)
 	update_last_tick_seen(rq);
 	update_cpu_load(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
-	update_sched_rt_period(rq);
 	spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
@@ -4689,7 +4756,7 @@ recheck:
 	 * Do not allow realtime tasks into groups that have no runtime
 	 * assigned.
 	 */
-	if (rt_policy(policy) && task_group(p)->rt_runtime == 0)
+	if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
 		return -EPERM;
 #endif
 
@@ -7288,6 +7355,14 @@ void __init sched_init(void)
 	init_defrootdomain();
 #endif
 
+	init_rt_bandwidth(&def_rt_bandwidth,
+			global_rt_period(), global_rt_runtime());
+
+#ifdef CONFIG_RT_GROUP_SCHED
+	init_rt_bandwidth(&init_task_group.rt_bandwidth,
+			global_rt_period(), global_rt_runtime());
+#endif
+
 #ifdef CONFIG_GROUP_SCHED
 	list_add(&init_task_group.list, &task_groups);
 #endif
@@ -7312,15 +7387,11 @@ void __init sched_init(void)
 
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
-		init_task_group.rt_runtime =
-			sysctl_sched_rt_runtime * NSEC_PER_USEC;
 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
 		init_tg_rt_entry(rq, &init_task_group,
 				&per_cpu(init_rt_rq, i),
 				&per_cpu(init_sched_rt_entity, i), i, 1);
 #endif
-		rq->rt_period_expire = 0;
-		rq->rt_throttled = 0;
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
@@ -7506,8 +7577,6 @@ void set_curr_task(int cpu, struct task_struct *p)
 
 #endif
 
-#ifdef CONFIG_GROUP_SCHED
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void free_fair_sched_group(struct task_group *tg)
 {
@@ -7596,6 +7665,8 @@ static void free_rt_sched_group(struct task_group *tg)
 {
 	int i;
 
+	destroy_rt_bandwidth(&tg->rt_bandwidth);
+
 	for_each_possible_cpu(i) {
 		if (tg->rt_rq)
 			kfree(tg->rt_rq[i]);
@@ -7621,7 +7692,8 @@ static int alloc_rt_sched_group(struct task_group *tg)
 	if (!tg->rt_se)
 		goto err;
 
-	tg->rt_runtime = 0;
+	init_rt_bandwidth(&tg->rt_bandwidth,
+			ktime_to_ns(def_rt_bandwidth.rt_period), 0);
 
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
@@ -7674,6 +7746,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
 }
 #endif
 
+#ifdef CONFIG_GROUP_SCHED
 static void free_sched_group(struct task_group *tg)
 {
 	free_fair_sched_group(tg);
@@ -7775,6 +7848,7 @@ void sched_move_task(struct task_struct *tsk)
 
 	task_rq_unlock(rq, &flags);
 }
+#endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -7871,16 +7945,15 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 	struct task_group *tgi;
 	unsigned long total = 0;
 	unsigned long global_ratio =
-		to_ratio(sysctl_sched_rt_period,
-			 sysctl_sched_rt_runtime < 0 ?
-				RUNTIME_INF : sysctl_sched_rt_runtime);
+		to_ratio(global_rt_period(), global_rt_runtime());
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(tgi, &task_groups, list) {
 		if (tgi == tg)
 			continue;
 
-		total += to_ratio(period, tgi->rt_runtime);
+		total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
+				tgi->rt_bandwidth.rt_runtime);
 	}
 	rcu_read_unlock();
 
@@ -7898,16 +7971,11 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
 	return 0;
 }
 
-int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+static int tg_set_bandwidth(struct task_group *tg,
+		u64 rt_period, u64 rt_runtime)
 {
-	u64 rt_runtime, rt_period;
 	int err = 0;
 
-	rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
-	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
-	if (rt_runtime_us == -1)
-		rt_runtime = RUNTIME_INF;
-
 	mutex_lock(&rt_constraints_mutex);
 	read_lock(&tasklist_lock);
 	if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) {
@@ -7918,7 +7986,8 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 		err = -EINVAL;
 		goto unlock;
 	}
-	tg->rt_runtime = rt_runtime;
+	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
+	tg->rt_bandwidth.rt_runtime = rt_runtime;
  unlock:
 	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
@@ -7926,19 +7995,96 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 	return err;
 }
 
+int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+{
+	u64 rt_runtime, rt_period;
+
+	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+	if (rt_runtime_us < 0)
+		rt_runtime = RUNTIME_INF;
+
+	return tg_set_bandwidth(tg, rt_period, rt_runtime);
+}
+
 long sched_group_rt_runtime(struct task_group *tg)
 {
 	u64 rt_runtime_us;
 
-	if (tg->rt_runtime == RUNTIME_INF)
+	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
 		return -1;
 
-	rt_runtime_us = tg->rt_runtime;
+	rt_runtime_us = tg->rt_bandwidth.rt_runtime;
 	do_div(rt_runtime_us, NSEC_PER_USEC);
 	return rt_runtime_us;
 }
+
+int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+{
+	u64 rt_runtime, rt_period;
+
+	rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+	rt_runtime = tg->rt_bandwidth.rt_runtime;
+
+	return tg_set_bandwidth(tg, rt_period, rt_runtime);
+}
+
+long sched_group_rt_period(struct task_group *tg)
+{
+	u64 rt_period_us;
+
+	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
+	do_div(rt_period_us, NSEC_PER_USEC);
+	return rt_period_us;
+}
+
+static int sched_rt_global_constraints(void)
+{
+	int ret = 0;
+
+	mutex_lock(&rt_constraints_mutex);
+	if (!__rt_schedulable(NULL, 1, 0))
+		ret = -EINVAL;
+	mutex_unlock(&rt_constraints_mutex);
+
+	return ret;
+}
+#else
+static int sched_rt_global_constraints(void)
+{
+	return 0;
+}
 #endif
-#endif	/* CONFIG_GROUP_SCHED */
+
+int sched_rt_handler(struct ctl_table *table, int write,
+		struct file *filp, void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int ret;
+	int old_period, old_runtime;
+	static DEFINE_MUTEX(mutex);
+
+	mutex_lock(&mutex);
+	old_period = sysctl_sched_rt_period;
+	old_runtime = sysctl_sched_rt_runtime;
+
+	ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+
+	if (!ret && write) {
+		ret = sched_rt_global_constraints();
+		if (ret) {
+			sysctl_sched_rt_period = old_period;
+			sysctl_sched_rt_runtime = old_runtime;
+		} else {
+			def_rt_bandwidth.rt_runtime = global_rt_runtime();
+			def_rt_bandwidth.rt_period =
+				ns_to_ktime(global_rt_period());
+		}
+	}
+	mutex_unlock(&mutex);
+
+	return ret;
+}
 
 #ifdef CONFIG_CGROUP_SCHED
 
@@ -7988,7 +8134,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 {
 #ifdef CONFIG_RT_GROUP_SCHED
 	/* Don't accept realtime tasks when there is no way for them to run */
-	if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0)
+	if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
 		return -EINVAL;
 #else
 	/* We don't support RT-tasks being in separate groups */
@@ -8066,6 +8212,17 @@ static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
 
 	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
+
+static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+		u64 rt_period_us)
+{
+	return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
+}
+
+static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
+{
+	return sched_group_rt_period(cgroup_tg(cgrp));
+}
 #endif
 
 static struct cftype cpu_files[] = {
@@ -8082,6 +8239,11 @@ static struct cftype cpu_files[] = {
 		.read = cpu_rt_runtime_read,
 		.write = cpu_rt_runtime_write,
 	},
+	{
+		.name = "rt_period_us",
+		.read_uint = cpu_rt_period_read_uint,
+		.write_uint = cpu_rt_period_write_uint,
+	},
 #endif
 };
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 0a6d2e51642..8bc17613666 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -62,7 +62,7 @@ static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 	if (!rt_rq->tg)
 		return RUNTIME_INF;
 
-	return rt_rq->tg->rt_runtime;
+	return rt_rq->tg->rt_bandwidth.rt_runtime;
 }
 
 #define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -127,14 +127,29 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
 	return p->prio != p->normal_prio;
 }
 
+#ifdef CONFIG_SMP
+static inline cpumask_t sched_rt_period_mask(void)
+{
+	return cpu_rq(smp_processor_id())->rd->span;
+}
 #else
+static inline cpumask_t sched_rt_period_mask(void)
+{
+	return cpu_online_map;
+}
+#endif
 
-static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
+static inline
+struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 {
-	if (sysctl_sched_rt_runtime == -1)
-		return RUNTIME_INF;
+	return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
+}
 
-	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
+#else
+
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
+{
+	return def_rt_bandwidth.rt_runtime;
 }
 
 #define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -173,8 +188,55 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 {
 	return rt_rq->rt_throttled;
 }
+
+static inline cpumask_t sched_rt_period_mask(void)
+{
+	return cpu_online_map;
+}
+
+static inline
+struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
+{
+	return &cpu_rq(cpu)->rt;
+}
+
 #endif
 
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
+{
+	int i, idle = 1;
+	cpumask_t span;
+
+	if (rt_b->rt_runtime == RUNTIME_INF)
+		return 1;
+
+	span = sched_rt_period_mask();
+	for_each_cpu_mask(i, span) {
+		int enqueue = 0;
+		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
+		struct rq *rq = rq_of_rt_rq(rt_rq);
+
+		spin_lock(&rq->lock);
+		if (rt_rq->rt_time) {
+			u64 runtime = rt_b->rt_runtime;
+
+			rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
+			if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
+				rt_rq->rt_throttled = 0;
+				enqueue = 1;
+			}
+			if (rt_rq->rt_time || rt_rq->rt_nr_running)
+				idle = 0;
+		}
+
+		if (enqueue)
+			sched_rt_rq_enqueue(rt_rq);
+		spin_unlock(&rq->lock);
+	}
+
+	return idle;
+}
+
 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -198,11 +260,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 		return rt_rq_throttled(rt_rq);
 
 	if (rt_rq->rt_time > runtime) {
-		struct rq *rq = rq_of_rt_rq(rt_rq);
-
-		rq->rt_throttled = 1;
 		rt_rq->rt_throttled = 1;
-
 		if (rt_rq_throttled(rt_rq)) {
 			sched_rt_rq_dequeue(rt_rq);
 			return 1;
@@ -212,29 +270,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 	return 0;
 }
 
-static void update_sched_rt_period(struct rq *rq)
-{
-	struct rt_rq *rt_rq;
-	u64 period;
-
-	while (rq->clock > rq->rt_period_expire) {
-		period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
-		rq->rt_period_expire += period;
-
-		for_each_leaf_rt_rq(rt_rq, rq) {
-			u64 runtime = sched_rt_runtime(rt_rq);
-
-			rt_rq->rt_time -= min(rt_rq->rt_time, runtime);
-			if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
-				rt_rq->rt_throttled = 0;
-				sched_rt_rq_enqueue(rt_rq);
-			}
-		}
-
-		rq->rt_throttled = 0;
-	}
-}
-
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -284,6 +319,11 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 #ifdef CONFIG_RT_GROUP_SCHED
 	if (rt_se_boosted(rt_se))
 		rt_rq->rt_nr_boosted++;
+
+	if (rt_rq->tg)
+		start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
+#else
+	start_rt_bandwidth(&def_rt_bandwidth);
 #endif
 }
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index be332e1a0c2..fd3364827cc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -307,7 +307,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_sched_rt_period,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &sched_rt_handler,
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
@@ -315,7 +315,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_sched_rt_runtime,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &sched_rt_handler,
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 69dba0c7172..d358d4e3a95 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -191,7 +191,6 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 void tick_nohz_stop_sched_tick(void)
 {
 	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
-	unsigned long rt_jiffies;
 	struct tick_sched *ts;
 	ktime_t last_update, expires, now;
 	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -243,10 +242,6 @@ void tick_nohz_stop_sched_tick(void)
 	next_jiffies = get_next_timer_interrupt(last_jiffies);
 	delta_jiffies = next_jiffies - last_jiffies;
 
-	rt_jiffies = rt_needs_cpu(cpu);
-	if (rt_jiffies && rt_jiffies < delta_jiffies)
-		delta_jiffies = rt_jiffies;
-
 	if (rcu_needs_cpu(cpu))
 		delta_jiffies = 1;
 	/*
diff --git a/kernel/user.c b/kernel/user.c
index 7132022a040..5925c6887c1 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -193,6 +193,33 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
 
 static struct kobj_attribute cpu_rt_runtime_attr =
 	__ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
+
+static ssize_t cpu_rt_period_show(struct kobject *kobj,
+				   struct kobj_attribute *attr,
+				   char *buf)
+{
+	struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+
+	return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
+}
+
+static ssize_t cpu_rt_period_store(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    const char *buf, size_t size)
+{
+	struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+	unsigned long rt_period;
+	int rc;
+
+	sscanf(buf, "%lu", &rt_period);
+
+	rc = sched_group_set_rt_period(up->tg, rt_period);
+
+	return (rc ? rc : size);
+}
+
+static struct kobj_attribute cpu_rt_period_attr =
+	__ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
 #endif
 
 /* default attributes per uid directory */
@@ -202,6 +229,7 @@ static struct attribute *uids_attributes[] = {
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	&cpu_rt_runtime_attr.attr,
+	&cpu_rt_period_attr.attr,
 #endif
 	NULL
 };
-- 
cgit v1.2.3-70-g09d2


From ac086bc22997a2be24fc40fc8d46522fe7e03d11 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 19 Apr 2008 19:44:58 +0200
Subject: sched: rt-group: smp balancing

Currently the rt group scheduling does a per cpu runtime limit, however
the rt load balancer makes no guarantees about an equal spread of real-
time tasks, just that at any one time, the highest priority tasks run.

Solve this by making the runtime limit a global property by borrowing
excessive runtime from the other cpus once the local limit runs out.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 40 +++++++++++++++++++++++--
 kernel/sched_rt.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 122 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index bb20323f7d0..313cd4f057c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -164,6 +164,7 @@ struct rt_prio_array {
 struct rt_bandwidth {
 	ktime_t rt_period;
 	u64 rt_runtime;
+	spinlock_t rt_runtime_lock;
 	struct hrtimer rt_period_timer;
 };
 
@@ -198,6 +199,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 	rt_b->rt_period = ns_to_ktime(period);
 	rt_b->rt_runtime = runtime;
 
+	spin_lock_init(&rt_b->rt_runtime_lock);
+
 	hrtimer_init(&rt_b->rt_period_timer,
 			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rt_b->rt_period_timer.function = sched_rt_period_timer;
@@ -414,6 +417,8 @@ struct rt_rq {
 #endif
 	int rt_throttled;
 	u64 rt_time;
+	u64 rt_runtime;
+	spinlock_t rt_runtime_lock;
 
 #ifdef CONFIG_RT_GROUP_SCHED
 	unsigned long rt_nr_boosted;
@@ -7299,6 +7304,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 
 	rt_rq->rt_time = 0;
 	rt_rq->rt_throttled = 0;
+	rt_rq->rt_runtime = 0;
+	spin_lock_init(&rt_rq->rt_runtime_lock);
 
 #ifdef CONFIG_RT_GROUP_SCHED
 	rt_rq->rt_nr_boosted = 0;
@@ -7335,6 +7342,7 @@ static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
 	init_rt_rq(rt_rq, rq);
 	rt_rq->tg = tg;
 	rt_rq->rt_se = rt_se;
+	rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 	if (add)
 		list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
 
@@ -7391,6 +7399,8 @@ void __init sched_init(void)
 		init_tg_rt_entry(rq, &init_task_group,
 				&per_cpu(init_rt_rq, i),
 				&per_cpu(init_sched_rt_entity, i), i, 1);
+#else
+		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #endif
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -7974,11 +7984,11 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
 static int tg_set_bandwidth(struct task_group *tg,
 		u64 rt_period, u64 rt_runtime)
 {
-	int err = 0;
+	int i, err = 0;
 
 	mutex_lock(&rt_constraints_mutex);
 	read_lock(&tasklist_lock);
-	if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) {
+	if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
 		err = -EBUSY;
 		goto unlock;
 	}
@@ -7986,8 +7996,19 @@ static int tg_set_bandwidth(struct task_group *tg,
 		err = -EINVAL;
 		goto unlock;
 	}
+
+	spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
 	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
 	tg->rt_bandwidth.rt_runtime = rt_runtime;
+
+	for_each_possible_cpu(i) {
+		struct rt_rq *rt_rq = tg->rt_rq[i];
+
+		spin_lock(&rt_rq->rt_runtime_lock);
+		rt_rq->rt_runtime = rt_runtime;
+		spin_unlock(&rt_rq->rt_runtime_lock);
+	}
+	spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
  unlock:
 	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
@@ -8052,6 +8073,19 @@ static int sched_rt_global_constraints(void)
 #else
 static int sched_rt_global_constraints(void)
 {
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
+	for_each_possible_cpu(i) {
+		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
+
+		spin_lock(&rt_rq->rt_runtime_lock);
+		rt_rq->rt_runtime = global_rt_runtime();
+		spin_unlock(&rt_rq->rt_runtime_lock);
+	}
+	spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
+
 	return 0;
 }
 #endif
@@ -8168,7 +8202,7 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
-static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
+static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
 				struct file *file,
 				const char __user *userbuf,
 				size_t nbytes, loff_t *unused_ppos)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8bc17613666..6928ded24da 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -62,7 +62,12 @@ static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 	if (!rt_rq->tg)
 		return RUNTIME_INF;
 
-	return rt_rq->tg->rt_bandwidth.rt_runtime;
+	return rt_rq->rt_runtime;
+}
+
+static inline u64 sched_rt_period(struct rt_rq *rt_rq)
+{
+	return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 }
 
 #define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -145,11 +150,21 @@ struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 	return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
 }
 
+static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
+{
+	return &rt_rq->tg->rt_bandwidth;
+}
+
 #else
 
 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 {
-	return def_rt_bandwidth.rt_runtime;
+	return rt_rq->rt_runtime;
+}
+
+static inline u64 sched_rt_period(struct rt_rq *rt_rq)
+{
+	return ktime_to_ns(def_rt_bandwidth.rt_period);
 }
 
 #define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -200,6 +215,11 @@ struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 	return &cpu_rq(cpu)->rt;
 }
 
+static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
+{
+	return &def_rt_bandwidth;
+}
+
 #endif
 
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
@@ -218,8 +238,10 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 
 		spin_lock(&rq->lock);
 		if (rt_rq->rt_time) {
-			u64 runtime = rt_b->rt_runtime;
+			u64 runtime;
 
+			spin_lock(&rt_rq->rt_runtime_lock);
+			runtime = rt_rq->rt_runtime;
 			rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
 			if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 				rt_rq->rt_throttled = 0;
@@ -227,6 +249,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 			}
 			if (rt_rq->rt_time || rt_rq->rt_nr_running)
 				idle = 0;
+			spin_unlock(&rt_rq->rt_runtime_lock);
 		}
 
 		if (enqueue)
@@ -237,6 +260,47 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 	return idle;
 }
 
+#ifdef CONFIG_SMP
+static int balance_runtime(struct rt_rq *rt_rq)
+{
+	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+	int i, weight, more = 0;
+	u64 rt_period;
+
+	weight = cpus_weight(rd->span);
+
+	spin_lock(&rt_b->rt_runtime_lock);
+	rt_period = ktime_to_ns(rt_b->rt_period);
+	for_each_cpu_mask(i, rd->span) {
+		struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
+		s64 diff;
+
+		if (iter == rt_rq)
+			continue;
+
+		spin_lock(&iter->rt_runtime_lock);
+		diff = iter->rt_runtime - iter->rt_time;
+		if (diff > 0) {
+			do_div(diff, weight);
+			if (rt_rq->rt_runtime + diff > rt_period)
+				diff = rt_period - rt_rq->rt_runtime;
+			iter->rt_runtime -= diff;
+			rt_rq->rt_runtime += diff;
+			more = 1;
+			if (rt_rq->rt_runtime == rt_period) {
+				spin_unlock(&iter->rt_runtime_lock);
+				break;
+			}
+		}
+		spin_unlock(&iter->rt_runtime_lock);
+	}
+	spin_unlock(&rt_b->rt_runtime_lock);
+
+	return more;
+}
+#endif
+
 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -259,6 +323,22 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 	if (rt_rq->rt_throttled)
 		return rt_rq_throttled(rt_rq);
 
+	if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
+		return 0;
+
+#ifdef CONFIG_SMP
+	if (rt_rq->rt_time > runtime) {
+		int more;
+
+		spin_unlock(&rt_rq->rt_runtime_lock);
+		more = balance_runtime(rt_rq);
+		spin_lock(&rt_rq->rt_runtime_lock);
+
+		if (more)
+			runtime = sched_rt_runtime(rt_rq);
+	}
+#endif
+
 	if (rt_rq->rt_time > runtime) {
 		rt_rq->rt_throttled = 1;
 		if (rt_rq_throttled(rt_rq)) {
@@ -294,9 +374,11 @@ static void update_curr_rt(struct rq *rq)
 	curr->se.exec_start = rq->clock;
 	cpuacct_charge(curr, delta_exec);
 
+	spin_lock(&rt_rq->rt_runtime_lock);
 	rt_rq->rt_time += delta_exec;
 	if (sched_rt_runtime_exceeded(rt_rq))
 		resched_task(curr);
+	spin_unlock(&rt_rq->rt_runtime_lock);
 }
 
 static inline
-- 
cgit v1.2.3-70-g09d2


From 48f20a9a9488c432fc86df1ff4b7f4fa895d1183 Mon Sep 17 00:00:00 2001
From: Olof Johansson <olof@lixom.net>
Date: Tue, 4 Mar 2008 15:23:25 -0800
Subject: tasklets: execute tasklets in the same order they were queued

I noticed this when looking at an openswan issue.  Openswan (ab?)uses the
tasklet API to defer processing of packets in some situations, with one
packet per tasklet_action().  I started noticing sequences of
backwards-ordered sequence numbers coming over the wire, since new tasklets
are always queued at the head of the list but processed sequentially.

Convert it to instead append new entries to the tail of the list.  As an
extra bonus, the splicing code in takeover_tasklets() no longer has to
iterate over the list.

Signed-off-by: Olof Johansson <olof@lixom.net>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softirq.c | 63 ++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 41 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 31e9f2a4792..3c44956ee7e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -356,7 +356,8 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
 /* Tasklets */
 struct tasklet_head
 {
-	struct tasklet_struct *list;
+	struct tasklet_struct *head;
+	struct tasklet_struct **tail;
 };
 
 /* Some compilers disobey section attribute on statics when not
@@ -369,8 +370,9 @@ void __tasklet_schedule(struct tasklet_struct *t)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	t->next = __get_cpu_var(tasklet_vec).list;
-	__get_cpu_var(tasklet_vec).list = t;
+	t->next = NULL;
+	*__get_cpu_var(tasklet_vec).tail = t;
+	__get_cpu_var(tasklet_vec).tail = &(t->next);
 	raise_softirq_irqoff(TASKLET_SOFTIRQ);
 	local_irq_restore(flags);
 }
@@ -382,8 +384,9 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	t->next = __get_cpu_var(tasklet_hi_vec).list;
-	__get_cpu_var(tasklet_hi_vec).list = t;
+	t->next = NULL;
+	*__get_cpu_var(tasklet_hi_vec).tail = t;
+	__get_cpu_var(tasklet_hi_vec).tail = &(t->next);
 	raise_softirq_irqoff(HI_SOFTIRQ);
 	local_irq_restore(flags);
 }
@@ -395,8 +398,9 @@ static void tasklet_action(struct softirq_action *a)
 	struct tasklet_struct *list;
 
 	local_irq_disable();
-	list = __get_cpu_var(tasklet_vec).list;
-	__get_cpu_var(tasklet_vec).list = NULL;
+	list = __get_cpu_var(tasklet_vec).head;
+	__get_cpu_var(tasklet_vec).head = NULL;
+	__get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head;
 	local_irq_enable();
 
 	while (list) {
@@ -416,8 +420,9 @@ static void tasklet_action(struct softirq_action *a)
 		}
 
 		local_irq_disable();
-		t->next = __get_cpu_var(tasklet_vec).list;
-		__get_cpu_var(tasklet_vec).list = t;
+		t->next = NULL;
+		*__get_cpu_var(tasklet_vec).tail = t;
+		__get_cpu_var(tasklet_vec).tail = &(t->next);
 		__raise_softirq_irqoff(TASKLET_SOFTIRQ);
 		local_irq_enable();
 	}
@@ -428,8 +433,9 @@ static void tasklet_hi_action(struct softirq_action *a)
 	struct tasklet_struct *list;
 
 	local_irq_disable();
-	list = __get_cpu_var(tasklet_hi_vec).list;
-	__get_cpu_var(tasklet_hi_vec).list = NULL;
+	list = __get_cpu_var(tasklet_hi_vec).head;
+	__get_cpu_var(tasklet_hi_vec).head = NULL;
+	__get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head;
 	local_irq_enable();
 
 	while (list) {
@@ -449,8 +455,9 @@ static void tasklet_hi_action(struct softirq_action *a)
 		}
 
 		local_irq_disable();
-		t->next = __get_cpu_var(tasklet_hi_vec).list;
-		__get_cpu_var(tasklet_hi_vec).list = t;
+		t->next = NULL;
+		*__get_cpu_var(tasklet_hi_vec).tail = t;
+		__get_cpu_var(tasklet_hi_vec).tail = &(t->next);
 		__raise_softirq_irqoff(HI_SOFTIRQ);
 		local_irq_enable();
 	}
@@ -487,6 +494,15 @@ EXPORT_SYMBOL(tasklet_kill);
 
 void __init softirq_init(void)
 {
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		per_cpu(tasklet_vec, cpu).tail =
+			&per_cpu(tasklet_vec, cpu).head;
+		per_cpu(tasklet_hi_vec, cpu).tail =
+			&per_cpu(tasklet_hi_vec, cpu).head;
+	}
+
 	open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
 	open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
 }
@@ -555,9 +571,12 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
 		return;
 
 	/* CPU is dead, so no lock needed. */
-	for (i = &per_cpu(tasklet_vec, cpu).list; *i; i = &(*i)->next) {
+	for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) {
 		if (*i == t) {
 			*i = t->next;
+			/* If this was the tail element, move the tail ptr */
+			if (*i == NULL)
+				per_cpu(tasklet_vec, cpu).tail = i;
 			return;
 		}
 	}
@@ -566,20 +585,20 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
 
 static void takeover_tasklets(unsigned int cpu)
 {
-	struct tasklet_struct **i;
-
 	/* CPU is dead, so no lock needed. */
 	local_irq_disable();
 
 	/* Find end, append list for that CPU. */
-	for (i = &__get_cpu_var(tasklet_vec).list; *i; i = &(*i)->next);
-	*i = per_cpu(tasklet_vec, cpu).list;
-	per_cpu(tasklet_vec, cpu).list = NULL;
+	*__get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).head;
+	__get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail;
+	per_cpu(tasklet_vec, cpu).head = NULL;
+	per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
 	raise_softirq_irqoff(TASKLET_SOFTIRQ);
 
-	for (i = &__get_cpu_var(tasklet_hi_vec).list; *i; i = &(*i)->next);
-	*i = per_cpu(tasklet_hi_vec, cpu).list;
-	per_cpu(tasklet_hi_vec, cpu).list = NULL;
+	*__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head;
+	__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail;
+	per_cpu(tasklet_hi_vec, cpu).head = NULL;
+	per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
 	raise_softirq_irqoff(HI_SOFTIRQ);
 
 	local_irq_enable();
-- 
cgit v1.2.3-70-g09d2


From 32cd756a80aaef657ac09c76e6eff3ba65567790 Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Date: Fri, 29 Feb 2008 10:02:43 +0530
Subject: sched: cleanup cpuacct variable names

Change the variable names to the common convention for the cpuacct
subsystem.

Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 313cd4f057c..e2f85c7a747 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8318,9 +8318,9 @@ struct cpuacct {
 struct cgroup_subsys cpuacct_subsys;
 
 /* return cpu accounting group corresponding to this container */
-static inline struct cpuacct *cgroup_ca(struct cgroup *cont)
+static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
 {
-	return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id),
+	return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
 			    struct cpuacct, css);
 }
 
@@ -8333,7 +8333,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk)
 
 /* create a new cpu accounting group */
 static struct cgroup_subsys_state *cpuacct_create(
-	struct cgroup_subsys *ss, struct cgroup *cont)
+	struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
 
@@ -8351,18 +8351,18 @@ static struct cgroup_subsys_state *cpuacct_create(
 
 /* destroy an existing cpu accounting group */
 static void
-cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
-	struct cpuacct *ca = cgroup_ca(cont);
+	struct cpuacct *ca = cgroup_ca(cgrp);
 
 	free_percpu(ca->cpuusage);
 	kfree(ca);
 }
 
 /* return total cpu usage (in nanoseconds) of a group */
-static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft)
+static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
 {
-	struct cpuacct *ca = cgroup_ca(cont);
+	struct cpuacct *ca = cgroup_ca(cgrp);
 	u64 totalcpuusage = 0;
 	int i;
 
@@ -8388,9 +8388,9 @@ static struct cftype files[] = {
 	},
 };
 
-static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
-	return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
+	return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 0297b80339d545045490716fa8591b215fdd9458 Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Date: Fri, 29 Feb 2008 10:02:44 +0530
Subject: sched: allow cpuacct stats to be reset

Currently the schedstats implementation does not allow the statistics
to be reset. This patch aims to allow that.

  echo 0 > cpuacct.usage

resets the usage. Any other value is not allowed and returns -EINVAL.

Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e2f85c7a747..e4bf4477aee 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8381,10 +8381,34 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
 	return totalcpuusage;
 }
 
+static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
+								u64 reset)
+{
+	struct cpuacct *ca = cgroup_ca(cgrp);
+	int err = 0;
+	int i;
+
+	if (reset) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	for_each_possible_cpu(i) {
+		u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
+
+		spin_lock_irq(&cpu_rq(i)->lock);
+		*cpuusage = 0;
+		spin_unlock_irq(&cpu_rq(i)->lock);
+	}
+out:
+	return err;
+}
+
 static struct cftype files[] = {
 	{
 		.name = "usage",
 		.read_uint = cpuusage_read,
+		.write_uint = cpuusage_write,
 	},
 };
 
-- 
cgit v1.2.3-70-g09d2


From 9f0e738f492522a2f70ad9a2a0287e4e966c633a Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Tue, 12 Feb 2008 13:30:05 -0500
Subject: sched: fix cpus_allowed settings

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kthread.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 0ac887882f9..25241d6ec8c 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
 	wait_task_inactive(k);
 	set_task_cpu(k, cpu);
 	k->cpus_allowed = cpumask_of_cpu(cpu);
+	k->rt.nr_cpus_allowed = 1;
 }
 EXPORT_SYMBOL(kthread_bind);
 
-- 
cgit v1.2.3-70-g09d2


From d366f8cbc16882e93538d9a52423c2f50dad7c06 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Fri, 4 Apr 2008 18:11:12 -0700
Subject: cpumask: Cleanup more uses of CPU_MASK and NODE_MASK

 *  Replace usages of CPU_MASK_NONE, CPU_MASK_ALL, NODE_MASK_NONE,
    NODE_MASK_ALL to reduce stack requirements for large NR_CPUS
    and MAXNODES counts.

 *  In some cases, the cpumask variable was initialized but then overwritten
    with another value.  This is the case for changes like this:

    -       cpumask_t oldmask = CPU_MASK_ALL;
    +       cpumask_t oldmask;

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic_64.c | 2 +-
 kernel/irq/chip.c            | 2 +-
 mm/allocpercpu.c             | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index b54464b2665..9ba11d07920 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -785,7 +785,7 @@ static void __clear_irq_vector(int irq)
 		per_cpu(vector_irq, cpu)[vector] = -1;
 
 	cfg->vector = 0;
-	cfg->domain = CPU_MASK_NONE;
+	cpus_clear(cfg->domain);
 }
 
 void __setup_vector_irq(int cpu)
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index fdb3fbe2b0c..964964baefa 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -47,7 +47,7 @@ void dynamic_irq_init(unsigned int irq)
 	desc->irq_count = 0;
 	desc->irqs_unhandled = 0;
 #ifdef CONFIG_SMP
-	desc->affinity = CPU_MASK_ALL;
+	cpus_setall(desc->affinity);
 #endif
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index b0012e27fea..f4026bae6ee 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -82,9 +82,10 @@ EXPORT_SYMBOL_GPL(percpu_populate);
 int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
 			   cpumask_t *mask)
 {
-	cpumask_t populated = CPU_MASK_NONE;
+	cpumask_t populated;
 	int cpu;
 
+	cpus_clear(populated);
 	for_each_cpu_mask(cpu, *mask)
 		if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
 			__percpu_depopulate_mask(__pdata, &populated);
-- 
cgit v1.2.3-70-g09d2


From 434d53b00d6bb7be0a1d3dcc0d0d5df6c042e164 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Fri, 4 Apr 2008 18:11:04 -0700
Subject: sched: remove fixed NR_CPUS sized arrays in kernel_sched_c

 * Change fixed size arrays to per_cpu variables or dynamically allocated
   arrays in sched_init() and sched_init_smp().

     (1) static struct sched_entity *init_sched_entity_p[NR_CPUS];
     (1) static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
     (1) static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
     (1) static struct rt_rq *init_rt_rq_p[NR_CPUS];
	 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];

     (1) - these arrays are allocated via alloc_bootmem_low()

 * Change sched_domain_debug_one() to use cpulist_scnprintf instead of
   cpumask_scnprintf.  This reduces the output buffer required and improves
   readability when large NR_CPU count machines arrive.

 * In sched_create_group() we allocate new arrays based on nr_cpu_ids.

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 80 ++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 52 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e4bf4477aee..ef3f28b334e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -67,6 +67,7 @@
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
+#include <linux/bootmem.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -276,17 +277,11 @@ struct task_group {
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
 static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
-
-static struct sched_entity *init_sched_entity_p[NR_CPUS];
-static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
-
-static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
-static struct rt_rq *init_rt_rq_p[NR_CPUS];
 #endif
 
 /* task_group_lock serializes add/remove of task groups and also changes to
@@ -310,17 +305,7 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 /* Default task group.
  *	Every task in system belong to this group at bootup.
  */
-struct task_group init_task_group = {
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	.se	= init_sched_entity_p,
-	.cfs_rq = init_cfs_rq_p,
-#endif
-
-#ifdef CONFIG_RT_GROUP_SCHED
-	.rt_se	= init_sched_rt_entity_p,
-	.rt_rq	= init_rt_rq_p,
-#endif
-};
+struct task_group init_task_group;
 
 /* return group to which a task belongs */
 static inline struct task_group *task_group(struct task_struct *p)
@@ -3720,7 +3705,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
 			 */
 			int ilb = first_cpu(nohz.cpu_mask);
 
-			if (ilb != NR_CPUS)
+			if (ilb < nr_cpu_ids)
 				resched_cpu(ilb);
 		}
 	}
@@ -5671,11 +5656,11 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 		dest_cpu = any_online_cpu(mask);
 
 		/* On any allowed CPU? */
-		if (dest_cpu == NR_CPUS)
+		if (dest_cpu >= nr_cpu_ids)
 			dest_cpu = any_online_cpu(p->cpus_allowed);
 
 		/* No more Mr. Nice Guy. */
-		if (dest_cpu == NR_CPUS) {
+		if (dest_cpu >= nr_cpu_ids) {
 			cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p);
 			/*
 			 * Try to stay on the same cpuset, where the
@@ -6134,9 +6119,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level)
 {
 	struct sched_group *group = sd->groups;
 	cpumask_t groupmask;
-	char str[NR_CPUS];
+	char str[256];
 
-	cpumask_scnprintf(str, NR_CPUS, sd->span);
+	cpulist_scnprintf(str, sizeof(str), sd->span);
 	cpus_clear(groupmask);
 
 	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
@@ -6189,7 +6174,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level)
 
 		cpus_or(groupmask, groupmask, group->cpumask);
 
-		cpumask_scnprintf(str, NR_CPUS, group->cpumask);
+		cpulist_scnprintf(str, sizeof(str), group->cpumask);
 		printk(KERN_CONT " %s", str);
 
 		group = group->next;
@@ -6601,7 +6586,7 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
  * gets dynamically allocated.
  */
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
+static struct sched_group ***sched_group_nodes_bycpu;
 
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
@@ -7244,6 +7229,11 @@ void __init sched_init_smp(void)
 {
 	cpumask_t non_isolated_cpus;
 
+#if defined(CONFIG_NUMA)
+	sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
+								GFP_KERNEL);
+	BUG_ON(sched_group_nodes_bycpu == NULL);
+#endif
 	get_online_cpus();
 	arch_init_sched_domains(&cpu_online_map);
 	cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
@@ -7261,6 +7251,11 @@ void __init sched_init_smp(void)
 #else
 void __init sched_init_smp(void)
 {
+#if defined(CONFIG_NUMA)
+	sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
+								GFP_KERNEL);
+	BUG_ON(sched_group_nodes_bycpu == NULL);
+#endif
 	sched_init_granularity();
 }
 #endif /* CONFIG_SMP */
@@ -7358,6 +7353,35 @@ void __init sched_init(void)
 {
 	int highest_cpu = 0;
 	int i, j;
+	unsigned long alloc_size = 0, ptr;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+#endif
+	/*
+	 * As sched_init() is called before page_alloc is setup,
+	 * we use alloc_bootmem().
+	 */
+	if (alloc_size) {
+		ptr = (unsigned long)alloc_bootmem_low(alloc_size);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		init_task_group.se = (struct sched_entity **)ptr;
+		ptr += nr_cpu_ids * sizeof(void **);
+
+		init_task_group.cfs_rq = (struct cfs_rq **)ptr;
+		ptr += nr_cpu_ids * sizeof(void **);
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+		init_task_group.rt_se = (struct sched_rt_entity **)ptr;
+		ptr += nr_cpu_ids * sizeof(void **);
+
+		init_task_group.rt_rq = (struct rt_rq **)ptr;
+#endif
+	}
 
 #ifdef CONFIG_SMP
 	init_defrootdomain();
@@ -7610,10 +7634,10 @@ static int alloc_fair_sched_group(struct task_group *tg)
 	struct rq *rq;
 	int i;
 
-	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
+	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
 	if (!tg->cfs_rq)
 		goto err;
-	tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
+	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
 	if (!tg->se)
 		goto err;
 
@@ -7695,10 +7719,10 @@ static int alloc_rt_sched_group(struct task_group *tg)
 	struct rq *rq;
 	int i;
 
-	tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
+	tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
 	if (!tg->rt_rq)
 		goto err;
-	tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
+	tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
 	if (!tg->rt_se)
 		goto err;
 
-- 
cgit v1.2.3-70-g09d2


From f70316dace2bb99730800d47044acb818c6735f6 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Fri, 4 Apr 2008 18:11:06 -0700
Subject: generic: use new set_cpus_allowed_ptr function

  * Use new set_cpus_allowed_ptr() function added by previous patch,
    which instead of passing the "newly allowed cpus" cpumask_t arg
    by value,  pass it by pointer:

    -int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
    +int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)

  * Modify CPU_MASK_ALL

Depends on:
	[sched-devel]: sched: add new set_cpus_allowed_ptr function

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/acpi/processor_throttling.c | 10 +++++-----
 drivers/firmware/dcdbas.c           |  4 ++--
 drivers/pci/pci-driver.c            |  9 ++++++---
 kernel/cpu.c                        |  6 +++---
 kernel/kmod.c                       |  2 +-
 kernel/rcutorture.c                 | 15 +++++++++------
 kernel/stop_machine.c               |  2 +-
 7 files changed, 27 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c
index 1b8e592a824..0bba3a914e8 100644
--- a/drivers/acpi/processor_throttling.c
+++ b/drivers/acpi/processor_throttling.c
@@ -838,10 +838,10 @@ static int acpi_processor_get_throttling(struct acpi_processor *pr)
 	 * Migrate task to the cpu pointed by pr.
 	 */
 	saved_mask = current->cpus_allowed;
-	set_cpus_allowed(current, cpumask_of_cpu(pr->id));
+	set_cpus_allowed_ptr(current, &cpumask_of_cpu(pr->id));
 	ret = pr->throttling.acpi_processor_get_throttling(pr);
 	/* restore the previous state */
-	set_cpus_allowed(current, saved_mask);
+	set_cpus_allowed_ptr(current, &saved_mask);
 
 	return ret;
 }
@@ -1025,7 +1025,7 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, int state)
 	 * it can be called only for the cpu pointed by pr.
 	 */
 	if (p_throttling->shared_type == DOMAIN_COORD_TYPE_SW_ANY) {
-		set_cpus_allowed(current, cpumask_of_cpu(pr->id));
+		set_cpus_allowed_ptr(current, &cpumask_of_cpu(pr->id));
 		ret = p_throttling->acpi_processor_set_throttling(pr,
 						t_state.target_state);
 	} else {
@@ -1056,7 +1056,7 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, int state)
 				continue;
 			}
 			t_state.cpu = i;
-			set_cpus_allowed(current, cpumask_of_cpu(i));
+			set_cpus_allowed_ptr(current, &cpumask_of_cpu(i));
 			ret = match_pr->throttling.
 				acpi_processor_set_throttling(
 				match_pr, t_state.target_state);
@@ -1074,7 +1074,7 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, int state)
 							&t_state);
 	}
 	/* restore the previous state */
-	set_cpus_allowed(current, saved_mask);
+	set_cpus_allowed_ptr(current, &saved_mask);
 	return ret;
 }
 
diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c
index 1636806ec55..0ffef3b7c6c 100644
--- a/drivers/firmware/dcdbas.c
+++ b/drivers/firmware/dcdbas.c
@@ -265,7 +265,7 @@ static int smi_request(struct smi_cmd *smi_cmd)
 
 	/* SMI requires CPU 0 */
 	old_mask = current->cpus_allowed;
-	set_cpus_allowed(current, cpumask_of_cpu(0));
+	set_cpus_allowed_ptr(current, &cpumask_of_cpu(0));
 	if (smp_processor_id() != 0) {
 		dev_dbg(&dcdbas_pdev->dev, "%s: failed to get CPU 0\n",
 			__FUNCTION__);
@@ -285,7 +285,7 @@ static int smi_request(struct smi_cmd *smi_cmd)
 	);
 
 out:
-	set_cpus_allowed(current, old_mask);
+	set_cpus_allowed_ptr(current, &old_mask);
 	return ret;
 }
 
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index e571c72e675..e8d94fafc28 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -182,15 +182,18 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
 	struct mempolicy *oldpol;
 	cpumask_t oldmask = current->cpus_allowed;
 	int node = pcibus_to_node(dev->bus);
-	if (node >= 0 && node_online(node))
-	    set_cpus_allowed(current, node_to_cpumask(node));
+
+	if (node >= 0) {
+		node_to_cpumask_ptr(nodecpumask, node);
+		set_cpus_allowed_ptr(current, nodecpumask);
+	}
 	/* And set default memory allocation policy */
 	oldpol = current->mempolicy;
 	current->mempolicy = NULL;	/* fall back to system default policy */
 #endif
 	error = drv->probe(dev, id);
 #ifdef CONFIG_NUMA
-	set_cpus_allowed(current, oldmask);
+	set_cpus_allowed_ptr(current, &oldmask);
 	current->mempolicy = oldpol;
 #endif
 	return error;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2eff3f63abe..2011ad8d269 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -232,9 +232,9 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 
 	/* Ensure that we are not runnable on dying cpu */
 	old_allowed = current->cpus_allowed;
-	tmp = CPU_MASK_ALL;
+	cpus_setall(tmp);
 	cpu_clear(cpu, tmp);
-	set_cpus_allowed(current, tmp);
+	set_cpus_allowed_ptr(current, &tmp);
 
 	p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
 
@@ -268,7 +268,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 out_thread:
 	err = kthread_stop(p);
 out_allowed:
-	set_cpus_allowed(current, old_allowed);
+	set_cpus_allowed_ptr(current, &old_allowed);
 out_release:
 	cpu_hotplug_done();
 	return err;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 22be3ff3f36..e2764047ec0 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -165,7 +165,7 @@ static int ____call_usermodehelper(void *data)
 	}
 
 	/* We can run anywhere, unlike our parent keventd(). */
-	set_cpus_allowed(current, CPU_MASK_ALL);
+	set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR);
 
 	/*
 	 * Our parent is keventd, which runs with elevated scheduling priority.
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index fd599829e72..47894f919d4 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -723,9 +723,10 @@ static int rcu_idle_cpu;	/* Force all torture tasks off this CPU */
  */
 static void rcu_torture_shuffle_tasks(void)
 {
-	cpumask_t tmp_mask = CPU_MASK_ALL;
+	cpumask_t tmp_mask;
 	int i;
 
+	cpus_setall(tmp_mask);
 	get_online_cpus();
 
 	/* No point in shuffling if there is only one online CPU (ex: UP) */
@@ -737,25 +738,27 @@ static void rcu_torture_shuffle_tasks(void)
 	if (rcu_idle_cpu != -1)
 		cpu_clear(rcu_idle_cpu, tmp_mask);
 
-	set_cpus_allowed(current, tmp_mask);
+	set_cpus_allowed_ptr(current, &tmp_mask);
 
 	if (reader_tasks) {
 		for (i = 0; i < nrealreaders; i++)
 			if (reader_tasks[i])
-				set_cpus_allowed(reader_tasks[i], tmp_mask);
+				set_cpus_allowed_ptr(reader_tasks[i],
+						     &tmp_mask);
 	}
 
 	if (fakewriter_tasks) {
 		for (i = 0; i < nfakewriters; i++)
 			if (fakewriter_tasks[i])
-				set_cpus_allowed(fakewriter_tasks[i], tmp_mask);
+				set_cpus_allowed_ptr(fakewriter_tasks[i],
+						     &tmp_mask);
 	}
 
 	if (writer_task)
-		set_cpus_allowed(writer_task, tmp_mask);
+		set_cpus_allowed_ptr(writer_task, &tmp_mask);
 
 	if (stats_task)
-		set_cpus_allowed(stats_task, tmp_mask);
+		set_cpus_allowed_ptr(stats_task, &tmp_mask);
 
 	if (rcu_idle_cpu == -1)
 		rcu_idle_cpu = num_online_cpus() - 1;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 6f4e0e13f70..e1b2a5b1b10 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,7 +35,7 @@ static int stopmachine(void *cpu)
 	int irqs_disabled = 0;
 	int prepared = 0;
 
-	set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu));
+	set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu));
 
 	/* Ack: we are alive */
 	smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
-- 
cgit v1.2.3-70-g09d2


From f9a86fcbbb1e5542eabf45c9144ac4b6330861a4 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Fri, 4 Apr 2008 18:11:07 -0700
Subject: cpuset: modify cpuset_set_cpus_allowed to use cpumask pointer

  * Modify cpuset_cpus_allowed to return the currently allowed cpuset
    via a pointer argument instead of as the function return value.

  * Use new set_cpus_allowed_ptr function.

  * Cleanup CPU_MASK_ALL and NODE_MASK_ALL uses.

Depends on:
	[sched-devel]: sched: add new set_cpus_allowed_ptr function

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/cpuset.h | 13 +++++++------
 kernel/cpuset.c        | 31 ++++++++++++-------------------
 kernel/sched.c         |  8 +++++---
 mm/pdflush.c           |  4 ++--
 4 files changed, 26 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 0a26be353cb..726761e2400 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -20,8 +20,8 @@ extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 extern int cpuset_init_early(void);
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
-extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
-extern cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p);
+extern void cpuset_cpus_allowed(struct task_struct *p, cpumask_t *mask);
+extern void cpuset_cpus_allowed_locked(struct task_struct *p, cpumask_t *mask);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@@ -84,13 +84,14 @@ static inline int cpuset_init_early(void) { return 0; }
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
-static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p)
+static inline void cpuset_cpus_allowed(struct task_struct *p, cpumask_t *mask)
 {
-	return cpu_possible_map;
+	*mask = cpu_possible_map;
 }
-static inline cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p)
+static inline void cpuset_cpus_allowed_locked(struct task_struct *p,
+								cpumask_t *mask)
 {
-	return cpu_possible_map;
+	*mask = cpu_possible_map;
 }
 
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a1b61f41422..6b9ac296a05 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -729,7 +729,7 @@ int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
  */
 void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
 {
-	set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed);
+	set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed));
 }
 
 /**
@@ -1178,7 +1178,7 @@ static void cpuset_attach(struct cgroup_subsys *ss,
 
 	mutex_lock(&callback_mutex);
 	guarantee_online_cpus(cs, &cpus);
-	set_cpus_allowed(tsk, cpus);
+	set_cpus_allowed_ptr(tsk, &cpus);
 	mutex_unlock(&callback_mutex);
 
 	from = oldcs->mems_allowed;
@@ -1555,8 +1555,8 @@ static struct cgroup_subsys_state *cpuset_create(
 	if (is_spread_slab(parent))
 		set_bit(CS_SPREAD_SLAB, &cs->flags);
 	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
-	cs->cpus_allowed = CPU_MASK_NONE;
-	cs->mems_allowed = NODE_MASK_NONE;
+	cpus_clear(cs->cpus_allowed);
+	nodes_clear(cs->mems_allowed);
 	cs->mems_generation = cpuset_mems_generation++;
 	fmeter_init(&cs->fmeter);
 
@@ -1625,8 +1625,8 @@ int __init cpuset_init(void)
 {
 	int err = 0;
 
-	top_cpuset.cpus_allowed = CPU_MASK_ALL;
-	top_cpuset.mems_allowed = NODE_MASK_ALL;
+	cpus_setall(top_cpuset.cpus_allowed);
+	nodes_setall(top_cpuset.mems_allowed);
 
 	fmeter_init(&top_cpuset.fmeter);
 	top_cpuset.mems_generation = cpuset_mems_generation++;
@@ -1844,6 +1844,7 @@ void __init cpuset_init_smp(void)
 
  * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
  * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
+ * @pmask: pointer to cpumask_t variable to receive cpus_allowed set.
  *
  * Description: Returns the cpumask_t cpus_allowed of the cpuset
  * attached to the specified @tsk.  Guaranteed to return some non-empty
@@ -1851,35 +1852,27 @@ void __init cpuset_init_smp(void)
  * tasks cpuset.
  **/
 
-cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
+void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask)
 {
-	cpumask_t mask;
-
 	mutex_lock(&callback_mutex);
-	mask = cpuset_cpus_allowed_locked(tsk);
+	cpuset_cpus_allowed_locked(tsk, pmask);
 	mutex_unlock(&callback_mutex);
-
-	return mask;
 }
 
 /**
  * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
  * Must be called with callback_mutex held.
  **/
-cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk)
+void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask)
 {
-	cpumask_t mask;
-
 	task_lock(tsk);
-	guarantee_online_cpus(task_cs(tsk), &mask);
+	guarantee_online_cpus(task_cs(tsk), pmask);
 	task_unlock(tsk);
-
-	return mask;
 }
 
 void cpuset_init_current_mems_allowed(void)
 {
-	current->mems_allowed = NODE_MASK_ALL;
+	nodes_setall(current->mems_allowed);
 }
 
 /**
diff --git a/kernel/sched.c b/kernel/sched.c
index ef3f28b334e..ccc23a9cd26 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4941,13 +4941,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 	if (retval)
 		goto out_unlock;
 
-	cpus_allowed = cpuset_cpus_allowed(p);
+	cpuset_cpus_allowed(p, &cpus_allowed);
 	cpus_and(new_mask, new_mask, cpus_allowed);
  again:
 	retval = set_cpus_allowed(p, new_mask);
 
 	if (!retval) {
-		cpus_allowed = cpuset_cpus_allowed(p);
+		cpuset_cpus_allowed(p, &cpus_allowed);
 		if (!cpus_subset(new_mask, cpus_allowed)) {
 			/*
 			 * We must have raced with a concurrent cpuset
@@ -5661,7 +5661,9 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 
 		/* No more Mr. Nice Guy. */
 		if (dest_cpu >= nr_cpu_ids) {
-			cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p);
+			cpumask_t cpus_allowed;
+
+			cpuset_cpus_allowed_locked(p, &cpus_allowed);
 			/*
 			 * Try to stay on the same cpuset, where the
 			 * current cpuset may be a subset of all cpus.
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 8f6ee073c0e..0ceacff5645 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -187,8 +187,8 @@ static int pdflush(void *dummy)
 	 * This is needed as pdflush's are dynamically created and destroyed.
 	 * The boottime pdflush's are easily placed w/o these 2 lines.
 	 */
-	cpus_allowed = cpuset_cpus_allowed(current);
-	set_cpus_allowed(current, cpus_allowed);
+	cpuset_cpus_allowed(current, &cpus_allowed);
+	set_cpus_allowed_ptr(current, &cpus_allowed);
 
 	return __pdflush(&my_work);
 }
-- 
cgit v1.2.3-70-g09d2


From b53e921ba1cff8453dc9a87a84052fa12d5b30bd Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Fri, 4 Apr 2008 18:11:08 -0700
Subject: generic: reduce stack pressure in sched_affinity

  * Modify sched_affinity functions to pass cpumask_t variables by reference
    instead of by value.

  * Use new set_cpus_allowed_ptr function.

Depends on:
	[sched-devel]: sched: add new set_cpus_allowed_ptr function

Cc: Paul Jackson <pj@sgi.com>
Cc: Cliff Wickman <cpw@sgi.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 46 ++++++++++++++++-----------------
 include/linux/sched.h                   |  2 +-
 kernel/compat.c                         |  2 +-
 kernel/rcupreempt.c                     |  4 +--
 kernel/sched.c                          |  5 ++--
 5 files changed, 30 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 32671da8184..7c9a813e119 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -251,18 +251,18 @@ struct threshold_attr {
 	ssize_t(*store) (struct threshold_block *, const char *, size_t count);
 };
 
-static cpumask_t affinity_set(unsigned int cpu)
+static void affinity_set(unsigned int cpu, cpumask_t *oldmask,
+					   cpumask_t *newmask)
 {
-	cpumask_t oldmask = current->cpus_allowed;
-	cpumask_t newmask = CPU_MASK_NONE;
-	cpu_set(cpu, newmask);
-	set_cpus_allowed(current, newmask);
-	return oldmask;
+	*oldmask = current->cpus_allowed;
+	cpus_clear(*newmask);
+	cpu_set(cpu, *newmask);
+	set_cpus_allowed_ptr(current, newmask);
 }
 
-static void affinity_restore(cpumask_t oldmask)
+static void affinity_restore(const cpumask_t *oldmask)
 {
-	set_cpus_allowed(current, oldmask);
+	set_cpus_allowed_ptr(current, oldmask);
 }
 
 #define SHOW_FIELDS(name)                                           \
@@ -277,15 +277,15 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
 				      const char *buf, size_t count)
 {
 	char *end;
-	cpumask_t oldmask;
+	cpumask_t oldmask, newmask;
 	unsigned long new = simple_strtoul(buf, &end, 0);
 	if (end == buf)
 		return -EINVAL;
 	b->interrupt_enable = !!new;
 
-	oldmask = affinity_set(b->cpu);
+	affinity_set(b->cpu, &oldmask, &newmask);
 	threshold_restart_bank(b, 0, 0);
-	affinity_restore(oldmask);
+	affinity_restore(&oldmask);
 
 	return end - buf;
 }
@@ -294,7 +294,7 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
 				     const char *buf, size_t count)
 {
 	char *end;
-	cpumask_t oldmask;
+	cpumask_t oldmask, newmask;
 	u16 old;
 	unsigned long new = simple_strtoul(buf, &end, 0);
 	if (end == buf)
@@ -306,9 +306,9 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
 	old = b->threshold_limit;
 	b->threshold_limit = new;
 
-	oldmask = affinity_set(b->cpu);
+	affinity_set(b->cpu, &oldmask, &newmask);
 	threshold_restart_bank(b, 0, old);
-	affinity_restore(oldmask);
+	affinity_restore(&oldmask);
 
 	return end - buf;
 }
@@ -316,10 +316,10 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
 static ssize_t show_error_count(struct threshold_block *b, char *buf)
 {
 	u32 high, low;
-	cpumask_t oldmask;
-	oldmask = affinity_set(b->cpu);
+	cpumask_t oldmask, newmask;
+	affinity_set(b->cpu, &oldmask, &newmask);
 	rdmsr(b->address, low, high);
-	affinity_restore(oldmask);
+	affinity_restore(&oldmask);
 	return sprintf(buf, "%x\n",
 		       (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
 }
@@ -327,10 +327,10 @@ static ssize_t show_error_count(struct threshold_block *b, char *buf)
 static ssize_t store_error_count(struct threshold_block *b,
 				 const char *buf, size_t count)
 {
-	cpumask_t oldmask;
-	oldmask = affinity_set(b->cpu);
+	cpumask_t oldmask, newmask;
+	affinity_set(b->cpu, &oldmask, &newmask);
 	threshold_restart_bank(b, 1, 0);
-	affinity_restore(oldmask);
+	affinity_restore(&oldmask);
 	return 1;
 }
 
@@ -468,7 +468,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 {
 	int i, err = 0;
 	struct threshold_bank *b = NULL;
-	cpumask_t oldmask = CPU_MASK_NONE;
+	cpumask_t oldmask, newmask;
 	char name[32];
 
 	sprintf(name, "threshold_bank%i", bank);
@@ -519,10 +519,10 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 
 	per_cpu(threshold_banks, cpu)[bank] = b;
 
-	oldmask = affinity_set(cpu);
+	affinity_set(cpu, &oldmask, &newmask);
 	err = allocate_threshold_blocks(cpu, bank, 0,
 					MSR_IA32_MC0_MISC + bank * 4);
-	affinity_restore(oldmask);
+	affinity_restore(&oldmask);
 
 	if (err)
 		goto out_free;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index be5d31752db..383502dfda1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2034,7 +2034,7 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm)
 }
 #endif
 
-extern long sched_setaffinity(pid_t pid, cpumask_t new_mask);
+extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
 extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
 
 extern int sched_mc_power_savings, sched_smt_power_savings;
diff --git a/kernel/compat.c b/kernel/compat.c
index 9c48abfcd4a..e1ef04870c2 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -445,7 +445,7 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
 	if (retval)
 		return retval;
 
-	return sched_setaffinity(pid, new_mask);
+	return sched_setaffinity(pid, &new_mask);
 }
 
 asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index e9517014b57..e1cdf196a51 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1007,10 +1007,10 @@ void __synchronize_sched(void)
 	if (sched_getaffinity(0, &oldmask) < 0)
 		oldmask = cpu_possible_map;
 	for_each_online_cpu(cpu) {
-		sched_setaffinity(0, cpumask_of_cpu(cpu));
+		sched_setaffinity(0, &cpumask_of_cpu(cpu));
 		schedule();
 	}
-	sched_setaffinity(0, oldmask);
+	sched_setaffinity(0, &oldmask);
 }
 EXPORT_SYMBOL_GPL(__synchronize_sched);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index ccc23a9cd26..1a8252385c4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4908,9 +4908,10 @@ out_unlock:
 	return retval;
 }
 
-long sched_setaffinity(pid_t pid, cpumask_t new_mask)
+long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
 {
 	cpumask_t cpus_allowed;
+	cpumask_t new_mask = *in_mask;
 	struct task_struct *p;
 	int retval;
 
@@ -4991,7 +4992,7 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
 	if (retval)
 		return retval;
 
-	return sched_setaffinity(pid, new_mask);
+	return sched_setaffinity(pid, &new_mask);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From c5f59f0833df945eef7ff35f3dc6ba61c5f293dd Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Fri, 4 Apr 2008 18:11:10 -0700
Subject: nodemask: use new node_to_cpumask_ptr function

  * Use new node_to_cpumask_ptr.  This creates a pointer to the
    cpumask for a given node.  This definition is in mm patch:

	asm-generic-add-node_to_cpumask_ptr-macro.patch

  * Use new set_cpus_allowed_ptr function.

Depends on:
	[mm-patch]: asm-generic-add-node_to_cpumask_ptr-macro.patch
	[sched-devel]: sched: add new set_cpus_allowed_ptr function
	[x86/latest]: x86: add cpus_scnprintf function

Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Greg Banks <gnb@melbourne.sgi.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/base/node.c |  7 ++++---
 kernel/sched.c      | 29 ++++++++++++++---------------
 mm/page_alloc.c     |  6 +++---
 mm/slab.c           |  5 ++---
 mm/vmscan.c         | 18 ++++++++----------
 net/sunrpc/svc.c    | 16 +++++++++++-----
 6 files changed, 42 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index e59861f18ce..8e3f25bb8f8 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -22,14 +22,15 @@ static struct sysdev_class node_class = {
 static ssize_t node_read_cpumap(struct sys_device * dev, char * buf)
 {
 	struct node *node_dev = to_node(dev);
-	cpumask_t mask = node_to_cpumask(node_dev->sysdev.id);
+	node_to_cpumask_ptr(mask, node_dev->sysdev.id);
 	int len;
 
 	/* 2004/06/03: buf currently PAGE_SIZE, need > 1 char per 4 bits. */
 	BUILD_BUG_ON(MAX_NUMNODES/4 > PAGE_SIZE/2);
 
-	len = cpumask_scnprintf(buf, PAGE_SIZE-1, mask);
-	len += sprintf(buf + len, "\n");
+	len = cpumask_scnprintf(buf, PAGE_SIZE-2, *mask);
+ 	buf[len++] = '\n';
+ 	buf[len] = '\0';
 	return len;
 }
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 1a8252385c4..9f7980f8ec0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6448,7 +6448,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
  *
  * Should use nodemask_t.
  */
-static int find_next_best_node(int node, unsigned long *used_nodes)
+static int find_next_best_node(int node, nodemask_t *used_nodes)
 {
 	int i, n, val, min_val, best_node = 0;
 
@@ -6462,7 +6462,7 @@ static int find_next_best_node(int node, unsigned long *used_nodes)
 			continue;
 
 		/* Skip already used nodes */
-		if (test_bit(n, used_nodes))
+		if (node_isset(n, *used_nodes))
 			continue;
 
 		/* Simple min distance search */
@@ -6474,14 +6474,13 @@ static int find_next_best_node(int node, unsigned long *used_nodes)
 		}
 	}
 
-	set_bit(best_node, used_nodes);
+	node_set(best_node, *used_nodes);
 	return best_node;
 }
 
 /**
  * sched_domain_node_span - get a cpumask for a node's sched_domain
  * @node: node whose cpumask we're constructing
- * @size: number of nodes to include in this span
  *
  * Given a node, construct a good cpumask for its sched_domain to span. It
  * should be one that prevents unnecessary balancing, but also spreads tasks
@@ -6489,22 +6488,22 @@ static int find_next_best_node(int node, unsigned long *used_nodes)
  */
 static cpumask_t sched_domain_node_span(int node)
 {
-	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
-	cpumask_t span, nodemask;
+	nodemask_t used_nodes;
+	cpumask_t span;
+	node_to_cpumask_ptr(nodemask, node);
 	int i;
 
 	cpus_clear(span);
-	bitmap_zero(used_nodes, MAX_NUMNODES);
+	nodes_clear(used_nodes);
 
-	nodemask = node_to_cpumask(node);
-	cpus_or(span, span, nodemask);
-	set_bit(node, used_nodes);
+	cpus_or(span, span, *nodemask);
+	node_set(node, used_nodes);
 
 	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
-		int next_node = find_next_best_node(node, used_nodes);
+		int next_node = find_next_best_node(node, &used_nodes);
 
-		nodemask = node_to_cpumask(next_node);
-		cpus_or(span, span, nodemask);
+		node_to_cpumask_ptr_next(nodemask, next_node);
+		cpus_or(span, span, *nodemask);
 	}
 
 	return span;
@@ -6901,6 +6900,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		for (j = 0; j < MAX_NUMNODES; j++) {
 			cpumask_t tmp, notcovered;
 			int n = (i + j) % MAX_NUMNODES;
+			node_to_cpumask_ptr(pnodemask, n);
 
 			cpus_complement(notcovered, covered);
 			cpus_and(tmp, notcovered, *cpu_map);
@@ -6908,8 +6908,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 			if (cpus_empty(tmp))
 				break;
 
-			nodemask = node_to_cpumask(n);
-			cpus_and(tmp, tmp, nodemask);
+			cpus_and(tmp, tmp, *pnodemask);
 			if (cpus_empty(tmp))
 				continue;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 402a504f122..32e796af12a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2029,6 +2029,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
 	int n, val;
 	int min_val = INT_MAX;
 	int best_node = -1;
+	node_to_cpumask_ptr(tmp, 0);
 
 	/* Use the local node if we haven't already */
 	if (!node_isset(node, *used_node_mask)) {
@@ -2037,7 +2038,6 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
 	}
 
 	for_each_node_state(n, N_HIGH_MEMORY) {
-		cpumask_t tmp;
 
 		/* Don't want a node to appear more than once */
 		if (node_isset(n, *used_node_mask))
@@ -2050,8 +2050,8 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
 		val += (n < node);
 
 		/* Give preference to headless and unused nodes */
-		tmp = node_to_cpumask(n);
-		if (!cpus_empty(tmp))
+		node_to_cpumask_ptr_next(tmp, n);
+		if (!cpus_empty(*tmp))
 			val += PENALTY_FOR_NODE_WITH_CPUS;
 
 		/* Slight preference for less loaded node */
diff --git a/mm/slab.c b/mm/slab.c
index 04b308c3bc5..03927cb5ec9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1160,14 +1160,13 @@ static void __cpuinit cpuup_canceled(long cpu)
 	struct kmem_cache *cachep;
 	struct kmem_list3 *l3 = NULL;
 	int node = cpu_to_node(cpu);
+	node_to_cpumask_ptr(mask, node);
 
 	list_for_each_entry(cachep, &cache_chain, next) {
 		struct array_cache *nc;
 		struct array_cache *shared;
 		struct array_cache **alien;
-		cpumask_t mask;
 
-		mask = node_to_cpumask(node);
 		/* cpu is dead; no one can alloc from it. */
 		nc = cachep->array[cpu];
 		cachep->array[cpu] = NULL;
@@ -1183,7 +1182,7 @@ static void __cpuinit cpuup_canceled(long cpu)
 		if (nc)
 			free_block(cachep, nc->entry, nc->avail, node);
 
-		if (!cpus_empty(mask)) {
+		if (!cpus_empty(*mask)) {
 			spin_unlock_irq(&l3->list_lock);
 			goto free_array_cache;
 		}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4046434046e..f80a5b7c057 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1647,11 +1647,10 @@ static int kswapd(void *p)
 	struct reclaim_state reclaim_state = {
 		.reclaimed_slab = 0,
 	};
-	cpumask_t cpumask;
+	node_to_cpumask_ptr(cpumask, pgdat->node_id);
 
-	cpumask = node_to_cpumask(pgdat->node_id);
-	if (!cpus_empty(cpumask))
-		set_cpus_allowed(tsk, cpumask);
+	if (!cpus_empty(*cpumask))
+		set_cpus_allowed_ptr(tsk, cpumask);
 	current->reclaim_state = &reclaim_state;
 
 	/*
@@ -1880,17 +1879,16 @@ out:
 static int __devinit cpu_callback(struct notifier_block *nfb,
 				  unsigned long action, void *hcpu)
 {
-	pg_data_t *pgdat;
-	cpumask_t mask;
 	int nid;
 
 	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
 		for_each_node_state(nid, N_HIGH_MEMORY) {
-			pgdat = NODE_DATA(nid);
-			mask = node_to_cpumask(pgdat->node_id);
-			if (any_online_cpu(mask) != NR_CPUS)
+			pg_data_t *pgdat = NODE_DATA(nid);
+			node_to_cpumask_ptr(mask, pgdat->node_id);
+
+			if (any_online_cpu(*mask) < nr_cpu_ids)
 				/* One of our CPUs online: restore mask */
-				set_cpus_allowed(pgdat->kswapd, mask);
+				set_cpus_allowed_ptr(pgdat->kswapd, mask);
 		}
 	}
 	return NOTIFY_OK;
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index a290e152329..090af78d68b 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -301,7 +301,6 @@ static inline int
 svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
 {
 	struct svc_pool_map *m = &svc_pool_map;
-	unsigned int node; /* or cpu */
 
 	/*
 	 * The caller checks for sv_nrpools > 1, which
@@ -314,16 +313,23 @@ svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
 	default:
 		return 0;
 	case SVC_POOL_PERCPU:
-		node = m->pool_to[pidx];
+	{
+		unsigned int cpu = m->pool_to[pidx];
+
 		*oldmask = current->cpus_allowed;
-		set_cpus_allowed(current, cpumask_of_cpu(node));
+		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
 		return 1;
+	}
 	case SVC_POOL_PERNODE:
-		node = m->pool_to[pidx];
+	{
+		unsigned int node = m->pool_to[pidx];
+		node_to_cpumask_ptr(nodecpumask, node);
+
 		*oldmask = current->cpus_allowed;
-		set_cpus_allowed(current, node_to_cpumask(node));
+		set_cpus_allowed_ptr(current, nodecpumask);
 		return 1;
 	}
+	}
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 7c16ec585c558960a508ccf9a08fcb9ed49b3754 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Fri, 4 Apr 2008 18:11:11 -0700
Subject: cpumask: reduce stack usage in SD_x_INIT initializers

  * Remove empty cpumask_t (and all non-zero/non-null) variables
    in SD_*_INIT macros.  Use memset(0) to clear.  Also, don't
    inline the initializer functions to save on stack space in
    build_sched_domains().

  * Merge change to include/linux/topology.h that uses the new
    node_to_cpumask_ptr function in the nr_cpus_node macro into
    this patch.

Depends on:
	[mm-patch]: asm-generic-add-node_to_cpumask_ptr-macro.patch
	[sched-devel]: sched: add new set_cpus_allowed_ptr function

Cc: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-x86/topology.h |   5 -
 include/linux/topology.h   |  46 +-----
 kernel/sched.c             | 368 ++++++++++++++++++++++++++++++---------------
 3 files changed, 256 insertions(+), 163 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h
index b167ca90f96..9ef74c5d5ad 100644
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -154,10 +154,6 @@ extern unsigned long node_remap_size[];
 
 /* sched_domains SD_NODE_INIT for NUMAQ machines */
 #define SD_NODE_INIT (struct sched_domain) {		\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.child			= NULL,			\
-	.groups			= NULL,			\
 	.min_interval		= 8,			\
 	.max_interval		= 32,			\
 	.busy_factor		= 32,			\
@@ -175,7 +171,6 @@ extern unsigned long node_remap_size[];
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
-	.nr_balance_failed	= 0,			\
 }
 
 #ifdef CONFIG_X86_64_ACPI_NUMA
diff --git a/include/linux/topology.h b/include/linux/topology.h
index bd14f8b30f0..4bb7074a2c3 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -38,16 +38,15 @@
 #endif
 
 #ifndef nr_cpus_node
-#define nr_cpus_node(node)							\
-	({									\
-		cpumask_t __tmp__;						\
-		__tmp__ = node_to_cpumask(node);				\
-		cpus_weight(__tmp__);						\
+#define nr_cpus_node(node)				\
+	({						\
+		node_to_cpumask_ptr(__tmp__, node);	\
+		cpus_weight(*__tmp__);			\
 	})
 #endif
 
-#define for_each_node_with_cpus(node)						\
-	for_each_online_node(node)						\
+#define for_each_node_with_cpus(node)			\
+	for_each_online_node(node)			\
 		if (nr_cpus_node(node))
 
 void arch_update_cpu_topology(void);
@@ -80,7 +79,9 @@ void arch_update_cpu_topology(void);
  * by defining their own arch-specific initializer in include/asm/topology.h.
  * A definition there will automagically override these default initializers
  * and allow arch-specific performance tuning of sched_domains.
+ * (Only non-zero and non-null fields need be specified.)
  */
+
 #ifdef CONFIG_SCHED_SMT
 /* MCD - Do we really need this?  It is always on if CONFIG_SCHED_SMT is,
  * so can't we drop this in favor of CONFIG_SCHED_SMT?
@@ -89,20 +90,10 @@ void arch_update_cpu_topology(void);
 /* Common values for SMT siblings */
 #ifndef SD_SIBLING_INIT
 #define SD_SIBLING_INIT (struct sched_domain) {		\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.child			= NULL,			\
-	.groups			= NULL,			\
 	.min_interval		= 1,			\
 	.max_interval		= 2,			\
 	.busy_factor		= 64,			\
 	.imbalance_pct		= 110,			\
-	.cache_nice_tries	= 0,			\
-	.busy_idx		= 0,			\
-	.idle_idx		= 0,			\
-	.newidle_idx		= 0,			\
-	.wake_idx		= 0,			\
-	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_FORK	\
@@ -112,7 +103,6 @@ void arch_update_cpu_topology(void);
 				| SD_SHARE_CPUPOWER,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
-	.nr_balance_failed	= 0,			\
 }
 #endif
 #endif /* CONFIG_SCHED_SMT */
@@ -121,18 +111,12 @@ void arch_update_cpu_topology(void);
 /* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */
 #ifndef SD_MC_INIT
 #define SD_MC_INIT (struct sched_domain) {		\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.child			= NULL,			\
-	.groups			= NULL,			\
 	.min_interval		= 1,			\
 	.max_interval		= 4,			\
 	.busy_factor		= 64,			\
 	.imbalance_pct		= 125,			\
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 2,			\
-	.idle_idx		= 0,			\
-	.newidle_idx		= 0,			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
@@ -144,7 +128,6 @@ void arch_update_cpu_topology(void);
 				| BALANCE_FOR_MC_POWER,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
-	.nr_balance_failed	= 0,			\
 }
 #endif
 #endif /* CONFIG_SCHED_MC */
@@ -152,10 +135,6 @@ void arch_update_cpu_topology(void);
 /* Common values for CPUs */
 #ifndef SD_CPU_INIT
 #define SD_CPU_INIT (struct sched_domain) {		\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.child			= NULL,			\
-	.groups			= NULL,			\
 	.min_interval		= 1,			\
 	.max_interval		= 4,			\
 	.busy_factor		= 64,			\
@@ -174,16 +153,11 @@ void arch_update_cpu_topology(void);
 				| BALANCE_FOR_PKG_POWER,\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
-	.nr_balance_failed	= 0,			\
 }
 #endif
 
 /* sched_domains SD_ALLNODES_INIT for NUMA machines */
 #define SD_ALLNODES_INIT (struct sched_domain) {	\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.child			= NULL,			\
-	.groups			= NULL,			\
 	.min_interval		= 64,			\
 	.max_interval		= 64*num_online_cpus(),	\
 	.busy_factor		= 128,			\
@@ -191,14 +165,10 @@ void arch_update_cpu_topology(void);
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 3,			\
-	.newidle_idx		= 0, /* unused */	\
-	.wake_idx		= 0, /* unused */	\
-	.forkexec_idx		= 0, /* unused */	\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_SERIALIZE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 64,			\
-	.nr_balance_failed	= 0,			\
 }
 
 #ifdef CONFIG_NUMA
diff --git a/kernel/sched.c b/kernel/sched.c
index 9f7980f8ec0..6809178eaa9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1869,17 +1869,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
  * find_idlest_cpu - find the idlest cpu among the cpus in group.
  */
 static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
+		cpumask_t *tmp)
 {
-	cpumask_t tmp;
 	unsigned long load, min_load = ULONG_MAX;
 	int idlest = -1;
 	int i;
 
 	/* Traverse only the allowed CPUs */
-	cpus_and(tmp, group->cpumask, p->cpus_allowed);
+	cpus_and(*tmp, group->cpumask, p->cpus_allowed);
 
-	for_each_cpu_mask(i, tmp) {
+	for_each_cpu_mask(i, *tmp) {
 		load = weighted_cpuload(i);
 
 		if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1918,7 +1918,7 @@ static int sched_balance_self(int cpu, int flag)
 	}
 
 	while (sd) {
-		cpumask_t span;
+		cpumask_t span, tmpmask;
 		struct sched_group *group;
 		int new_cpu, weight;
 
@@ -1934,7 +1934,7 @@ static int sched_balance_self(int cpu, int flag)
 			continue;
 		}
 
-		new_cpu = find_idlest_cpu(group, t, cpu);
+		new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
 		if (new_cpu == -1 || new_cpu == cpu) {
 			/* Now try balancing at a lower domain level of cpu */
 			sd = sd->child;
@@ -2818,7 +2818,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
 		   unsigned long *imbalance, enum cpu_idle_type idle,
-		   int *sd_idle, cpumask_t *cpus, int *balance)
+		   int *sd_idle, const cpumask_t *cpus, int *balance)
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -3119,7 +3119,7 @@ ret:
  */
 static struct rq *
 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
-		   unsigned long imbalance, cpumask_t *cpus)
+		   unsigned long imbalance, const cpumask_t *cpus)
 {
 	struct rq *busiest = NULL, *rq;
 	unsigned long max_load = 0;
@@ -3158,15 +3158,16 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *balance)
+			int *balance, cpumask_t *cpus)
 {
 	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
 	struct sched_group *group;
 	unsigned long imbalance;
 	struct rq *busiest;
-	cpumask_t cpus = CPU_MASK_ALL;
 	unsigned long flags;
 
+	cpus_setall(*cpus);
+
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3181,7 +3182,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 redo:
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
-				   &cpus, balance);
+				   cpus, balance);
 
 	if (*balance == 0)
 		goto out_balanced;
@@ -3191,7 +3192,7 @@ redo:
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group, idle, imbalance, &cpus);
+	busiest = find_busiest_queue(group, idle, imbalance, cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
@@ -3224,8 +3225,8 @@ redo:
 
 		/* All tasks on this runqueue were pinned by CPU affinity */
 		if (unlikely(all_pinned)) {
-			cpu_clear(cpu_of(busiest), cpus);
-			if (!cpus_empty(cpus))
+			cpu_clear(cpu_of(busiest), *cpus);
+			if (!cpus_empty(*cpus))
 				goto redo;
 			goto out_balanced;
 		}
@@ -3310,7 +3311,8 @@ out_one_pinned:
  * this_rq is locked.
  */
 static int
-load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
+load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
+			cpumask_t *cpus)
 {
 	struct sched_group *group;
 	struct rq *busiest = NULL;
@@ -3318,7 +3320,8 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 	int ld_moved = 0;
 	int sd_idle = 0;
 	int all_pinned = 0;
-	cpumask_t cpus = CPU_MASK_ALL;
+
+	cpus_setall(*cpus);
 
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
@@ -3333,14 +3336,13 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 	schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
 redo:
 	group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
-				   &sd_idle, &cpus, NULL);
+				   &sd_idle, cpus, NULL);
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
-				&cpus);
+	busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
 		goto out_balanced;
@@ -3362,8 +3364,8 @@ redo:
 		spin_unlock(&busiest->lock);
 
 		if (unlikely(all_pinned)) {
-			cpu_clear(cpu_of(busiest), cpus);
-			if (!cpus_empty(cpus))
+			cpu_clear(cpu_of(busiest), *cpus);
+			if (!cpus_empty(*cpus))
 				goto redo;
 		}
 	}
@@ -3397,6 +3399,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 	struct sched_domain *sd;
 	int pulled_task = -1;
 	unsigned long next_balance = jiffies + HZ;
+	cpumask_t tmpmask;
 
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
@@ -3406,8 +3409,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 
 		if (sd->flags & SD_BALANCE_NEWIDLE)
 			/* If we've pulled tasks over stop searching: */
-			pulled_task = load_balance_newidle(this_cpu,
-								this_rq, sd);
+			pulled_task = load_balance_newidle(this_cpu, this_rq,
+							   sd, &tmpmask);
 
 		interval = msecs_to_jiffies(sd->balance_interval);
 		if (time_after(next_balance, sd->last_balance + interval))
@@ -3566,6 +3569,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 	/* Earliest time when we have to do rebalance again */
 	unsigned long next_balance = jiffies + 60*HZ;
 	int update_next_balance = 0;
+	cpumask_t tmp;
 
 	for_each_domain(cpu, sd) {
 		if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3589,7 +3593,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 		}
 
 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
-			if (load_balance(cpu, rq, sd, idle, &balance)) {
+			if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
 				/*
 				 * We've pulled tasks over so either we're no
 				 * longer idle, or one of our SMT siblings is
@@ -4945,7 +4949,7 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
 	cpuset_cpus_allowed(p, &cpus_allowed);
 	cpus_and(new_mask, new_mask, cpus_allowed);
  again:
-	retval = set_cpus_allowed(p, new_mask);
+	retval = set_cpus_allowed_ptr(p, &new_mask);
 
 	if (!retval) {
 		cpuset_cpus_allowed(p, &cpus_allowed);
@@ -5700,7 +5704,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
  */
 static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
-	struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
+	struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
 	unsigned long flags;
 
 	local_irq_save(flags);
@@ -6118,14 +6122,14 @@ EXPORT_SYMBOL(nr_cpu_ids);
 
 #ifdef CONFIG_SCHED_DEBUG
 
-static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level)
+static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
+				  cpumask_t *groupmask)
 {
 	struct sched_group *group = sd->groups;
-	cpumask_t groupmask;
 	char str[256];
 
 	cpulist_scnprintf(str, sizeof(str), sd->span);
-	cpus_clear(groupmask);
+	cpus_clear(*groupmask);
 
 	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
 
@@ -6169,13 +6173,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level)
 			break;
 		}
 
-		if (cpus_intersects(groupmask, group->cpumask)) {
+		if (cpus_intersects(*groupmask, group->cpumask)) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: repeated CPUs\n");
 			break;
 		}
 
-		cpus_or(groupmask, groupmask, group->cpumask);
+		cpus_or(*groupmask, *groupmask, group->cpumask);
 
 		cpulist_scnprintf(str, sizeof(str), group->cpumask);
 		printk(KERN_CONT " %s", str);
@@ -6184,10 +6188,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level)
 	} while (group != sd->groups);
 	printk(KERN_CONT "\n");
 
-	if (!cpus_equal(sd->span, groupmask))
+	if (!cpus_equal(sd->span, *groupmask))
 		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
 
-	if (sd->parent && !cpus_subset(groupmask, sd->parent->span))
+	if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
 		printk(KERN_ERR "ERROR: parent span is not a superset "
 			"of domain->span\n");
 	return 0;
@@ -6195,6 +6199,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level)
 
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
+	cpumask_t *groupmask;
 	int level = 0;
 
 	if (!sd) {
@@ -6204,14 +6209,21 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 
 	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
 
+	groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+	if (!groupmask) {
+		printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
+		return;
+	}
+
 	for (;;) {
-		if (sched_domain_debug_one(sd, cpu, level))
+		if (sched_domain_debug_one(sd, cpu, level, groupmask))
 			break;
 		level++;
 		sd = sd->parent;
 		if (!sd)
 			break;
 	}
+	kfree(groupmask);
 }
 #else
 # define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6399,30 +6411,33 @@ __setup("isolcpus=", isolated_cpu_setup);
  * and ->cpu_power to 0.
  */
 static void
-init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
+init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
 			int (*group_fn)(int cpu, const cpumask_t *cpu_map,
-					struct sched_group **sg))
+					struct sched_group **sg,
+					cpumask_t *tmpmask),
+			cpumask_t *covered, cpumask_t *tmpmask)
 {
 	struct sched_group *first = NULL, *last = NULL;
-	cpumask_t covered = CPU_MASK_NONE;
 	int i;
 
-	for_each_cpu_mask(i, span) {
+	cpus_clear(*covered);
+
+	for_each_cpu_mask(i, *span) {
 		struct sched_group *sg;
-		int group = group_fn(i, cpu_map, &sg);
+		int group = group_fn(i, cpu_map, &sg, tmpmask);
 		int j;
 
-		if (cpu_isset(i, covered))
+		if (cpu_isset(i, *covered))
 			continue;
 
-		sg->cpumask = CPU_MASK_NONE;
+		cpus_clear(sg->cpumask);
 		sg->__cpu_power = 0;
 
-		for_each_cpu_mask(j, span) {
-			if (group_fn(j, cpu_map, NULL) != group)
+		for_each_cpu_mask(j, *span) {
+			if (group_fn(j, cpu_map, NULL, tmpmask) != group)
 				continue;
 
-			cpu_set(j, covered);
+			cpu_set(j, *covered);
 			cpu_set(j, sg->cpumask);
 		}
 		if (!first)
@@ -6520,7 +6535,8 @@ static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
 
 static int
-cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
+cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
+		 cpumask_t *unused)
 {
 	if (sg)
 		*sg = &per_cpu(sched_group_cpus, cpu);
@@ -6538,19 +6554,22 @@ static DEFINE_PER_CPU(struct sched_group, sched_group_core);
 
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
 static int
-cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
+cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
+		  cpumask_t *mask)
 {
 	int group;
-	cpumask_t mask = per_cpu(cpu_sibling_map, cpu);
-	cpus_and(mask, mask, *cpu_map);
-	group = first_cpu(mask);
+
+	*mask = per_cpu(cpu_sibling_map, cpu);
+	cpus_and(*mask, *mask, *cpu_map);
+	group = first_cpu(*mask);
 	if (sg)
 		*sg = &per_cpu(sched_group_core, group);
 	return group;
 }
 #elif defined(CONFIG_SCHED_MC)
 static int
-cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
+cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
+		  cpumask_t *unused)
 {
 	if (sg)
 		*sg = &per_cpu(sched_group_core, cpu);
@@ -6562,17 +6581,18 @@ static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
 
 static int
-cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
+cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
+		  cpumask_t *mask)
 {
 	int group;
 #ifdef CONFIG_SCHED_MC
-	cpumask_t mask = cpu_coregroup_map(cpu);
-	cpus_and(mask, mask, *cpu_map);
-	group = first_cpu(mask);
+	*mask = cpu_coregroup_map(cpu);
+	cpus_and(*mask, *mask, *cpu_map);
+	group = first_cpu(*mask);
 #elif defined(CONFIG_SCHED_SMT)
-	cpumask_t mask = per_cpu(cpu_sibling_map, cpu);
-	cpus_and(mask, mask, *cpu_map);
-	group = first_cpu(mask);
+	*mask = per_cpu(cpu_sibling_map, cpu);
+	cpus_and(*mask, *mask, *cpu_map);
+	group = first_cpu(*mask);
 #else
 	group = cpu;
 #endif
@@ -6594,13 +6614,13 @@ static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
 
 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
-				 struct sched_group **sg)
+				 struct sched_group **sg, cpumask_t *nodemask)
 {
-	cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
 	int group;
 
-	cpus_and(nodemask, nodemask, *cpu_map);
-	group = first_cpu(nodemask);
+	*nodemask = node_to_cpumask(cpu_to_node(cpu));
+	cpus_and(*nodemask, *nodemask, *cpu_map);
+	group = first_cpu(*nodemask);
 
 	if (sg)
 		*sg = &per_cpu(sched_group_allnodes, group);
@@ -6636,7 +6656,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 
 #ifdef CONFIG_NUMA
 /* Free memory allocated for various sched_group structures */
-static void free_sched_groups(const cpumask_t *cpu_map)
+static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 {
 	int cpu, i;
 
@@ -6648,11 +6668,11 @@ static void free_sched_groups(const cpumask_t *cpu_map)
 			continue;
 
 		for (i = 0; i < MAX_NUMNODES; i++) {
-			cpumask_t nodemask = node_to_cpumask(i);
 			struct sched_group *oldsg, *sg = sched_group_nodes[i];
 
-			cpus_and(nodemask, nodemask, *cpu_map);
-			if (cpus_empty(nodemask))
+			*nodemask = node_to_cpumask(i);
+			cpus_and(*nodemask, *nodemask, *cpu_map);
+			if (cpus_empty(*nodemask))
 				continue;
 
 			if (sg == NULL)
@@ -6670,7 +6690,7 @@ next_sg:
 	}
 }
 #else
-static void free_sched_groups(const cpumask_t *cpu_map)
+static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 {
 }
 #endif
@@ -6727,6 +6747,65 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 	} while (group != child->groups);
 }
 
+/*
+ * Initializers for schedule domains
+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
+ */
+
+#define	SD_INIT(sd, type)	sd_init_##type(sd)
+#define SD_INIT_FUNC(type)	\
+static noinline void sd_init_##type(struct sched_domain *sd)	\
+{								\
+	memset(sd, 0, sizeof(*sd));				\
+	*sd = SD_##type##_INIT;					\
+}
+
+SD_INIT_FUNC(CPU)
+#ifdef CONFIG_NUMA
+ SD_INIT_FUNC(ALLNODES)
+ SD_INIT_FUNC(NODE)
+#endif
+#ifdef CONFIG_SCHED_SMT
+ SD_INIT_FUNC(SIBLING)
+#endif
+#ifdef CONFIG_SCHED_MC
+ SD_INIT_FUNC(MC)
+#endif
+
+/*
+ * To minimize stack usage kmalloc room for cpumasks and share the
+ * space as the usage in build_sched_domains() dictates.  Used only
+ * if the amount of space is significant.
+ */
+struct allmasks {
+	cpumask_t tmpmask;			/* make this one first */
+	union {
+		cpumask_t nodemask;
+		cpumask_t this_sibling_map;
+		cpumask_t this_core_map;
+	};
+	cpumask_t send_covered;
+
+#ifdef CONFIG_NUMA
+	cpumask_t domainspan;
+	cpumask_t covered;
+	cpumask_t notcovered;
+#endif
+};
+
+#if	NR_CPUS > 128
+#define	SCHED_CPUMASK_ALLOC		1
+#define	SCHED_CPUMASK_FREE(v)		kfree(v)
+#define	SCHED_CPUMASK_DECLARE(v)	struct allmasks *v
+#else
+#define	SCHED_CPUMASK_ALLOC		0
+#define	SCHED_CPUMASK_FREE(v)
+#define	SCHED_CPUMASK_DECLARE(v)	struct allmasks _v, *v = &_v
+#endif
+
+#define	SCHED_CPUMASK_VAR(v, a) 	cpumask_t *v = (cpumask_t *) \
+			((unsigned long)(a) + offsetof(struct allmasks, v))
+
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
@@ -6735,6 +6814,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
 	struct root_domain *rd;
+	SCHED_CPUMASK_DECLARE(allmasks);
+	cpumask_t *tmpmask;
 #ifdef CONFIG_NUMA
 	struct sched_group **sched_group_nodes = NULL;
 	int sd_allnodes = 0;
@@ -6748,38 +6829,60 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
 		return -ENOMEM;
 	}
-	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
 #endif
 
 	rd = alloc_rootdomain();
 	if (!rd) {
 		printk(KERN_WARNING "Cannot alloc root domain\n");
+#ifdef CONFIG_NUMA
+		kfree(sched_group_nodes);
+#endif
 		return -ENOMEM;
 	}
 
+#if SCHED_CPUMASK_ALLOC
+	/* get space for all scratch cpumask variables */
+	allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
+	if (!allmasks) {
+		printk(KERN_WARNING "Cannot alloc cpumask array\n");
+		kfree(rd);
+#ifdef CONFIG_NUMA
+		kfree(sched_group_nodes);
+#endif
+		return -ENOMEM;
+	}
+#endif
+	tmpmask = (cpumask_t *)allmasks;
+
+
+#ifdef CONFIG_NUMA
+	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+#endif
+
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
 	 */
 	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd = NULL, *p;
-		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
+		SCHED_CPUMASK_VAR(nodemask, allmasks);
 
-		cpus_and(nodemask, nodemask, *cpu_map);
+		*nodemask = node_to_cpumask(cpu_to_node(i));
+		cpus_and(*nodemask, *nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
 		if (cpus_weight(*cpu_map) >
-				SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+				SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
 			sd = &per_cpu(allnodes_domains, i);
-			*sd = SD_ALLNODES_INIT;
+			SD_INIT(sd, ALLNODES);
 			sd->span = *cpu_map;
-			cpu_to_allnodes_group(i, cpu_map, &sd->groups);
+			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
 			p = sd;
 			sd_allnodes = 1;
 		} else
 			p = NULL;
 
 		sd = &per_cpu(node_domains, i);
-		*sd = SD_NODE_INIT;
+		SD_INIT(sd, NODE);
 		sd->span = sched_domain_node_span(cpu_to_node(i));
 		sd->parent = p;
 		if (p)
@@ -6789,94 +6892,114 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 
 		p = sd;
 		sd = &per_cpu(phys_domains, i);
-		*sd = SD_CPU_INIT;
-		sd->span = nodemask;
+		SD_INIT(sd, CPU);
+		sd->span = *nodemask;
 		sd->parent = p;
 		if (p)
 			p->child = sd;
-		cpu_to_phys_group(i, cpu_map, &sd->groups);
+		cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
 
 #ifdef CONFIG_SCHED_MC
 		p = sd;
 		sd = &per_cpu(core_domains, i);
-		*sd = SD_MC_INIT;
+		SD_INIT(sd, MC);
 		sd->span = cpu_coregroup_map(i);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
-		cpu_to_core_group(i, cpu_map, &sd->groups);
+		cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
 #endif
 
 #ifdef CONFIG_SCHED_SMT
 		p = sd;
 		sd = &per_cpu(cpu_domains, i);
-		*sd = SD_SIBLING_INIT;
+		SD_INIT(sd, SIBLING);
 		sd->span = per_cpu(cpu_sibling_map, i);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
-		cpu_to_cpu_group(i, cpu_map, &sd->groups);
+		cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
 #endif
 	}
 
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
 	for_each_cpu_mask(i, *cpu_map) {
-		cpumask_t this_sibling_map = per_cpu(cpu_sibling_map, i);
-		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
-		if (i != first_cpu(this_sibling_map))
+		SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
+		SCHED_CPUMASK_VAR(send_covered, allmasks);
+
+		*this_sibling_map = per_cpu(cpu_sibling_map, i);
+		cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
+		if (i != first_cpu(*this_sibling_map))
 			continue;
 
 		init_sched_build_groups(this_sibling_map, cpu_map,
-					&cpu_to_cpu_group);
+					&cpu_to_cpu_group,
+					send_covered, tmpmask);
 	}
 #endif
 
 #ifdef CONFIG_SCHED_MC
 	/* Set up multi-core groups */
 	for_each_cpu_mask(i, *cpu_map) {
-		cpumask_t this_core_map = cpu_coregroup_map(i);
-		cpus_and(this_core_map, this_core_map, *cpu_map);
-		if (i != first_cpu(this_core_map))
+		SCHED_CPUMASK_VAR(this_core_map, allmasks);
+		SCHED_CPUMASK_VAR(send_covered, allmasks);
+
+		*this_core_map = cpu_coregroup_map(i);
+		cpus_and(*this_core_map, *this_core_map, *cpu_map);
+		if (i != first_cpu(*this_core_map))
 			continue;
+
 		init_sched_build_groups(this_core_map, cpu_map,
-					&cpu_to_core_group);
+					&cpu_to_core_group,
+					send_covered, tmpmask);
 	}
 #endif
 
 	/* Set up physical groups */
 	for (i = 0; i < MAX_NUMNODES; i++) {
-		cpumask_t nodemask = node_to_cpumask(i);
+		SCHED_CPUMASK_VAR(nodemask, allmasks);
+		SCHED_CPUMASK_VAR(send_covered, allmasks);
 
-		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask))
+		*nodemask = node_to_cpumask(i);
+		cpus_and(*nodemask, *nodemask, *cpu_map);
+		if (cpus_empty(*nodemask))
 			continue;
 
-		init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
+		init_sched_build_groups(nodemask, cpu_map,
+					&cpu_to_phys_group,
+					send_covered, tmpmask);
 	}
 
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
-	if (sd_allnodes)
-		init_sched_build_groups(*cpu_map, cpu_map,
-					&cpu_to_allnodes_group);
+	if (sd_allnodes) {
+		SCHED_CPUMASK_VAR(send_covered, allmasks);
+
+		init_sched_build_groups(cpu_map, cpu_map,
+					&cpu_to_allnodes_group,
+					send_covered, tmpmask);
+	}
 
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		/* Set up node groups */
 		struct sched_group *sg, *prev;
-		cpumask_t nodemask = node_to_cpumask(i);
-		cpumask_t domainspan;
-		cpumask_t covered = CPU_MASK_NONE;
+		SCHED_CPUMASK_VAR(nodemask, allmasks);
+		SCHED_CPUMASK_VAR(domainspan, allmasks);
+		SCHED_CPUMASK_VAR(covered, allmasks);
 		int j;
 
-		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask)) {
+		*nodemask = node_to_cpumask(i);
+		cpus_clear(*covered);
+
+		cpus_and(*nodemask, *nodemask, *cpu_map);
+		if (cpus_empty(*nodemask)) {
 			sched_group_nodes[i] = NULL;
 			continue;
 		}
 
-		domainspan = sched_domain_node_span(i);
-		cpus_and(domainspan, domainspan, *cpu_map);
+		*domainspan = sched_domain_node_span(i);
+		cpus_and(*domainspan, *domainspan, *cpu_map);
 
 		sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
 		if (!sg) {
@@ -6885,31 +7008,31 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 			goto error;
 		}
 		sched_group_nodes[i] = sg;
-		for_each_cpu_mask(j, nodemask) {
+		for_each_cpu_mask(j, *nodemask) {
 			struct sched_domain *sd;
 
 			sd = &per_cpu(node_domains, j);
 			sd->groups = sg;
 		}
 		sg->__cpu_power = 0;
-		sg->cpumask = nodemask;
+		sg->cpumask = *nodemask;
 		sg->next = sg;
-		cpus_or(covered, covered, nodemask);
+		cpus_or(*covered, *covered, *nodemask);
 		prev = sg;
 
 		for (j = 0; j < MAX_NUMNODES; j++) {
-			cpumask_t tmp, notcovered;
+			SCHED_CPUMASK_VAR(notcovered, allmasks);
 			int n = (i + j) % MAX_NUMNODES;
 			node_to_cpumask_ptr(pnodemask, n);
 
-			cpus_complement(notcovered, covered);
-			cpus_and(tmp, notcovered, *cpu_map);
-			cpus_and(tmp, tmp, domainspan);
-			if (cpus_empty(tmp))
+			cpus_complement(*notcovered, *covered);
+			cpus_and(*tmpmask, *notcovered, *cpu_map);
+			cpus_and(*tmpmask, *tmpmask, *domainspan);
+			if (cpus_empty(*tmpmask))
 				break;
 
-			cpus_and(tmp, tmp, *pnodemask);
-			if (cpus_empty(tmp))
+			cpus_and(*tmpmask, *tmpmask, *pnodemask);
+			if (cpus_empty(*tmpmask))
 				continue;
 
 			sg = kmalloc_node(sizeof(struct sched_group),
@@ -6920,9 +7043,9 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 				goto error;
 			}
 			sg->__cpu_power = 0;
-			sg->cpumask = tmp;
+			sg->cpumask = *tmpmask;
 			sg->next = prev->next;
-			cpus_or(covered, covered, tmp);
+			cpus_or(*covered, *covered, *tmpmask);
 			prev->next = sg;
 			prev = sg;
 		}
@@ -6958,7 +7081,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 	if (sd_allnodes) {
 		struct sched_group *sg;
 
-		cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
+		cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
+								tmpmask);
 		init_numa_sched_groups_power(sg);
 	}
 #endif
@@ -6976,11 +7100,13 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		cpu_attach_domain(sd, rd, i);
 	}
 
+	SCHED_CPUMASK_FREE((void *)allmasks);
 	return 0;
 
 #ifdef CONFIG_NUMA
 error:
-	free_sched_groups(cpu_map);
+	free_sched_groups(cpu_map, tmpmask);
+	SCHED_CPUMASK_FREE((void *)allmasks);
 	return -ENOMEM;
 #endif
 }
@@ -7020,9 +7146,10 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
 	return err;
 }
 
-static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
+static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
+				       cpumask_t *tmpmask)
 {
-	free_sched_groups(cpu_map);
+	free_sched_groups(cpu_map, tmpmask);
 }
 
 /*
@@ -7031,6 +7158,7 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
  */
 static void detach_destroy_domains(const cpumask_t *cpu_map)
 {
+	cpumask_t tmpmask;
 	int i;
 
 	unregister_sched_domain_sysctl();
@@ -7038,7 +7166,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
 	for_each_cpu_mask(i, *cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
 	synchronize_sched();
-	arch_destroy_sched_domains(cpu_map);
+	arch_destroy_sched_domains(cpu_map, &tmpmask);
 }
 
 /*
@@ -7246,7 +7374,7 @@ void __init sched_init_smp(void)
 	hotcpu_notifier(update_sched_domains, 0);
 
 	/* Move init over to a non-isolated CPU */
-	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
+	if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
 		BUG();
 	sched_init_granularity();
 }
-- 
cgit v1.2.3-70-g09d2


From 39106dcf85285e78f3b290022122c76f851379b8 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Tue, 8 Apr 2008 11:43:03 -0700
Subject: cpumask: use new cpus_scnprintf function

  * Cleaned up references to cpumask_scnprintf() and added new
    cpulist_scnprintf() interfaces where appropriate.

  * Fix some small bugs (or code efficiency improvments) for various uses
    of cpumask_scnprintf.

  * Clean up some checkpatch errors.

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/base/node.c     | 24 +++++++++++++++++++-----
 drivers/base/topology.c | 41 ++++++++++++++++++++++++++++++++++-------
 drivers/pci/pci-sysfs.c | 20 ++++++++++++++++++--
 drivers/pci/probe.c     | 27 +++++++++++++++++++++++----
 kernel/cpuset.c         |  8 ++++++++
 kernel/sched_stats.h    |  8 ++++++--
 6 files changed, 108 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 8e3f25bb8f8..12fde2d03d6 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -19,22 +19,34 @@ static struct sysdev_class node_class = {
 };
 
 
-static ssize_t node_read_cpumap(struct sys_device * dev, char * buf)
+static ssize_t node_read_cpumap(struct sys_device *dev, int type, char *buf)
 {
 	struct node *node_dev = to_node(dev);
 	node_to_cpumask_ptr(mask, node_dev->sysdev.id);
 	int len;
 
-	/* 2004/06/03: buf currently PAGE_SIZE, need > 1 char per 4 bits. */
-	BUILD_BUG_ON(MAX_NUMNODES/4 > PAGE_SIZE/2);
+	/* 2008/04/07: buf currently PAGE_SIZE, need 9 chars per 32 bits. */
+	BUILD_BUG_ON((NR_CPUS/32 * 9) > (PAGE_SIZE-1));
 
-	len = cpumask_scnprintf(buf, PAGE_SIZE-2, *mask);
+	len = type?
+		cpulist_scnprintf(buf, PAGE_SIZE-2, *mask):
+		cpumask_scnprintf(buf, PAGE_SIZE-2, *mask);
  	buf[len++] = '\n';
  	buf[len] = '\0';
 	return len;
 }
 
-static SYSDEV_ATTR(cpumap, S_IRUGO, node_read_cpumap, NULL);
+static inline ssize_t node_read_cpumask(struct sys_device *dev, char *buf)
+{
+	return node_read_cpumap(dev, 0, buf);
+}
+static inline ssize_t node_read_cpulist(struct sys_device *dev, char *buf)
+{
+	return node_read_cpumap(dev, 1, buf);
+}
+
+static SYSDEV_ATTR(cpumap,  S_IRUGO, node_read_cpumask, NULL);
+static SYSDEV_ATTR(cpulist, S_IRUGO, node_read_cpulist, NULL);
 
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
@@ -150,6 +162,7 @@ int register_node(struct node *node, int num, struct node *parent)
 
 	if (!error){
 		sysdev_create_file(&node->sysdev, &attr_cpumap);
+		sysdev_create_file(&node->sysdev, &attr_cpulist);
 		sysdev_create_file(&node->sysdev, &attr_meminfo);
 		sysdev_create_file(&node->sysdev, &attr_numastat);
 		sysdev_create_file(&node->sysdev, &attr_distance);
@@ -167,6 +180,7 @@ int register_node(struct node *node, int num, struct node *parent)
 void unregister_node(struct node *node)
 {
 	sysdev_remove_file(&node->sysdev, &attr_cpumap);
+	sysdev_remove_file(&node->sysdev, &attr_cpulist);
 	sysdev_remove_file(&node->sysdev, &attr_meminfo);
 	sysdev_remove_file(&node->sysdev, &attr_numastat);
 	sysdev_remove_file(&node->sysdev, &attr_distance);
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index e1d3ad4db2f..fdf4044d2e7 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -40,15 +40,38 @@ static ssize_t show_##name(struct sys_device *dev, char *buf)	\
 	return sprintf(buf, "%d\n", topology_##name(cpu));	\
 }
 
-#define define_siblings_show_func(name)					\
-static ssize_t show_##name(struct sys_device *dev, char *buf)		\
+static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf)
+{
+	ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
+	int n = 0;
+
+	if (len > 1) {
+		n = type?
+			cpulist_scnprintf(buf, len-2, *mask):
+			cpumask_scnprintf(buf, len-2, *mask);
+		buf[n++] = '\n';
+		buf[n] = '\0';
+	}
+	return n;
+}
+
+#define define_siblings_show_map(name)					\
+static inline ssize_t show_##name(struct sys_device *dev, char *buf)	\
 {									\
-	ssize_t len = -1;						\
 	unsigned int cpu = dev->id;					\
-	len = cpumask_scnprintf(buf, NR_CPUS+1, topology_##name(cpu));	\
-	return (len + sprintf(buf + len, "\n"));			\
+	return show_cpumap(0, &(topology_##name(cpu)), buf);		\
 }
 
+#define define_siblings_show_list(name)					\
+static inline ssize_t show_##name##_list(struct sys_device *dev, char *buf) \
+{									\
+	unsigned int cpu = dev->id;					\
+	return show_cpumap(1, &(topology_##name(cpu)), buf);		\
+}
+
+#define define_siblings_show_func(name)		\
+	define_siblings_show_map(name); define_siblings_show_list(name)
+
 #ifdef	topology_physical_package_id
 define_id_show_func(physical_package_id);
 define_one_ro(physical_package_id);
@@ -68,7 +91,9 @@ define_one_ro(core_id);
 #ifdef topology_thread_siblings
 define_siblings_show_func(thread_siblings);
 define_one_ro(thread_siblings);
-#define ref_thread_siblings_attr	&attr_thread_siblings.attr,
+define_one_ro(thread_siblings_list);
+#define ref_thread_siblings_attr	\
+		&attr_thread_siblings.attr, &attr_thread_siblings_list.attr,
 #else
 #define ref_thread_siblings_attr
 #endif
@@ -76,7 +101,9 @@ define_one_ro(thread_siblings);
 #ifdef topology_core_siblings
 define_siblings_show_func(core_siblings);
 define_one_ro(core_siblings);
-#define ref_core_siblings_attr		&attr_core_siblings.attr,
+define_one_ro(core_siblings_list);
+#define ref_core_siblings_attr		\
+		&attr_core_siblings.attr, &attr_core_siblings_list.attr,
 #else
 #define ref_core_siblings_attr
 #endif
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 8dcf1458aa2..8d9d648daeb 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -73,8 +73,23 @@ static ssize_t local_cpus_show(struct device *dev,
 
 	mask = pcibus_to_cpumask(to_pci_dev(dev)->bus);
 	len = cpumask_scnprintf(buf, PAGE_SIZE-2, mask);
-	strcat(buf,"\n"); 
-	return 1+len;
+	buf[len++] = '\n';
+	buf[len] = '\0';
+	return len;
+}
+
+
+static ssize_t local_cpulist_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	cpumask_t mask;
+	int len;
+
+	mask = pcibus_to_cpumask(to_pci_dev(dev)->bus);
+	len = cpulist_scnprintf(buf, PAGE_SIZE-2, mask);
+	buf[len++] = '\n';
+	buf[len] = '\0';
+	return len;
 }
 
 /* show resources */
@@ -201,6 +216,7 @@ struct device_attribute pci_dev_attrs[] = {
 	__ATTR_RO(class),
 	__ATTR_RO(irq),
 	__ATTR_RO(local_cpus),
+	__ATTR_RO(local_cpulist),
 	__ATTR_RO(modalias),
 #ifdef CONFIG_NUMA
 	__ATTR_RO(numa_node),
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 2db2e4bb0d1..4b3011a23ef 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -82,6 +82,7 @@ void pci_remove_legacy_files(struct pci_bus *bus) { return; }
  * PCI Bus Class Devices
  */
 static ssize_t pci_bus_show_cpuaffinity(struct device *dev,
+					int type,
 					struct device_attribute *attr,
 					char *buf)
 {
@@ -89,12 +90,30 @@ static ssize_t pci_bus_show_cpuaffinity(struct device *dev,
 	cpumask_t cpumask;
 
 	cpumask = pcibus_to_cpumask(to_pci_bus(dev));
-	ret = cpumask_scnprintf(buf, PAGE_SIZE, cpumask);
-	if (ret < PAGE_SIZE)
-		buf[ret++] = '\n';
+	ret = type?
+		cpulist_scnprintf(buf, PAGE_SIZE-2, cpumask):
+		cpumask_scnprintf(buf, PAGE_SIZE-2, cpumask);
+	buf[ret++] = '\n';
+	buf[ret] = '\0';
 	return ret;
 }
-DEVICE_ATTR(cpuaffinity, S_IRUGO, pci_bus_show_cpuaffinity, NULL);
+
+static ssize_t inline pci_bus_show_cpumaskaffinity(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	return pci_bus_show_cpuaffinity(dev, 0, attr, buf);
+}
+
+static ssize_t inline pci_bus_show_cpulistaffinity(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	return pci_bus_show_cpuaffinity(dev, 1, attr, buf);
+}
+
+DEVICE_ATTR(cpuaffinity,     S_IRUGO, pci_bus_show_cpumaskaffinity, NULL);
+DEVICE_ATTR(cpulistaffinity, S_IRUGO, pci_bus_show_cpulistaffinity, NULL);
 
 /*
  * PCI Bus Class
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6b9ac296a05..b0c870b2ac3 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2254,8 +2254,16 @@ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
 	m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count,
 					task->cpus_allowed);
 	seq_printf(m, "\n");
+	seq_printf(m, "Cpus_allowed_list:\t");
+	m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count,
+					task->cpus_allowed);
+	seq_printf(m, "\n");
 	seq_printf(m, "Mems_allowed:\t");
 	m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count,
 					task->mems_allowed);
 	seq_printf(m, "\n");
+	seq_printf(m, "Mems_allowed_list:\t");
+	m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count,
+					task->mems_allowed);
+	seq_printf(m, "\n");
 }
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 5b32433e7ee..5bae2e0c3ff 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -9,6 +9,11 @@
 static int show_schedstat(struct seq_file *seq, void *v)
 {
 	int cpu;
+	int mask_len = NR_CPUS/32 * 9;
+	char *mask_str = kmalloc(mask_len, GFP_KERNEL);
+
+	if (mask_str == NULL)
+		return -ENOMEM;
 
 	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
 	seq_printf(seq, "timestamp %lu\n", jiffies);
@@ -36,9 +41,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
 		preempt_disable();
 		for_each_domain(cpu, sd) {
 			enum cpu_idle_type itype;
-			char mask_str[NR_CPUS];
 
-			cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
+			cpumask_scnprintf(mask_str, mask_len, sd->span);
 			seq_printf(seq, "domain%d %s", dcount++, mask_str);
 			for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
 					itype++) {
-- 
cgit v1.2.3-70-g09d2


From 4bdbaad33d0f4d0e9818a38a825f5b75c0296a28 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Tue, 15 Apr 2008 16:35:52 -0700
Subject: sched: remove another cpumask_t variable from stack

    * Remove another cpumask_t variable from stack that was missed in the
      last kernel_sched_c updates.

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6809178eaa9..b56d98b0126 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6501,27 +6501,24 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
  * should be one that prevents unnecessary balancing, but also spreads tasks
  * out optimally.
  */
-static cpumask_t sched_domain_node_span(int node)
+static void sched_domain_node_span(int node, cpumask_t *span)
 {
 	nodemask_t used_nodes;
-	cpumask_t span;
 	node_to_cpumask_ptr(nodemask, node);
 	int i;
 
-	cpus_clear(span);
+	cpus_clear(*span);
 	nodes_clear(used_nodes);
 
-	cpus_or(span, span, *nodemask);
+	cpus_or(*span, *span, *nodemask);
 	node_set(node, used_nodes);
 
 	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
 		int next_node = find_next_best_node(node, &used_nodes);
 
 		node_to_cpumask_ptr_next(nodemask, next_node);
-		cpus_or(span, span, *nodemask);
+		cpus_or(*span, *span, *nodemask);
 	}
-
-	return span;
 }
 #endif
 
@@ -6883,7 +6880,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 
 		sd = &per_cpu(node_domains, i);
 		SD_INIT(sd, NODE);
-		sd->span = sched_domain_node_span(cpu_to_node(i));
+		sched_domain_node_span(cpu_to_node(i), &sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -6998,7 +6995,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 			continue;
 		}
 
-		*domainspan = sched_domain_node_span(i);
+		sched_domain_node_span(i, domainspan);
 		cpus_and(*domainspan, *domainspan, *cpu_map);
 
 		sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
-- 
cgit v1.2.3-70-g09d2


From e0982e90cd1ecf59818b137386b7f63debded9cc Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Wed, 26 Mar 2008 14:23:48 -0700
Subject: init: move setup of nr_cpu_ids to as early as possible

Move the setting of nr_cpu_ids from sched_init() to start_kernel()
so that it's available as early as possible.

Note that an arch has the option of setting it even earlier if need be,
but it should not result in a different value than the setup_nr_cpu_ids()
function.

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 init/main.c    | 17 +++++++++++++++++
 kernel/sched.c |  7 -------
 2 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/init/main.c b/init/main.c
index 2df3f0617fd..833a67df1f7 100644
--- a/init/main.c
+++ b/init/main.c
@@ -359,6 +359,7 @@ static void __init smp_init(void)
 #endif
 
 static inline void setup_per_cpu_areas(void) { }
+static inline void setup_nr_cpu_ids(void) { }
 static inline void smp_prepare_cpus(unsigned int maxcpus) { }
 
 #else
@@ -368,6 +369,21 @@ cpumask_t cpu_mask_all __read_mostly = CPU_MASK_ALL;
 EXPORT_SYMBOL(cpu_mask_all);
 #endif
 
+/* Setup number of possible processor ids */
+int nr_cpu_ids __read_mostly = NR_CPUS;
+EXPORT_SYMBOL(nr_cpu_ids);
+
+/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
+static void __init setup_nr_cpu_ids(void)
+{
+	int cpu, highest_cpu = 0;
+
+	for_each_possible_cpu(cpu)
+		highest_cpu = cpu;
+
+	nr_cpu_ids = highest_cpu + 1;
+}
+
 #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 
@@ -542,6 +558,7 @@ asmlinkage void __init start_kernel(void)
 	setup_command_line(command_line);
 	unwind_setup();
 	setup_per_cpu_areas();
+	setup_nr_cpu_ids();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
 	/*
diff --git a/kernel/sched.c b/kernel/sched.c
index b56d98b0126..6ab0fcbf26e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6116,10 +6116,6 @@ void __init migration_init(void)
 
 #ifdef CONFIG_SMP
 
-/* Number of possible processor ids */
-int nr_cpu_ids __read_mostly = NR_CPUS;
-EXPORT_SYMBOL(nr_cpu_ids);
-
 #ifdef CONFIG_SCHED_DEBUG
 
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
@@ -7478,7 +7474,6 @@ static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
 
 void __init sched_init(void)
 {
-	int highest_cpu = 0;
 	int i, j;
 	unsigned long alloc_size = 0, ptr;
 
@@ -7569,7 +7564,6 @@ void __init sched_init(void)
 #endif
 		init_rq_hrtick(rq);
 		atomic_set(&rq->nr_iowait, 0);
-		highest_cpu = i;
 	}
 
 	set_load_weight(&init_task);
@@ -7579,7 +7573,6 @@ void __init sched_init(void)
 #endif
 
 #ifdef CONFIG_SMP
-	nr_cpu_ids = highest_cpu + 1;
 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From cd8ba7cd9be0192348c2836cb6645d9b2cd2bfd2 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Wed, 26 Mar 2008 14:23:49 -0700
Subject: sched: add new set_cpus_allowed_ptr function

Add a new function that accepts a pointer to the "newly allowed cpus"
cpumask argument.

int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)

The current set_cpus_allowed() function is modified to use the above
but this does not result in an ABI change.  And with some compiler
optimization help, it may not introduce any additional overhead.

Additionally, to enforce the read only nature of the new_mask arg, the
"const" property is migrated to sub-functions called by set_cpus_allowed.
This silences compiler warnings.

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 15 +++++++++++----
 kernel/sched.c        | 16 ++++++++--------
 kernel/sched_rt.c     |  3 ++-
 3 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 383502dfda1..79c025c3b62 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -889,7 +889,8 @@ struct sched_class {
 	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
 	void (*task_new) (struct rq *rq, struct task_struct *p);
-	void (*set_cpus_allowed)(struct task_struct *p, cpumask_t *newmask);
+	void (*set_cpus_allowed)(struct task_struct *p,
+				 const cpumask_t *newmask);
 
 	void (*join_domain)(struct rq *rq);
 	void (*leave_domain)(struct rq *rq);
@@ -1502,15 +1503,21 @@ static inline void put_task_struct(struct task_struct *t)
 #define used_math() tsk_used_math(current)
 
 #ifdef CONFIG_SMP
-extern int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask);
+extern int set_cpus_allowed_ptr(struct task_struct *p,
+				const cpumask_t *new_mask);
 #else
-static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
+static inline int set_cpus_allowed_ptr(struct task_struct *p,
+				       const cpumask_t *new_mask)
 {
-	if (!cpu_isset(0, new_mask))
+	if (!cpu_isset(0, *new_mask))
 		return -EINVAL;
 	return 0;
 }
 #endif
+static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
+{
+	return set_cpus_allowed_ptr(p, &new_mask);
+}
 
 extern unsigned long long sched_clock(void);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 6ab0fcbf26e..521b89b0148 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5486,7 +5486,7 @@ static inline void sched_init_granularity(void)
  * task must not exit() & deallocate itself prematurely. The
  * call is not atomic; no spinlocks may be held.
  */
-int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
+int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
 {
 	struct migration_req req;
 	unsigned long flags;
@@ -5494,23 +5494,23 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 	int ret = 0;
 
 	rq = task_rq_lock(p, &flags);
-	if (!cpus_intersects(new_mask, cpu_online_map)) {
+	if (!cpus_intersects(*new_mask, cpu_online_map)) {
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (p->sched_class->set_cpus_allowed)
-		p->sched_class->set_cpus_allowed(p, &new_mask);
+		p->sched_class->set_cpus_allowed(p, new_mask);
 	else {
-		p->cpus_allowed = new_mask;
-		p->rt.nr_cpus_allowed = cpus_weight(new_mask);
+		p->cpus_allowed = *new_mask;
+		p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
 	}
 
 	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpu_isset(task_cpu(p), new_mask))
+	if (cpu_isset(task_cpu(p), *new_mask))
 		goto out;
 
-	if (migrate_task(p, any_online_cpu(new_mask), &req)) {
+	if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
 		/* Need help from migration thread: drop lock and wait. */
 		task_rq_unlock(rq, &flags);
 		wake_up_process(rq->migration_thread);
@@ -5523,7 +5523,7 @@ out:
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(set_cpus_allowed);
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
 /*
  * Move (not current) task off this cpu, onto dest cpu. We're doing
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 6928ded24da..8ff824565e0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1123,7 +1123,8 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	return 0;
 }
 
-static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
+static void set_cpus_allowed_rt(struct task_struct *p,
+				const cpumask_t *new_mask)
 {
 	int weight = cpus_weight(*new_mask);
 
-- 
cgit v1.2.3-70-g09d2


From 112f53f5d700589de741dca67c77439e96ea94a7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 19 Mar 2008 11:43:36 +0100
Subject: sched: old sleeper bonus

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 4 +++-
 kernel/sched_fair.c | 7 +++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 521b89b0148..070eefdd90f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -680,6 +680,7 @@ enum {
 	SCHED_FEAT_SYNC_WAKEUPS		= 32,
 	SCHED_FEAT_HRTICK		= 64,
 	SCHED_FEAT_DOUBLE_TICK		= 128,
+	SCHED_FEAT_NORMALIZED_SLEEPER	= 256,
 };
 
 const_debug unsigned int sysctl_sched_features =
@@ -690,7 +691,8 @@ const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_CACHE_HOT_BUDDY	* 1 |
 		SCHED_FEAT_SYNC_WAKEUPS		* 1 |
 		SCHED_FEAT_HRTICK		* 1 |
-		SCHED_FEAT_DOUBLE_TICK		* 0;
+		SCHED_FEAT_DOUBLE_TICK		* 0 |
+		SCHED_FEAT_NORMALIZED_SLEEPER	* 1;
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 290cf770b71..022e036f2c3 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -501,8 +501,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 	if (!initial) {
 		/* sleeps upto a single latency don't count. */
 		if (sched_feat(NEW_FAIR_SLEEPERS)) {
-			vruntime -= calc_delta_fair(sysctl_sched_latency,
-						    &cfs_rq->load);
+			if (sched_feat(NORMALIZED_SLEEPER))
+				vruntime -= calc_delta_fair(sysctl_sched_latency,
+						&cfs_rq->load);
+			else
+				vruntime -= sysctl_sched_latency;
 		}
 
 		/* ensure we never gain time by being placed backwards. */
-- 
cgit v1.2.3-70-g09d2


From ea736ed5d353d7a3aa1cf8ce4cf8d947bc353fb2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 25 Mar 2008 13:51:45 +0100
Subject: sched: fix checks

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 070eefdd90f..62830eaec52 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -163,10 +163,11 @@ struct rt_prio_array {
 };
 
 struct rt_bandwidth {
-	ktime_t rt_period;
-	u64 rt_runtime;
-	spinlock_t rt_runtime_lock;
-	struct hrtimer rt_period_timer;
+	/* nests inside the rq lock: */
+	spinlock_t		rt_runtime_lock;
+	ktime_t			rt_period;
+	u64			rt_runtime;
+	struct hrtimer		rt_period_timer;
 };
 
 static struct rt_bandwidth def_rt_bandwidth;
@@ -403,6 +404,7 @@ struct rt_rq {
 	int rt_throttled;
 	u64 rt_time;
 	u64 rt_runtime;
+	/* Nests inside the rq lock: */
 	spinlock_t rt_runtime_lock;
 
 #ifdef CONFIG_RT_GROUP_SCHED
-- 
cgit v1.2.3-70-g09d2


From 354d60c2ff72d86627dfe2089d186824abf4bb8e Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Date: Sat, 19 Apr 2008 19:44:59 +0200
Subject: sched: mix tasks and groups

This patch allows tasks and groups to exist in the same cfs_rq. With this
change the CFS group scheduling follows a 1/(M+N) model from a 1/(1+N)
fairness model where M tasks and N groups exist at the cfs_rq level.

[a.p.zijlstra@chello.nl: rt bits and assorted fixes]
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 51 +++++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched_fair.c | 51 +++++++++++++++++++++++++++++++++++++++++++++------
 kernel/sched_rt.c   | 15 +++++++++------
 3 files changed, 103 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 62830eaec52..1b7399dfa36 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -273,6 +273,7 @@ struct task_group {
 	struct list_head list;
 };
 
+#ifdef CONFIG_USER_SCHED
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /* Default task group's sched entity on each cpu */
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
@@ -284,6 +285,7 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 #endif
+#endif
 
 /* task_group_lock serializes add/remove of task groups and also changes to
  * a task group's cpu shares.
@@ -7447,6 +7449,10 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
 		list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
 
 	tg->se[cpu] = se;
+	/* se could be NULL for init_task_group */
+	if (!se)
+		return;
+
 	se->cfs_rq = &rq->cfs;
 	se->my_q = cfs_rq;
 	se->load.weight = tg->shares;
@@ -7469,6 +7475,9 @@ static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
 		list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
 
 	tg->rt_se[cpu] = rt_se;
+	if (!rt_se)
+		return;
+
 	rt_se->rt_rq = &rq->rt;
 	rt_se->my_q = rt_rq;
 	rt_se->parent = NULL;
@@ -7539,18 +7548,56 @@ void __init sched_init(void)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		init_task_group.shares = init_task_group_load;
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+#ifdef CONFIG_CGROUP_SCHED
+		/*
+		 * How much cpu bandwidth does init_task_group get?
+		 *
+		 * In case of task-groups formed thr' the cgroup filesystem, it
+		 * gets 100% of the cpu resources in the system. This overall
+		 * system cpu resource is divided among the tasks of
+		 * init_task_group and its child task-groups in a fair manner,
+		 * based on each entity's (task or task-group's) weight
+		 * (se->load.weight).
+		 *
+		 * In other words, if init_task_group has 10 tasks of weight
+		 * 1024) and two child groups A0 and A1 (of weight 1024 each),
+		 * then A0's share of the cpu resource is:
+		 *
+		 * 	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+		 *
+		 * We achieve this by letting init_task_group's tasks sit
+		 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
+		 */
+		init_tg_cfs_entry(rq, &init_task_group, &rq->cfs, NULL, i, 1);
+#elif defined CONFIG_USER_SCHED
+		/*
+		 * In case of task-groups formed thr' the user id of tasks,
+		 * init_task_group represents tasks belonging to root user.
+		 * Hence it forms a sibling of all subsequent groups formed.
+		 * In this case, init_task_group gets only a fraction of overall
+		 * system cpu resource, based on the weight assigned to root
+		 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
+		 * by letting tasks of init_task_group sit in a separate cfs_rq
+		 * (init_cfs_rq) and having one entity represent this group of
+		 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
+		 */
 		init_tg_cfs_entry(rq, &init_task_group,
 				&per_cpu(init_cfs_rq, i),
 				&per_cpu(init_sched_entity, i), i, 1);
 
 #endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
+#ifdef CONFIG_CGROUP_SCHED
+		init_tg_rt_entry(rq, &init_task_group, &rq->rt, NULL, i, 1);
+#elif defined CONFIG_USER_SCHED
 		init_tg_rt_entry(rq, &init_task_group,
 				&per_cpu(init_rt_rq, i),
 				&per_cpu(init_sched_rt_entity, i), i, 1);
-#else
-		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
+#endif
 #endif
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 022e036f2c3..3dde0f0ec93 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1133,6 +1133,17 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
 	return 0;
 }
 
+/* return depth at which a sched entity is present in the hierarchy */
+static inline int depth_se(struct sched_entity *se)
+{
+	int depth = 0;
+
+	for_each_sched_entity(se)
+		depth++;
+
+	return depth;
+}
+
 /*
  * Preempt the current task with a newly woken task if needed:
  */
@@ -1141,6 +1152,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	struct sched_entity *se = &curr->se, *pse = &p->se;
+	int se_depth, pse_depth;
 
 	if (unlikely(rt_prio(p->prio))) {
 		update_rq_clock(rq);
@@ -1165,6 +1177,27 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
 
+	/*
+	 * preemption test can be made between sibling entities who are in the
+	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
+	 * both tasks until we find their ancestors who are siblings of common
+	 * parent.
+	 */
+
+	/* First walk up until both entities are at same depth */
+	se_depth = depth_se(se);
+	pse_depth = depth_se(pse);
+
+	while (se_depth > pse_depth) {
+		se_depth--;
+		se = parent_entity(se);
+	}
+
+	while (pse_depth > se_depth) {
+		pse_depth--;
+		pse = parent_entity(pse);
+	}
+
 	while (!is_same_group(se, pse)) {
 		se = parent_entity(se);
 		pse = parent_entity(pse);
@@ -1223,13 +1256,22 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 static struct task_struct *
 __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
 {
-	struct task_struct *p;
+	struct task_struct *p = NULL;
+	struct sched_entity *se;
 
 	if (!curr)
 		return NULL;
 
-	p = rb_entry(curr, struct task_struct, se.run_node);
-	cfs_rq->rb_load_balance_curr = rb_next(curr);
+	/* Skip over entities that are not tasks */
+	do {
+		se = rb_entry(curr, struct sched_entity, run_node);
+		curr = rb_next(curr);
+	} while (curr && !entity_is_task(se));
+
+	cfs_rq->rb_load_balance_curr = curr;
+
+	if (entity_is_task(se))
+		p = task_of(se);
 
 	return p;
 }
@@ -1489,9 +1531,6 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
 {
 	struct cfs_rq *cfs_rq;
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
-#endif
 	rcu_read_lock();
 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
 		print_cfs_rq(m, cpu, cfs_rq);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8ff824565e0..201a69382a4 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -374,11 +374,15 @@ static void update_curr_rt(struct rq *rq)
 	curr->se.exec_start = rq->clock;
 	cpuacct_charge(curr, delta_exec);
 
-	spin_lock(&rt_rq->rt_runtime_lock);
-	rt_rq->rt_time += delta_exec;
-	if (sched_rt_runtime_exceeded(rt_rq))
-		resched_task(curr);
-	spin_unlock(&rt_rq->rt_runtime_lock);
+	for_each_sched_rt_entity(rt_se) {
+		rt_rq = rt_rq_of_se(rt_se);
+
+		spin_lock(&rt_rq->rt_runtime_lock);
+		rt_rq->rt_time += delta_exec;
+		if (sched_rt_runtime_exceeded(rt_rq))
+			resched_task(curr);
+		spin_unlock(&rt_rq->rt_runtime_lock);
+	}
 }
 
 static inline
@@ -477,7 +481,6 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
  * entries, we must remove entries top - down.
  *
  * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
- *      doesn't matter much for now, as h=2 for GROUP_SCHED.
  */
 static void dequeue_rt_stack(struct task_struct *p)
 {
-- 
cgit v1.2.3-70-g09d2


From ec7dc8ac73e4a56ed03b673f026f08c0d547f597 Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Date: Sat, 19 Apr 2008 19:44:59 +0200
Subject: sched: allow the group scheduler to have multiple levels

This patch makes the group scheduler multi hierarchy aware.

[a.p.zijlstra@chello.nl: rt-parts and assorted fixes]
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  2 +-
 kernel/sched.c        | 85 ++++++++++++++++++++++++++++++++-------------------
 kernel/user.c         |  2 +-
 3 files changed, 55 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 79c025c3b62..fa14781747c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2052,7 +2052,7 @@ extern void normalize_rt_tasks(void);
 
 extern struct task_group init_task_group;
 
-extern struct task_group *sched_create_group(void);
+extern struct task_group *sched_create_group(struct task_group *parent);
 extern void sched_destroy_group(struct task_group *tg);
 extern void sched_move_task(struct task_struct *tsk);
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched.c b/kernel/sched.c
index 1b7399dfa36..f9c8da798bb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7438,10 +7438,11 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
-		struct cfs_rq *cfs_rq, struct sched_entity *se,
-		int cpu, int add)
+static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+				struct sched_entity *se, int cpu, int add,
+				struct sched_entity *parent)
 {
+	struct rq *rq = cpu_rq(cpu);
 	tg->cfs_rq[cpu] = cfs_rq;
 	init_cfs_rq(cfs_rq, rq);
 	cfs_rq->tg = tg;
@@ -7453,19 +7454,25 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
 	if (!se)
 		return;
 
-	se->cfs_rq = &rq->cfs;
+	if (!parent)
+		se->cfs_rq = &rq->cfs;
+	else
+		se->cfs_rq = parent->my_q;
+
 	se->my_q = cfs_rq;
 	se->load.weight = tg->shares;
 	se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
-	se->parent = NULL;
+	se->parent = parent;
 }
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
-static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
-		struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
-		int cpu, int add)
+static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
+		struct sched_rt_entity *rt_se, int cpu, int add,
+		struct sched_rt_entity *parent)
 {
+	struct rq *rq = cpu_rq(cpu);
+
 	tg->rt_rq[cpu] = rt_rq;
 	init_rt_rq(rt_rq, rq);
 	rt_rq->tg = tg;
@@ -7478,9 +7485,14 @@ static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
 	if (!rt_se)
 		return;
 
+	if (!parent)
+		rt_se->rt_rq = &rq->rt;
+	else
+		rt_se->rt_rq = parent->my_q;
+
 	rt_se->rt_rq = &rq->rt;
 	rt_se->my_q = rt_rq;
-	rt_se->parent = NULL;
+	rt_se->parent = parent;
 	INIT_LIST_HEAD(&rt_se->run_list);
 }
 #endif
@@ -7568,7 +7580,7 @@ void __init sched_init(void)
 		 * We achieve this by letting init_task_group's tasks sit
 		 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
 		 */
-		init_tg_cfs_entry(rq, &init_task_group, &rq->cfs, NULL, i, 1);
+		init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
 #elif defined CONFIG_USER_SCHED
 		/*
 		 * In case of task-groups formed thr' the user id of tasks,
@@ -7581,9 +7593,9 @@ void __init sched_init(void)
 		 * (init_cfs_rq) and having one entity represent this group of
 		 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
 		 */
-		init_tg_cfs_entry(rq, &init_task_group,
+		init_tg_cfs_entry(&init_task_group,
 				&per_cpu(init_cfs_rq, i),
-				&per_cpu(init_sched_entity, i), i, 1);
+				&per_cpu(init_sched_entity, i), i, 1, NULL);
 
 #endif
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -7592,11 +7604,11 @@ void __init sched_init(void)
 #ifdef CONFIG_RT_GROUP_SCHED
 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
 #ifdef CONFIG_CGROUP_SCHED
-		init_tg_rt_entry(rq, &init_task_group, &rq->rt, NULL, i, 1);
+		init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
 #elif defined CONFIG_USER_SCHED
-		init_tg_rt_entry(rq, &init_task_group,
+		init_tg_rt_entry(&init_task_group,
 				&per_cpu(init_rt_rq, i),
-				&per_cpu(init_sched_rt_entity, i), i, 1);
+				&per_cpu(init_sched_rt_entity, i), i, 1, NULL);
 #endif
 #endif
 
@@ -7798,10 +7810,11 @@ static void free_fair_sched_group(struct task_group *tg)
 	kfree(tg->se);
 }
 
-static int alloc_fair_sched_group(struct task_group *tg)
+static
+int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct cfs_rq *cfs_rq;
-	struct sched_entity *se;
+	struct sched_entity *se, *parent_se;
 	struct rq *rq;
 	int i;
 
@@ -7827,7 +7840,8 @@ static int alloc_fair_sched_group(struct task_group *tg)
 		if (!se)
 			goto err;
 
-		init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
+		parent_se = parent ? parent->se[i] : NULL;
+		init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
 	}
 
 	return 1;
@@ -7851,7 +7865,8 @@ static inline void free_fair_sched_group(struct task_group *tg)
 {
 }
 
-static inline int alloc_fair_sched_group(struct task_group *tg)
+static inline
+int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	return 1;
 }
@@ -7883,10 +7898,11 @@ static void free_rt_sched_group(struct task_group *tg)
 	kfree(tg->rt_se);
 }
 
-static int alloc_rt_sched_group(struct task_group *tg)
+static
+int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct rt_rq *rt_rq;
-	struct sched_rt_entity *rt_se;
+	struct sched_rt_entity *rt_se, *parent_se;
 	struct rq *rq;
 	int i;
 
@@ -7913,7 +7929,8 @@ static int alloc_rt_sched_group(struct task_group *tg)
 		if (!rt_se)
 			goto err;
 
-		init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
+		parent_se = parent ? parent->rt_se[i] : NULL;
+		init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
 	}
 
 	return 1;
@@ -7937,7 +7954,8 @@ static inline void free_rt_sched_group(struct task_group *tg)
 {
 }
 
-static inline int alloc_rt_sched_group(struct task_group *tg)
+static inline
+int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	return 1;
 }
@@ -7960,7 +7978,7 @@ static void free_sched_group(struct task_group *tg)
 }
 
 /* allocate runqueue etc for a new task group */
-struct task_group *sched_create_group(void)
+struct task_group *sched_create_group(struct task_group *parent)
 {
 	struct task_group *tg;
 	unsigned long flags;
@@ -7970,10 +7988,10 @@ struct task_group *sched_create_group(void)
 	if (!tg)
 		return ERR_PTR(-ENOMEM);
 
-	if (!alloc_fair_sched_group(tg))
+	if (!alloc_fair_sched_group(tg, parent))
 		goto err;
 
-	if (!alloc_rt_sched_group(tg))
+	if (!alloc_rt_sched_group(tg, parent))
 		goto err;
 
 	spin_lock_irqsave(&task_group_lock, flags);
@@ -8084,6 +8102,12 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	int i;
 	unsigned long flags;
 
+	/*
+	 * We can't change the weight of the root cgroup.
+	 */
+	if (!tg->se[0])
+		return -EINVAL;
+
 	/*
 	 * A weight of 0 or 1 can cause arithmetics problems.
 	 * (The default weight is 1024 - so there's no practical
@@ -8327,7 +8351,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
 static struct cgroup_subsys_state *
 cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
-	struct task_group *tg;
+	struct task_group *tg, *parent;
 
 	if (!cgrp->parent) {
 		/* This is early initialization for the top cgroup */
@@ -8335,11 +8359,8 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
 		return &init_task_group.css;
 	}
 
-	/* we support only 1-level deep hierarchical scheduler atm */
-	if (cgrp->parent->parent)
-		return ERR_PTR(-EINVAL);
-
-	tg = sched_create_group();
+	parent = cgroup_tg(cgrp->parent);
+	tg = sched_create_group(parent);
 	if (IS_ERR(tg))
 		return ERR_PTR(-ENOMEM);
 
diff --git a/kernel/user.c b/kernel/user.c
index 5925c6887c1..a28d9f99246 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -101,7 +101,7 @@ static int sched_create_user(struct user_struct *up)
 {
 	int rc = 0;
 
-	up->tg = sched_create_group();
+	up->tg = sched_create_group(NULL);
 	if (IS_ERR(up->tg))
 		rc = -ENOMEM;
 
-- 
cgit v1.2.3-70-g09d2


From eff766a65c60237bfa865160c3129de31fab591b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 19 Apr 2008 19:45:00 +0200
Subject: sched: fix the task_group hierarchy for UID grouping

UID grouping doesn't actually have a task_group representing the root of
the task_group tree. Add one.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  3 +++
 kernel/sched.c        | 43 +++++++++++++++++++++++++++++++++++++++++--
 kernel/user.c         |  2 +-
 3 files changed, 45 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fa14781747c..ada24022d23 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2051,6 +2051,9 @@ extern void normalize_rt_tasks(void);
 #ifdef CONFIG_GROUP_SCHED
 
 extern struct task_group init_task_group;
+#ifdef CONFIG_USER_SCHED
+extern struct task_group root_task_group;
+#endif
 
 extern struct task_group *sched_create_group(struct task_group *parent);
 extern void sched_destroy_group(struct task_group *tg);
diff --git a/kernel/sched.c b/kernel/sched.c
index f9c8da798bb..e03b45ccf78 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -274,6 +274,14 @@ struct task_group {
 };
 
 #ifdef CONFIG_USER_SCHED
+
+/*
+ * Root task group.
+ * 	Every UID task group (including init_task_group aka UID-0) will
+ * 	be a child to this group.
+ */
+struct task_group root_task_group;
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /* Default task group's sched entity on each cpu */
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
@@ -285,6 +293,8 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 #endif
+#else
+#define root_task_group init_task_group
 #endif
 
 /* task_group_lock serializes add/remove of task groups and also changes to
@@ -7507,6 +7517,9 @@ void __init sched_init(void)
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+#endif
+#ifdef CONFIG_USER_SCHED
+	alloc_size *= 2;
 #endif
 	/*
 	 * As sched_init() is called before page_alloc is setup,
@@ -7521,12 +7534,29 @@ void __init sched_init(void)
 
 		init_task_group.cfs_rq = (struct cfs_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
+
+#ifdef CONFIG_USER_SCHED
+		root_task_group.se = (struct sched_entity **)ptr;
+		ptr += nr_cpu_ids * sizeof(void **);
+
+		root_task_group.cfs_rq = (struct cfs_rq **)ptr;
+		ptr += nr_cpu_ids * sizeof(void **);
+#endif
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 		init_task_group.rt_se = (struct sched_rt_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 
 		init_task_group.rt_rq = (struct rt_rq **)ptr;
+		ptr += nr_cpu_ids * sizeof(void **);
+
+#ifdef CONFIG_USER_SCHED
+		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
+		ptr += nr_cpu_ids * sizeof(void **);
+
+		root_task_group.rt_rq = (struct rt_rq **)ptr;
+		ptr += nr_cpu_ids * sizeof(void **);
+#endif
 #endif
 	}
 
@@ -7540,6 +7570,10 @@ void __init sched_init(void)
 #ifdef CONFIG_RT_GROUP_SCHED
 	init_rt_bandwidth(&init_task_group.rt_bandwidth,
 			global_rt_period(), global_rt_runtime());
+#ifdef CONFIG_USER_SCHED
+	init_rt_bandwidth(&root_task_group.rt_bandwidth,
+			global_rt_period(), RUNTIME_INF);
+#endif
 #endif
 
 #ifdef CONFIG_GROUP_SCHED
@@ -7582,6 +7616,8 @@ void __init sched_init(void)
 		 */
 		init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
 #elif defined CONFIG_USER_SCHED
+		root_task_group.shares = NICE_0_LOAD;
+		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
 		/*
 		 * In case of task-groups formed thr' the user id of tasks,
 		 * init_task_group represents tasks belonging to root user.
@@ -7595,7 +7631,8 @@ void __init sched_init(void)
 		 */
 		init_tg_cfs_entry(&init_task_group,
 				&per_cpu(init_cfs_rq, i),
-				&per_cpu(init_sched_entity, i), i, 1, NULL);
+				&per_cpu(init_sched_entity, i), i, 1,
+				root_task_group.se[i]);
 
 #endif
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -7606,9 +7643,11 @@ void __init sched_init(void)
 #ifdef CONFIG_CGROUP_SCHED
 		init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
 #elif defined CONFIG_USER_SCHED
+		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
 		init_tg_rt_entry(&init_task_group,
 				&per_cpu(init_rt_rq, i),
-				&per_cpu(init_sched_rt_entity, i), i, 1, NULL);
+				&per_cpu(init_sched_rt_entity, i), i, 1,
+				root_task_group.rt_se[i]);
 #endif
 #endif
 
diff --git a/kernel/user.c b/kernel/user.c
index a28d9f99246..debce602bfd 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -101,7 +101,7 @@ static int sched_create_user(struct user_struct *up)
 {
 	int rc = 0;
 
-	up->tg = sched_create_group(NULL);
+	up->tg = sched_create_group(&root_task_group);
 	if (IS_ERR(up->tg))
 		rc = -ENOMEM;
 
-- 
cgit v1.2.3-70-g09d2


From f473aa5e025bc8e0c5fe9352f65178a54adadec2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 19 Apr 2008 19:45:00 +0200
Subject: sched: task_group hierarchy

Add the full parent<->child relation thing into task_groups as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e03b45ccf78..debb06a4a66 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -271,6 +271,10 @@ struct task_group {
 
 	struct rcu_head rcu;
 	struct list_head list;
+
+	struct task_group *parent;
+	struct list_head siblings;
+	struct list_head children;
 };
 
 #ifdef CONFIG_USER_SCHED
@@ -7578,6 +7582,13 @@ void __init sched_init(void)
 
 #ifdef CONFIG_GROUP_SCHED
 	list_add(&init_task_group.list, &task_groups);
+	INIT_LIST_HEAD(&init_task_group.children);
+
+#ifdef CONFIG_USER_SCHED
+	INIT_LIST_HEAD(&root_task_group.children);
+	init_task_group.parent = &root_task_group;
+	list_add(&init_task_group.siblings, &root_task_group.children);
+#endif
 #endif
 
 	for_each_possible_cpu(i) {
@@ -8039,6 +8050,12 @@ struct task_group *sched_create_group(struct task_group *parent)
 		register_rt_sched_group(tg, i);
 	}
 	list_add_rcu(&tg->list, &task_groups);
+
+	WARN_ON(!parent); /* root should already exist */
+
+	tg->parent = parent;
+	list_add_rcu(&tg->siblings, &parent->children);
+	INIT_LIST_HEAD(&tg->children);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 
 	return tg;
@@ -8067,6 +8084,7 @@ void sched_destroy_group(struct task_group *tg)
 		unregister_rt_sched_group(tg, i);
 	}
 	list_del_rcu(&tg->list);
+	list_del_rcu(&tg->siblings);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 
 	/* wait for possible concurrent references to cfs_rqs complete */
@@ -8162,6 +8180,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i)
 		unregister_fair_sched_group(tg, i);
+	list_del_rcu(&tg->siblings);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 
 	/* wait for any ongoing reference to this group to finish */
@@ -8182,6 +8201,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i)
 		register_fair_sched_group(tg, i);
+	list_add_rcu(&tg->siblings, &tg->parent->children);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 done:
 	mutex_unlock(&shares_mutex);
-- 
cgit v1.2.3-70-g09d2


From b40b2e8eb52192a8a22d707ed37925792b7bdfd1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 19 Apr 2008 19:45:00 +0200
Subject: sched: rt: multi level group constraints

multi level rt constraints

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index debb06a4a66..475e3fcab73 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8228,6 +8228,38 @@ static unsigned long to_ratio(u64 period, u64 runtime)
 	return div64_64(runtime << 16, period);
 }
 
+#ifdef CONFIG_CGROUP_SCHED
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+{
+	struct task_group *tgi, *parent = tg->parent;
+	unsigned long total = 0;
+
+	if (!parent) {
+		if (global_rt_period() < period)
+			return 0;
+
+		return to_ratio(period, runtime) <
+			to_ratio(global_rt_period(), global_rt_runtime());
+	}
+
+	if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
+		return 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tgi, &parent->children, siblings) {
+		if (tgi == tg)
+			continue;
+
+		total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
+				tgi->rt_bandwidth.rt_runtime);
+	}
+	rcu_read_unlock();
+
+	return total + to_ratio(period, runtime) <
+		to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
+				parent->rt_bandwidth.rt_runtime);
+}
+#elif defined CONFIG_USER_SCHED
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
 	struct task_group *tgi;
@@ -8247,6 +8279,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 
 	return total + to_ratio(period, runtime) < global_ratio;
 }
+#endif
 
 /* Must be called with tasklist_lock held */
 static inline int tg_has_rt_tasks(struct task_group *tg)
-- 
cgit v1.2.3-70-g09d2


From b758149c02638146a835f42097dd1950a6cae638 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 19 Apr 2008 19:45:00 +0200
Subject: sched: prepatory code movement

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 190 +++++++++++++++++++++++++---------------------------
 1 file changed, 92 insertions(+), 98 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3dde0f0ec93..de4250c53a1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -77,6 +77,11 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
  * CFS operations on generic schedulable entities:
  */
 
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+	return container_of(se, struct task_struct, se);
+}
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
 /* cpu runqueue to which this cfs_rq is attached */
@@ -88,6 +93,54 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 /* An entity is a task if it doesn't "own" a runqueue */
 #define entity_is_task(se)	(!se->my_q)
 
+/* Walk up scheduling entities hierarchy */
+#define for_each_sched_entity(se) \
+		for (; se; se = se->parent)
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+	return p->se.cfs_rq;
+}
+
+/* runqueue on which this entity is (to be) queued */
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+	return se->cfs_rq;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+	return grp->my_q;
+}
+
+/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
+ * another cpu ('this_cpu')
+ */
+static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
+{
+	return cfs_rq->tg->cfs_rq[this_cpu];
+}
+
+/* Iterate thr' all leaf cfs_rq's on a runqueue */
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+
+/* Do the two (enqueued) entities belong to the same group ? */
+static inline int
+is_same_group(struct sched_entity *se, struct sched_entity *pse)
+{
+	if (se->cfs_rq == pse->cfs_rq)
+		return 1;
+
+	return 0;
+}
+
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+	return se->parent;
+}
+
 #else	/* CONFIG_FAIR_GROUP_SCHED */
 
 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -97,13 +150,49 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 
 #define entity_is_task(se)	1
 
-#endif	/* CONFIG_FAIR_GROUP_SCHED */
+#define for_each_sched_entity(se) \
+		for (; se; se = NULL)
 
-static inline struct task_struct *task_of(struct sched_entity *se)
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 {
-	return container_of(se, struct task_struct, se);
+	return &task_rq(p)->cfs;
 }
 
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+	struct task_struct *p = task_of(se);
+	struct rq *rq = task_rq(p);
+
+	return &rq->cfs;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+	return NULL;
+}
+
+static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
+{
+	return &cpu_rq(this_cpu)->cfs;
+}
+
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+
+static inline int
+is_same_group(struct sched_entity *se, struct sched_entity *pse)
+{
+	return 1;
+}
+
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+	return NULL;
+}
+
+#endif	/* CONFIG_FAIR_GROUP_SCHED */
+
 
 /**************************************************************
  * Scheduling class tree data structure manipulation methods:
@@ -699,101 +788,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
  * CFS operations on tasks:
  */
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/* Walk up scheduling entities hierarchy */
-#define for_each_sched_entity(se) \
-		for (; se; se = se->parent)
-
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
-	return p->se.cfs_rq;
-}
-
-/* runqueue on which this entity is (to be) queued */
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
-	return se->cfs_rq;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
-	return grp->my_q;
-}
-
-/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
- * another cpu ('this_cpu')
- */
-static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
-{
-	return cfs_rq->tg->cfs_rq[this_cpu];
-}
-
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
-
-/* Do the two (enqueued) entities belong to the same group ? */
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
-	if (se->cfs_rq == pse->cfs_rq)
-		return 1;
-
-	return 0;
-}
-
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
-{
-	return se->parent;
-}
-
-#else	/* CONFIG_FAIR_GROUP_SCHED */
-
-#define for_each_sched_entity(se) \
-		for (; se; se = NULL)
-
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
-	return &task_rq(p)->cfs;
-}
-
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
-	struct task_struct *p = task_of(se);
-	struct rq *rq = task_rq(p);
-
-	return &rq->cfs;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
-	return NULL;
-}
-
-static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
-{
-	return &cpu_rq(this_cpu)->cfs;
-}
-
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
-
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
-	return 1;
-}
-
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
-{
-	return NULL;
-}
-
-#endif	/* CONFIG_FAIR_GROUP_SCHED */
-
 #ifdef CONFIG_SCHED_HRTICK
 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 {
-- 
cgit v1.2.3-70-g09d2


From 1d3504fcf5606579d60b649d19f44b3871c1ddae Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Tue, 15 Apr 2008 14:04:23 +0900
Subject: sched, cpuset: customize sched domains, core

[rebased for sched-devel/latest]

 - Add a new cpuset file, having levels:
     sched_relax_domain_level

 - Modify partition_sched_domains() and build_sched_domains()
   to take attributes parameter passed from cpuset.

 - Fill newidle_idx for node domains which currently unused but
   might be required if sched_relax_domain_level become higher.

 - We can change the default level by boot option 'relax_domain_level='.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-ia64/topology.h |  2 +-
 include/asm-sh/topology.h   |  2 +-
 include/asm-x86/topology.h  |  2 +-
 include/linux/sched.h       | 23 ++++++++++++-
 kernel/cpuset.c             | 61 ++++++++++++++++++++++++++++++++++-
 kernel/sched.c              | 78 ++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched_fair.c         |  4 ++-
 7 files changed, 161 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h
index f929dde8534..f2f72ef2a89 100644
--- a/include/asm-ia64/topology.h
+++ b/include/asm-ia64/topology.h
@@ -93,7 +93,7 @@ void build_cpu_to_node_map(void);
 	.cache_nice_tries	= 2,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 2,			\
-	.newidle_idx		= 0, /* unused */	\
+	.newidle_idx		= 2,			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
diff --git a/include/asm-sh/topology.h b/include/asm-sh/topology.h
index f402a3b1cfa..34cdb28e8f4 100644
--- a/include/asm-sh/topology.h
+++ b/include/asm-sh/topology.h
@@ -16,7 +16,7 @@
 	.cache_nice_tries	= 2,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 2,			\
-	.newidle_idx		= 0,			\
+	.newidle_idx		= 2,			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h
index 9ef74c5d5ad..22073268b48 100644
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -147,7 +147,7 @@ extern unsigned long node_remap_size[];
 
 # define SD_CACHE_NICE_TRIES	2
 # define SD_IDLE_IDX		2
-# define SD_NEWIDLE_IDX		0
+# define SD_NEWIDLE_IDX		2
 # define SD_FORKEXEC_IDX	1
 
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ada24022d23..11f47249cdd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -704,6 +704,7 @@ enum cpu_idle_type {
 #define SD_POWERSAVINGS_BALANCE	256	/* Balance for power savings */
 #define SD_SHARE_PKG_RESOURCES	512	/* Domain members share cpu pkg resources */
 #define SD_SERIALIZE		1024	/* Only a single load balancing instance */
+#define SD_WAKE_IDLE_FAR	2048	/* Gain latency sacrificing cache hit */
 
 #define BALANCE_FOR_MC_POWER	\
 	(sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
@@ -733,6 +734,24 @@ struct sched_group {
 	u32 reciprocal_cpu_power;
 };
 
+enum sched_domain_level {
+	SD_LV_NONE = 0,
+	SD_LV_SIBLING,
+	SD_LV_MC,
+	SD_LV_CPU,
+	SD_LV_NODE,
+	SD_LV_ALLNODES,
+	SD_LV_MAX
+};
+
+struct sched_domain_attr {
+	int relax_domain_level;
+};
+
+#define SD_ATTR_INIT	(struct sched_domain_attr) {	\
+	.relax_domain_level = -1,			\
+}
+
 struct sched_domain {
 	/* These fields must be setup */
 	struct sched_domain *parent;	/* top domain must be null terminated */
@@ -750,6 +769,7 @@ struct sched_domain {
 	unsigned int wake_idx;
 	unsigned int forkexec_idx;
 	int flags;			/* See SD_* */
+	enum sched_domain_level level;
 
 	/* Runtime fields. */
 	unsigned long last_balance;	/* init to jiffies. units in jiffies */
@@ -789,7 +809,8 @@ struct sched_domain {
 #endif
 };
 
-extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new);
+extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+				    struct sched_domain_attr *dattr_new);
 extern int arch_reinit_sched_domains(void);
 
 #endif	/* CONFIG_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b0c870b2ac3..8b35fbd8292 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -98,6 +98,9 @@ struct cpuset {
 	/* partition number for rebuild_sched_domains() */
 	int pn;
 
+	/* for custom sched domain */
+	int relax_domain_level;
+
 	/* used for walking a cpuset heirarchy */
 	struct list_head stack_list;
 };
@@ -478,6 +481,16 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 	return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
 }
 
+static void
+update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
+{
+	if (!dattr)
+		return;
+	if (dattr->relax_domain_level < c->relax_domain_level)
+		dattr->relax_domain_level = c->relax_domain_level;
+	return;
+}
+
 /*
  * rebuild_sched_domains()
  *
@@ -553,12 +566,14 @@ static void rebuild_sched_domains(void)
 	int csn;		/* how many cpuset ptrs in csa so far */
 	int i, j, k;		/* indices for partition finding loops */
 	cpumask_t *doms;	/* resulting partition; i.e. sched domains */
+	struct sched_domain_attr *dattr;  /* attributes for custom domains */
 	int ndoms;		/* number of sched domains in result */
 	int nslot;		/* next empty doms[] cpumask_t slot */
 
 	q = NULL;
 	csa = NULL;
 	doms = NULL;
+	dattr = NULL;
 
 	/* Special case for the 99% of systems with one, full, sched domain */
 	if (is_sched_load_balance(&top_cpuset)) {
@@ -566,6 +581,11 @@ static void rebuild_sched_domains(void)
 		doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
 		if (!doms)
 			goto rebuild;
+		dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
+		if (dattr) {
+			*dattr = SD_ATTR_INIT;
+			update_domain_attr(dattr, &top_cpuset);
+		}
 		*doms = top_cpuset.cpus_allowed;
 		goto rebuild;
 	}
@@ -622,6 +642,7 @@ restart:
 	doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
 	if (!doms)
 		goto rebuild;
+	dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
 
 	for (nslot = 0, i = 0; i < csn; i++) {
 		struct cpuset *a = csa[i];
@@ -644,12 +665,15 @@ restart:
 			}
 
 			cpus_clear(*dp);
+			if (dattr)
+				*(dattr + nslot) = SD_ATTR_INIT;
 			for (j = i; j < csn; j++) {
 				struct cpuset *b = csa[j];
 
 				if (apn == b->pn) {
 					cpus_or(*dp, *dp, b->cpus_allowed);
 					b->pn = -1;
+					update_domain_attr(dattr, b);
 				}
 			}
 			nslot++;
@@ -660,7 +684,7 @@ restart:
 rebuild:
 	/* Have scheduler rebuild sched domains */
 	get_online_cpus();
-	partition_sched_domains(ndoms, doms);
+	partition_sched_domains(ndoms, doms, dattr);
 	put_online_cpus();
 
 done:
@@ -668,6 +692,7 @@ done:
 		kfifo_free(q);
 	kfree(csa);
 	/* Don't kfree(doms) -- partition_sched_domains() does that. */
+	/* Don't kfree(dattr) -- partition_sched_domains() does that. */
 }
 
 static inline int started_after_time(struct task_struct *t1,
@@ -1011,6 +1036,21 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
 	return 0;
 }
 
+static int update_relax_domain_level(struct cpuset *cs, char *buf)
+{
+	int val = simple_strtol(buf, NULL, 10);
+
+	if (val < 0)
+		val = -1;
+
+	if (val != cs->relax_domain_level) {
+		cs->relax_domain_level = val;
+		rebuild_sched_domains();
+	}
+
+	return 0;
+}
+
 /*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
@@ -1202,6 +1242,7 @@ typedef enum {
 	FILE_CPU_EXCLUSIVE,
 	FILE_MEM_EXCLUSIVE,
 	FILE_SCHED_LOAD_BALANCE,
+	FILE_SCHED_RELAX_DOMAIN_LEVEL,
 	FILE_MEMORY_PRESSURE_ENABLED,
 	FILE_MEMORY_PRESSURE,
 	FILE_SPREAD_PAGE,
@@ -1256,6 +1297,9 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
 	case FILE_SCHED_LOAD_BALANCE:
 		retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
 		break;
+	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+		retval = update_relax_domain_level(cs, buffer);
+		break;
 	case FILE_MEMORY_MIGRATE:
 		retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
 		break;
@@ -1354,6 +1398,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont,
 	case FILE_SCHED_LOAD_BALANCE:
 		*s++ = is_sched_load_balance(cs) ? '1' : '0';
 		break;
+	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+		s += sprintf(s, "%d", cs->relax_domain_level);
+		break;
 	case FILE_MEMORY_MIGRATE:
 		*s++ = is_memory_migrate(cs) ? '1' : '0';
 		break;
@@ -1424,6 +1471,13 @@ static struct cftype cft_sched_load_balance = {
 	.private = FILE_SCHED_LOAD_BALANCE,
 };
 
+static struct cftype cft_sched_relax_domain_level = {
+	.name = "sched_relax_domain_level",
+	.read = cpuset_common_file_read,
+	.write = cpuset_common_file_write,
+	.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
+};
+
 static struct cftype cft_memory_migrate = {
 	.name = "memory_migrate",
 	.read = cpuset_common_file_read,
@@ -1475,6 +1529,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 		return err;
 	if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
 		return err;
+	if ((err = cgroup_add_file(cont, ss,
+					&cft_sched_relax_domain_level)) < 0)
+		return err;
 	if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
 		return err;
 	if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
@@ -1559,6 +1616,7 @@ static struct cgroup_subsys_state *cpuset_create(
 	nodes_clear(cs->mems_allowed);
 	cs->mems_generation = cpuset_mems_generation++;
 	fmeter_init(&cs->fmeter);
+	cs->relax_domain_level = -1;
 
 	cs->parent = parent;
 	number_of_cpusets++;
@@ -1631,6 +1689,7 @@ int __init cpuset_init(void)
 	fmeter_init(&top_cpuset.fmeter);
 	top_cpuset.mems_generation = cpuset_mems_generation++;
 	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
+	top_cpuset.relax_domain_level = -1;
 
 	err = register_filesystem(&cpuset_fs_type);
 	if (err < 0)
diff --git a/kernel/sched.c b/kernel/sched.c
index 475e3fcab73..62d7481caca 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6771,6 +6771,7 @@ static noinline void sd_init_##type(struct sched_domain *sd)	\
 {								\
 	memset(sd, 0, sizeof(*sd));				\
 	*sd = SD_##type##_INIT;					\
+	sd->level = SD_LV_##type;				\
 }
 
 SD_INIT_FUNC(CPU)
@@ -6819,11 +6820,42 @@ struct allmasks {
 #define	SCHED_CPUMASK_VAR(v, a) 	cpumask_t *v = (cpumask_t *) \
 			((unsigned long)(a) + offsetof(struct allmasks, v))
 
+static int default_relax_domain_level = -1;
+
+static int __init setup_relax_domain_level(char *str)
+{
+	default_relax_domain_level = simple_strtoul(str, NULL, 0);
+	return 1;
+}
+__setup("relax_domain_level=", setup_relax_domain_level);
+
+static void set_domain_attribute(struct sched_domain *sd,
+				 struct sched_domain_attr *attr)
+{
+	int request;
+
+	if (!attr || attr->relax_domain_level < 0) {
+		if (default_relax_domain_level < 0)
+			return;
+		else
+			request = default_relax_domain_level;
+	} else
+		request = attr->relax_domain_level;
+	if (request < sd->level) {
+		/* turn off idle balance on this domain */
+		sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
+	} else {
+		/* turn on idle balance on this domain */
+		sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
+	}
+}
+
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
-static int build_sched_domains(const cpumask_t *cpu_map)
+static int __build_sched_domains(const cpumask_t *cpu_map,
+				 struct sched_domain_attr *attr)
 {
 	int i;
 	struct root_domain *rd;
@@ -6887,6 +6919,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 				SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
 			sd = &per_cpu(allnodes_domains, i);
 			SD_INIT(sd, ALLNODES);
+			set_domain_attribute(sd, attr);
 			sd->span = *cpu_map;
 			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
 			p = sd;
@@ -6896,6 +6929,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 
 		sd = &per_cpu(node_domains, i);
 		SD_INIT(sd, NODE);
+		set_domain_attribute(sd, attr);
 		sched_domain_node_span(cpu_to_node(i), &sd->span);
 		sd->parent = p;
 		if (p)
@@ -6906,6 +6940,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		p = sd;
 		sd = &per_cpu(phys_domains, i);
 		SD_INIT(sd, CPU);
+		set_domain_attribute(sd, attr);
 		sd->span = *nodemask;
 		sd->parent = p;
 		if (p)
@@ -6916,6 +6951,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		p = sd;
 		sd = &per_cpu(core_domains, i);
 		SD_INIT(sd, MC);
+		set_domain_attribute(sd, attr);
 		sd->span = cpu_coregroup_map(i);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
@@ -6927,6 +6963,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 		p = sd;
 		sd = &per_cpu(cpu_domains, i);
 		SD_INIT(sd, SIBLING);
+		set_domain_attribute(sd, attr);
 		sd->span = per_cpu(cpu_sibling_map, i);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
@@ -7124,8 +7161,15 @@ error:
 #endif
 }
 
+static int build_sched_domains(const cpumask_t *cpu_map)
+{
+	return __build_sched_domains(cpu_map, NULL);
+}
+
 static cpumask_t *doms_cur;	/* current sched domains */
 static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
+static struct sched_domain_attr *dattr_cur;	/* attribues of custom domains
+						   in 'doms_cur' */
 
 /*
  * Special case: If a kmalloc of a doms_cur partition (array of
@@ -7153,6 +7197,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
 	if (!doms_cur)
 		doms_cur = &fallback_doms;
 	cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
+	dattr_cur = NULL;
 	err = build_sched_domains(doms_cur);
 	register_sched_domain_sysctl();
 
@@ -7182,6 +7227,22 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
 	arch_destroy_sched_domains(cpu_map, &tmpmask);
 }
 
+/* handle null as "default" */
+static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
+			struct sched_domain_attr *new, int idx_new)
+{
+	struct sched_domain_attr tmp;
+
+	/* fast path */
+	if (!new && !cur)
+		return 1;
+
+	tmp = SD_ATTR_INIT;
+	return !memcmp(cur ? (cur + idx_cur) : &tmp,
+			new ? (new + idx_new) : &tmp,
+			sizeof(struct sched_domain_attr));
+}
+
 /*
  * Partition sched domains as specified by the 'ndoms_new'
  * cpumasks in the array doms_new[] of cpumasks. This compares
@@ -7203,7 +7264,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
  *
  * Call with hotplug lock held
  */
-void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
+void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+			     struct sched_domain_attr *dattr_new)
 {
 	int i, j;
 
@@ -7216,12 +7278,14 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
 		ndoms_new = 1;
 		doms_new = &fallback_doms;
 		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+		dattr_new = NULL;
 	}
 
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
 		for (j = 0; j < ndoms_new; j++) {
-			if (cpus_equal(doms_cur[i], doms_new[j]))
+			if (cpus_equal(doms_cur[i], doms_new[j])
+			    && dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
 		}
 		/* no match - a current sched domain not in new doms_new[] */
@@ -7233,11 +7297,13 @@ match1:
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < ndoms_cur; j++) {
-			if (cpus_equal(doms_new[i], doms_cur[j]))
+			if (cpus_equal(doms_new[i], doms_cur[j])
+			    && dattrs_equal(dattr_new, i, dattr_cur, j))
 				goto match2;
 		}
 		/* no match - add a new doms_new */
-		build_sched_domains(doms_new + i);
+		__build_sched_domains(doms_new + i,
+					dattr_new ? dattr_new + i : NULL);
 match2:
 		;
 	}
@@ -7245,7 +7311,9 @@ match2:
 	/* Remember the new sched domains */
 	if (doms_cur != &fallback_doms)
 		kfree(doms_cur);
+	kfree(dattr_cur);	/* kfree(NULL) is safe */
 	doms_cur = doms_new;
+	dattr_cur = dattr_new;
 	ndoms_cur = ndoms_new;
 
 	register_sched_domain_sysctl();
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index de4250c53a1..b43748efaa7 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -940,7 +940,9 @@ static int wake_idle(int cpu, struct task_struct *p)
 		return cpu;
 
 	for_each_domain(cpu, sd) {
-		if (sd->flags & SD_WAKE_IDLE) {
+		if ((sd->flags & SD_WAKE_IDLE)
+		    || ((sd->flags & SD_WAKE_IDLE_FAR)
+			&& !task_hot(p, task_rq(p)->clock, sd))) {
 			cpus_and(tmp, sd->span, p->cpus_allowed);
 			for_each_cpu_mask(i, tmp) {
 				if (idle_cpu(i)) {
-- 
cgit v1.2.3-70-g09d2


From 18d95a2832c1392a2d63227a7a6d433cb9f2037e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 19 Apr 2008 19:45:00 +0200
Subject: sched: fair-group: SMP-nice for group scheduling

Implement SMP nice support for the full group hierarchy.

On each load-balance action, compile a sched_domain wide view of the full
task_group tree. We compute the domain wide view when walking down the
hierarchy, and readjust the weights when walking back up.

After collecting and readjusting the domain wide view, we try to balance the
tasks within the task_groups. The current approach is a naively balance each
task group until we've moved the targeted amount of load.

Inspired by Srivatsa Vaddsgiri's previous code and Abhishek Chandra's H-SMP
paper.

XXX: there will be some numerical issues due to the limited nature of
     SCHED_LOAD_SCALE wrt to representing a task_groups influence on the
     total weight. When the tree is deep enough, or the task weight small
     enough, we'll run out of bits.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Abhishek Chandra <chandra@cs.umn.edu>
CC: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |   1 +
 kernel/sched.c        | 497 ++++++++++++++++++++++++++++++++++++++++++++++----
 kernel/sched_fair.c   | 124 ++++++++-----
 kernel/sched_rt.c     |   4 +
 4 files changed, 548 insertions(+), 78 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 11f47249cdd..0a32059e6ed 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -758,6 +758,7 @@ struct sched_domain {
 	struct sched_domain *child;	/* bottom domain must be null terminated */
 	struct sched_group *groups;	/* the balancing groups of the domain */
 	cpumask_t span;			/* span of all CPUs in this domain */
+	int first_cpu;			/* cache of the first cpu in this domain */
 	unsigned long min_interval;	/* Minimum balance interval ms */
 	unsigned long max_interval;	/* Maximum balance interval ms */
 	unsigned int busy_factor;	/* less balancing by factor if busy */
diff --git a/kernel/sched.c b/kernel/sched.c
index 62d7481caca..ae1a3e936d2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -316,6 +316,8 @@ static DEFINE_MUTEX(doms_cur_mutex);
 # define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
 #endif
 
+#define MIN_SHARES	2
+
 static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 #endif
 
@@ -403,6 +405,43 @@ struct cfs_rq {
 	 */
 	struct list_head leaf_cfs_rq_list;
 	struct task_group *tg;	/* group that "owns" this runqueue */
+
+#ifdef CONFIG_SMP
+	unsigned long task_weight;
+	unsigned long shares;
+	/*
+	 * We need space to build a sched_domain wide view of the full task
+	 * group tree, in order to avoid depending on dynamic memory allocation
+	 * during the load balancing we place this in the per cpu task group
+	 * hierarchy. This limits the load balancing to one instance per cpu,
+	 * but more should not be needed anyway.
+	 */
+	struct aggregate_struct {
+		/*
+		 *   load = weight(cpus) * f(tg)
+		 *
+		 * Where f(tg) is the recursive weight fraction assigned to
+		 * this group.
+		 */
+		unsigned long load;
+
+		/*
+		 * part of the group weight distributed to this span.
+		 */
+		unsigned long shares;
+
+		/*
+		 * The sum of all runqueue weights within this span.
+		 */
+		unsigned long rq_weight;
+
+		/*
+		 * Weight contributed by tasks; this is the part we can
+		 * influence by moving tasks around.
+		 */
+		unsigned long task_weight;
+	} aggregate;
+#endif
 #endif
 };
 
@@ -1402,11 +1441,390 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 #endif
 
+static inline void inc_cpu_load(struct rq *rq, unsigned long load)
+{
+	update_load_add(&rq->load, load);
+}
+
+static inline void dec_cpu_load(struct rq *rq, unsigned long load)
+{
+	update_load_sub(&rq->load, load);
+}
+
 #ifdef CONFIG_SMP
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
 static unsigned long cpu_avg_load_per_task(int cpu);
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/*
+ * Group load balancing.
+ *
+ * We calculate a few balance domain wide aggregate numbers; load and weight.
+ * Given the pictures below, and assuming each item has equal weight:
+ *
+ *         root          1 - thread
+ *         / | \         A - group
+ *        A  1  B
+ *       /|\   / \
+ *      C 2 D 3   4
+ *      |   |
+ *      5   6
+ *
+ * load:
+ *    A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
+ *    which equals 1/9-th of the total load.
+ *
+ * shares:
+ *    The weight of this group on the selected cpus.
+ *
+ * rq_weight:
+ *    Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
+ *    B would get 2.
+ *
+ * task_weight:
+ *    Part of the rq_weight contributed by tasks; all groups except B would
+ *    get 1, B gets 2.
+ */
+
+static inline struct aggregate_struct *
+aggregate(struct task_group *tg, struct sched_domain *sd)
+{
+	return &tg->cfs_rq[sd->first_cpu]->aggregate;
+}
+
+typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
+
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ */
+static
+void aggregate_walk_tree(aggregate_func down, aggregate_func up,
+			 struct sched_domain *sd)
+{
+	struct task_group *parent, *child;
+
+	rcu_read_lock();
+	parent = &root_task_group;
+down:
+	(*down)(parent, sd);
+	list_for_each_entry_rcu(child, &parent->children, siblings) {
+		parent = child;
+		goto down;
+
+up:
+		continue;
+	}
+	(*up)(parent, sd);
+
+	child = parent;
+	parent = parent->parent;
+	if (parent)
+		goto up;
+	rcu_read_unlock();
+}
+
+/*
+ * Calculate the aggregate runqueue weight.
+ */
+static
+void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
+{
+	unsigned long rq_weight = 0;
+	unsigned long task_weight = 0;
+	int i;
+
+	for_each_cpu_mask(i, sd->span) {
+		rq_weight += tg->cfs_rq[i]->load.weight;
+		task_weight += tg->cfs_rq[i]->task_weight;
+	}
+
+	aggregate(tg, sd)->rq_weight = rq_weight;
+	aggregate(tg, sd)->task_weight = task_weight;
+}
+
+/*
+ * Redistribute tg->shares amongst all tg->cfs_rq[]s.
+ */
+static void __aggregate_redistribute_shares(struct task_group *tg)
+{
+	int i, max_cpu = smp_processor_id();
+	unsigned long rq_weight = 0;
+	unsigned long shares, max_shares = 0, shares_rem = tg->shares;
+
+	for_each_possible_cpu(i)
+		rq_weight += tg->cfs_rq[i]->load.weight;
+
+	for_each_possible_cpu(i) {
+		/*
+		 * divide shares proportional to the rq_weights.
+		 */
+		shares = tg->shares * tg->cfs_rq[i]->load.weight;
+		shares /= rq_weight + 1;
+
+		tg->cfs_rq[i]->shares = shares;
+
+		if (shares > max_shares) {
+			max_shares = shares;
+			max_cpu = i;
+		}
+		shares_rem -= shares;
+	}
+
+	/*
+	 * Ensure it all adds up to tg->shares; we can loose a few
+	 * due to rounding down when computing the per-cpu shares.
+	 */
+	if (shares_rem)
+		tg->cfs_rq[max_cpu]->shares += shares_rem;
+}
+
+/*
+ * Compute the weight of this group on the given cpus.
+ */
+static
+void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
+{
+	unsigned long shares = 0;
+	int i;
+
+again:
+	for_each_cpu_mask(i, sd->span)
+		shares += tg->cfs_rq[i]->shares;
+
+	/*
+	 * When the span doesn't have any shares assigned, but does have
+	 * tasks to run do a machine wide rebalance (should be rare).
+	 */
+	if (unlikely(!shares && aggregate(tg, sd)->rq_weight)) {
+		__aggregate_redistribute_shares(tg);
+		goto again;
+	}
+
+	aggregate(tg, sd)->shares = shares;
+}
+
+/*
+ * Compute the load fraction assigned to this group, relies on the aggregate
+ * weight and this group's parent's load, i.e. top-down.
+ */
+static
+void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
+{
+	unsigned long load;
+
+	if (!tg->parent) {
+		int i;
+
+		load = 0;
+		for_each_cpu_mask(i, sd->span)
+			load += cpu_rq(i)->load.weight;
+
+	} else {
+		load = aggregate(tg->parent, sd)->load;
+
+		/*
+		 * shares is our weight in the parent's rq so
+		 * shares/parent->rq_weight gives our fraction of the load
+		 */
+		load *= aggregate(tg, sd)->shares;
+		load /= aggregate(tg->parent, sd)->rq_weight + 1;
+	}
+
+	aggregate(tg, sd)->load = load;
+}
+
+static void __set_se_shares(struct sched_entity *se, unsigned long shares);
+
+/*
+ * Calculate and set the cpu's group shares.
+ */
+static void
+__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
+			  int tcpu)
+{
+	int boost = 0;
+	unsigned long shares;
+	unsigned long rq_weight;
+
+	if (!tg->se[tcpu])
+		return;
+
+	rq_weight = tg->cfs_rq[tcpu]->load.weight;
+
+	/*
+	 * If there are currently no tasks on the cpu pretend there is one of
+	 * average load so that when a new task gets to run here it will not
+	 * get delayed by group starvation.
+	 */
+	if (!rq_weight) {
+		boost = 1;
+		rq_weight = NICE_0_LOAD;
+	}
+
+	/*
+	 *           \Sum shares * rq_weight
+	 * shares =  -----------------------
+	 *               \Sum rq_weight
+	 *
+	 */
+	shares = aggregate(tg, sd)->shares * rq_weight;
+	shares /= aggregate(tg, sd)->rq_weight + 1;
+
+	/*
+	 * record the actual number of shares, not the boosted amount.
+	 */
+	tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
+
+	if (shares < MIN_SHARES)
+		shares = MIN_SHARES;
+
+	__set_se_shares(tg->se[tcpu], shares);
+}
+
+/*
+ * Re-adjust the weights on the cpu the task came from and on the cpu the
+ * task went to.
+ */
+static void
+__move_group_shares(struct task_group *tg, struct sched_domain *sd,
+		    int scpu, int dcpu)
+{
+	unsigned long shares;
+
+	shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
+
+	__update_group_shares_cpu(tg, sd, scpu);
+	__update_group_shares_cpu(tg, sd, dcpu);
+
+	/*
+	 * ensure we never loose shares due to rounding errors in the
+	 * above redistribution.
+	 */
+	shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
+	if (shares)
+		tg->cfs_rq[dcpu]->shares += shares;
+}
+
+/*
+ * Because changing a group's shares changes the weight of the super-group
+ * we need to walk up the tree and change all shares until we hit the root.
+ */
+static void
+move_group_shares(struct task_group *tg, struct sched_domain *sd,
+		  int scpu, int dcpu)
+{
+	while (tg) {
+		__move_group_shares(tg, sd, scpu, dcpu);
+		tg = tg->parent;
+	}
+}
+
+static
+void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
+{
+	unsigned long shares = aggregate(tg, sd)->shares;
+	int i;
+
+	for_each_cpu_mask(i, sd->span) {
+		struct rq *rq = cpu_rq(i);
+		unsigned long flags;
+
+		spin_lock_irqsave(&rq->lock, flags);
+		__update_group_shares_cpu(tg, sd, i);
+		spin_unlock_irqrestore(&rq->lock, flags);
+	}
+
+	aggregate_group_shares(tg, sd);
+
+	/*
+	 * ensure we never loose shares due to rounding errors in the
+	 * above redistribution.
+	 */
+	shares -= aggregate(tg, sd)->shares;
+	if (shares) {
+		tg->cfs_rq[sd->first_cpu]->shares += shares;
+		aggregate(tg, sd)->shares += shares;
+	}
+}
+
+/*
+ * Calculate the accumulative weight and recursive load of each task group
+ * while walking down the tree.
+ */
+static
+void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
+{
+	aggregate_group_weight(tg, sd);
+	aggregate_group_shares(tg, sd);
+	aggregate_group_load(tg, sd);
+}
+
+/*
+ * Rebalance the cpu shares while walking back up the tree.
+ */
+static
+void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
+{
+	aggregate_group_set_shares(tg, sd);
+}
+
+static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
+
+static void __init init_aggregate(void)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		spin_lock_init(&per_cpu(aggregate_lock, i));
+}
+
+static int get_aggregate(struct sched_domain *sd)
+{
+	if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
+		return 0;
+
+	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
+	return 1;
+}
+
+static void put_aggregate(struct sched_domain *sd)
+{
+	spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
+}
+
+static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+{
+	cfs_rq->shares = shares;
+}
+
+#else
+
+static inline void init_aggregate(void)
+{
+}
+
+static inline int get_aggregate(struct sched_domain *sd)
+{
+	return 0;
+}
+
+static inline void put_aggregate(struct sched_domain *sd)
+{
+}
+#endif
+
+#else /* CONFIG_SMP */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+{
+}
+#endif
+
 #endif /* CONFIG_SMP */
 
 #include "sched_stats.h"
@@ -1419,26 +1837,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 
 #define sched_class_highest (&rt_sched_class)
 
-static inline void inc_load(struct rq *rq, const struct task_struct *p)
-{
-	update_load_add(&rq->load, p->se.load.weight);
-}
-
-static inline void dec_load(struct rq *rq, const struct task_struct *p)
-{
-	update_load_sub(&rq->load, p->se.load.weight);
-}
-
-static void inc_nr_running(struct task_struct *p, struct rq *rq)
+static void inc_nr_running(struct rq *rq)
 {
 	rq->nr_running++;
-	inc_load(rq, p);
 }
 
-static void dec_nr_running(struct task_struct *p, struct rq *rq)
+static void dec_nr_running(struct rq *rq)
 {
 	rq->nr_running--;
-	dec_load(rq, p);
 }
 
 static void set_load_weight(struct task_struct *p)
@@ -1530,7 +1936,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 		rq->nr_uninterruptible--;
 
 	enqueue_task(rq, p, wakeup);
-	inc_nr_running(p, rq);
+	inc_nr_running(rq);
 }
 
 /*
@@ -1542,7 +1948,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 		rq->nr_uninterruptible++;
 
 	dequeue_task(rq, p, sleep);
-	dec_nr_running(p, rq);
+	dec_nr_running(rq);
 }
 
 /**
@@ -2194,7 +2600,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		 * management (if any):
 		 */
 		p->sched_class->task_new(rq, p);
-		inc_nr_running(p, rq);
+		inc_nr_running(rq);
 	}
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
@@ -3185,9 +3591,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	unsigned long imbalance;
 	struct rq *busiest;
 	unsigned long flags;
+	int unlock_aggregate;
 
 	cpus_setall(*cpus);
 
+	unlock_aggregate = get_aggregate(sd);
+
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3303,8 +3712,9 @@ redo:
 
 	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		return -1;
-	return ld_moved;
+		ld_moved = -1;
+
+	goto out;
 
 out_balanced:
 	schedstat_inc(sd, lb_balanced[idle]);
@@ -3319,8 +3729,13 @@ out_one_pinned:
 
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		return -1;
-	return 0;
+		ld_moved = -1;
+	else
+		ld_moved = 0;
+out:
+	if (unlock_aggregate)
+		put_aggregate(sd);
+	return ld_moved;
 }
 
 /*
@@ -4535,10 +4950,8 @@ void set_user_nice(struct task_struct *p, long nice)
 		goto out_unlock;
 	}
 	on_rq = p->se.on_rq;
-	if (on_rq) {
+	if (on_rq)
 		dequeue_task(rq, p, 0);
-		dec_load(rq, p);
-	}
 
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
@@ -4548,7 +4961,6 @@ void set_user_nice(struct task_struct *p, long nice)
 
 	if (on_rq) {
 		enqueue_task(rq, p, 0);
-		inc_load(rq, p);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -6921,6 +7333,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			SD_INIT(sd, ALLNODES);
 			set_domain_attribute(sd, attr);
 			sd->span = *cpu_map;
+			sd->first_cpu = first_cpu(sd->span);
 			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
 			p = sd;
 			sd_allnodes = 1;
@@ -6931,6 +7344,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, NODE);
 		set_domain_attribute(sd, attr);
 		sched_domain_node_span(cpu_to_node(i), &sd->span);
+		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -6942,6 +7356,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, CPU);
 		set_domain_attribute(sd, attr);
 		sd->span = *nodemask;
+		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -6953,6 +7368,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, MC);
 		set_domain_attribute(sd, attr);
 		sd->span = cpu_coregroup_map(i);
+		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
@@ -6965,6 +7381,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, SIBLING);
 		set_domain_attribute(sd, attr);
 		sd->span = per_cpu(cpu_sibling_map, i);
+		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
@@ -7633,6 +8050,7 @@ void __init sched_init(void)
 	}
 
 #ifdef CONFIG_SMP
+	init_aggregate();
 	init_defrootdomain();
 #endif
 
@@ -8199,14 +8617,11 @@ void sched_move_task(struct task_struct *tsk)
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
+static void __set_se_shares(struct sched_entity *se, unsigned long shares)
 {
 	struct cfs_rq *cfs_rq = se->cfs_rq;
-	struct rq *rq = cfs_rq->rq;
 	int on_rq;
 
-	spin_lock_irq(&rq->lock);
-
 	on_rq = se->on_rq;
 	if (on_rq)
 		dequeue_entity(cfs_rq, se, 0);
@@ -8216,8 +8631,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
 
 	if (on_rq)
 		enqueue_entity(cfs_rq, se, 0);
+}
 
-	spin_unlock_irq(&rq->lock);
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
+{
+	struct cfs_rq *cfs_rq = se->cfs_rq;
+	struct rq *rq = cfs_rq->rq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	__set_se_shares(se, shares);
+	spin_unlock_irqrestore(&rq->lock, flags);
 }
 
 static DEFINE_MUTEX(shares_mutex);
@@ -8238,8 +8662,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	 * (The default weight is 1024 - so there's no practical
 	 *  limitation from this.)
 	 */
-	if (shares < 2)
-		shares = 2;
+	if (shares < MIN_SHARES)
+		shares = MIN_SHARES;
 
 	mutex_lock(&shares_mutex);
 	if (tg->shares == shares)
@@ -8259,8 +8683,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	 * w/o tripping rebalance_share or load_balance_fair.
 	 */
 	tg->shares = shares;
-	for_each_possible_cpu(i)
-		set_se_shares(tg->se[i], shares);
+	for_each_possible_cpu(i) {
+		/*
+		 * force a rebalance
+		 */
+		cfs_rq_set_shares(tg->cfs_rq[i], 0);
+		set_se_shares(tg->se[i], shares/nr_cpu_ids);
+	}
 
 	/*
 	 * Enable load balance activity on this group, by inserting it back on
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b43748efaa7..b89fec93a23 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -492,10 +492,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Scheduling class queueing methods:
  */
 
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+	cfs_rq->task_weight += weight;
+}
+#else
+static inline void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+}
+#endif
+
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_add(&cfs_rq->load, se->load.weight);
+	if (!parent_entity(se))
+		inc_cpu_load(rq_of(cfs_rq), se->load.weight);
+	if (entity_is_task(se))
+		add_cfs_task_weight(cfs_rq, se->load.weight);
 	cfs_rq->nr_running++;
 	se->on_rq = 1;
 }
@@ -504,6 +521,10 @@ static void
 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_sub(&cfs_rq->load, se->load.weight);
+	if (!parent_entity(se))
+		dec_cpu_load(rq_of(cfs_rq), se->load.weight);
+	if (entity_is_task(se))
+		add_cfs_task_weight(cfs_rq, -se->load.weight);
 	cfs_rq->nr_running--;
 	se->on_rq = 0;
 }
@@ -1286,75 +1307,90 @@ static struct task_struct *load_balance_next_fair(void *arg)
 	return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
+static unsigned long
+__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		unsigned long max_load_move, struct sched_domain *sd,
+		enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
+		struct cfs_rq *cfs_rq)
 {
-	struct sched_entity *curr;
-	struct task_struct *p;
-
-	if (!cfs_rq->nr_running || !first_fair(cfs_rq))
-		return MAX_PRIO;
-
-	curr = cfs_rq->curr;
-	if (!curr)
-		curr = __pick_next_entity(cfs_rq);
+	struct rq_iterator cfs_rq_iterator;
 
-	p = task_of(curr);
+	cfs_rq_iterator.start = load_balance_start_fair;
+	cfs_rq_iterator.next = load_balance_next_fair;
+	cfs_rq_iterator.arg = cfs_rq;
 
-	return p->prio;
+	return balance_tasks(this_rq, this_cpu, busiest,
+			max_load_move, sd, idle, all_pinned,
+			this_best_prio, &cfs_rq_iterator);
 }
-#endif
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
 		  struct sched_domain *sd, enum cpu_idle_type idle,
 		  int *all_pinned, int *this_best_prio)
 {
-	struct cfs_rq *busy_cfs_rq;
 	long rem_load_move = max_load_move;
-	struct rq_iterator cfs_rq_iterator;
+	int busiest_cpu = cpu_of(busiest);
+	struct task_group *tg;
 
-	cfs_rq_iterator.start = load_balance_start_fair;
-	cfs_rq_iterator.next = load_balance_next_fair;
-
-	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
-#ifdef CONFIG_FAIR_GROUP_SCHED
-		struct cfs_rq *this_cfs_rq;
+	rcu_read_lock();
+	list_for_each_entry(tg, &task_groups, list) {
 		long imbalance;
-		unsigned long maxload;
+		unsigned long this_weight, busiest_weight;
+		long rem_load, max_load, moved_load;
+
+		/*
+		 * empty group
+		 */
+		if (!aggregate(tg, sd)->task_weight)
+			continue;
+
+		rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
+		rem_load /= aggregate(tg, sd)->load + 1;
 
-		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+		this_weight = tg->cfs_rq[this_cpu]->task_weight;
+		busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
 
-		imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
-		/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
-		if (imbalance <= 0)
+		imbalance = (busiest_weight - this_weight) / 2;
+
+		if (imbalance < 0)
+			imbalance = busiest_weight;
+
+		max_load = max(rem_load, imbalance);
+		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
+				max_load, sd, idle, all_pinned, this_best_prio,
+				tg->cfs_rq[busiest_cpu]);
+
+		if (!moved_load)
 			continue;
 
-		/* Don't pull more than imbalance/2 */
-		imbalance /= 2;
-		maxload = min(rem_load_move, imbalance);
+		move_group_shares(tg, sd, busiest_cpu, this_cpu);
 
-		*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
-#else
-# define maxload rem_load_move
-#endif
-		/*
-		 * pass busy_cfs_rq argument into
-		 * load_balance_[start|next]_fair iterators
-		 */
-		cfs_rq_iterator.arg = busy_cfs_rq;
-		rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
-					       maxload, sd, idle, all_pinned,
-					       this_best_prio,
-					       &cfs_rq_iterator);
+		moved_load *= aggregate(tg, sd)->load;
+		moved_load /= aggregate(tg, sd)->rq_weight + 1;
 
-		if (rem_load_move <= 0)
+		rem_load_move -= moved_load;
+		if (rem_load_move < 0)
 			break;
 	}
+	rcu_read_unlock();
 
 	return max_load_move - rem_load_move;
 }
+#else
+static unsigned long
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		  unsigned long max_load_move,
+		  struct sched_domain *sd, enum cpu_idle_type idle,
+		  int *all_pinned, int *this_best_prio)
+{
+	return __load_balance_fair(this_rq, this_cpu, busiest,
+			max_load_move, sd, idle, all_pinned,
+			this_best_prio, &busiest->cfs);
+}
+#endif
 
 static int
 move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 201a69382a4..736fb8fd897 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -518,6 +518,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 	 */
 	for_each_sched_rt_entity(rt_se)
 		enqueue_rt_entity(rt_se);
+
+	inc_cpu_load(rq, p->se.load.weight);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -537,6 +539,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 		if (rt_rq && rt_rq->rt_nr_running)
 			enqueue_rt_entity(rt_se);
 	}
+
+	dec_cpu_load(rq, p->se.load.weight);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From d19ca30874f2ad343d054e0b5c0576744afeecd4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 19 Apr 2008 19:45:00 +0200
Subject: sched: debug: add some debug code to handle the full hierarchy

Add some extra debug output so we can get a better overview of the
full hierarchy.

We print the cgroup path after each cfs_rq, so we can see what group
we're looking at.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_debug.c | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 3d09106990c..be42548b67b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -67,14 +67,24 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 		(long long)(p->nvcsw + p->nivcsw),
 		p->prio);
 #ifdef CONFIG_SCHEDSTATS
-	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n",
+	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
 		SPLIT_NS(p->se.vruntime),
 		SPLIT_NS(p->se.sum_exec_runtime),
 		SPLIT_NS(p->se.sum_sleep_runtime));
 #else
-	SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n",
+	SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
 		0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
+
+#ifdef CONFIG_CGROUP_SCHED
+	{
+		char path[64];
+
+		cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
+		SEQ_printf(m, " %s", path);
+	}
+#endif
+	SEQ_printf(m, "\n");
 }
 
 static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
@@ -109,7 +119,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	struct sched_entity *last;
 	unsigned long flags;
 
-	SEQ_printf(m, "\ncfs_rq\n");
+#ifndef CONFIG_CGROUP_SCHED
+	SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
+#else
+	char path[128] = "";
+	struct cgroup *cgroup = NULL;
+	struct task_group *tg = cfs_rq->tg;
+
+	if (tg)
+		cgroup = tg->css.cgroup;
+
+	if (cgroup)
+		cgroup_path(cgroup, path, sizeof(path));
+
+	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
+#endif
 
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
 			SPLIT_NS(cfs_rq->exec_clock));
@@ -143,6 +167,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #endif
 	SEQ_printf(m, "  .%-30s: %ld\n", "nr_spread_over",
 			cfs_rq->nr_spread_over);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_SMP
+	SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
+#endif
+#endif
 }
 
 static void print_cpu(struct seq_file *m, int cpu)
-- 
cgit v1.2.3-70-g09d2


From 58d6c2d72f8628f39e8689fbde8aa177fcf00a37 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 19 Apr 2008 19:45:00 +0200
Subject: sched: rt-group: optimize dequeue_rt_stack

Now that the group hierarchy can have an arbitrary depth the O(n^2) nature
of RT task dequeues will really hurt. Optimize this by providing space to
store the tree path, so we can walk it the other way.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 +
 kernel/sched_rt.c     | 27 +++++++++++----------------
 2 files changed, 12 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0a32059e6ed..887f5db8942 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1005,6 +1005,7 @@ struct sched_rt_entity {
 	unsigned long timeout;
 	int nr_cpus_allowed;
 
+	struct sched_rt_entity *back;
 #ifdef CONFIG_RT_GROUP_SCHED
 	struct sched_rt_entity	*parent;
 	/* rq on which this entity is (to be) queued: */
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 736fb8fd897..c2730a5a4f0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -479,26 +479,21 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 /*
  * Because the prio of an upper entry depends on the lower
  * entries, we must remove entries top - down.
- *
- * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
  */
 static void dequeue_rt_stack(struct task_struct *p)
 {
-	struct sched_rt_entity *rt_se, *top_se;
+	struct sched_rt_entity *rt_se, *back = NULL;
 
-	/*
-	 * dequeue all, top - down.
-	 */
-	do {
-		rt_se = &p->rt;
-		top_se = NULL;
-		for_each_sched_rt_entity(rt_se) {
-			if (on_rt_rq(rt_se))
-				top_se = rt_se;
-		}
-		if (top_se)
-			dequeue_rt_entity(top_se);
-	} while (top_se);
+	rt_se = &p->rt;
+	for_each_sched_rt_entity(rt_se) {
+		rt_se->back = back;
+		back = rt_se;
+	}
+
+	for (rt_se = back; rt_se; rt_se = rt_se->back) {
+		if (on_rt_rq(rt_se))
+			dequeue_rt_entity(rt_se);
+	}
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From ac884dec6d4a7df252150af875cffddf8f1d9c15 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 19 Apr 2008 19:45:00 +0200
Subject: sched: fair-group scheduling vs latency

Currently FAIR_GROUP sched grows the scheduler latency outside of
sysctl_sched_latency, invert this so it stays within.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 44 +++++++++++++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b89fec93a23..9e301a2bab6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -362,29 +362,47 @@ static u64 __sched_period(unsigned long nr_running)
  */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	return calc_delta_mine(__sched_period(cfs_rq->nr_running),
-			       se->load.weight, &cfs_rq->load);
+	u64 slice = __sched_period(cfs_rq->nr_running);
+
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+
+		slice *= se->load.weight;
+		do_div(slice, cfs_rq->load.weight);
+	}
+
+
+	return slice;
 }
 
 /*
- * We calculate the vruntime slice.
+ * We calculate the vruntime slice of a to be inserted task
  *
  * vs = s/w = p/rw
  */
-static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
+static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	u64 vslice = __sched_period(nr_running);
+	unsigned long nr_running = cfs_rq->nr_running;
+	unsigned long weight;
+	u64 vslice;
 
-	vslice *= NICE_0_LOAD;
-	do_div(vslice, rq_weight);
+	if (!se->on_rq)
+		nr_running++;
 
-	return vslice;
-}
+	vslice = __sched_period(nr_running);
 
-static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	return __sched_vslice(cfs_rq->load.weight + se->load.weight,
-			cfs_rq->nr_running + 1);
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+
+		weight = cfs_rq->load.weight;
+		if (!se->on_rq)
+			weight += se->load.weight;
+
+		vslice *= NICE_0_LOAD;
+		do_div(vslice, weight);
+	}
+
+	return vslice;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 4a55bd5e97b1775913f88f11108a4f144f590e89 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 19 Apr 2008 19:45:00 +0200
Subject: sched: fair-group: de-couple load-balancing from the rb-trees

De-couple load-balancing from the rb-trees, so that I can change their
organization.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/init_task.h |  3 +++
 include/linux/sched.h     |  1 +
 kernel/sched.c            | 10 ++++++++--
 kernel/sched_fair.c       | 21 +++++++++++++--------
 4 files changed, 25 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1f74e1d7415..37a6f5bc4a9 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -151,6 +151,9 @@ extern struct group_info init_groups;
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
+	.se		= {						\
+		.group_node 	= LIST_HEAD_INIT(tsk.se.group_node),	\
+	},								\
 	.rt		= {						\
 		.run_list	= LIST_HEAD_INIT(tsk.rt.run_list),	\
 		.time_slice	= HZ, 					\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 887f5db8942..be6914014c7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -946,6 +946,7 @@ struct load_weight {
 struct sched_entity {
 	struct load_weight	load;		/* for load-balancing */
 	struct rb_node		run_node;
+	struct list_head	group_node;
 	unsigned int		on_rq;
 
 	u64			exec_start;
diff --git a/kernel/sched.c b/kernel/sched.c
index ae1a3e936d2..3202462109f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -384,8 +384,12 @@ struct cfs_rq {
 
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
-	struct rb_node *rb_load_balance_curr;
-	/* 'curr' points to currently running entity on this cfs_rq.
+
+	struct list_head tasks;
+	struct list_head *balance_iterator;
+
+	/*
+	 * 'curr' points to currently running entity on this cfs_rq.
 	 * It is set to NULL otherwise (i.e when none are currently running).
 	 */
 	struct sched_entity *curr, *next;
@@ -2525,6 +2529,7 @@ static void __sched_fork(struct task_struct *p)
 
 	INIT_LIST_HEAD(&p->rt.run_list);
 	p->se.on_rq = 0;
+	INIT_LIST_HEAD(&p->se.group_node);
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -7898,6 +7903,7 @@ int in_sched_functions(unsigned long addr)
 static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT;
+	INIT_LIST_HEAD(&cfs_rq->tasks);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->rq = rq;
 #endif
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9e301a2bab6..ed8ce329899 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -533,6 +533,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		add_cfs_task_weight(cfs_rq, se->load.weight);
 	cfs_rq->nr_running++;
 	se->on_rq = 1;
+	list_add(&se->group_node, &cfs_rq->tasks);
 }
 
 static void
@@ -545,6 +546,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		add_cfs_task_weight(cfs_rq, -se->load.weight);
 	cfs_rq->nr_running--;
 	se->on_rq = 0;
+	list_del_init(&se->group_node);
 }
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1289,21 +1291,24 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
  * the current task:
  */
 static struct task_struct *
-__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
+__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
 {
 	struct task_struct *p = NULL;
 	struct sched_entity *se;
 
-	if (!curr)
+	if (next == &cfs_rq->tasks)
 		return NULL;
 
 	/* Skip over entities that are not tasks */
 	do {
-		se = rb_entry(curr, struct sched_entity, run_node);
-		curr = rb_next(curr);
-	} while (curr && !entity_is_task(se));
+		se = list_entry(next, struct sched_entity, group_node);
+		next = next->next;
+	} while (next != &cfs_rq->tasks && !entity_is_task(se));
 
-	cfs_rq->rb_load_balance_curr = curr;
+	if (next == &cfs_rq->tasks)
+		return NULL;
+
+	cfs_rq->balance_iterator = next;
 
 	if (entity_is_task(se))
 		p = task_of(se);
@@ -1315,14 +1320,14 @@ static struct task_struct *load_balance_start_fair(void *arg)
 {
 	struct cfs_rq *cfs_rq = arg;
 
-	return __load_balance_iterator(cfs_rq, first_fair(cfs_rq));
+	return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next);
 }
 
 static struct task_struct *load_balance_next_fair(void *arg)
 {
 	struct cfs_rq *cfs_rq = arg;
 
-	return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
+	return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
 }
 
 static unsigned long
-- 
cgit v1.2.3-70-g09d2


From 8f1bc385cfbab474db6c27b5af1e439614f3025c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 19 Apr 2008 19:45:00 +0200
Subject: sched: fair: weight calculations

In order to level the hierarchy, we need to calculate load based on the
root view. That is, each task's load is in the same unit.

             A
            / \
           B   1
          / \
         2   3

To compute 1's load we do:

	   weight(1)
	--------------
	 rq_weight(A)

To compute 2's load we do:

	  weight(2)      weight(B)
	------------ * -----------
	rq_weight(B)   rw_weight(A)

This yields load fractions in comparable units.

The consequence is that it changes virtual time. We used to have:

                time_{i}
  vtime_{i} = ------------
               weight_{i}

  vtime = \Sum vtime_{i} = time / rq_weight.

But with the new way of load calculation we get that vtime equals time.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  9 ++---
 kernel/sched_fair.c | 95 ++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 65 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3202462109f..6d55dfc56ca 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1320,6 +1320,9 @@ static void __resched_task(struct task_struct *p, int tif_bit)
  */
 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 
+/*
+ * delta *= weight / lw
+ */
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		struct load_weight *lw)
@@ -1342,12 +1345,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
 
-static inline unsigned long
-calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
-{
-	return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
-}
-
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ed8ce329899..d72e8b41b3e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -333,6 +333,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
 }
 #endif
 
+/*
+ * delta *= w / rw
+ */
+static inline unsigned long
+calc_delta_weight(unsigned long delta, struct sched_entity *se)
+{
+	for_each_sched_entity(se) {
+		delta = calc_delta_mine(delta,
+				se->load.weight, &cfs_rq_of(se)->load);
+	}
+
+	return delta;
+}
+
+/*
+ * delta *= rw / w
+ */
+static inline unsigned long
+calc_delta_fair(unsigned long delta, struct sched_entity *se)
+{
+	for_each_sched_entity(se) {
+		delta = calc_delta_mine(delta,
+				cfs_rq_of(se)->load.weight, &se->load);
+	}
+
+	return delta;
+}
+
 /*
  * The idea is to set a period in which each task runs once.
  *
@@ -362,47 +390,54 @@ static u64 __sched_period(unsigned long nr_running)
  */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	u64 slice = __sched_period(cfs_rq->nr_running);
-
-	for_each_sched_entity(se) {
-		cfs_rq = cfs_rq_of(se);
-
-		slice *= se->load.weight;
-		do_div(slice, cfs_rq->load.weight);
-	}
-
-
-	return slice;
+	return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
 }
 
 /*
  * We calculate the vruntime slice of a to be inserted task
  *
- * vs = s/w = p/rw
+ * vs = s*rw/w = p
  */
 static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	unsigned long nr_running = cfs_rq->nr_running;
-	unsigned long weight;
-	u64 vslice;
 
 	if (!se->on_rq)
 		nr_running++;
 
-	vslice = __sched_period(nr_running);
+	return __sched_period(nr_running);
+}
+
+/*
+ * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
+ * that it favours >=0 over <0.
+ *
+ *   -20         |
+ *               |
+ *     0 --------+-------
+ *             .'
+ *    19     .'
+ *
+ */
+static unsigned long
+calc_delta_asym(unsigned long delta, struct sched_entity *se)
+{
+	struct load_weight lw = {
+		.weight = NICE_0_LOAD,
+		.inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
+	};
 
 	for_each_sched_entity(se) {
-		cfs_rq = cfs_rq_of(se);
+		struct load_weight *se_lw = &se->load;
 
-		weight = cfs_rq->load.weight;
-		if (!se->on_rq)
-			weight += se->load.weight;
+		if (se->load.weight < NICE_0_LOAD)
+			se_lw = &lw;
 
-		vslice *= NICE_0_LOAD;
-		do_div(vslice, weight);
+		delta = calc_delta_mine(delta,
+				cfs_rq_of(se)->load.weight, se_lw);
 	}
 
-	return vslice;
+	return delta;
 }
 
 /*
@@ -419,11 +454,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 
 	curr->sum_exec_runtime += delta_exec;
 	schedstat_add(cfs_rq, exec_clock, delta_exec);
-	delta_exec_weighted = delta_exec;
-	if (unlikely(curr->load.weight != NICE_0_LOAD)) {
-		delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
-							&curr->load);
-	}
+	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
 	curr->vruntime += delta_exec_weighted;
 }
 
@@ -632,8 +663,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 		/* sleeps upto a single latency don't count. */
 		if (sched_feat(NEW_FAIR_SLEEPERS)) {
 			if (sched_feat(NORMALIZED_SLEEPER))
-				vruntime -= calc_delta_fair(sysctl_sched_latency,
-						&cfs_rq->load);
+				vruntime -= calc_delta_weight(sysctl_sched_latency, se);
 			else
 				vruntime -= sysctl_sched_latency;
 		}
@@ -1132,11 +1162,10 @@ static unsigned long wakeup_gran(struct sched_entity *se)
 	unsigned long gran = sysctl_sched_wakeup_granularity;
 
 	/*
-	 * More easily preempt - nice tasks, while not making
-	 * it harder for + nice tasks.
+	 * More easily preempt - nice tasks, while not making it harder for
+	 * + nice tasks.
 	 */
-	if (unlikely(se->load.weight > NICE_0_LOAD))
-		gran = calc_delta_fair(gran, &se->load);
+	gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
 
 	return gran;
 }
-- 
cgit v1.2.3-70-g09d2


From 7ba2e74ab5a0518bc953042952dd165724bc70c9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 19 Apr 2008 19:45:00 +0200
Subject: sched: debug: show a weight tree

Print a tree of weights.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d72e8b41b3e..89fa32b4edf 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1611,6 +1611,30 @@ static const struct sched_class fair_sched_class = {
 };
 
 #ifdef CONFIG_SCHED_DEBUG
+static void
+print_cfs_rq_tasks(struct seq_file *m, struct cfs_rq *cfs_rq, int depth)
+{
+	struct sched_entity *se;
+
+	if (!cfs_rq)
+		return;
+
+	list_for_each_entry_rcu(se, &cfs_rq->tasks, group_node) {
+		int i;
+
+		for (i = depth; i; i--)
+			seq_puts(m, "  ");
+
+		seq_printf(m, "%lu %s %lu\n",
+				se->load.weight,
+				entity_is_task(se) ? "T" : "G",
+				calc_delta_weight(SCHED_LOAD_SCALE, se)
+				);
+		if (!entity_is_task(se))
+			print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1);
+	}
+}
+
 static void print_cfs_stats(struct seq_file *m, int cpu)
 {
 	struct cfs_rq *cfs_rq;
@@ -1618,6 +1642,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
 	rcu_read_lock();
 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
 		print_cfs_rq(m, cpu, cfs_rq);
+
+	seq_printf(m, "\nWeight tree:\n");
+	print_cfs_rq_tasks(m, &cpu_rq(cpu)->cfs, 1);
 	rcu_read_unlock();
 }
 #endif
-- 
cgit v1.2.3-70-g09d2


From 06379aba522ebdabca37446ea988a23c43c03c67 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 19 Apr 2008 09:25:58 +0200
Subject: sched: add SCHED_FEAT_DEADLINE

unused at the moment.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6d55dfc56ca..8f03817e7dd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -742,6 +742,7 @@ enum {
 	SCHED_FEAT_HRTICK		= 64,
 	SCHED_FEAT_DOUBLE_TICK		= 128,
 	SCHED_FEAT_NORMALIZED_SLEEPER	= 256,
+	SCHED_FEAT_DEADLINE		= 512,
 };
 
 const_debug unsigned int sysctl_sched_features =
@@ -753,7 +754,8 @@ const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_SYNC_WAKEUPS		* 1 |
 		SCHED_FEAT_HRTICK		* 1 |
 		SCHED_FEAT_DOUBLE_TICK		* 0 |
-		SCHED_FEAT_NORMALIZED_SLEEPER	* 1;
+		SCHED_FEAT_NORMALIZED_SLEEPER	* 1 |
+		SCHED_FEAT_DEADLINE		* 1;
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 
-- 
cgit v1.2.3-70-g09d2


From f00b45c145981b43c7e7f66315ac77534c938cbf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 19 Apr 2008 19:45:00 +0200
Subject: sched: /debug/sched_features

provide a text based interface to the scheduler features; this saves the
'user' from setting bits using decimal arithmetic.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c          | 162 +++++++++++++++++++++++++++++++++++++++++-------
 kernel/sched_features.h |  10 +++
 2 files changed, 150 insertions(+), 22 deletions(-)
 create mode 100644 kernel/sched_features.h

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8f03817e7dd..b59a44e1ea4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -68,6 +68,8 @@
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
 #include <linux/bootmem.h>
+#include <linux/debugfs.h>
+#include <linux/ctype.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -732,32 +734,148 @@ static void update_rq_clock(struct rq *rq)
 /*
  * Debugging: various feature bits
  */
+
+#define SCHED_FEAT(name, enabled)	\
+	__SCHED_FEAT_##name ,
+
 enum {
-	SCHED_FEAT_NEW_FAIR_SLEEPERS	= 1,
-	SCHED_FEAT_WAKEUP_PREEMPT	= 2,
-	SCHED_FEAT_START_DEBIT		= 4,
-	SCHED_FEAT_AFFINE_WAKEUPS	= 8,
-	SCHED_FEAT_CACHE_HOT_BUDDY	= 16,
-	SCHED_FEAT_SYNC_WAKEUPS		= 32,
-	SCHED_FEAT_HRTICK		= 64,
-	SCHED_FEAT_DOUBLE_TICK		= 128,
-	SCHED_FEAT_NORMALIZED_SLEEPER	= 256,
-	SCHED_FEAT_DEADLINE		= 512,
+#include "sched_features.h"
 };
 
+#undef SCHED_FEAT
+
+#define SCHED_FEAT(name, enabled)	\
+	(1UL << __SCHED_FEAT_##name) * enabled |
+
 const_debug unsigned int sysctl_sched_features =
-		SCHED_FEAT_NEW_FAIR_SLEEPERS	* 1 |
-		SCHED_FEAT_WAKEUP_PREEMPT	* 1 |
-		SCHED_FEAT_START_DEBIT		* 1 |
-		SCHED_FEAT_AFFINE_WAKEUPS	* 1 |
-		SCHED_FEAT_CACHE_HOT_BUDDY	* 1 |
-		SCHED_FEAT_SYNC_WAKEUPS		* 1 |
-		SCHED_FEAT_HRTICK		* 1 |
-		SCHED_FEAT_DOUBLE_TICK		* 0 |
-		SCHED_FEAT_NORMALIZED_SLEEPER	* 1 |
-		SCHED_FEAT_DEADLINE		* 1;
-
-#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
+#include "sched_features.h"
+	0;
+
+#undef SCHED_FEAT
+
+#ifdef CONFIG_SCHED_DEBUG
+#define SCHED_FEAT(name, enabled)	\
+	#name ,
+
+__read_mostly char *sched_feat_names[] = {
+#include "sched_features.h"
+	NULL
+};
+
+#undef SCHED_FEAT
+
+int sched_feat_open(struct inode *inode, struct file *filp)
+{
+	filp->private_data = inode->i_private;
+	return 0;
+}
+
+static ssize_t
+sched_feat_read(struct file *filp, char __user *ubuf,
+		size_t cnt, loff_t *ppos)
+{
+	char *buf;
+	int r = 0;
+	int len = 0;
+	int i;
+
+	for (i = 0; sched_feat_names[i]; i++) {
+		len += strlen(sched_feat_names[i]);
+		len += 4;
+	}
+
+	buf = kmalloc(len + 2, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	for (i = 0; sched_feat_names[i]; i++) {
+		if (sysctl_sched_features & (1UL << i))
+			r += sprintf(buf + r, "%s ", sched_feat_names[i]);
+		else
+			r += sprintf(buf + r, "no_%s ", sched_feat_names[i]);
+	}
+
+	r += sprintf(buf + r, "\n");
+	WARN_ON(r >= len + 2);
+
+	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+
+	kfree(buf);
+
+	return r;
+}
+
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+		size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	char *cmp = buf;
+	int neg = 0;
+	int i;
+
+	if (cnt > 63)
+		cnt = 63;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	if (strncmp(buf, "no_", 3) == 0) {
+		neg = 1;
+		cmp += 3;
+	}
+
+	for (i = 0; sched_feat_names[i]; i++) {
+		int len = strlen(sched_feat_names[i]);
+
+		if (strncmp(cmp, sched_feat_names[i], len) == 0) {
+			if (neg)
+				sysctl_sched_features &= ~(1UL << i);
+			else
+				sysctl_sched_features |= (1UL << i);
+			break;
+		}
+	}
+
+	if (!sched_feat_names[i])
+		return -EINVAL;
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static struct file_operations sched_feat_fops = {
+	.open	= sched_feat_open,
+	.read	= sched_feat_read,
+	.write	= sched_feat_write,
+};
+
+static __init int sched_init_debug(void)
+{
+	int i, j, len;
+
+	for (i = 0; sched_feat_names[i]; i++) {
+		len = strlen(sched_feat_names[i]);
+
+		for (j = 0; j < len; j++) {
+			sched_feat_names[i][j] =
+				tolower(sched_feat_names[i][j]);
+		}
+	}
+
+	debugfs_create_file("sched_features", 0644, NULL, NULL,
+			&sched_feat_fops);
+
+	return 0;
+}
+late_initcall(sched_init_debug);
+
+#endif
+
+#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 
 /*
  * Number of tasks to iterate in a single balance run.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
new file mode 100644
index 00000000000..1c7283cb958
--- /dev/null
+++ b/kernel/sched_features.h
@@ -0,0 +1,10 @@
+SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
+SCHED_FEAT(WAKEUP_PREEMPT, 1)
+SCHED_FEAT(START_DEBIT, 1)
+SCHED_FEAT(AFFINE_WAKEUPS, 1)
+SCHED_FEAT(CACHE_HOT_BUDDY, 1)
+SCHED_FEAT(SYNC_WAKEUPS, 1)
+SCHED_FEAT(HRTICK, 1)
+SCHED_FEAT(DOUBLE_TICK, 0)
+SCHED_FEAT(NORMALIZED_SLEEPER, 1)
+SCHED_FEAT(DEADLINE, 1)
-- 
cgit v1.2.3-70-g09d2


From c24b7c524421f9ea9d9ebab55f80cfb1f3fb77a3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 18 Apr 2008 10:55:34 +0200
Subject: sched: features fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b59a44e1ea4..57ba7ea9b74 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -792,7 +792,7 @@ sched_feat_read(struct file *filp, char __user *ubuf,
 		if (sysctl_sched_features & (1UL << i))
 			r += sprintf(buf + r, "%s ", sched_feat_names[i]);
 		else
-			r += sprintf(buf + r, "no_%s ", sched_feat_names[i]);
+			r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
 	}
 
 	r += sprintf(buf + r, "\n");
@@ -822,7 +822,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
 
 	buf[cnt] = 0;
 
-	if (strncmp(buf, "no_", 3) == 0) {
+	if (strncmp(buf, "NO_", 3) == 0) {
 		neg = 1;
 		cmp += 3;
 	}
@@ -855,17 +855,6 @@ static struct file_operations sched_feat_fops = {
 
 static __init int sched_init_debug(void)
 {
-	int i, j, len;
-
-	for (i = 0; sched_feat_names[i]; i++) {
-		len = strlen(sched_feat_names[i]);
-
-		for (j = 0; j < len; j++) {
-			sched_feat_names[i][j] =
-				tolower(sched_feat_names[i][j]);
-		}
-	}
-
 	debugfs_create_file("sched_features", 0644, NULL, NULL,
 			&sched_feat_fops);
 
-- 
cgit v1.2.3-70-g09d2


From 486fdae21458bd9f4e125099bb3c38a4064e450e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 19 Apr 2008 12:11:10 +0200
Subject: sched: build fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index be42548b67b..f3f4af4b8b0 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -119,7 +119,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	struct sched_entity *last;
 	unsigned long flags;
 
-#ifndef CONFIG_CGROUP_SCHED
+#if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED)
 	SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
 #else
 	char path[128] = "";
-- 
cgit v1.2.3-70-g09d2


From 884525655d07fdee9245716b998ecdc45cdd8007 Mon Sep 17 00:00:00 2001
From: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Date: Sun, 30 Mar 2008 19:50:14 +0400
Subject: PCI: clean up resource alignment management

Done per Linus' request and suggestions. Linus has explained that
better than I'll be able to explain:

On Thu, Mar 27, 2008 at 10:12:10AM -0700, Linus Torvalds wrote:
> Actually, before we go any further, there might be a less intrusive
> alternative: add just a couple of flags to the resource flags field (we
> still have something like 8 unused bits on 32-bit), and use those to
> implement a generic "resource_alignment()" routine.
>
> Two flags would do it:
>
>  - IORESOURCE_SIZEALIGN: size indicates alignment (regular PCI device
>    resources)
>
>  - IORESOURCE_STARTALIGN: start field is alignment (PCI bus resources
>    during probing)
>
> and then the case of both flags zero (or both bits set) would actually be
> "invalid", and we would also clear the IORESOURCE_STARTALIGN flag when we
> actually allocate the resource (so that we don't use the "start" field as
> alignment incorrectly when it no longer indicates alignment).
>
> That wouldn't be totally generic, but it would have the nice property of
> automatically at least add sanity checking for that whole "res->start has
> the odd meaning of 'alignment' during probing" and remove the need for a
> new field, and it would allow us to have a generic "resource_alignment()"
> routine that just gets a resource pointer.

Besides, I removed IORESOURCE_BUS_HAS_VGA flag which was unused for ages.

Signed-off-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Gary Hade <garyhade@us.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/probe.c     |  5 +++--
 drivers/pci/setup-bus.c |  3 +++
 drivers/pci/setup-res.c | 42 +++++++++++++++++++++++-------------------
 include/linux/ioport.h  |  5 ++++-
 kernel/resource.c       | 18 ++++++++++++++++++
 5 files changed, 51 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index c2e99fd87fa..33d9b8bea6e 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -235,7 +235,7 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
 			res->flags |= l & ~PCI_BASE_ADDRESS_IO_MASK;
 		}
 		res->end = res->start + (unsigned long) sz;
-		res->flags |= pci_calc_resource_flags(l);
+		res->flags |= pci_calc_resource_flags(l) | IORESOURCE_SIZEALIGN;
 		if (is_64bit_memory(l)) {
 			u32 szhi, lhi;
 
@@ -288,7 +288,8 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
 			if (sz) {
 				res->flags = (l & IORESOURCE_ROM_ENABLE) |
 				  IORESOURCE_MEM | IORESOURCE_PREFETCH |
-				  IORESOURCE_READONLY | IORESOURCE_CACHEABLE;
+				  IORESOURCE_READONLY | IORESOURCE_CACHEABLE |
+				  IORESOURCE_SIZEALIGN;
 				res->start = l & PCI_ROM_ADDRESS_MASK;
 				res->end = res->start + (unsigned long) sz;
 			}
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index f7cb8e0758b..5cf84568c9e 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -65,6 +65,7 @@ static void pbus_assign_resources_sorted(struct pci_bus *bus)
 		res = list->res;
 		idx = res - &list->dev->resource[0];
 		if (pci_assign_resource(list->dev, idx)) {
+			/* FIXME: get rid of this */
 			res->start = 0;
 			res->end = 0;
 			res->flags = 0;
@@ -327,6 +328,7 @@ static void pbus_size_io(struct pci_bus *bus)
 	/* Alignment of the IO window is always 4K */
 	b_res->start = 4096;
 	b_res->end = b_res->start + size - 1;
+	b_res->flags |= IORESOURCE_STARTALIGN;
 }
 
 /* Calculate the size of the bus and minimal alignment which
@@ -401,6 +403,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, unsigned long
 	}
 	b_res->start = min_align;
 	b_res->end = size + min_align - 1;
+	b_res->flags |= IORESOURCE_STARTALIGN;
 	return 1;
 }
 
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index bad509e40fb..7d35cdf4579 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -137,10 +137,16 @@ int pci_assign_resource(struct pci_dev *dev, int resno)
 
 	size = res->end - res->start + 1;
 	min = (res->flags & IORESOURCE_IO) ? PCIBIOS_MIN_IO : PCIBIOS_MIN_MEM;
-	/* The bridge resources are special, as their
-	   size != alignment. Sizing routines return
-	   required alignment in the "start" field. */
-	align = (resno < PCI_BRIDGE_RESOURCES) ? size : res->start;
+
+	align = resource_alignment(res);
+	if (!align) {
+		printk(KERN_ERR "PCI: Cannot allocate resource (bogus "
+			"alignment) %d [%llx:%llx] (flags %lx) of %s\n",
+			resno, (unsigned long long)res->start,
+			(unsigned long long)res->end, res->flags,
+			pci_name(dev));
+		return -EINVAL;
+	}
 
 	/* First, try exact prefetching match.. */
 	ret = pci_bus_alloc_resource(bus, res, size, align, min,
@@ -164,8 +170,10 @@ int pci_assign_resource(struct pci_dev *dev, int resno)
 			res->flags & IORESOURCE_IO ? "I/O" : "mem",
 			resno, (unsigned long long)size,
 			(unsigned long long)res->start, pci_name(dev));
-	} else if (resno < PCI_BRIDGE_RESOURCES) {
-		pci_update_resource(dev, res, resno);
+	} else {
+		res->flags &= ~IORESOURCE_STARTALIGN;
+		if (resno < PCI_BRIDGE_RESOURCES)
+			pci_update_resource(dev, res, resno);
 	}
 
 	return ret;
@@ -226,29 +234,25 @@ void pdev_sort_resources(struct pci_dev *dev, struct resource_list *head)
 		if (r->flags & IORESOURCE_PCI_FIXED)
 			continue;
 
-		r_align = r->end - r->start;
-		
 		if (!(r->flags) || r->parent)
 			continue;
+
+		r_align = resource_alignment(r);
 		if (!r_align) {
-			printk(KERN_WARNING "PCI: Ignore bogus resource %d "
-				"[%llx:%llx] of %s\n",
+			printk(KERN_WARNING "PCI: bogus alignment of resource "
+				"%d [%llx:%llx] (flags %lx) of %s\n",
 				i, (unsigned long long)r->start,
-				(unsigned long long)r->end, pci_name(dev));
+				(unsigned long long)r->end, r->flags,
+				pci_name(dev));
 			continue;
 		}
-		r_align = (i < PCI_BRIDGE_RESOURCES) ? r_align + 1 : r->start;
 		for (list = head; ; list = list->next) {
 			resource_size_t align = 0;
 			struct resource_list *ln = list->next;
-			int idx;
 
-			if (ln) {
-				idx = ln->res - &ln->dev->resource[0];
-				align = (idx < PCI_BRIDGE_RESOURCES) ?
-					ln->res->end - ln->res->start + 1 :
-					ln->res->start;
-			}
+			if (ln)
+				align = resource_alignment(ln->res);
+
 			if (r_align > align) {
 				tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
 				if (!tmp)
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 605d237364d..d5d40a9f792 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -44,7 +44,9 @@ struct resource_list {
 #define IORESOURCE_CACHEABLE	0x00004000
 #define IORESOURCE_RANGELENGTH	0x00008000
 #define IORESOURCE_SHADOWABLE	0x00010000
-#define IORESOURCE_BUS_HAS_VGA	0x00080000
+
+#define IORESOURCE_SIZEALIGN	0x00020000	/* size indicates alignment */
+#define IORESOURCE_STARTALIGN	0x00040000	/* start field is alignment */
 
 #define IORESOURCE_DISABLED	0x10000000
 #define IORESOURCE_UNSET	0x20000000
@@ -110,6 +112,7 @@ extern int allocate_resource(struct resource *root, struct resource *new,
 			     void *alignf_data);
 int adjust_resource(struct resource *res, resource_size_t start,
 		    resource_size_t size);
+resource_size_t resource_alignment(struct resource *res);
 
 /* Convenience shorthand with allocation */
 #define request_region(start,n,name)	__request_region(&ioport_resource, (start), (n), (name))
diff --git a/kernel/resource.c b/kernel/resource.c
index 82aea814d40..cee12cc47ca 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -486,6 +486,24 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
 
 EXPORT_SYMBOL(adjust_resource);
 
+/**
+ * resource_alignment - calculate resource's alignment
+ * @res: resource pointer
+ *
+ * Returns alignment on success, 0 (invalid alignment) on failure.
+ */
+resource_size_t resource_alignment(struct resource *res)
+{
+	switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) {
+	case IORESOURCE_SIZEALIGN:
+		return res->end - res->start + 1;
+	case IORESOURCE_STARTALIGN:
+		return res->start;
+	default:
+		return 0;
+	}
+}
+
 /*
  * This is compatibility stuff for IO resources.
  *
-- 
cgit v1.2.3-70-g09d2


From 833df317f9ada91488898b005f4641bb674a3bae Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Fri, 18 Apr 2008 13:38:58 -0700
Subject: clockevents: fix typo in tick-broadcast.c

braodcast -> broadcast

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index fdfa0c745bb..57a1f02e5ec 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -262,7 +262,7 @@ out:
 void tick_broadcast_on_off(unsigned long reason, int *oncpu)
 {
 	if (!cpu_isset(*oncpu, cpu_online_map))
-		printk(KERN_ERR "tick-braodcast: ignoring broadcast for "
+		printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
 		       "offline CPU #%d\n", *oncpu);
 	else
 		smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
-- 
cgit v1.2.3-70-g09d2


From 833883d9ac4cfb31c1c4419335e68e6895a05b6b Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Fri, 18 Apr 2008 13:39:00 -0700
Subject: hrtimer: reduce calls to hrtimer_get_softirq_time()

It seems that hrtimer_run_queues() is calling hrtimer_get_softirq_time() more
often than it needs to.  This can cause frequent contention on systems with
large numbers of processors/cores.

With this patch, hrtimer_run_queues only calls hrtimer_get_softirq_time() if
there is a pending timer in one of the hrtimer bases, and only once.

This also combines hrtimer_run_queues() and the inline run_hrtimer_queue()
into one function.

[ tglx@linutronix.de: coding style ]

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 64 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index c642ef75069..70d4adc7463 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1238,51 +1238,51 @@ void hrtimer_run_pending(void)
 /*
  * Called from hardirq context every jiffy
  */
-static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
-				     int index)
+void hrtimer_run_queues(void)
 {
 	struct rb_node *node;
-	struct hrtimer_clock_base *base = &cpu_base->clock_base[index];
+	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+	struct hrtimer_clock_base *base;
+	int index, gettime = 1;
 
-	if (!base->first)
+	if (hrtimer_hres_active())
 		return;
 
-	if (base->get_softirq_time)
-		base->softirq_time = base->get_softirq_time();
-
-	spin_lock(&cpu_base->lock);
+	for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
+		base = &cpu_base->clock_base[index];
 
-	while ((node = base->first)) {
-		struct hrtimer *timer;
-
-		timer = rb_entry(node, struct hrtimer, node);
-		if (base->softirq_time.tv64 <= timer->expires.tv64)
-			break;
-
-		if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
-			__remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0);
-			list_add_tail(&timer->cb_entry,
-					&base->cpu_base->cb_pending);
+		if (!base->first)
 			continue;
+
+		if (gettime) {
+			hrtimer_get_softirq_time(cpu_base);
+			gettime = 0;
 		}
 
-		__run_hrtimer(timer);
-	}
-	spin_unlock(&cpu_base->lock);
-}
+		if (base->get_softirq_time)
+			base->softirq_time = base->get_softirq_time();
 
-void hrtimer_run_queues(void)
-{
-	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
-	int i;
+		spin_lock(&cpu_base->lock);
 
-	if (hrtimer_hres_active())
-		return;
+		while ((node = base->first)) {
+			struct hrtimer *timer;
 
-	hrtimer_get_softirq_time(cpu_base);
+			timer = rb_entry(node, struct hrtimer, node);
+			if (base->softirq_time.tv64 <= timer->expires.tv64)
+				break;
 
-	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
-		run_hrtimer_queue(cpu_base, i);
+			if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
+				__remove_hrtimer(timer, base,
+					HRTIMER_STATE_PENDING, 0);
+				list_add_tail(&timer->cb_entry,
+					&base->cpu_base->cb_pending);
+				continue;
+			}
+
+			__run_hrtimer(timer);
+		}
+		spin_unlock(&cpu_base->lock);
+	}
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 259aae864ceeb2b34e7bafa1ce18d096a357fab2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 19 Apr 2008 21:31:26 +0200
Subject: hrtimer: optimize the softirq time optimization

The previous optimization did not take the case into account where a
clock provides its own softirq_get_time() function.

Check for the availablitiy of the clock get time function first and
then check if we need to retrieve the time for both clocks via
hrtimer_softirq_gettime() to avoid a double evaluation of time in that
case as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 70d4adc7463..f78777abe76 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1254,14 +1254,13 @@ void hrtimer_run_queues(void)
 		if (!base->first)
 			continue;
 
-		if (gettime) {
+		if (base->get_softirq_time)
+			base->softirq_time = base->get_softirq_time();
+		else if (gettime) {
 			hrtimer_get_softirq_time(cpu_base);
 			gettime = 0;
 		}
 
-		if (base->get_softirq_time)
-			base->softirq_time = base->get_softirq_time();
-
 		spin_lock(&cpu_base->lock);
 
 		while ((node = base->first)) {
-- 
cgit v1.2.3-70-g09d2


From f5264481c8049673e2cc8c7aca410931f571ba2d Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Mon, 21 Apr 2008 22:15:06 +0000
Subject: trivial: small cleanups

These are small cleanups all over the tree.

Trivial style and comment changes to
  fs/select.c, kernel/signal.c, kernel/stop_machine.c & mm/pdflush.c

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
---
 fs/select.c           | 2 +-
 kernel/signal.c       | 4 ++--
 kernel/stop_machine.c | 3 +--
 mm/pdflush.c          | 4 ++--
 4 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/fs/select.c b/fs/select.c
index 5633fe98078..00f58c5c7e0 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -260,7 +260,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 		wait = NULL;
 		if (retval || !*timeout || signal_pending(current))
 			break;
-		if(table.error) {
+		if (table.error) {
 			retval = table.error;
 			break;
 		}
diff --git a/kernel/signal.c b/kernel/signal.c
index cc8303cd093..64ad0ed1599 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -220,7 +220,7 @@ void flush_signals(struct task_struct *t)
 	unsigned long flags;
 
 	spin_lock_irqsave(&t->sighand->siglock, flags);
-	clear_tsk_thread_flag(t,TIF_SIGPENDING);
+	clear_tsk_thread_flag(t, TIF_SIGPENDING);
 	flush_sigqueue(&t->pending);
 	flush_sigqueue(&t->signal->shared_pending);
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
@@ -424,7 +424,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 	}
 	if (signr &&
 	     ((info->si_code & __SI_MASK) == __SI_TIMER) &&
-	     info->si_sys_private){
+	     info->si_sys_private) {
 		/*
 		 * Release the siglock to ensure proper locking order
 		 * of timer locks outside of siglocks.  Note, we leave
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 6f4e0e13f70..316283cb60c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -135,8 +135,7 @@ static void restart_machine(void)
 	preempt_enable_no_resched();
 }
 
-struct stop_machine_data
-{
+struct stop_machine_data {
 	int (*fn)(void *);
 	void *data;
 	struct completion done;
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 8f6ee073c0e..3931f716454 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -17,8 +17,8 @@
 #include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/fs.h>		// Needed by writeback.h
-#include <linux/writeback.h>	// Prototypes pdflush_operation()
+#include <linux/fs.h>		/* Needed by writeback.h	  */
+#include <linux/writeback.h>	/* Prototypes pdflush_operation() */
 #include <linux/kthread.h>
 #include <linux/cpuset.h>
 #include <linux/freezer.h>
-- 
cgit v1.2.3-70-g09d2


From e16b27816462de700f9508d86954410c41105dc2 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Sun, 20 Apr 2008 13:10:12 -0700
Subject: ptrace: compat_ptrace_request siginfo

This adds support for PTRACE_GETSIGINFO and PTRACE_SETSIGINFO in
compat_ptrace_request.  It relies on existing arch definitions for
copy_siginfo_to_user32 and copy_siginfo_from_user32.

On powerpc, this fixes a longstanding regression of 32-bit ptrace
calls on 64-bit kernels vs native calls (64-bit calls or 32-bit
kernels).  This can be seen in a 32-bit call using PTRACE_GETSIGINFO
to examine e.g. siginfo_t.si_addr from a signal that sets it.
(This was broken as of 2.6.24 and, I presume, many or all prior versions.)

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 48 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 35 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index fdb34e86f92..67e392ed549 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -323,9 +323,8 @@ static int ptrace_setoptions(struct task_struct *child, long data)
 	return (data & ~PTRACE_O_MASK) ? -EINVAL : 0;
 }
 
-static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data)
+static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
 {
-	siginfo_t lastinfo;
 	int error = -ESRCH;
 
 	read_lock(&tasklist_lock);
@@ -333,31 +332,25 @@ static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data)
 		error = -EINVAL;
 		spin_lock_irq(&child->sighand->siglock);
 		if (likely(child->last_siginfo != NULL)) {
-			lastinfo = *child->last_siginfo;
+			*info = *child->last_siginfo;
 			error = 0;
 		}
 		spin_unlock_irq(&child->sighand->siglock);
 	}
 	read_unlock(&tasklist_lock);
-	if (!error)
-		return copy_siginfo_to_user(data, &lastinfo);
 	return error;
 }
 
-static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data)
+static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
 {
-	siginfo_t newinfo;
 	int error = -ESRCH;
 
-	if (copy_from_user(&newinfo, data, sizeof (siginfo_t)))
-		return -EFAULT;
-
 	read_lock(&tasklist_lock);
 	if (likely(child->sighand != NULL)) {
 		error = -EINVAL;
 		spin_lock_irq(&child->sighand->siglock);
 		if (likely(child->last_siginfo != NULL)) {
-			*child->last_siginfo = newinfo;
+			*child->last_siginfo = *info;
 			error = 0;
 		}
 		spin_unlock_irq(&child->sighand->siglock);
@@ -424,6 +417,7 @@ int ptrace_request(struct task_struct *child, long request,
 		   long addr, long data)
 {
 	int ret = -EIO;
+	siginfo_t siginfo;
 
 	switch (request) {
 	case PTRACE_PEEKTEXT:
@@ -442,12 +436,22 @@ int ptrace_request(struct task_struct *child, long request,
 	case PTRACE_GETEVENTMSG:
 		ret = put_user(child->ptrace_message, (unsigned long __user *) data);
 		break;
+
 	case PTRACE_GETSIGINFO:
-		ret = ptrace_getsiginfo(child, (siginfo_t __user *) data);
+		ret = ptrace_getsiginfo(child, &siginfo);
+		if (!ret)
+			ret = copy_siginfo_to_user((siginfo_t __user *) data,
+						   &siginfo);
 		break;
+
 	case PTRACE_SETSIGINFO:
-		ret = ptrace_setsiginfo(child, (siginfo_t __user *) data);
+		if (copy_from_user(&siginfo, (siginfo_t __user *) data,
+				   sizeof siginfo))
+			ret = -EFAULT;
+		else
+			ret = ptrace_setsiginfo(child, &siginfo);
 		break;
+
 	case PTRACE_DETACH:	 /* detach a process that was attached. */
 		ret = ptrace_detach(child, data);
 		break;
@@ -616,6 +620,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
 {
 	compat_ulong_t __user *datap = compat_ptr(data);
 	compat_ulong_t word;
+	siginfo_t siginfo;
 	int ret;
 
 	switch (request) {
@@ -638,6 +643,23 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
 		ret = put_user((compat_ulong_t) child->ptrace_message, datap);
 		break;
 
+	case PTRACE_GETSIGINFO:
+		ret = ptrace_getsiginfo(child, &siginfo);
+		if (!ret)
+			ret = copy_siginfo_to_user32(
+				(struct compat_siginfo __user *) datap,
+				&siginfo);
+		break;
+
+	case PTRACE_SETSIGINFO:
+		memset(&siginfo, 0, sizeof siginfo);
+		if (copy_siginfo_from_user32(
+			    &siginfo, (struct compat_siginfo __user *) datap))
+			ret = -EFAULT;
+		else
+			ret = ptrace_setsiginfo(child, &siginfo);
+		break;
+
 	default:
 		ret = ptrace_request(child, request, addr, data);
 	}
-- 
cgit v1.2.3-70-g09d2


From 7c3f944e29c02d71e13442e977cf4cec19c39e98 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Mon, 21 Apr 2008 19:45:12 -0700
Subject: time: Export set_normalized_timespec.

Sorry I have just realized set_normalized_timespec() (used in
timespec_sub()) is not exported, and link will fail because of it...

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/time.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index a5ec013b6c8..35d373a9878 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -379,6 +379,7 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
 	ts->tv_sec = sec;
 	ts->tv_nsec = nsec;
 }
+EXPORT_SYMBOL(set_normalized_timespec);
 
 /**
  * ns_to_timespec - Convert nanoseconds to timespec
-- 
cgit v1.2.3-70-g09d2


From 73486722b70a83bba17be722677519b0855abedf Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 22 Apr 2008 10:07:22 -0700
Subject: kernel-doc: fix sched.c missing parameter

Add missing kernel-doc in kernel/sched.c:

Warning(linux-2.6.25-git3//kernel/sched.c:7044): No description found for parameter 'span'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 57ba7ea9b74..0014b03adac 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7035,6 +7035,7 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
 /**
  * sched_domain_node_span - get a cpumask for a node's sched_domain
  * @node: node whose cpumask we're constructing
+ * @span: resulting cpumask
  *
  * Given a node, construct a good cpumask for its sched_domain to span. It
  * should be one that prevents unnecessary balancing, but also spreads tasks
-- 
cgit v1.2.3-70-g09d2


From 1ec7f1ddbe5ba49f7b10c3b129d6d5c90c43526c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Apr 2008 05:35:42 -0400
Subject: [PATCH] get rid of __exit_files(), __exit_fs() and __put_fs_struct()

The only reason to have separated __...() for those was to keep them inlined
for local users in exit.c.  Since Alexey removed the inline on those, there's
no reason whatsoever to keep them around; just collapse with normal variants.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/exit.c | 27 ++++++---------------------
 1 file changed, 6 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 073005b1cfb..cece89f80ab 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -521,7 +521,7 @@ void reset_files_struct(struct task_struct *tsk, struct files_struct *files)
 }
 EXPORT_SYMBOL(reset_files_struct);
 
-static void __exit_files(struct task_struct *tsk)
+void exit_files(struct task_struct *tsk)
 {
 	struct files_struct * files = tsk->files;
 
@@ -533,12 +533,7 @@ static void __exit_files(struct task_struct *tsk)
 	}
 }
 
-void exit_files(struct task_struct *tsk)
-{
-	__exit_files(tsk);
-}
-
-static void __put_fs_struct(struct fs_struct *fs)
+void put_fs_struct(struct fs_struct *fs)
 {
 	/* No need to hold fs->lock if we are killing it */
 	if (atomic_dec_and_test(&fs->count)) {
@@ -550,12 +545,7 @@ static void __put_fs_struct(struct fs_struct *fs)
 	}
 }
 
-void put_fs_struct(struct fs_struct *fs)
-{
-	__put_fs_struct(fs);
-}
-
-static void __exit_fs(struct task_struct *tsk)
+void exit_fs(struct task_struct *tsk)
 {
 	struct fs_struct * fs = tsk->fs;
 
@@ -563,15 +553,10 @@ static void __exit_fs(struct task_struct *tsk)
 		task_lock(tsk);
 		tsk->fs = NULL;
 		task_unlock(tsk);
-		__put_fs_struct(fs);
+		put_fs_struct(fs);
 	}
 }
 
-void exit_fs(struct task_struct *tsk)
-{
-	__exit_fs(tsk);
-}
-
 EXPORT_SYMBOL_GPL(exit_fs);
 
 /*
@@ -967,8 +952,8 @@ NORET_TYPE void do_exit(long code)
 	if (group_dead)
 		acct_process();
 	exit_sem(tsk);
-	__exit_files(tsk);
-	__exit_fs(tsk);
+	exit_files(tsk);
+	exit_fs(tsk);
 	check_stack_usage();
 	exit_thread();
 	cgroup_exit(tsk, 1);
-- 
cgit v1.2.3-70-g09d2


From 472613b961affef0c73f1c797993678312e7c666 Mon Sep 17 00:00:00 2001
From: Russ Anderson <rja@sgi.com>
Date: Thu, 24 Apr 2008 13:16:59 -0500
Subject: [IA64] fix bootmem regression on Altix

A recent change prevents SGI Altix from booting.
This patch fixes the problem.

The regresson was introduced in commit 434d53b00d6bb7be0a1d3dcc0d0d5df6c042e164

Signed-off-by: Russ Anderson <rja@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0014b03adac..09ca69b2c17 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8128,7 +8128,7 @@ void __init sched_init(void)
 	 * we use alloc_bootmem().
 	 */
 	if (alloc_size) {
-		ptr = (unsigned long)alloc_bootmem_low(alloc_size);
+		ptr = (unsigned long)alloc_bootmem(alloc_size);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		init_task_group.se = (struct sched_entity **)ptr;
-- 
cgit v1.2.3-70-g09d2


From 03970f065d9b4b156d0e879f82989440f7045396 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Tue, 22 Apr 2008 10:04:26 -0700
Subject: [PATCH] Build fix for CONFIG_NUMA=y && CONFIG_SMP=n

Regression caused by 434d53b00d6bb7be0a1d3dcc0d0d5df6c042e164

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 kernel/sched.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 09ca69b2c17..781870da598 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7991,11 +7991,6 @@ void __init sched_init_smp(void)
 #else
 void __init sched_init_smp(void)
 {
-#if defined(CONFIG_NUMA)
-	sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
-								GFP_KERNEL);
-	BUG_ON(sched_group_nodes_bycpu == NULL);
-#endif
 	sched_init_granularity();
 }
 #endif /* CONFIG_SMP */
-- 
cgit v1.2.3-70-g09d2


From 126e01bf92dfc5f0ba91e88be02c473e1506d7d9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Apr 2008 00:25:08 +0200
Subject: softlockup: fix NOHZ wakeup

David Miller reported:

|--------------->
the following commit:

| commit 27ec4407790d075c325e1f4da0a19c56953cce23
| Author: Ingo Molnar <mingo@elte.hu>
| Date:   Thu Feb 28 21:00:21 2008 +0100
|
|     sched: make cpu_clock() globally synchronous
|
|     Alexey Zaytsev reported (and bisected) that the introduction of
|     cpu_clock() in printk made the timestamps jump back and forth.
|
|     Make cpu_clock() more reliable while still keeping it fast when it's
|     called frequently.
|
|     Signed-off-by: Ingo Molnar <mingo@elte.hu>

causes watchdog triggers when a cpu exits NOHZ state when it has been
there for >= the soft lockup threshold, for example here are some
messages from a 128 cpu Niagara2 box:

[  168.106406] BUG: soft lockup - CPU#11 stuck for 128s! [dd:3239]
[  168.989592] BUG: soft lockup - CPU#21 stuck for 86s! [swapper:0]
[  168.999587] BUG: soft lockup - CPU#29 stuck for 91s! [make:4511]
[  168.999615] BUG: soft lockup - CPU#2 stuck for 85s! [swapper:0]
[  169.020514] BUG: soft lockup - CPU#37 stuck for 91s! [swapper:0]
[  169.020514] BUG: soft lockup - CPU#45 stuck for 91s! [sh:4515]
[  169.020515] BUG: soft lockup - CPU#69 stuck for 92s! [swapper:0]
[  169.020515] BUG: soft lockup - CPU#77 stuck for 92s! [swapper:0]
[  169.020515] BUG: soft lockup - CPU#61 stuck for 92s! [swapper:0]
[  169.112554] BUG: soft lockup - CPU#85 stuck for 92s! [swapper:0]
[  169.112554] BUG: soft lockup - CPU#101 stuck for 92s! [swapper:0]
[  169.112554] BUG: soft lockup - CPU#109 stuck for 92s! [swapper:0]
[  169.112554] BUG: soft lockup - CPU#117 stuck for 92s! [swapper:0]
[  169.171483] BUG: soft lockup - CPU#40 stuck for 80s! [dd:3239]
[  169.331483] BUG: soft lockup - CPU#13 stuck for 86s! [swapper:0]
[  169.351500] BUG: soft lockup - CPU#43 stuck for 101s! [dd:3239]
[  169.531482] BUG: soft lockup - CPU#9 stuck for 129s! [mkdir:4565]
[  169.595754] BUG: soft lockup - CPU#20 stuck for 93s! [swapper:0]
[  169.626787] BUG: soft lockup - CPU#52 stuck for 93s! [swapper:0]
[  169.626787] BUG: soft lockup - CPU#84 stuck for 92s! [swapper:0]
[  169.636812] BUG: soft lockup - CPU#116 stuck for 94s! [swapper:0]

It's simple enough to trigger this by doing a 10 minute sleep after a
fresh bootup then starting a parallel kernel build.

I suspect this might be reintroducing a problem we've had and fixed
before, see the thread:

http://marc.info/?l=linux-kernel&m=119546414004065&w=2
<---------------|

touch the softlockup watchdog when exiting NOHZ state - we are
obviously not locked up.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-sched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d358d4e3a95..b854a895591 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -393,6 +393,7 @@ void tick_nohz_restart_sched_tick(void)
 		sub_preempt_count(HARDIRQ_OFFSET);
 	}
 
+	touch_softlockup_watchdog();
 	/*
 	 * Cancel the scheduled timer and restore the tick
 	 */
-- 
cgit v1.2.3-70-g09d2


From 3f5087a2bae5d1ce10a3d698dec8f879a96f5419 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 25 Apr 2008 00:25:08 +0200
Subject: sched: fix share (re)distribution

fix __aggregate_redistribute_shares() related lockup reported by
David S. Miller.

The problem this code tries to solve is 'accurately' calculating the 'fair'
share of the group weight for each cpu. The current code falls back to a global
group rebalance in case the sched_domain's span it looks at has no shares, but
does have tasks.

The reason it gets stuck here, is because its inherently racy - if someone
steals the last task after we compute the agg->rq_weight, but before we
rebalance, we'll never get out of the loop.

We could of course go fix that, but while looking at this issue I found that
this 'fallback' wasn't nearly as rare as I'd hoped it to be. In fact its quite
common - and given it walks the whole machine, thats very bad.

The new approach is simple (why didn't I think of it before?), we set the
aggregate shares to the full task group weight, and each larger sched domain
that encounters an aggregate shares larger than the weight, clips it (it
already re-distributes anyway).

This nicely converges to the desired global picture where the sum of all
shares equals the task group weight.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 47 ++---------------------------------------------
 1 file changed, 2 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0014b03adac..85e1721594f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1656,42 +1656,6 @@ void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
 	aggregate(tg, sd)->task_weight = task_weight;
 }
 
-/*
- * Redistribute tg->shares amongst all tg->cfs_rq[]s.
- */
-static void __aggregate_redistribute_shares(struct task_group *tg)
-{
-	int i, max_cpu = smp_processor_id();
-	unsigned long rq_weight = 0;
-	unsigned long shares, max_shares = 0, shares_rem = tg->shares;
-
-	for_each_possible_cpu(i)
-		rq_weight += tg->cfs_rq[i]->load.weight;
-
-	for_each_possible_cpu(i) {
-		/*
-		 * divide shares proportional to the rq_weights.
-		 */
-		shares = tg->shares * tg->cfs_rq[i]->load.weight;
-		shares /= rq_weight + 1;
-
-		tg->cfs_rq[i]->shares = shares;
-
-		if (shares > max_shares) {
-			max_shares = shares;
-			max_cpu = i;
-		}
-		shares_rem -= shares;
-	}
-
-	/*
-	 * Ensure it all adds up to tg->shares; we can loose a few
-	 * due to rounding down when computing the per-cpu shares.
-	 */
-	if (shares_rem)
-		tg->cfs_rq[max_cpu]->shares += shares_rem;
-}
-
 /*
  * Compute the weight of this group on the given cpus.
  */
@@ -1701,18 +1665,11 @@ void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
 	unsigned long shares = 0;
 	int i;
 
-again:
 	for_each_cpu_mask(i, sd->span)
 		shares += tg->cfs_rq[i]->shares;
 
-	/*
-	 * When the span doesn't have any shares assigned, but does have
-	 * tasks to run do a machine wide rebalance (should be rare).
-	 */
-	if (unlikely(!shares && aggregate(tg, sd)->rq_weight)) {
-		__aggregate_redistribute_shares(tg);
-		goto again;
-	}
+	if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
+		shares = tg->shares;
 
 	aggregate(tg, sd)->shares = shares;
 }
-- 
cgit v1.2.3-70-g09d2


From 5a9d3225a0d7060bdf3a18018992dc8cef958425 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Thu, 24 Apr 2008 20:46:20 -0700
Subject: sched: use alloc_bootmem() instead of alloc_bootmem_low()

There is no guarantee that there is physical ram below 4GB, and in
fact many boxes don't have exactly that.

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0014b03adac..09ca69b2c17 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8128,7 +8128,7 @@ void __init sched_init(void)
 	 * we use alloc_bootmem().
 	 */
 	if (alloc_size) {
-		ptr = (unsigned long)alloc_bootmem_low(alloc_size);
+		ptr = (unsigned long)alloc_bootmem(alloc_size);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		init_task_group.se = (struct sched_entity **)ptr;
-- 
cgit v1.2.3-70-g09d2


From 6b335d9c80d7f3c2a3f6545f664ae9007a0f3821 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Apr 2008 04:45:46 -0400
Subject: [PATCH] close race in unshare_files()

updating current->files requires task_lock

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/fork.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 89fe414645e..76f05a08062 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -805,12 +805,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 		goto out;
 	}
 
-	/*
-	 * Note: we may be using current for both targets (See exec.c)
-	 * This works because we cache current->files (old) as oldf. Don't
-	 * break this.
-	 */
-	tsk->files = NULL;
 	newf = dup_fd(oldf, &error);
 	if (!newf)
 		goto out;
@@ -855,7 +849,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 int unshare_files(void)
 {
 	struct files_struct *files  = current->files;
-	int rc;
+	struct files_struct *newf;
+	int error = 0;
 
 	BUG_ON(!files);
 
@@ -866,10 +861,13 @@ int unshare_files(void)
 		atomic_inc(&files->count);
 		return 0;
 	}
-	rc = copy_files(0, current);
-	if(rc)
-		current->files = files;
-	return rc;
+	newf = dup_fd(files, &error);
+	if (newf) {
+		task_lock(current);
+		current->files = newf;
+		task_unlock(current);
+	}
+	return error;
 }
 
 EXPORT_SYMBOL(unshare_files);
-- 
cgit v1.2.3-70-g09d2


From fd8328be874f4190a811c58cd4778ec2c74d2c05 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Apr 2008 05:11:59 -0400
Subject: [PATCH] sanitize handling of shared descriptor tables in failing
 execve()

* unshare_files() can fail; doing it after irreversible actions is wrong
  and de_thread() is certainly irreversible.
* since we do it unconditionally anyway, we might as well do it in do_execve()
  and save ourselves the PITA in binfmt handlers, etc.
* while we are at it, binfmt_som actually leaked files_struct on failure.

As a side benefit, unshare_files(), put_files_struct() and reset_files_struct()
become unexported.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf.c  | 23 +----------------------
 fs/binfmt_misc.c | 18 +-----------------
 fs/binfmt_som.c  | 10 ----------
 fs/exec.c        | 34 ++++++++++++++++++----------------
 kernel/exit.c    |  3 ---
 kernel/fork.c    |  2 --
 6 files changed, 20 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 5e1a4fb5cac..9924581df6f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -543,7 +543,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	unsigned long interp_load_addr = 0;
 	unsigned long start_code, end_code, start_data, end_data;
 	unsigned long reloc_func_desc = 0;
-	struct files_struct *files;
 	int executable_stack = EXSTACK_DEFAULT;
 	unsigned long def_flags = 0;
 	struct {
@@ -593,20 +592,9 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 		goto out_free_ph;
 	}
 
-	files = current->files;	/* Refcounted so ok */
-	retval = unshare_files();
-	if (retval < 0)
-		goto out_free_ph;
-	if (files == current->files) {
-		put_files_struct(files);
-		files = NULL;
-	}
-
-	/* exec will make our files private anyway, but for the a.out
-	   loader stuff we need to do it earlier */
 	retval = get_unused_fd();
 	if (retval < 0)
-		goto out_free_fh;
+		goto out_free_ph;
 	get_file(bprm->file);
 	fd_install(elf_exec_fileno = retval, bprm->file);
 
@@ -728,12 +716,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	if (retval)
 		goto out_free_dentry;
 
-	/* Discard our unneeded old files struct */
-	if (files) {
-		put_files_struct(files);
-		files = NULL;
-	}
-
 	/* OK, This is the point of no return */
 	current->flags &= ~PF_FORKNOEXEC;
 	current->mm->def_flags = def_flags;
@@ -1016,9 +998,6 @@ out_free_interp:
 	kfree(elf_interpreter);
 out_free_file:
 	sys_close(elf_exec_fileno);
-out_free_fh:
-	if (files)
-		reset_files_struct(current, files);
 out_free_ph:
 	kfree(elf_phdata);
 	goto out;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index b53c7e5f41b..dbf0ac0523d 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -110,7 +110,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	char *iname_addr = iname;
 	int retval;
 	int fd_binary = -1;
-	struct files_struct *files = NULL;
 
 	retval = -ENOEXEC;
 	if (!enabled)
@@ -133,21 +132,13 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 
 	if (fmt->flags & MISC_FMT_OPEN_BINARY) {
 
-		files = current->files;
-		retval = unshare_files();
-		if (retval < 0)
-			goto _ret;
-		if (files == current->files) {
-			put_files_struct(files);
-			files = NULL;
-		}
 		/* if the binary should be opened on behalf of the
 		 * interpreter than keep it open and assign descriptor
 		 * to it */
  		fd_binary = get_unused_fd();
  		if (fd_binary < 0) {
  			retval = fd_binary;
- 			goto _unshare;
+ 			goto _ret;
  		}
  		fd_install(fd_binary, bprm->file);
 
@@ -205,10 +196,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	if (retval < 0)
 		goto _error;
 
-	if (files) {
-		put_files_struct(files);
-		files = NULL;
-	}
 _ret:
 	return retval;
 _error:
@@ -216,9 +203,6 @@ _error:
 		sys_close(fd_binary);
 	bprm->interp_flags = 0;
 	bprm->interp_data = 0;
-_unshare:
-	if (files)
-		reset_files_struct(current, files);
 	goto _ret;
 }
 
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 14c63527c76..fdc36bfd6a7 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -194,7 +194,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	unsigned long som_entry;
 	struct som_hdr *som_ex;
 	struct som_exec_auxhdr *hpuxhdr;
-	struct files_struct *files;
 
 	/* Get the exec-header */
 	som_ex = (struct som_hdr *) bprm->buf;
@@ -221,15 +220,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		goto out_free;
 	}
 
-	files = current->files; /* Refcounted so ok */
-	retval = unshare_files();
-	if (retval < 0)
-		goto out_free;
-	if (files == current->files) {
-		put_files_struct(files);
-		files = NULL;
-	}
-
 	retval = get_unused_fd();
 	if (retval < 0)
 		goto out_free;
diff --git a/fs/exec.c b/fs/exec.c
index 54a0a557b67..475543002f1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -953,7 +953,6 @@ int flush_old_exec(struct linux_binprm * bprm)
 {
 	char * name;
 	int i, ch, retval;
-	struct files_struct *files;
 	char tcomm[sizeof(current->comm)];
 
 	/*
@@ -964,27 +963,16 @@ int flush_old_exec(struct linux_binprm * bprm)
 	if (retval)
 		goto out;
 
-	/*
-	 * Make sure we have private file handles. Ask the
-	 * fork helper to do the work for us and the exit
-	 * helper to do the cleanup of the old one.
-	 */
-	files = current->files;		/* refcounted so safe to hold */
-	retval = unshare_files();
-	if (retval)
-		goto out;
 	/*
 	 * Release all of the old mmap stuff
 	 */
 	retval = exec_mmap(bprm->mm);
 	if (retval)
-		goto mmap_failed;
+		goto out;
 
 	bprm->mm = NULL;		/* We're using it now */
 
 	/* This is the point of no return */
-	put_files_struct(files);
-
 	current->sas_ss_sp = current->sas_ss_size = 0;
 
 	if (current->euid == current->uid && current->egid == current->gid)
@@ -1034,8 +1022,6 @@ int flush_old_exec(struct linux_binprm * bprm)
 
 	return 0;
 
-mmap_failed:
-	reset_files_struct(current, files);
 out:
 	return retval;
 }
@@ -1283,12 +1269,23 @@ int do_execve(char * filename,
 	struct linux_binprm *bprm;
 	struct file *file;
 	unsigned long env_p;
+	struct files_struct *files;
 	int retval;
 
+	files = current->files;
+	retval = unshare_files();
+	if (retval)
+		goto out_ret;
+
+	if (files == current->files) {
+		put_files_struct(files);
+		files = NULL;
+	}
+
 	retval = -ENOMEM;
 	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
 	if (!bprm)
-		goto out_ret;
+		goto out_files;
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
@@ -1343,6 +1340,8 @@ int do_execve(char * filename,
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
 		kfree(bprm);
+		if (files)
+			put_files_struct(files);
 		return retval;
 	}
 
@@ -1363,6 +1362,9 @@ out_file:
 out_kfree:
 	kfree(bprm);
 
+out_files:
+	if (files)
+		reset_files_struct(current, files);
 out_ret:
 	return retval;
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index cece89f80ab..3d320003cc0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -507,8 +507,6 @@ void put_files_struct(struct files_struct *files)
 	}
 }
 
-EXPORT_SYMBOL(put_files_struct);
-
 void reset_files_struct(struct task_struct *tsk, struct files_struct *files)
 {
 	struct files_struct *old;
@@ -519,7 +517,6 @@ void reset_files_struct(struct task_struct *tsk, struct files_struct *files)
 	task_unlock(tsk);
 	put_files_struct(old);
 }
-EXPORT_SYMBOL(reset_files_struct);
 
 void exit_files(struct task_struct *tsk)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 76f05a08062..2fc11f2e2b2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -870,8 +870,6 @@ int unshare_files(void)
 	return error;
 }
 
-EXPORT_SYMBOL(unshare_files);
-
 static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
 {
 	struct sighand_struct *sig;
-- 
cgit v1.2.3-70-g09d2


From 3b1253880b7a9e6db54b943b2d40bcf2202f58ab Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Apr 2008 05:31:30 -0400
Subject: [PATCH] sanitize unshare_files/reset_files_struct

* let unshare_files() give caller the displaced files_struct
* don't bother with grabbing reference only to drop it in the
  caller if it hadn't been shared in the first place
* in that form unshare_files() is trivially implemented via
  unshare_fd(), so we eliminate the duplicate logics in fork.c
* reset_files_struct() is not just only called for current;
  it will break the system if somebody ever calls it for anything
  else (we can't modify ->files of somebody else).  Lose the
  task_struct * argument.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exec.c            | 18 ++++++------------
 include/linux/file.h |  3 ++-
 include/linux/fs.h   |  3 ---
 kernel/exit.c        |  3 ++-
 kernel/fork.c        | 54 +++++++++++++++++++++++-----------------------------
 5 files changed, 34 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 475543002f1..b152029f18f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1269,19 +1269,13 @@ int do_execve(char * filename,
 	struct linux_binprm *bprm;
 	struct file *file;
 	unsigned long env_p;
-	struct files_struct *files;
+	struct files_struct *displaced;
 	int retval;
 
-	files = current->files;
-	retval = unshare_files();
+	retval = unshare_files(&displaced);
 	if (retval)
 		goto out_ret;
 
-	if (files == current->files) {
-		put_files_struct(files);
-		files = NULL;
-	}
-
 	retval = -ENOMEM;
 	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
 	if (!bprm)
@@ -1340,8 +1334,8 @@ int do_execve(char * filename,
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
 		kfree(bprm);
-		if (files)
-			put_files_struct(files);
+		if (displaced)
+			put_files_struct(displaced);
 		return retval;
 	}
 
@@ -1363,8 +1357,8 @@ out_kfree:
 	kfree(bprm);
 
 out_files:
-	if (files)
-		reset_files_struct(current, files);
+	if (displaced)
+		reset_files_struct(displaced);
 out_ret:
 	return retval;
 }
diff --git a/include/linux/file.h b/include/linux/file.h
index 653477021e4..69baf5a4f0a 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -117,7 +117,8 @@ struct task_struct;
 
 struct files_struct *get_files_struct(struct task_struct *);
 void put_files_struct(struct files_struct *fs);
-void reset_files_struct(struct task_struct *, struct files_struct *);
+void reset_files_struct(struct files_struct *);
+int unshare_files(struct files_struct **);
 
 extern struct kmem_cache *files_cachep;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ad41d0bbcb4..e057438a05a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2033,9 +2033,6 @@ static inline ino_t parent_ino(struct dentry *dentry)
 	return res;
 }
 
-/* kernel/fork.c */
-extern int unshare_files(void);
-
 /* Transaction based IO helpers */
 
 /*
diff --git a/kernel/exit.c b/kernel/exit.c
index 3d320003cc0..97f609f574b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -507,8 +507,9 @@ void put_files_struct(struct files_struct *files)
 	}
 }
 
-void reset_files_struct(struct task_struct *tsk, struct files_struct *files)
+void reset_files_struct(struct files_struct *files)
 {
+	struct task_struct *tsk = current;
 	struct files_struct *old;
 
 	old = tsk->files;
diff --git a/kernel/fork.c b/kernel/fork.c
index 2fc11f2e2b2..efb618fc8ff 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -840,36 +840,6 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
 	return 0;
 }
 
-/*
- *	Helper to unshare the files of the current task.
- *	We don't want to expose copy_files internals to
- *	the exec layer of the kernel.
- */
-
-int unshare_files(void)
-{
-	struct files_struct *files  = current->files;
-	struct files_struct *newf;
-	int error = 0;
-
-	BUG_ON(!files);
-
-	/* This can race but the race causes us to copy when we don't
-	   need to and drop the copy */
-	if(atomic_read(&files->count) == 1)
-	{
-		atomic_inc(&files->count);
-		return 0;
-	}
-	newf = dup_fd(files, &error);
-	if (newf) {
-		task_lock(current);
-		current->files = newf;
-		task_unlock(current);
-	}
-	return error;
-}
-
 static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
 {
 	struct sighand_struct *sig;
@@ -1807,3 +1777,27 @@ bad_unshare_cleanup_thread:
 bad_unshare_out:
 	return err;
 }
+
+/*
+ *	Helper to unshare the files of the current task.
+ *	We don't want to expose copy_files internals to
+ *	the exec layer of the kernel.
+ */
+
+int unshare_files(struct files_struct **displaced)
+{
+	struct task_struct *task = current;
+	struct files_struct *copy;
+	int error;
+
+	error = unshare_fd(CLONE_FILES, &copy);
+	if (error || !copy) {
+		*displaced = NULL;
+		return error;
+	}
+	*displaced = task->files;
+	task_lock(task);
+	task->files = copy;
+	task_unlock(task);
+	return 0;
+}
-- 
cgit v1.2.3-70-g09d2


From 50704516f334d5036c09b0ecc0064598f7c5596f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Sat, 26 Apr 2008 05:25:00 +0100
Subject: Fix uninitialized 'copy' in unshare_files

	Arrgghhh...

Sorry about that, I'd been sure I'd folded that one, but it actually got
lost.  Please apply - that breaks execve().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index efb618fc8ff..cb46befdd3a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1787,7 +1787,7 @@ bad_unshare_out:
 int unshare_files(struct files_struct **displaced)
 {
 	struct task_struct *task = current;
-	struct files_struct *copy;
+	struct files_struct *copy = NULL;
 	int error;
 
 	error = unshare_fd(CLONE_FILES, &copy);
-- 
cgit v1.2.3-70-g09d2


From 402b08622d9ac6e32e25289573272e0f21bb58a7 Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Tue, 25 Mar 2008 18:47:10 +0100
Subject: s390: KVM preparation: provide hook to enable pgstes in user
 pagetable

The SIE instruction on s390 uses the 2nd half of the page table page to
virtualize the storage keys of a guest. This patch offers the s390_enable_sie
function, which reorganizes the page tables of a single-threaded process to
reserve space in the page table:
s390_enable_sie makes sure that the process is single threaded and then uses
dup_mm to create a new mm with reorganized page tables. The old mm is freed
and the process has now a page status extended field after every page table.

Code that wants to exploit pgstes should SELECT CONFIG_PGSTE.

This patch has a small common code hit, namely making dup_mm non-static.

Edit (Carsten): I've modified Martin's patch, following Jeremy Fitzhardinge's
review feedback. Now we do have the prototype for dup_mm in
include/linux/sched.h. Following Martin's suggestion, s390_enable_sie() does now
call task_lock() to prevent race against ptrace modification of mm_users.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/s390/Kconfig              |  4 +++
 arch/s390/kernel/setup.c       |  4 +++
 arch/s390/mm/pgtable.c         | 65 ++++++++++++++++++++++++++++++++++++++++--
 include/asm-s390/mmu.h         |  1 +
 include/asm-s390/mmu_context.h |  8 +++++-
 include/asm-s390/pgtable.h     |  1 +
 include/linux/sched.h          |  2 ++
 kernel/fork.c                  |  2 +-
 8 files changed, 82 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index f6a68e178fc..513a0589e81 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -62,6 +62,10 @@ config GENERIC_LOCKBREAK
 	default y
 	depends on SMP && PREEMPT
 
+config PGSTE
+	bool
+	default y if KVM
+
 mainmenu "Linux Kernel Configuration"
 
 config S390
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 7141147e6b6..2f35133ebc1 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -316,7 +316,11 @@ static int __init early_parse_ipldelay(char *p)
 early_param("ipldelay", early_parse_ipldelay);
 
 #ifdef CONFIG_S390_SWITCH_AMODE
+#ifdef CONFIG_PGSTE
+unsigned int switch_amode = 1;
+#else
 unsigned int switch_amode = 0;
+#endif
 EXPORT_SYMBOL_GPL(switch_amode);
 
 static void set_amode_and_uaccess(unsigned long user_amode,
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index fd072013f88..5c1aea97cd1 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -30,11 +30,27 @@
 #define TABLES_PER_PAGE	4
 #define FRAG_MASK	15UL
 #define SECOND_HALVES	10UL
+
+void clear_table_pgstes(unsigned long *table)
+{
+	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
+	memset(table + 256, 0, PAGE_SIZE/4);
+	clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
+	memset(table + 768, 0, PAGE_SIZE/4);
+}
+
 #else
 #define ALLOC_ORDER	2
 #define TABLES_PER_PAGE	2
 #define FRAG_MASK	3UL
 #define SECOND_HALVES	2UL
+
+void clear_table_pgstes(unsigned long *table)
+{
+	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
+	memset(table + 256, 0, PAGE_SIZE/2);
+}
+
 #endif
 
 unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec)
@@ -153,7 +169,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 	unsigned long *table;
 	unsigned long bits;
 
-	bits = mm->context.noexec ? 3UL : 1UL;
+	bits = (mm->context.noexec || mm->context.pgstes) ? 3UL : 1UL;
 	spin_lock(&mm->page_table_lock);
 	page = NULL;
 	if (!list_empty(&mm->context.pgtable_list)) {
@@ -170,7 +186,10 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 		pgtable_page_ctor(page);
 		page->flags &= ~FRAG_MASK;
 		table = (unsigned long *) page_to_phys(page);
-		clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
+		if (mm->context.pgstes)
+			clear_table_pgstes(table);
+		else
+			clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
 		spin_lock(&mm->page_table_lock);
 		list_add(&page->lru, &mm->context.pgtable_list);
 	}
@@ -191,7 +210,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
 	struct page *page;
 	unsigned long bits;
 
-	bits = mm->context.noexec ? 3UL : 1UL;
+	bits = (mm->context.noexec || mm->context.pgstes) ? 3UL : 1UL;
 	bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
 	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
 	spin_lock(&mm->page_table_lock);
@@ -228,3 +247,43 @@ void disable_noexec(struct mm_struct *mm, struct task_struct *tsk)
 	mm->context.noexec = 0;
 	update_mm(mm, tsk);
 }
+
+/*
+ * switch on pgstes for its userspace process (for kvm)
+ */
+int s390_enable_sie(void)
+{
+	struct task_struct *tsk = current;
+	struct mm_struct *mm;
+	int rc;
+
+	task_lock(tsk);
+
+	rc = 0;
+	if (tsk->mm->context.pgstes)
+		goto unlock;
+
+	rc = -EINVAL;
+	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
+	    tsk->mm != tsk->active_mm || tsk->mm->ioctx_list)
+		goto unlock;
+
+	tsk->mm->context.pgstes = 1;	/* dirty little tricks .. */
+	mm = dup_mm(tsk);
+	tsk->mm->context.pgstes = 0;
+
+	rc = -ENOMEM;
+	if (!mm)
+		goto unlock;
+	mmput(tsk->mm);
+	tsk->mm = tsk->active_mm = mm;
+	preempt_disable();
+	update_mm(mm, tsk);
+	cpu_set(smp_processor_id(), mm->cpu_vm_mask);
+	preempt_enable();
+	rc = 0;
+unlock:
+	task_unlock(tsk);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(s390_enable_sie);
diff --git a/include/asm-s390/mmu.h b/include/asm-s390/mmu.h
index 1698e29c5b2..5dd5e7b3476 100644
--- a/include/asm-s390/mmu.h
+++ b/include/asm-s390/mmu.h
@@ -7,6 +7,7 @@ typedef struct {
 	unsigned long asce_bits;
 	unsigned long asce_limit;
 	int noexec;
+	int pgstes;
 } mm_context_t;
 
 #endif
diff --git a/include/asm-s390/mmu_context.h b/include/asm-s390/mmu_context.h
index b5a34c6f91a..4c2fbf48c9c 100644
--- a/include/asm-s390/mmu_context.h
+++ b/include/asm-s390/mmu_context.h
@@ -20,7 +20,13 @@ static inline int init_new_context(struct task_struct *tsk,
 #ifdef CONFIG_64BIT
 	mm->context.asce_bits |= _ASCE_TYPE_REGION3;
 #endif
-	mm->context.noexec = s390_noexec;
+	if (current->mm->context.pgstes) {
+		mm->context.noexec = 0;
+		mm->context.pgstes = 1;
+	} else {
+		mm->context.noexec = s390_noexec;
+		mm->context.pgstes = 0;
+	}
 	mm->context.asce_limit = STACK_TOP_MAX;
 	crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
 	return 0;
diff --git a/include/asm-s390/pgtable.h b/include/asm-s390/pgtable.h
index 65154dc9a9e..8e9a629dc19 100644
--- a/include/asm-s390/pgtable.h
+++ b/include/asm-s390/pgtable.h
@@ -966,6 +966,7 @@ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset)
 
 extern int add_shared_memory(unsigned long start, unsigned long size);
 extern int remove_shared_memory(unsigned long start, unsigned long size);
+extern int s390_enable_sie(void);
 
 /*
  * No page table caches to initialise
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d0bd97044ab..9a4f3e63e3b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1798,6 +1798,8 @@ extern void mmput(struct mm_struct *);
 extern struct mm_struct *get_task_mm(struct task_struct *task);
 /* Remove the current tasks stale references to the old mm_struct */
 extern void mm_release(struct task_struct *, struct mm_struct *);
+/* Allocate a new mm structure and copy contents from tsk->mm */
+extern struct mm_struct *dup_mm(struct task_struct *tsk);
 
 extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
 extern void flush_thread(void);
diff --git a/kernel/fork.c b/kernel/fork.c
index cb46befdd3a..c674aa8d3c3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -521,7 +521,7 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
  * Allocate a new mm structure and copy contents from the
  * mm structure of the passed in task structure.
  */
-static struct mm_struct *dup_mm(struct task_struct *tsk)
+struct mm_struct *dup_mm(struct task_struct *tsk)
 {
 	struct mm_struct *mm, *oldmm = current->mm;
 	int err;
-- 
cgit v1.2.3-70-g09d2


From d7b41a24bfb5d7fa02f7b49be1293d468814e424 Mon Sep 17 00:00:00 2001
From: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
Date: Sat, 26 Apr 2008 14:10:16 -0700
Subject: hrtimer: timeout too long when using HRTIMER_CB_SOFTIRQ

When using hrtimer with timer->cb_mode == HRTIMER_CB_SOFTIRQ
in some cases the clockevent is not programmed.
This happens, if:
 - a timer is rearmed while it's state is HRTIMER_STATE_CALLBACK
 - hrtimer_reprogram() returns -ETIME, when it is called after
   CALLBACK is finished. This occurs if the new timer->expires
   is in the past when CALLBACK is done.
In this case, the timer needs to be removed from the tree and put
onto the pending list again.

The patch is against 2.6.22.5, but AFAICS, it is relevant
for 2.6.25 also (in run_hrtimer_pending()).

Signed-off-by: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
Cc: stable@kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f78777abe76..e379ef0e9c2 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1080,8 +1080,19 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
 			 * If the timer was rearmed on another CPU, reprogram
 			 * the event device.
 			 */
-			if (timer->base->first == &timer->node)
-				hrtimer_reprogram(timer, timer->base);
+			struct hrtimer_clock_base *base = timer->base;
+
+			if (base->first == &timer->node &&
+			    hrtimer_reprogram(timer, base)) {
+				/*
+				 * Timer is expired. Thus move it from tree to
+				 * pending list again.
+				 */
+				__remove_hrtimer(timer, base,
+						 HRTIMER_STATE_PENDING, 0);
+				list_add_tail(&timer->cb_entry,
+					      &base->cpu_base->cb_pending);
+			}
 		}
 	}
 	spin_unlock_irq(&cpu_base->lock);
-- 
cgit v1.2.3-70-g09d2


From dd1a239f6f2d4d3eedd318583ec319aa145b324c Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Mon, 28 Apr 2008 02:12:17 -0700
Subject: mm: have zonelist contains structs with both a zone pointer and
 zone_idx

Filtering zonelists requires very frequent use of zone_idx().  This is costly
as it involves a lookup of another structure and a substraction operation.  As
the zone_idx is often required, it should be quickly accessible.  The node idx
could also be stored here if it was found that accessing zone->node is
significant which may be the case on workloads where nodemasks are heavily
used.

This patch introduces a struct zoneref to store a zone pointer and a zone
index.  The zonelist then consists of an array of these struct zonerefs which
are looked up as necessary.  Helpers are given for accessing the zone index as
well as the node index.

[kamezawa.hiroyu@jp.fujitsu.com: Suggested struct zoneref instead of embedding information in pointers]
[hugh@veritas.com: mm-have-zonelist: fix memcg ooms]
[hugh@veritas.com: just return do_try_to_free_pages]
[hugh@veritas.com: do_try_to_free_pages gfp_mask redundant]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Christoph Lameter <clameter@sgi.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/parisc/mm/init.c  |  2 +-
 fs/buffer.c            |  6 ++---
 include/linux/mmzone.h | 64 +++++++++++++++++++++++++++++++++++++++--------
 include/linux/oom.h    |  4 +--
 kernel/cpuset.c        |  4 +--
 mm/hugetlb.c           |  3 ++-
 mm/mempolicy.c         | 36 +++++++++++++++-----------
 mm/oom_kill.c          | 45 ++++++++++++++++-----------------
 mm/page_alloc.c        | 68 ++++++++++++++++++++++++++++----------------------
 mm/slab.c              |  2 +-
 mm/slub.c              |  2 +-
 mm/vmscan.c            | 22 ++++++++--------
 12 files changed, 158 insertions(+), 100 deletions(-)

(limited to 'kernel')

diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 9bb6136d77c..1f012843150 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -608,7 +608,7 @@ void show_mem(void)
 		for (i = 0; i < npmem_ranges; i++) {
 			zl = node_zonelist(i);
 			for (j = 0; j < MAX_NR_ZONES; j++) {
-				struct zone **z;
+				struct zoneref *z;
 				struct zone *zone;
 
 				printk("Zone list for zone %d on node %d: ", j, i);
diff --git a/fs/buffer.c b/fs/buffer.c
index 9b5434a8047..ac84cd13075 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -360,16 +360,16 @@ void invalidate_bdev(struct block_device *bdev)
  */
 static void free_more_memory(void)
 {
-	struct zone **zones;
+	struct zoneref *zrefs;
 	int nid;
 
 	wakeup_pdflush(1024);
 	yield();
 
 	for_each_online_node(nid) {
-		zones = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
+		zrefs = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
 						gfp_zone(GFP_NOFS));
-		if (*zones)
+		if (zrefs->zone)
 			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
 						GFP_NOFS);
 	}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d5c33a0b89e..d34b4c29001 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -468,6 +468,15 @@ struct zonelist_cache {
 struct zonelist_cache;
 #endif
 
+/*
+ * This struct contains information about a zone in a zonelist. It is stored
+ * here to avoid dereferences into large structures and lookups of tables
+ */
+struct zoneref {
+	struct zone *zone;	/* Pointer to actual zone */
+	int zone_idx;		/* zone_idx(zoneref->zone) */
+};
+
 /*
  * One allocation request operates on a zonelist. A zonelist
  * is a list of zones, the first one is the 'goal' of the
@@ -476,11 +485,18 @@ struct zonelist_cache;
  *
  * If zlcache_ptr is not NULL, then it is just the address of zlcache,
  * as explained above.  If zlcache_ptr is NULL, there is no zlcache.
+ * *
+ * To speed the reading of the zonelist, the zonerefs contain the zone index
+ * of the entry being read. Helper functions to access information given
+ * a struct zoneref are
+ *
+ * zonelist_zone()	- Return the struct zone * for an entry in _zonerefs
+ * zonelist_zone_idx()	- Return the index of the zone for an entry
+ * zonelist_node_idx()	- Return the index of the node for an entry
  */
-
 struct zonelist {
 	struct zonelist_cache *zlcache_ptr;		     // NULL or &zlcache
-	struct zone *zones[MAX_ZONES_PER_ZONELIST + 1];      // NULL delimited
+	struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
 #ifdef CONFIG_NUMA
 	struct zonelist_cache zlcache;			     // optional ...
 #endif
@@ -713,26 +729,52 @@ extern struct zone *next_zone(struct zone *zone);
 	     zone;					\
 	     zone = next_zone(zone))
 
+static inline struct zone *zonelist_zone(struct zoneref *zoneref)
+{
+	return zoneref->zone;
+}
+
+static inline int zonelist_zone_idx(struct zoneref *zoneref)
+{
+	return zoneref->zone_idx;
+}
+
+static inline int zonelist_node_idx(struct zoneref *zoneref)
+{
+#ifdef CONFIG_NUMA
+	/* zone_to_nid not available in this context */
+	return zoneref->zone->node;
+#else
+	return 0;
+#endif /* CONFIG_NUMA */
+}
+
+static inline void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
+{
+	zoneref->zone = zone;
+	zoneref->zone_idx = zone_idx(zone);
+}
+
 /* Returns the first zone at or below highest_zoneidx in a zonelist */
-static inline struct zone **first_zones_zonelist(struct zonelist *zonelist,
+static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 					enum zone_type highest_zoneidx)
 {
-	struct zone **z;
+	struct zoneref *z;
 
 	/* Find the first suitable zone to use for the allocation */
-	z = zonelist->zones;
-	while (*z && zone_idx(*z) > highest_zoneidx)
+	z = zonelist->_zonerefs;
+	while (zonelist_zone_idx(z) > highest_zoneidx)
 		z++;
 
 	return z;
 }
 
 /* Returns the next zone at or below highest_zoneidx in a zonelist */
-static inline struct zone **next_zones_zonelist(struct zone **z,
+static inline struct zoneref *next_zones_zonelist(struct zoneref *z,
 					enum zone_type highest_zoneidx)
 {
 	/* Find the next suitable zone to use for the allocation */
-	while (*z && zone_idx(*z) > highest_zoneidx)
+	while (zonelist_zone_idx(z) > highest_zoneidx)
 		z++;
 
 	return z;
@@ -748,9 +790,11 @@ static inline struct zone **next_zones_zonelist(struct zone **z,
  * This iterator iterates though all zones at or below a given zone index.
  */
 #define for_each_zone_zonelist(zone, z, zlist, highidx) \
-	for (z = first_zones_zonelist(zlist, highidx), zone = *z++;	\
+	for (z = first_zones_zonelist(zlist, highidx),			\
+					zone = zonelist_zone(z++);	\
 		zone;							\
-		z = next_zones_zonelist(z, highidx), zone = *z++)
+		z = next_zones_zonelist(z, highidx),			\
+					zone = zonelist_zone(z++))
 
 #ifdef CONFIG_SPARSEMEM
 #include <asm/sparsemem.h>
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 3852436b652..a7979baf1e3 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -23,8 +23,8 @@ enum oom_constraint {
 	CONSTRAINT_MEMORY_POLICY,
 };
 
-extern int try_set_zone_oom(struct zonelist *zonelist);
-extern void clear_zonelist_oom(struct zonelist *zonelist);
+extern int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_flags);
+extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 
 extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order);
 extern int register_oom_notifier(struct notifier_block *nb);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8b35fbd8292..a220b13cbfa 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1967,8 +1967,8 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 {
 	int i;
 
-	for (i = 0; zl->zones[i]; i++) {
-		int nid = zone_to_nid(zl->zones[i]);
+	for (i = 0; zl->_zonerefs[i].zone; i++) {
+		int nid = zonelist_node_idx(&zl->_zonerefs[i]);
 
 		if (node_isset(nid, current->mems_allowed))
 			return 1;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ddd141cad77..4bced0d705c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -97,7 +97,8 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
 	struct mempolicy *mpol;
 	struct zonelist *zonelist = huge_zonelist(vma, address,
 					htlb_alloc_mask, &mpol);
-	struct zone *zone, **z;
+	struct zone *zone;
+	struct zoneref *z;
 
 	for_each_zone_zonelist(zone, z, zonelist, MAX_NR_ZONES - 1) {
 		nid = zone_to_nid(zone);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5d20bf44062..90193a2a915 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -186,7 +186,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 		for_each_node_mask(nd, *nodes) { 
 			struct zone *z = &NODE_DATA(nd)->node_zones[k];
 			if (z->present_pages > 0) 
-				zl->zones[num++] = z;
+				zoneref_set_zone(z, &zl->_zonerefs[num++]);
 		}
 		if (k == 0)
 			break;
@@ -196,7 +196,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 		kfree(zl);
 		return ERR_PTR(-EINVAL);
 	}
-	zl->zones[num] = NULL;
+	zl->_zonerefs[num].zone = NULL;
+	zl->_zonerefs[num].zone_idx = 0;
 	return zl;
 }
 
@@ -504,9 +505,11 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 	nodes_clear(*nodes);
 	switch (p->policy) {
 	case MPOL_BIND:
-		for (i = 0; p->v.zonelist->zones[i]; i++)
-			node_set(zone_to_nid(p->v.zonelist->zones[i]),
-				*nodes);
+		for (i = 0; p->v.zonelist->_zonerefs[i].zone; i++) {
+			struct zoneref *zref;
+			zref = &p->v.zonelist->_zonerefs[i];
+			node_set(zonelist_node_idx(zref), *nodes);
+		}
 		break;
 	case MPOL_DEFAULT:
 		break;
@@ -1212,12 +1215,13 @@ unsigned slab_node(struct mempolicy *policy)
 	case MPOL_INTERLEAVE:
 		return interleave_nodes(policy);
 
-	case MPOL_BIND:
+	case MPOL_BIND: {
 		/*
 		 * Follow bind policy behavior and start allocation at the
 		 * first node.
 		 */
-		return zone_to_nid(policy->v.zonelist->zones[0]);
+		return zonelist_node_idx(policy->v.zonelist->_zonerefs);
+	}
 
 	case MPOL_PREFERRED:
 		if (policy->v.preferred_node >= 0)
@@ -1323,7 +1327,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 
 	zl = node_zonelist(nid, gfp);
 	page = __alloc_pages(gfp, order, zl);
-	if (page && page_zone(page) == zl->zones[0])
+	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
 	return page;
 }
@@ -1463,10 +1467,14 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 		return a->v.preferred_node == b->v.preferred_node;
 	case MPOL_BIND: {
 		int i;
-		for (i = 0; a->v.zonelist->zones[i]; i++)
-			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
+		for (i = 0; a->v.zonelist->_zonerefs[i].zone; i++) {
+			struct zone *za, *zb;
+			za = zonelist_zone(&a->v.zonelist->_zonerefs[i]);
+			zb = zonelist_zone(&b->v.zonelist->_zonerefs[i]);
+			if (za != zb)
 				return 0;
-		return b->v.zonelist->zones[i] == NULL;
+		}
+		return b->v.zonelist->_zonerefs[i].zone == NULL;
 	}
 	default:
 		BUG();
@@ -1785,12 +1793,12 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 		break;
 	case MPOL_BIND: {
 		nodemask_t nodes;
-		struct zone **z;
+		struct zoneref *z;
 		struct zonelist *zonelist;
 
 		nodes_clear(nodes);
-		for (z = pol->v.zonelist->zones; *z; z++)
-			node_set(zone_to_nid(*z), nodes);
+		for (z = pol->v.zonelist->_zonerefs; z->zone; z++)
+			node_set(zonelist_node_idx(z), nodes);
 		nodes_remap(tmp, nodes, *mpolmask, *newmask);
 		nodes = tmp;
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2c93502cfcb..e41504aa5da 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -176,7 +176,7 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 {
 #ifdef CONFIG_NUMA
 	struct zone *zone;
-	struct zone **z;
+	struct zoneref *z;
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	nodemask_t nodes = node_states[N_HIGH_MEMORY];
 
@@ -462,29 +462,29 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
  * if a parallel OOM killing is already taking place that includes a zone in
  * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
  */
-int try_set_zone_oom(struct zonelist *zonelist)
+int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 {
-	struct zone **z;
+	struct zoneref *z;
+	struct zone *zone;
 	int ret = 1;
 
-	z = zonelist->zones;
-
 	spin_lock(&zone_scan_mutex);
-	do {
-		if (zone_is_oom_locked(*z)) {
+	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+		if (zone_is_oom_locked(zone)) {
 			ret = 0;
 			goto out;
 		}
-	} while (*(++z) != NULL);
+	}
+
+	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+		/*
+		 * Lock each zone in the zonelist under zone_scan_mutex so a
+		 * parallel invocation of try_set_zone_oom() doesn't succeed
+		 * when it shouldn't.
+		 */
+		zone_set_flag(zone, ZONE_OOM_LOCKED);
+	}
 
-	/*
-	 * Lock each zone in the zonelist under zone_scan_mutex so a parallel
-	 * invocation of try_set_zone_oom() doesn't succeed when it shouldn't.
-	 */
-	z = zonelist->zones;
-	do {
-		zone_set_flag(*z, ZONE_OOM_LOCKED);
-	} while (*(++z) != NULL);
 out:
 	spin_unlock(&zone_scan_mutex);
 	return ret;
@@ -495,16 +495,15 @@ out:
  * allocation attempts with zonelists containing them may now recall the OOM
  * killer, if necessary.
  */
-void clear_zonelist_oom(struct zonelist *zonelist)
+void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 {
-	struct zone **z;
-
-	z = zonelist->zones;
+	struct zoneref *z;
+	struct zone *zone;
 
 	spin_lock(&zone_scan_mutex);
-	do {
-		zone_clear_flag(*z, ZONE_OOM_LOCKED);
-	} while (*(++z) != NULL);
+	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+		zone_clear_flag(zone, ZONE_OOM_LOCKED);
+	}
 	spin_unlock(&zone_scan_mutex);
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4ccb8651cf2..6d94d04ea78 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1317,7 +1317,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
  * We are low on memory in the second scan, and should leave no stone
  * unturned looking for a free page.
  */
-static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 						nodemask_t *allowednodes)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
@@ -1328,7 +1328,7 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
 	if (!zlc)
 		return 1;
 
-	i = z - zonelist->zones;
+	i = z - zonelist->_zonerefs;
 	n = zlc->z_to_n[i];
 
 	/* This zone is worth trying if it is allowed but not full */
@@ -1340,7 +1340,7 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
  * zlc->fullzones, so that subsequent attempts to allocate a page
  * from that zone don't waste time re-examining it.
  */
-static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	int i;				/* index of *z in zonelist zones */
@@ -1349,7 +1349,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
 	if (!zlc)
 		return;
 
-	i = z - zonelist->zones;
+	i = z - zonelist->_zonerefs;
 
 	set_bit(i, zlc->fullzones);
 }
@@ -1361,13 +1361,13 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 	return NULL;
 }
 
-static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 				nodemask_t *allowednodes)
 {
 	return 1;
 }
 
-static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 }
 #endif	/* CONFIG_NUMA */
@@ -1380,7 +1380,7 @@ static struct page *
 get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 		struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
 {
-	struct zone **z;
+	struct zoneref *z;
 	struct page *page = NULL;
 	int classzone_idx;
 	struct zone *zone, *preferred_zone;
@@ -1389,8 +1389,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
 
 	z = first_zones_zonelist(zonelist, high_zoneidx);
-	classzone_idx = zone_idx(*z);
-	preferred_zone = *z;
+	classzone_idx = zonelist_zone_idx(z);
+	preferred_zone = zonelist_zone(z);
 
 zonelist_scan:
 	/*
@@ -1453,7 +1453,8 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
-	struct zone **z;
+	struct zoneref *z;
+	struct zone *zone;
 	struct page *page;
 	struct reclaim_state reclaim_state;
 	struct task_struct *p = current;
@@ -1467,9 +1468,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
 		return NULL;
 
 restart:
-	z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
+	z = zonelist->_zonerefs;  /* the list of zones suitable for gfp_mask */
 
-	if (unlikely(*z == NULL)) {
+	if (unlikely(!z->zone)) {
 		/*
 		 * Happens if we have an empty zonelist as a result of
 		 * GFP_THISNODE being used on a memoryless node
@@ -1493,8 +1494,8 @@ restart:
 	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
 		goto nopage;
 
-	for (z = zonelist->zones; *z; z++)
-		wakeup_kswapd(*z, order);
+	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+		wakeup_kswapd(zone, order);
 
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
@@ -1575,7 +1576,7 @@ nofail_alloc:
 		if (page)
 			goto got_pg;
 	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
-		if (!try_set_zone_oom(zonelist)) {
+		if (!try_set_zone_oom(zonelist, gfp_mask)) {
 			schedule_timeout_uninterruptible(1);
 			goto restart;
 		}
@@ -1589,18 +1590,18 @@ nofail_alloc:
 		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
 			zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
 		if (page) {
-			clear_zonelist_oom(zonelist);
+			clear_zonelist_oom(zonelist, gfp_mask);
 			goto got_pg;
 		}
 
 		/* The OOM killer will not help higher order allocs so fail */
 		if (order > PAGE_ALLOC_COSTLY_ORDER) {
-			clear_zonelist_oom(zonelist);
+			clear_zonelist_oom(zonelist, gfp_mask);
 			goto nopage;
 		}
 
 		out_of_memory(zonelist, gfp_mask, order);
-		clear_zonelist_oom(zonelist);
+		clear_zonelist_oom(zonelist, gfp_mask);
 		goto restart;
 	}
 
@@ -1702,7 +1703,7 @@ EXPORT_SYMBOL(free_pages);
 
 static unsigned int nr_free_zone_pages(int offset)
 {
-	struct zone **z;
+	struct zoneref *z;
 	struct zone *zone;
 
 	/* Just pick one node, since fallback list is circular */
@@ -1896,7 +1897,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
 		zone_type--;
 		zone = pgdat->node_zones + zone_type;
 		if (populated_zone(zone)) {
-			zonelist->zones[nr_zones++] = zone;
+			zoneref_set_zone(zone,
+				&zonelist->_zonerefs[nr_zones++]);
 			check_highest_zone(zone_type);
 		}
 
@@ -2072,11 +2074,12 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 	struct zonelist *zonelist;
 
 	zonelist = &pgdat->node_zonelists[0];
-	for (j = 0; zonelist->zones[j] != NULL; j++)
+	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
 		;
 	j = build_zonelists_node(NODE_DATA(node), zonelist, j,
 							MAX_NR_ZONES - 1);
-	zonelist->zones[j] = NULL;
+	zonelist->_zonerefs[j].zone = NULL;
+	zonelist->_zonerefs[j].zone_idx = 0;
 }
 
 /*
@@ -2089,7 +2092,8 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
 
 	zonelist = &pgdat->node_zonelists[1];
 	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
-	zonelist->zones[j] = NULL;
+	zonelist->_zonerefs[j].zone = NULL;
+	zonelist->_zonerefs[j].zone_idx = 0;
 }
 
 /*
@@ -2114,12 +2118,14 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
 			node = node_order[j];
 			z = &NODE_DATA(node)->node_zones[zone_type];
 			if (populated_zone(z)) {
-				zonelist->zones[pos++] = z;
+				zoneref_set_zone(z,
+					&zonelist->_zonerefs[pos++]);
 				check_highest_zone(zone_type);
 			}
 		}
 	}
-	zonelist->zones[pos] = NULL;
+	zonelist->_zonerefs[pos].zone = NULL;
+	zonelist->_zonerefs[pos].zone_idx = 0;
 }
 
 static int default_zonelist_order(void)
@@ -2196,7 +2202,8 @@ static void build_zonelists(pg_data_t *pgdat)
 	/* initialize zonelists */
 	for (i = 0; i < MAX_ZONELISTS; i++) {
 		zonelist = pgdat->node_zonelists + i;
-		zonelist->zones[0] = NULL;
+		zonelist->_zonerefs[0].zone = NULL;
+		zonelist->_zonerefs[0].zone_idx = 0;
 	}
 
 	/* NUMA-aware ordering of nodes */
@@ -2248,13 +2255,13 @@ static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	struct zonelist *zonelist;
 	struct zonelist_cache *zlc;
-	struct zone **z;
+	struct zoneref *z;
 
 	zonelist = &pgdat->node_zonelists[0];
 	zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
-	for (z = zonelist->zones; *z; z++)
-		zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
+	for (z = zonelist->_zonerefs; z->zone; z++)
+		zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
 }
 
 
@@ -2297,7 +2304,8 @@ static void build_zonelists(pg_data_t *pgdat)
 							MAX_NR_ZONES - 1);
 	}
 
-	zonelist->zones[j] = NULL;
+	zonelist->_zonerefs[j].zone = NULL;
+	zonelist->_zonerefs[j].zone_idx = 0;
 }
 
 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
diff --git a/mm/slab.c b/mm/slab.c
index 29851841da6..7bc4a136846 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3242,7 +3242,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 {
 	struct zonelist *zonelist;
 	gfp_t local_flags;
-	struct zone **z;
+	struct zoneref *z;
 	struct zone *zone;
 	enum zone_type high_zoneidx = gfp_zone(flags);
 	void *obj = NULL;
diff --git a/mm/slub.c b/mm/slub.c
index 80d20cc1c0f..48fff83a1e9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1284,7 +1284,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 {
 #ifdef CONFIG_NUMA
 	struct zonelist *zonelist;
-	struct zone **z;
+	struct zoneref *z;
 	struct zone *zone;
 	enum zone_type high_zoneidx = gfp_zone(flags);
 	struct page *page;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0515b8f4489..eceac9f9032 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1251,7 +1251,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
 {
 	enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
 	unsigned long nr_reclaimed = 0;
-	struct zone **z;
+	struct zoneref *z;
 	struct zone *zone;
 
 	sc->all_unreclaimable = 1;
@@ -1301,7 +1301,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
  * allocation attempt will fail.
  */
 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
-					gfp_t gfp_mask, struct scan_control *sc)
+					struct scan_control *sc)
 {
 	int priority;
 	int ret = 0;
@@ -1309,9 +1309,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	unsigned long nr_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long lru_pages = 0;
-	struct zone **z;
+	struct zoneref *z;
 	struct zone *zone;
-	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+	enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
 
 	if (scan_global_lru(sc))
 		count_vm_event(ALLOCSTALL);
@@ -1339,7 +1339,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		 * over limit cgroups
 		 */
 		if (scan_global_lru(sc)) {
-			shrink_slab(sc->nr_scanned, gfp_mask, lru_pages);
+			shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
 			if (reclaim_state) {
 				nr_reclaimed += reclaim_state->reclaimed_slab;
 				reclaim_state->reclaimed_slab = 0;
@@ -1410,7 +1410,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.isolate_pages = isolate_pages_global,
 	};
 
-	return do_try_to_free_pages(zonelist, gfp_mask, &sc);
+	return do_try_to_free_pages(zonelist, &sc);
 }
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
@@ -1419,7 +1419,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 						gfp_t gfp_mask)
 {
 	struct scan_control sc = {
-		.gfp_mask = gfp_mask,
 		.may_writepage = !laptop_mode,
 		.may_swap = 1,
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
@@ -1429,12 +1428,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 		.isolate_pages = mem_cgroup_isolate_pages,
 	};
 	struct zonelist *zonelist;
-	int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE);
 
-	zonelist = &NODE_DATA(numa_node_id())->node_zonelists[target_zone];
-	if (do_try_to_free_pages(zonelist, sc.gfp_mask, &sc))
-		return 1;
-	return 0;
+	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
+	zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+	return do_try_to_free_pages(zonelist, &sc);
 }
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 19770b32609b6bf97a3dece2529089494cbfc549 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Mon, 28 Apr 2008 02:12:18 -0700
Subject: mm: filter based on a nodemask as well as a gfp_mask

The MPOL_BIND policy creates a zonelist that is used for allocations
controlled by that mempolicy.  As the per-node zonelist is already being
filtered based on a zone id, this patch adds a version of __alloc_pages() that
takes a nodemask for further filtering.  This eliminates the need for
MPOL_BIND to create a custom zonelist.

A positive benefit of this is that allocations using MPOL_BIND now use the
local node's distance-ordered zonelist instead of a custom node-id-ordered
zonelist.  I.e., pages will be allocated from the closest allowed node with
available memory.

[Lee.Schermerhorn@hp.com: Mempolicy: update stale documentation and comments]
[Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask]
[Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask rework]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/numa_memory_policy.txt |  11 +-
 fs/buffer.c                             |   9 +-
 include/linux/cpuset.h                  |   4 +-
 include/linux/gfp.h                     |   4 +
 include/linux/mempolicy.h               |  19 ++--
 include/linux/mmzone.h                  |  80 ++++++++------
 kernel/cpuset.c                         |  18 +---
 mm/hugetlb.c                            |   6 +-
 mm/mempolicy.c                          | 184 +++++++++++++-------------------
 mm/mmzone.c                             |  30 ++++++
 mm/page_alloc.c                         |  50 ++++++---
 11 files changed, 224 insertions(+), 191 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
index dd498649799..1278e685d65 100644
--- a/Documentation/vm/numa_memory_policy.txt
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -182,14 +182,9 @@ Components of Memory Policies
 	    The Default mode does not use the optional set of nodes.
 
 	MPOL_BIND:  This mode specifies that memory must come from the
-	set of nodes specified by the policy.
-
-	    The memory policy APIs do not specify an order in which the nodes
-	    will be searched.  However, unlike "local allocation", the Bind
-	    policy does not consider the distance between the nodes.  Rather,
-	    allocations will fallback to the nodes specified by the policy in
-	    order of numeric node id.  Like everything in Linux, this is subject
-	    to change.
+	set of nodes specified by the policy.  Memory will be allocated from
+	the node in the set with sufficient free memory that is closest to
+	the node where the allocation takes place.
 
 	MPOL_PREFERRED:  This mode specifies that the allocation should be
 	attempted from the single node specified in the policy.  If that
diff --git a/fs/buffer.c b/fs/buffer.c
index ac84cd13075..7d51e649b19 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -360,16 +360,17 @@ void invalidate_bdev(struct block_device *bdev)
  */
 static void free_more_memory(void)
 {
-	struct zoneref *zrefs;
+	struct zone *zone;
 	int nid;
 
 	wakeup_pdflush(1024);
 	yield();
 
 	for_each_online_node(nid) {
-		zrefs = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
-						gfp_zone(GFP_NOFS));
-		if (zrefs->zone)
+		(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
+						gfp_zone(GFP_NOFS), NULL,
+						&zone);
+		if (zone)
 			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
 						GFP_NOFS);
 	}
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 726761e2400..038578362b4 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -26,7 +26,7 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
 void cpuset_update_task_memory_state(void);
-int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
+int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
 
 extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask);
 extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask);
@@ -103,7 +103,7 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
 static inline void cpuset_init_current_mems_allowed(void) {}
 static inline void cpuset_update_task_memory_state(void) {}
 
-static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
+static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 {
 	return 1;
 }
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index e1c6064cb6c..898aa9d5b6c 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -182,6 +182,10 @@ static inline void arch_alloc_page(struct page *page, int order) { }
 
 extern struct page *__alloc_pages(gfp_t, unsigned int, struct zonelist *);
 
+extern struct page *
+__alloc_pages_nodemask(gfp_t, unsigned int,
+				struct zonelist *, nodemask_t *nodemask);
+
 static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 						unsigned int order)
 {
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 69160dc32d4..b8b3da7a331 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -54,19 +54,20 @@ struct mm_struct;
  * mmap_sem.
  *
  * Freeing policy:
- * When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd.
- * All other policies don't have any external state. mpol_free() handles this.
+ * Mempolicy objects are reference counted.  A mempolicy will be freed when
+ * mpol_free() decrements the reference count to zero.
  *
  * Copying policy objects:
- * For MPOL_BIND the zonelist must be always duplicated. mpol_clone() does this.
+ * mpol_copy() allocates a new mempolicy and copies the specified mempolicy
+ * to the new storage.  The reference count of the new object is initialized
+ * to 1, representing the caller of mpol_copy().
  */
 struct mempolicy {
 	atomic_t refcnt;
 	short policy; 	/* See MPOL_* above */
 	union {
-		struct zonelist  *zonelist;	/* bind */
 		short 		 preferred_node; /* preferred */
-		nodemask_t	 nodes;		/* interleave */
+		nodemask_t	 nodes;		/* interleave/bind */
 		/* undefined for default */
 	} v;
 	nodemask_t cpuset_mems_allowed;	/* mempolicy relative to these nodes */
@@ -151,7 +152,8 @@ extern void mpol_fix_fork_child_flag(struct task_struct *p);
 
 extern struct mempolicy default_policy;
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
-		unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol);
+				unsigned long addr, gfp_t gfp_flags,
+				struct mempolicy **mpol, nodemask_t **nodemask);
 extern unsigned slab_node(struct mempolicy *policy);
 
 extern enum zone_type policy_zone;
@@ -239,8 +241,11 @@ static inline void mpol_fix_fork_child_flag(struct task_struct *p)
 }
 
 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
- 		unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol)
+				unsigned long addr, gfp_t gfp_flags,
+				struct mempolicy **mpol, nodemask_t **nodemask)
 {
+	*mpol = NULL;
+	*nodemask = NULL;
 	return node_zonelist(0, gfp_flags);
 }
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d34b4c29001..498d6ceff2f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -749,36 +749,60 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
 #endif /* CONFIG_NUMA */
 }
 
-static inline void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
-{
-	zoneref->zone = zone;
-	zoneref->zone_idx = zone_idx(zone);
-}
+/**
+ * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
+ * @z - The cursor used as a starting point for the search
+ * @highest_zoneidx - The zone index of the highest zone to return
+ * @nodes - An optional nodemask to filter the zonelist with
+ * @zone - The first suitable zone found is returned via this parameter
+ *
+ * This function returns the next zone at or below a given zone index that is
+ * within the allowed nodemask using a cursor as the starting point for the
+ * search. The zoneref returned is a cursor that is used as the next starting
+ * point for future calls to next_zones_zonelist().
+ */
+struct zoneref *next_zones_zonelist(struct zoneref *z,
+					enum zone_type highest_zoneidx,
+					nodemask_t *nodes,
+					struct zone **zone);
 
-/* Returns the first zone at or below highest_zoneidx in a zonelist */
+/**
+ * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
+ * @zonelist - The zonelist to search for a suitable zone
+ * @highest_zoneidx - The zone index of the highest zone to return
+ * @nodes - An optional nodemask to filter the zonelist with
+ * @zone - The first suitable zone found is returned via this parameter
+ *
+ * This function returns the first zone at or below a given zone index that is
+ * within the allowed nodemask. The zoneref returned is a cursor that can be
+ * used to iterate the zonelist with next_zones_zonelist. The cursor should
+ * not be used by the caller as it does not match the value of the zone
+ * returned.
+ */
 static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
-					enum zone_type highest_zoneidx)
+					enum zone_type highest_zoneidx,
+					nodemask_t *nodes,
+					struct zone **zone)
 {
-	struct zoneref *z;
-
-	/* Find the first suitable zone to use for the allocation */
-	z = zonelist->_zonerefs;
-	while (zonelist_zone_idx(z) > highest_zoneidx)
-		z++;
-
-	return z;
+	return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes,
+								zone);
 }
 
-/* Returns the next zone at or below highest_zoneidx in a zonelist */
-static inline struct zoneref *next_zones_zonelist(struct zoneref *z,
-					enum zone_type highest_zoneidx)
-{
-	/* Find the next suitable zone to use for the allocation */
-	while (zonelist_zone_idx(z) > highest_zoneidx)
-		z++;
-
-	return z;
-}
+/**
+ * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
+ * @zone - The current zone in the iterator
+ * @z - The current pointer within zonelist->zones being iterated
+ * @zlist - The zonelist being iterated
+ * @highidx - The zone index of the highest zone to return
+ * @nodemask - Nodemask allowed by the allocator
+ *
+ * This iterator iterates though all zones at or below a given zone index and
+ * within a given nodemask
+ */
+#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
+	for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone);	\
+		zone;							\
+		z = next_zones_zonelist(z, highidx, nodemask, &zone))	\
 
 /**
  * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
@@ -790,11 +814,7 @@ static inline struct zoneref *next_zones_zonelist(struct zoneref *z,
  * This iterator iterates though all zones at or below a given zone index.
  */
 #define for_each_zone_zonelist(zone, z, zlist, highidx) \
-	for (z = first_zones_zonelist(zlist, highidx),			\
-					zone = zonelist_zone(z++);	\
-		zone;							\
-		z = next_zones_zonelist(z, highidx),			\
-					zone = zonelist_zone(z++))
+	for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)
 
 #ifdef CONFIG_SPARSEMEM
 #include <asm/sparsemem.h>
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a220b13cbfa..c9923e3c9a3 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1958,22 +1958,14 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
 }
 
 /**
- * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
- * @zl: the zonelist to be checked
+ * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
+ * @nodemask: the nodemask to be checked
  *
- * Are any of the nodes on zonelist zl allowed in current->mems_allowed?
+ * Are any of the nodes in the nodemask allowed in current->mems_allowed?
  */
-int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
+int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 {
-	int i;
-
-	for (i = 0; zl->_zonerefs[i].zone; i++) {
-		int nid = zonelist_node_idx(&zl->_zonerefs[i]);
-
-		if (node_isset(nid, current->mems_allowed))
-			return 1;
-	}
-	return 0;
+	return nodes_intersects(*nodemask, current->mems_allowed);
 }
 
 /*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4bced0d705c..3737d82f522 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -95,12 +95,14 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
 	int nid;
 	struct page *page = NULL;
 	struct mempolicy *mpol;
+	nodemask_t *nodemask;
 	struct zonelist *zonelist = huge_zonelist(vma, address,
-					htlb_alloc_mask, &mpol);
+					htlb_alloc_mask, &mpol, &nodemask);
 	struct zone *zone;
 	struct zoneref *z;
 
-	for_each_zone_zonelist(zone, z, zonelist, MAX_NR_ZONES - 1) {
+	for_each_zone_zonelist_nodemask(zone, z, zonelist,
+						MAX_NR_ZONES - 1, nodemask) {
 		nid = zone_to_nid(zone);
 		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
 		    !list_empty(&hugepage_freelists[nid])) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 90193a2a915..acb5ee3587c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -163,42 +163,25 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
 	return 0;
 }
 
-/* Generate a custom zonelist for the BIND policy. */
-static struct zonelist *bind_zonelist(nodemask_t *nodes)
+/* Check that the nodemask contains at least one populated zone */
+static int is_valid_nodemask(nodemask_t *nodemask)
 {
-	struct zonelist *zl;
-	int num, max, nd;
-	enum zone_type k;
+	int nd, k;
 
-	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
-	max++;			/* space for zlcache_ptr (see mmzone.h) */
-	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
-	if (!zl)
-		return ERR_PTR(-ENOMEM);
-	zl->zlcache_ptr = NULL;
-	num = 0;
-	/* First put in the highest zones from all nodes, then all the next 
-	   lower zones etc. Avoid empty zones because the memory allocator
-	   doesn't like them. If you implement node hot removal you
-	   have to fix that. */
-	k = MAX_NR_ZONES - 1;
-	while (1) {
-		for_each_node_mask(nd, *nodes) { 
-			struct zone *z = &NODE_DATA(nd)->node_zones[k];
-			if (z->present_pages > 0) 
-				zoneref_set_zone(z, &zl->_zonerefs[num++]);
+	/* Check that there is something useful in this mask */
+	k = policy_zone;
+
+	for_each_node_mask(nd, *nodemask) {
+		struct zone *z;
+
+		for (k = 0; k <= policy_zone; k++) {
+			z = &NODE_DATA(nd)->node_zones[k];
+			if (z->present_pages > 0)
+				return 1;
 		}
-		if (k == 0)
-			break;
-		k--;
-	}
-	if (num == 0) {
-		kfree(zl);
-		return ERR_PTR(-EINVAL);
 	}
-	zl->_zonerefs[num].zone = NULL;
-	zl->_zonerefs[num].zone_idx = 0;
-	return zl;
+
+	return 0;
 }
 
 /* Create a new policy */
@@ -229,12 +212,11 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 			policy->v.preferred_node = -1;
 		break;
 	case MPOL_BIND:
-		policy->v.zonelist = bind_zonelist(nodes);
-		if (IS_ERR(policy->v.zonelist)) {
-			void *error_code = policy->v.zonelist;
+		if (!is_valid_nodemask(nodes)) {
 			kmem_cache_free(policy_cache, policy);
-			return error_code;
+			return ERR_PTR(-EINVAL);
 		}
+		policy->v.nodes = *nodes;
 		break;
 	}
 	policy->policy = mode;
@@ -500,19 +482,12 @@ static long do_set_mempolicy(int mode, nodemask_t *nodes)
 /* Fill a zone bitmap for a policy */
 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 {
-	int i;
-
 	nodes_clear(*nodes);
 	switch (p->policy) {
-	case MPOL_BIND:
-		for (i = 0; p->v.zonelist->_zonerefs[i].zone; i++) {
-			struct zoneref *zref;
-			zref = &p->v.zonelist->_zonerefs[i];
-			node_set(zonelist_node_idx(zref), *nodes);
-		}
-		break;
 	case MPOL_DEFAULT:
 		break;
+	case MPOL_BIND:
+		/* Fall through */
 	case MPOL_INTERLEAVE:
 		*nodes = p->v.nodes;
 		break;
@@ -1160,6 +1135,18 @@ static struct mempolicy * get_vma_policy(struct task_struct *task,
 	return pol;
 }
 
+/* Return a nodemask representing a mempolicy */
+static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
+{
+	/* Lower zones don't get a nodemask applied for MPOL_BIND */
+	if (unlikely(policy->policy == MPOL_BIND) &&
+			gfp_zone(gfp) >= policy_zone &&
+			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
+		return &policy->v.nodes;
+
+	return NULL;
+}
+
 /* Return a zonelist representing a mempolicy */
 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 {
@@ -1172,12 +1159,17 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 			nd = numa_node_id();
 		break;
 	case MPOL_BIND:
-		/* Lower zones don't get a policy applied */
-		/* Careful: current->mems_allowed might have moved */
-		if (gfp_zone(gfp) >= policy_zone)
-			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
-				return policy->v.zonelist;
-		/*FALL THROUGH*/
+		/*
+		 * Normally, MPOL_BIND allocations node-local are node-local
+		 * within the allowed nodemask. However, if __GFP_THISNODE is
+		 * set and the current node is part of the mask, we use the
+		 * the zonelist for the first node in the mask instead.
+		 */
+		nd = numa_node_id();
+		if (unlikely(gfp & __GFP_THISNODE) &&
+				unlikely(!node_isset(nd, policy->v.nodes)))
+			nd = first_node(policy->v.nodes);
+		break;
 	case MPOL_INTERLEAVE: /* should not happen */
 	case MPOL_DEFAULT:
 		nd = numa_node_id();
@@ -1220,7 +1212,14 @@ unsigned slab_node(struct mempolicy *policy)
 		 * Follow bind policy behavior and start allocation at the
 		 * first node.
 		 */
-		return zonelist_node_idx(policy->v.zonelist->_zonerefs);
+		struct zonelist *zonelist;
+		struct zone *zone;
+		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
+		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
+		(void)first_zones_zonelist(zonelist, highest_zoneidx,
+							&policy->v.nodes,
+							&zone);
+		return zone->node;
 	}
 
 	case MPOL_PREFERRED:
@@ -1278,25 +1277,31 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
  * @vma = virtual memory area whose policy is sought
  * @addr = address in @vma for shared policy lookup and interleave policy
  * @gfp_flags = for requested zone
- * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
+ * @mpol = pointer to mempolicy pointer for reference counted mempolicy
+ * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
  *
  * Returns a zonelist suitable for a huge page allocation.
- * If the effective policy is 'BIND, returns pointer to policy's zonelist.
+ * If the effective policy is 'BIND, returns pointer to local node's zonelist,
+ * and a pointer to the mempolicy's @nodemask for filtering the zonelist.
  * If it is also a policy for which get_vma_policy() returns an extra
- * reference, we must hold that reference until after allocation.
+ * reference, we must hold that reference until after the allocation.
  * In that case, return policy via @mpol so hugetlb allocation can drop
- * the reference.  For non-'BIND referenced policies, we can/do drop the
+ * the reference. For non-'BIND referenced policies, we can/do drop the
  * reference here, so the caller doesn't need to know about the special case
  * for default and current task policy.
  */
 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
-				gfp_t gfp_flags, struct mempolicy **mpol)
+				gfp_t gfp_flags, struct mempolicy **mpol,
+				nodemask_t **nodemask)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 	struct zonelist *zl;
 
 	*mpol = NULL;		/* probably no unref needed */
-	if (pol->policy == MPOL_INTERLEAVE) {
+	*nodemask = NULL;	/* assume !MPOL_BIND */
+	if (pol->policy == MPOL_BIND) {
+			*nodemask = &pol->v.nodes;
+	} else if (pol->policy == MPOL_INTERLEAVE) {
 		unsigned nid;
 
 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
@@ -1376,14 +1381,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 		/*
 		 * slow path: ref counted policy -- shared or vma
 		 */
-		struct page *page =  __alloc_pages(gfp, 0, zl);
+		struct page *page =  __alloc_pages_nodemask(gfp, 0,
+						zl, nodemask_policy(gfp, pol));
 		__mpol_free(pol);
 		return page;
 	}
 	/*
 	 * fast path:  default or task policy
 	 */
-	return __alloc_pages(gfp, 0, zl);
+	return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
 }
 
 /**
@@ -1415,7 +1421,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 		pol = &default_policy;
 	if (pol->policy == MPOL_INTERLEAVE)
 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
-	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
+	return __alloc_pages_nodemask(gfp, order,
+			zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
 }
 EXPORT_SYMBOL(alloc_pages_current);
 
@@ -1440,14 +1447,6 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
 	}
 	*new = *old;
 	atomic_set(&new->refcnt, 1);
-	if (new->policy == MPOL_BIND) {
-		int sz = ksize(old->v.zonelist);
-		new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
-		if (!new->v.zonelist) {
-			kmem_cache_free(policy_cache, new);
-			return ERR_PTR(-ENOMEM);
-		}
-	}
 	return new;
 }
 
@@ -1461,21 +1460,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	switch (a->policy) {
 	case MPOL_DEFAULT:
 		return 1;
+	case MPOL_BIND:
+		/* Fall through */
 	case MPOL_INTERLEAVE:
 		return nodes_equal(a->v.nodes, b->v.nodes);
 	case MPOL_PREFERRED:
 		return a->v.preferred_node == b->v.preferred_node;
-	case MPOL_BIND: {
-		int i;
-		for (i = 0; a->v.zonelist->_zonerefs[i].zone; i++) {
-			struct zone *za, *zb;
-			za = zonelist_zone(&a->v.zonelist->_zonerefs[i]);
-			zb = zonelist_zone(&b->v.zonelist->_zonerefs[i]);
-			if (za != zb)
-				return 0;
-		}
-		return b->v.zonelist->_zonerefs[i].zone == NULL;
-	}
 	default:
 		BUG();
 		return 0;
@@ -1487,8 +1477,6 @@ void __mpol_free(struct mempolicy *p)
 {
 	if (!atomic_dec_and_test(&p->refcnt))
 		return;
-	if (p->policy == MPOL_BIND)
-		kfree(p->v.zonelist);
 	p->policy = MPOL_DEFAULT;
 	kmem_cache_free(policy_cache, p);
 }
@@ -1779,6 +1767,8 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 	switch (pol->policy) {
 	case MPOL_DEFAULT:
 		break;
+	case MPOL_BIND:
+		/* Fall through */
 	case MPOL_INTERLEAVE:
 		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
 		pol->v.nodes = tmp;
@@ -1791,32 +1781,6 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 						*mpolmask, *newmask);
 		*mpolmask = *newmask;
 		break;
-	case MPOL_BIND: {
-		nodemask_t nodes;
-		struct zoneref *z;
-		struct zonelist *zonelist;
-
-		nodes_clear(nodes);
-		for (z = pol->v.zonelist->_zonerefs; z->zone; z++)
-			node_set(zonelist_node_idx(z), nodes);
-		nodes_remap(tmp, nodes, *mpolmask, *newmask);
-		nodes = tmp;
-
-		zonelist = bind_zonelist(&nodes);
-
-		/* If no mem, then zonelist is NULL and we keep old zonelist.
-		 * If that old zonelist has no remaining mems_allowed nodes,
-		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
-		 */
-
-		if (!IS_ERR(zonelist)) {
-			/* Good - got mem - substitute new zonelist */
-			kfree(pol->v.zonelist);
-			pol->v.zonelist = zonelist;
-		}
-		*mpolmask = *newmask;
-		break;
-	}
 	default:
 		BUG();
 		break;
@@ -1879,9 +1843,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 		break;
 
 	case MPOL_BIND:
-		get_zonemask(pol, &nodes);
-		break;
-
+		/* Fall through */
 	case MPOL_INTERLEAVE:
 		nodes = pol->v.nodes;
 		break;
diff --git a/mm/mmzone.c b/mm/mmzone.c
index eb5838634f1..486ed595ee6 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -42,3 +42,33 @@ struct zone *next_zone(struct zone *zone)
 	return zone;
 }
 
+static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
+{
+#ifdef CONFIG_NUMA
+	return node_isset(zonelist_node_idx(zref), *nodes);
+#else
+	return 1;
+#endif /* CONFIG_NUMA */
+}
+
+/* Returns the next zone at or below highest_zoneidx in a zonelist */
+struct zoneref *next_zones_zonelist(struct zoneref *z,
+					enum zone_type highest_zoneidx,
+					nodemask_t *nodes,
+					struct zone **zone)
+{
+	/*
+	 * Find the next suitable zone to use for the allocation.
+	 * Only filter based on nodemask if it's set
+	 */
+	if (likely(nodes == NULL))
+		while (zonelist_zone_idx(z) > highest_zoneidx)
+			z++;
+	else
+		while (zonelist_zone_idx(z) > highest_zoneidx ||
+				(z->zone && !zref_in_nodemask(z, nodes)))
+			z++;
+
+	*zone = zonelist_zone(z++);
+	return z;
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d94d04ea78..b4beb3eea8b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1377,7 +1377,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
  * a page.
  */
 static struct page *
-get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
+get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 		struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
 {
 	struct zoneref *z;
@@ -1388,16 +1388,17 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
 	int zlc_active = 0;		/* set if using zonelist_cache */
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
 
-	z = first_zones_zonelist(zonelist, high_zoneidx);
-	classzone_idx = zonelist_zone_idx(z);
-	preferred_zone = zonelist_zone(z);
+	(void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
+							&preferred_zone);
+	classzone_idx = zone_idx(preferred_zone);
 
 zonelist_scan:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
-	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+	for_each_zone_zonelist_nodemask(zone, z, zonelist,
+						high_zoneidx, nodemask) {
 		if (NUMA_BUILD && zlc_active &&
 			!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
@@ -1447,9 +1448,9 @@ try_next_zone:
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
-struct page *
-__alloc_pages(gfp_t gfp_mask, unsigned int order,
-		struct zonelist *zonelist)
+static struct page *
+__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
+			struct zonelist *zonelist, nodemask_t *nodemask)
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
@@ -1478,7 +1479,7 @@ restart:
 		return NULL;
 	}
 
-	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
 	if (page)
 		goto got_pg;
@@ -1523,7 +1524,7 @@ restart:
 	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
-	page = get_page_from_freelist(gfp_mask, order, zonelist,
+	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
 						high_zoneidx, alloc_flags);
 	if (page)
 		goto got_pg;
@@ -1536,7 +1537,7 @@ rebalance:
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 nofail_alloc:
 			/* go through the zonelist yet again, ignoring mins */
-			page = get_page_from_freelist(gfp_mask, order,
+			page = get_page_from_freelist(gfp_mask, nodemask, order,
 				zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
 			if (page)
 				goto got_pg;
@@ -1571,7 +1572,7 @@ nofail_alloc:
 		drain_all_pages();
 
 	if (likely(did_some_progress)) {
-		page = get_page_from_freelist(gfp_mask, order,
+		page = get_page_from_freelist(gfp_mask, nodemask, order,
 					zonelist, high_zoneidx, alloc_flags);
 		if (page)
 			goto got_pg;
@@ -1587,8 +1588,9 @@ nofail_alloc:
 		 * a parallel oom killing, we must fail if we're still
 		 * under heavy pressure.
 		 */
-		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
-			zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
+		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
+			order, zonelist, high_zoneidx,
+			ALLOC_WMARK_HIGH|ALLOC_CPUSET);
 		if (page) {
 			clear_zonelist_oom(zonelist, gfp_mask);
 			goto got_pg;
@@ -1637,6 +1639,20 @@ got_pg:
 	return page;
 }
 
+struct page *
+__alloc_pages(gfp_t gfp_mask, unsigned int order,
+		struct zonelist *zonelist)
+{
+	return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
+}
+
+struct page *
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+		struct zonelist *zonelist, nodemask_t *nodemask)
+{
+	return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
+}
+
 EXPORT_SYMBOL(__alloc_pages);
 
 /*
@@ -1880,6 +1896,12 @@ void show_free_areas(void)
 	show_swap_cache_info();
 }
 
+static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
+{
+	zoneref->zone = zone;
+	zoneref->zone_idx = zone_idx(zone);
+}
+
 /*
  * Builds allocation fallback zone lists.
  *
-- 
cgit v1.2.3-70-g09d2


From 1cdf25d704f7951d02a04064c97db547d6021872 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 28 Apr 2008 02:12:44 -0700
Subject: kbuild: create a way to create preprocessor constants from C
 expressions

The use of enums create constants that are not available to the preprocessor
when building the kernel (f.e.  MAX_NR_ZONES).

Arch code already has a way to export constants calculated to the preprocessor
through the asm-offsets.c file.  Generate something similar for the core
kernel through kbuild.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Kbuild          | 56 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 kernel/bounds.c | 19 +++++++++++++++++++
 2 files changed, 67 insertions(+), 8 deletions(-)
 create mode 100644 kernel/bounds.c

(limited to 'kernel')

diff --git a/Kbuild b/Kbuild
index 1570d248ad9..7136de7b6fc 100644
--- a/Kbuild
+++ b/Kbuild
@@ -1,19 +1,54 @@
 #
 # Kbuild for top-level directory of the kernel
 # This file takes care of the following:
-# 1) Generate asm-offsets.h
-# 2) Check for missing system calls
+# 1) Generate bounds.h
+# 2) Generate asm-offsets.h (may need bounds.h)
+# 3) Check for missing system calls
 
 #####
-# 1) Generate asm-offsets.h
+# 1) Generate bounds.h
+
+bounds-file := include/linux/bounds.h
+
+always  := $(bounds-file)
+targets := $(bounds-file) kernel/bounds.s
+
+quiet_cmd_bounds = GEN     $@
+define cmd_bounds
+	(set -e; \
+	 echo "#ifndef __LINUX_BOUNDS_H__"; \
+	 echo "#define __LINUX_BOUNDS_H__"; \
+	 echo "/*"; \
+	 echo " * DO NOT MODIFY."; \
+	 echo " *"; \
+	 echo " * This file was generated by Kbuild"; \
+	 echo " *"; \
+	 echo " */"; \
+	 echo ""; \
+	 sed -ne $(sed-y) $<; \
+	 echo ""; \
+	 echo "#endif" ) > $@
+endef
+
+# We use internal kbuild rules to avoid the "is up to date" message from make
+kernel/bounds.s: kernel/bounds.c FORCE
+	$(Q)mkdir -p $(dir $@)
+	$(call if_changed_dep,cc_s_c)
+
+$(obj)/$(bounds-file): kernel/bounds.s Kbuild
+	$(Q)mkdir -p $(dir $@)
+	$(call cmd,bounds)
+
+#####
+# 2) Generate asm-offsets.h
 #
 
 offsets-file := include/asm-$(SRCARCH)/asm-offsets.h
 
-always  := $(offsets-file)
-targets := $(offsets-file)
+always  += $(offsets-file)
+targets += $(offsets-file)
 targets += arch/$(SRCARCH)/kernel/asm-offsets.s
-clean-files := $(addprefix $(objtree)/,$(targets))
+
 
 # Default sed regexp - multiline due to syntax constraints
 define sed-y
@@ -40,7 +75,8 @@ define cmd_offsets
 endef
 
 # We use internal kbuild rules to avoid the "is up to date" message from make
-arch/$(SRCARCH)/kernel/asm-offsets.s: arch/$(SRCARCH)/kernel/asm-offsets.c FORCE
+arch/$(SRCARCH)/kernel/asm-offsets.s: arch/$(SRCARCH)/kernel/asm-offsets.c \
+                                      $(obj)/$(bounds-file) FORCE
 	$(Q)mkdir -p $(dir $@)
 	$(call if_changed_dep,cc_s_c)
 
@@ -49,7 +85,7 @@ $(obj)/$(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s Kbuild
 	$(call cmd,offsets)
 
 #####
-# 2) Check for missing system calls
+# 3) Check for missing system calls
 #
 
 quiet_cmd_syscalls = CALL    $<
@@ -58,3 +94,7 @@ quiet_cmd_syscalls = CALL    $<
 PHONY += missing-syscalls
 missing-syscalls: scripts/checksyscalls.sh FORCE
 	$(call cmd,syscalls)
+
+# Delete all targets during make clean
+clean-files := $(addprefix $(objtree)/,$(targets))
+
diff --git a/kernel/bounds.c b/kernel/bounds.c
new file mode 100644
index 00000000000..85bb281858c
--- /dev/null
+++ b/kernel/bounds.c
@@ -0,0 +1,19 @@
+/*
+ * Generate definitions needed by the preprocessor.
+ * This code generates raw asm output which is post-processed
+ * to extract and format the required data.
+ */
+
+#define __GENERATING_BOUNDS_H
+/* Include headers that define the enum constants of interest */
+
+#define DEFINE(sym, val) \
+        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+
+#define BLANK() asm volatile("\n->" : : )
+
+void foo(void)
+{
+	/* The enum constants to put into include/linux/bounds.h */
+	/* End of constants */
+}
-- 
cgit v1.2.3-70-g09d2


From 9223b4190fa1297a59f292f3419fc0285321d0ea Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 28 Apr 2008 02:12:48 -0700
Subject: pageflags: get rid of FLAGS_RESERVED

NR_PAGEFLAGS specifies the number of page flags we are using.  From that we
can calculate the number of bits leftover that can be used for zone, node (and
maybe the sections id).  There is no need anymore for FLAGS_RESERVED if we use
NR_PAGEFLAGS.

Use the new methods to make NR_PAGEFLAGS available via the preprocessor.
NR_PAGEFLAGS is used to calculate field boundaries in the page flags fields.
These field widths have to be available to the preprocessor.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: David Miller <davem@davemloft.net>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sparc64/mm/init.c     | 16 ++++++++++++++--
 include/linux/mm.h         |  6 +++---
 include/linux/mmzone.h     | 19 -------------------
 include/linux/page-flags.h | 19 ++++++++++++-------
 kernel/bounds.c            |  2 ++
 5 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index 177d8aaeec4..8c2b50e8abc 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -1699,9 +1699,21 @@ void __init paging_init(void)
 	 * functions like clear_dcache_dirty_cpu use the cpu mask
 	 * in 13-bit signed-immediate instruction fields.
 	 */
-	BUILD_BUG_ON(FLAGS_RESERVED != 32);
+
+	/*
+	 * Page flags must not reach into upper 32 bits that are used
+	 * for the cpu number
+	 */
+	BUILD_BUG_ON(NR_PAGEFLAGS > 32);
+
+	/*
+	 * The bit fields placed in the high range must not reach below
+	 * the 32 bit boundary. Otherwise we cannot place the cpu field
+	 * at the 32 bit boundary.
+	 */
 	BUILD_BUG_ON(SECTIONS_WIDTH + NODES_WIDTH + ZONES_WIDTH +
-		     ilog2(roundup_pow_of_two(NR_CPUS)) > FLAGS_RESERVED);
+		ilog2(roundup_pow_of_two(NR_CPUS)) > 32);
+
 	BUILD_BUG_ON(NR_CPUS > 4096);
 
 	kern_base = (prom_boot_mapping_phys_low >> 22UL) << 22UL;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4f3c1b2f44d..526f810367d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -407,7 +407,7 @@ static inline void set_compound_order(struct page *page, unsigned long order)
 
 #define ZONES_WIDTH		ZONES_SHIFT
 
-#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= FLAGS_RESERVED
+#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
 #define NODES_WIDTH		NODES_SHIFT
 #else
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
@@ -455,8 +455,8 @@ static inline void set_compound_order(struct page *page, unsigned long order)
 
 #define ZONEID_PGSHIFT		(ZONEID_PGOFF * (ZONEID_SHIFT != 0))
 
-#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
-#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
+#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
+#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
 #endif
 
 #define ZONES_MASK		((1UL << ZONES_WIDTH) - 1)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0aece6d8937..c7a51dac441 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -820,25 +820,6 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 #include <asm/sparsemem.h>
 #endif
 
-#if BITS_PER_LONG == 32
-/*
- * with 32 bit page->flags field, we reserve 9 bits for node/zone info.
- * there are 4 zones (3 bits) and this leaves 9-3=6 bits for nodes.
- */
-#define FLAGS_RESERVED		9
-
-#elif BITS_PER_LONG == 64
-/*
- * with 64 bit flags field, there's plenty of room.
- */
-#define FLAGS_RESERVED		32
-
-#else
-
-#error BITS_PER_LONG not defined
-
-#endif
-
 #if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \
 	!defined(CONFIG_ARCH_POPULATES_NODE_MAP)
 static inline unsigned long early_pfn_to_nid(unsigned long pfn)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d66971530ca..00e55e23b77 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -6,7 +6,10 @@
 #define PAGE_FLAGS_H
 
 #include <linux/types.h>
+#ifndef __GENERATING_BOUNDS_H
 #include <linux/mm_types.h>
+#include <linux/bounds.h>
+#endif /* !__GENERATING_BOUNDS_H */
 
 /*
  * Various page->flags bits:
@@ -59,13 +62,12 @@
  * extends from the high bits downwards.
  *
  *  | FIELD | ... | FLAGS |
- *  N-1     ^             0
- *          (N-FLAGS_RESERVED)
+ *  N-1           ^       0
+ *               (NR_PAGEFLAGS)
  *
- * The fields area is reserved for fields mapping zone, node and SPARSEMEM
- * section.  The boundry between these two areas is defined by
- * FLAGS_RESERVED which defines the width of the fields section
- * (see linux/mmzone.h).  New flags must _not_ overlap with this area.
+ * The fields area is reserved for fields mapping zone, node (for NUMA) and
+ * SPARSEMEM section (for variants of SPARSEMEM that require section ids like
+ * SPARSEMEM_EXTREME with !SPARSEMEM_VMEMMAP).
  */
 enum pageflags {
 	PG_locked,		/* Page is locked. Don't touch. */
@@ -101,9 +103,11 @@ enum pageflags {
  */
 	PG_uncached = 31,		/* Page has been mapped as uncached */
 #endif
-	NR_PAGEFLAGS
+	__NR_PAGEFLAGS
 };
 
+#ifndef __GENERATING_BOUNDS_H
+
 /*
  * Manipulation of page state flags
  */
@@ -304,4 +308,5 @@ static inline void set_page_writeback(struct page *page)
 	test_set_page_writeback(page);
 }
 
+#endif /* !__GENERATING_BOUNDS_H */
 #endif	/* PAGE_FLAGS_H */
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 85bb281858c..9ca2bb30243 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -6,6 +6,7 @@
 
 #define __GENERATING_BOUNDS_H
 /* Include headers that define the enum constants of interest */
+#include <linux/page-flags.h>
 
 #define DEFINE(sym, val) \
         asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -15,5 +16,6 @@
 void foo(void)
 {
 	/* The enum constants to put into include/linux/bounds.h */
+	DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
 	/* End of constants */
 }
-- 
cgit v1.2.3-70-g09d2


From 97965478a66fbdf0f4ad5e4ecc4828f0cb548a45 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 28 Apr 2008 02:12:54 -0700
Subject: mm: Get rid of __ZONE_COUNT

It was used to compensate because MAX_NR_ZONES was not available to the
#ifdefs.  Export MAX_NR_ZONES via the new mechanism and get rid of
__ZONE_COUNT.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 28 +++++++++++-----------------
 kernel/bounds.c        |  2 ++
 2 files changed, 13 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c7a51dac441..c3828497f41 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -3,6 +3,7 @@
 
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
+#ifndef __GENERATING_BOUNDS_H
 
 #include <linux/spinlock.h>
 #include <linux/list.h>
@@ -15,6 +16,7 @@
 #include <linux/seqlock.h>
 #include <linux/nodemask.h>
 #include <linux/pageblock-flags.h>
+#include <linux/bounds.h>
 #include <asm/atomic.h>
 #include <asm/page.h>
 
@@ -129,6 +131,8 @@ struct per_cpu_pageset {
 #define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])
 #endif
 
+#endif /* !__GENERATING_BOUNDS.H */
+
 enum zone_type {
 #ifdef CONFIG_ZONE_DMA
 	/*
@@ -177,9 +181,11 @@ enum zone_type {
 	ZONE_HIGHMEM,
 #endif
 	ZONE_MOVABLE,
-	MAX_NR_ZONES
+	__MAX_NR_ZONES
 };
 
+#ifndef __GENERATING_BOUNDS_H
+
 /*
  * When a memory allocation must conform to specific limitations (such
  * as being suitable for DMA) the caller will pass in hints to the
@@ -188,28 +194,15 @@ enum zone_type {
  * match the requested limits. See gfp_zone() in include/linux/gfp.h
  */
 
-/*
- * Count the active zones.  Note that the use of defined(X) outside
- * #if and family is not necessarily defined so ensure we cannot use
- * it later.  Use __ZONE_COUNT to work out how many shift bits we need.
- */
-#define __ZONE_COUNT (			\
-	  defined(CONFIG_ZONE_DMA)	\
-	+ defined(CONFIG_ZONE_DMA32)	\
-	+ 1				\
-	+ defined(CONFIG_HIGHMEM)	\
-	+ 1				\
-)
-#if __ZONE_COUNT < 2
+#if MAX_NR_ZONES < 2
 #define ZONES_SHIFT 0
-#elif __ZONE_COUNT <= 2
+#elif MAX_NR_ZONES <= 2
 #define ZONES_SHIFT 1
-#elif __ZONE_COUNT <= 4
+#elif MAX_NR_ZONES <= 4
 #define ZONES_SHIFT 2
 #else
 #error ZONES_SHIFT -- too many zones configured adjust calculation
 #endif
-#undef __ZONE_COUNT
 
 struct zone {
 	/* Fields commonly accessed by the page allocator */
@@ -1008,6 +1001,7 @@ unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
 #define pfn_valid_within(pfn) (1)
 #endif
 
+#endif /* !__GENERATING_BOUNDS.H */
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MMZONE_H */
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 9ca2bb30243..c3c55544db2 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -7,6 +7,7 @@
 #define __GENERATING_BOUNDS_H
 /* Include headers that define the enum constants of interest */
 #include <linux/page-flags.h>
+#include <linux/mmzone.h>
 
 #define DEFINE(sym, val) \
         asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -17,5 +18,6 @@ void foo(void)
 {
 	/* The enum constants to put into include/linux/bounds.h */
 	DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
+	DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
 	/* End of constants */
 }
-- 
cgit v1.2.3-70-g09d2


From 122c7a59055c77434118d7dd4dff4b625d4a2c15 Mon Sep 17 00:00:00 2001
From: Ken'ichi Ohmichi <oomichi@mxs.nes.nec.co.jp>
Date: Mon, 28 Apr 2008 02:13:04 -0700
Subject: vmcoreinfo: add page flags values

Add some values of page flags to the vmcoreinfo data.

The vmcoreinfo data has the minimum debugging information only for dump
filtering.  makedumpfile (dump filtering command) gets it to distinguish
unnecessary pages, and makedumpfile creates a small dumpfile.

An old makedumpfile (v1.2.4 or before) had assumed some values of page flags
internally, and this implementation could not follow the change of these
values.  For example, Christoph Lameter is changing these values by the
follwing patch: http://lkml.org/lkml/2008/2/29/463

So a new makedumpfile (v1.2.5) came to need these values and I created this
patch to let the kernel output them.

Signed-off-by: Ken'ichi Ohmichi <oomichi@mxs.nes.nec.co.jp>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 6782dce93d0..cb85c79989b 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1405,6 +1405,9 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
 	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
 	VMCOREINFO_NUMBER(NR_FREE_PAGES);
+	VMCOREINFO_NUMBER(PG_lru);
+	VMCOREINFO_NUMBER(PG_private);
+	VMCOREINFO_NUMBER(PG_swapcache);
 
 	arch_crash_save_vmcoreinfo();
 
-- 
cgit v1.2.3-70-g09d2


From f0be3d32b05d3fea2fcdbbb81a39dac2a7163169 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 28 Apr 2008 02:13:08 -0700
Subject: mempolicy: rename mpol_free to mpol_put

This is a change that was requested some time ago by Mel Gorman.  Makes sense
to me, so here it is.

Note: I retain the name "mpol_free_shared_policy()" because it actually does
free the shared_policy, which is NOT a reference counted object.  However, ...

The mempolicy object[s] referenced by the shared_policy are reference counted,
so mpol_put() is used to release the reference held by the shared_policy.  The
mempolicy might not be freed at this time, because some task attached to the
shared object associated with the shared policy may be in the process of
allocating a page based on the mempolicy.  In that case, the task performing
the allocation will hold a reference on the mempolicy, obtained via
mpol_shared_policy_lookup().  The mempolicy will be freed when all tasks
holding such a reference have called mpol_put() for the mempolicy.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h | 10 +++++-----
 kernel/exit.c             |  2 +-
 kernel/fork.c             |  2 +-
 mm/hugetlb.c              |  2 +-
 mm/mempolicy.c            | 26 +++++++++++++-------------
 mm/mmap.c                 |  6 +++---
 mm/shmem.c                |  4 ++--
 7 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 319fd342b1b..507bf5e29f2 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -71,7 +71,7 @@ struct mm_struct;
  *
  * Freeing policy:
  * Mempolicy objects are reference counted.  A mempolicy will be freed when
- * mpol_free() decrements the reference count to zero.
+ * mpol_put() decrements the reference count to zero.
  *
  * Copying policy objects:
  * mpol_copy() allocates a new mempolicy and copies the specified mempolicy
@@ -98,11 +98,11 @@ struct mempolicy {
  * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
  */
 
-extern void __mpol_free(struct mempolicy *pol);
-static inline void mpol_free(struct mempolicy *pol)
+extern void __mpol_put(struct mempolicy *pol);
+static inline void mpol_put(struct mempolicy *pol)
 {
 	if (pol)
-		__mpol_free(pol);
+		__mpol_put(pol);
 }
 
 extern struct mempolicy *__mpol_copy(struct mempolicy *pol);
@@ -190,7 +190,7 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	return 1;
 }
 
-static inline void mpol_free(struct mempolicy *p)
+static inline void mpol_put(struct mempolicy *p)
 {
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 97f609f574b..2a9d98c641a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -967,7 +967,7 @@ NORET_TYPE void do_exit(long code)
 	proc_exit_connector(tsk);
 	exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
-	mpol_free(tsk->mempolicy);
+	mpol_put(tsk->mempolicy);
 	tsk->mempolicy = NULL;
 #endif
 #ifdef CONFIG_FUTEX
diff --git a/kernel/fork.c b/kernel/fork.c
index c674aa8d3c3..1a5ae208457 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1374,7 +1374,7 @@ bad_fork_cleanup_security:
 	security_task_free(p);
 bad_fork_cleanup_policy:
 #ifdef CONFIG_NUMA
-	mpol_free(p->mempolicy);
+	mpol_put(p->mempolicy);
 bad_fork_cleanup_cgroup:
 #endif
 	cgroup_exit(p, cgroup_callbacks_done);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8deae4eb969..53afa8c76ad 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -116,7 +116,7 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
 			break;
 		}
 	}
-	mpol_free(mpol);	/* unref if mpol !NULL */
+	mpol_put(mpol);	/* unref if mpol !NULL */
 	return page;
 }
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c1b907789d8..ce2c5b6bf9f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -529,7 +529,7 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 	if (!err) {
 		mpol_get(new);
 		vma->vm_policy = new;
-		mpol_free(old);
+		mpol_put(old);
 	}
 	return err;
 }
@@ -595,7 +595,7 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	new = mpol_new(mode, flags, nodes);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
-	mpol_free(current->mempolicy);
+	mpol_put(current->mempolicy);
 	current->mempolicy = new;
 	mpol_set_task_struct_flag();
 	if (new && new->policy == MPOL_INTERLEAVE &&
@@ -948,7 +948,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 	}
 
 	up_write(&mm->mmap_sem);
-	mpol_free(new);
+	mpol_put(new);
 	return err;
 }
 
@@ -1446,14 +1446,14 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
 		if (unlikely(pol != &default_policy &&
 				pol != current->mempolicy))
-			__mpol_free(pol);	/* finished with pol */
+			__mpol_put(pol);	/* finished with pol */
 		return node_zonelist(nid, gfp_flags);
 	}
 
 	zl = zonelist_policy(GFP_HIGHUSER, pol);
 	if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
 		if (pol->policy != MPOL_BIND)
-			__mpol_free(pol);	/* finished with pol */
+			__mpol_put(pol);	/* finished with pol */
 		else
 			*mpol = pol;	/* unref needed after allocation */
 	}
@@ -1512,7 +1512,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
 		if (unlikely(pol != &default_policy &&
 				pol != current->mempolicy))
-			__mpol_free(pol);	/* finished with pol */
+			__mpol_put(pol);	/* finished with pol */
 		return alloc_page_interleave(gfp, 0, nid);
 	}
 	zl = zonelist_policy(gfp, pol);
@@ -1522,7 +1522,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 		 */
 		struct page *page =  __alloc_pages_nodemask(gfp, 0,
 						zl, nodemask_policy(gfp, pol));
-		__mpol_free(pol);
+		__mpol_put(pol);
 		return page;
 	}
 	/*
@@ -1624,7 +1624,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 }
 
 /* Slow path of a mpol destructor. */
-void __mpol_free(struct mempolicy *p)
+void __mpol_put(struct mempolicy *p)
 {
 	if (!atomic_dec_and_test(&p->refcnt))
 		return;
@@ -1720,7 +1720,7 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
 	rb_erase(&n->nd, &sp->root);
-	mpol_free(n->policy);
+	mpol_put(n->policy);
 	kmem_cache_free(sn_cache, n);
 }
 
@@ -1780,7 +1780,7 @@ restart:
 		sp_insert(sp, new);
 	spin_unlock(&sp->lock);
 	if (new2) {
-		mpol_free(new2->policy);
+		mpol_put(new2->policy);
 		kmem_cache_free(sn_cache, new2);
 	}
 	return 0;
@@ -1805,7 +1805,7 @@ void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
 			/* Policy covers entire file */
 			pvma.vm_end = TASK_SIZE;
 			mpol_set_shared_policy(info, &pvma, newpol);
-			mpol_free(newpol);
+			mpol_put(newpol);
 		}
 	}
 }
@@ -1848,7 +1848,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
 		n = rb_entry(next, struct sp_node, nd);
 		next = rb_next(&n->nd);
 		rb_erase(&n->nd, &p->root);
-		mpol_free(n->policy);
+		mpol_put(n->policy);
 		kmem_cache_free(sn_cache, n);
 	}
 	spin_unlock(&p->lock);
@@ -2068,7 +2068,7 @@ int show_numa_map(struct seq_file *m, void *v)
 	 * unref shared or other task's mempolicy
 	 */
 	if (pol != &default_policy && pol != current->mempolicy)
-		__mpol_free(pol);
+		__mpol_put(pol);
 
 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 6aaf657adb8..36c85e04fa9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -232,7 +232,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 		vma->vm_ops->close(vma);
 	if (vma->vm_file)
 		fput(vma->vm_file);
-	mpol_free(vma_policy(vma));
+	mpol_put(vma_policy(vma));
 	kmem_cache_free(vm_area_cachep, vma);
 	return next;
 }
@@ -626,7 +626,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 		if (file)
 			fput(file);
 		mm->map_count--;
-		mpol_free(vma_policy(next));
+		mpol_put(vma_policy(next));
 		kmem_cache_free(vm_area_cachep, next);
 		/*
 		 * In mprotect's case 6 (see comments on vma_merge),
@@ -1182,7 +1182,7 @@ munmap_back:
 
 	if (file && vma_merge(mm, prev, addr, vma->vm_end,
 			vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
-		mpol_free(vma_policy(vma));
+		mpol_put(vma_policy(vma));
 		kmem_cache_free(vm_area_cachep, vma);
 		fput(file);
 	} else {
diff --git a/mm/shmem.c b/mm/shmem.c
index 177c7a7d2bb..5326876d814 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1196,7 +1196,7 @@ static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
 	pvma.vm_ops = NULL;
 	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
 	page = swapin_readahead(entry, gfp, &pvma, 0);
-	mpol_free(pvma.vm_policy);
+	mpol_put(pvma.vm_policy);
 	return page;
 }
 
@@ -1212,7 +1212,7 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 	pvma.vm_ops = NULL;
 	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
 	page = alloc_page_vma(gfp, &pvma, 0);
-	mpol_free(pvma.vm_policy);
+	mpol_put(pvma.vm_policy);
 	return page;
 }
 #else /* !CONFIG_NUMA */
-- 
cgit v1.2.3-70-g09d2


From 846a16bf0fc80dc95a414ffce465e3cbf9680247 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 28 Apr 2008 02:13:09 -0700
Subject: mempolicy: rename mpol_copy to mpol_dup

This patch renames mpol_copy() to mpol_dup() because, well, that's what it
does.  Like, e.g., strdup() for strings, mpol_dup() takes a pointer to an
existing mempolicy, allocates a new one and copies the contents.

In a later patch, I want to use the name mpol_copy() to copy the contents from
one mempolicy to another like, e.g., strcpy() does for strings.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h | 14 +++++++-------
 kernel/cpuset.c           |  4 ++--
 kernel/fork.c             |  4 ++--
 mm/mempolicy.c            |  6 +++---
 mm/mmap.c                 |  4 ++--
 5 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 507bf5e29f2..5e19c2275a6 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -73,10 +73,10 @@ struct mm_struct;
  * Mempolicy objects are reference counted.  A mempolicy will be freed when
  * mpol_put() decrements the reference count to zero.
  *
- * Copying policy objects:
- * mpol_copy() allocates a new mempolicy and copies the specified mempolicy
+ * Duplicating policy objects:
+ * mpol_dup() allocates a new mempolicy and copies the specified mempolicy
  * to the new storage.  The reference count of the new object is initialized
- * to 1, representing the caller of mpol_copy().
+ * to 1, representing the caller of mpol_dup().
  */
 struct mempolicy {
 	atomic_t refcnt;
@@ -105,11 +105,11 @@ static inline void mpol_put(struct mempolicy *pol)
 		__mpol_put(pol);
 }
 
-extern struct mempolicy *__mpol_copy(struct mempolicy *pol);
-static inline struct mempolicy *mpol_copy(struct mempolicy *pol)
+extern struct mempolicy *__mpol_dup(struct mempolicy *pol);
+static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
 {
 	if (pol)
-		pol = __mpol_copy(pol);
+		pol = __mpol_dup(pol);
 	return pol;
 }
 
@@ -198,7 +198,7 @@ static inline void mpol_get(struct mempolicy *pol)
 {
 }
 
-static inline struct mempolicy *mpol_copy(struct mempolicy *old)
+static inline struct mempolicy *mpol_dup(struct mempolicy *old)
 {
 	return NULL;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c9923e3c9a3..024888bb981 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -941,7 +941,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	cs->mems_generation = cpuset_mems_generation++;
 	mutex_unlock(&callback_mutex);
 
-	cpuset_being_rebound = cs;		/* causes mpol_copy() rebind */
+	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
 
 	fudge = 10;				/* spare mmarray[] slots */
 	fudge += cpus_weight(cs->cpus_allowed);	/* imagine one fork-bomb/cpu */
@@ -992,7 +992,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	 * rebind the vma mempolicies of each mm in mmarray[] to their
 	 * new cpuset, and release that mm.  The mpol_rebind_mm()
 	 * call takes mmap_sem, which we couldn't take while holding
-	 * tasklist_lock.  Forks can happen again now - the mpol_copy()
+	 * tasklist_lock.  Forks can happen again now - the mpol_dup()
 	 * cpuset_being_rebound check will catch such forks, and rebind
 	 * their vma mempolicies too.  Because we still hold the global
 	 * cgroup_mutex, we know that no other rebind effort will
diff --git a/kernel/fork.c b/kernel/fork.c
index 1a5ae208457..6067e429f28 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -279,7 +279,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		if (!tmp)
 			goto fail_nomem;
 		*tmp = *mpnt;
-		pol = mpol_copy(vma_policy(mpnt));
+		pol = mpol_dup(vma_policy(mpnt));
 		retval = PTR_ERR(pol);
 		if (IS_ERR(pol))
 			goto fail_nomem_policy;
@@ -1116,7 +1116,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->audit_context = NULL;
 	cgroup_fork(p);
 #ifdef CONFIG_NUMA
- 	p->mempolicy = mpol_copy(p->mempolicy);
+	p->mempolicy = mpol_dup(p->mempolicy);
  	if (IS_ERR(p->mempolicy)) {
  		retval = PTR_ERR(p->mempolicy);
  		p->mempolicy = NULL;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ce2c5b6bf9f..e9fc1c1ae66 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1566,15 +1566,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 EXPORT_SYMBOL(alloc_pages_current);
 
 /*
- * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
+ * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
  * with the mems_allowed returned by cpuset_mems_allowed().  This
  * keeps mempolicies cpuset relative after its cpuset moves.  See
  * further kernel/cpuset.c update_nodemask().
  */
 
-/* Slow path of a mempolicy copy */
-struct mempolicy *__mpol_copy(struct mempolicy *old)
+/* Slow path of a mempolicy duplicate */
+struct mempolicy *__mpol_dup(struct mempolicy *old)
 {
 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 36c85e04fa9..677d184b0d4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1810,7 +1810,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 		new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
 	}
 
-	pol = mpol_copy(vma_policy(vma));
+	pol = mpol_dup(vma_policy(vma));
 	if (IS_ERR(pol)) {
 		kmem_cache_free(vm_area_cachep, new);
 		return PTR_ERR(pol);
@@ -2126,7 +2126,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 		if (new_vma) {
 			*new_vma = *vma;
-			pol = mpol_copy(vma_policy(vma));
+			pol = mpol_dup(vma_policy(vma));
 			if (IS_ERR(pol)) {
 				kmem_cache_free(vm_area_cachep, new_vma);
 				return NULL;
-- 
cgit v1.2.3-70-g09d2


From 3898b1b4ebff8dcfbcf1807e0661585e06c9a91c Mon Sep 17 00:00:00 2001
From: "Andrew G. Morgan" <morgan@kernel.org>
Date: Mon, 28 Apr 2008 02:13:40 -0700
Subject: capabilities: implement per-process securebits

Filesystem capability support makes it possible to do away with (set)uid-0
based privilege and use capabilities instead.  That is, with filesystem
support for capabilities but without this present patch, it is (conceptually)
possible to manage a system with capabilities alone and never need to obtain
privilege via (set)uid-0.

Of course, conceptually isn't quite the same as currently possible since few
user applications, certainly not enough to run a viable system, are currently
prepared to leverage capabilities to exercise privilege.  Further, many
applications exist that may never get upgraded in this way, and the kernel
will continue to want to support their setuid-0 base privilege needs.

Where pure-capability applications evolve and replace setuid-0 binaries, it is
desirable that there be a mechanisms by which they can contain their
privilege.  In addition to leveraging the per-process bounding and inheritable
sets, this should include suppressing the privilege of the uid-0 superuser
from the process' tree of children.

The feature added by this patch can be leveraged to suppress the privilege
associated with (set)uid-0.  This suppression requires CAP_SETPCAP to
initiate, and only immediately affects the 'current' process (it is inherited
through fork()/exec()).  This reimplementation differs significantly from the
historical support for securebits which was system-wide, unwieldy and which
has ultimately withered to a dead relic in the source of the modern kernel.

With this patch applied a process, that is capable(CAP_SETPCAP), can now drop
all legacy privilege (through uid=0) for itself and all subsequently
fork()'d/exec()'d children with:

  prctl(PR_SET_SECUREBITS, 0x2f);

This patch represents a no-op unless CONFIG_SECURITY_FILE_CAPABILITIES is
enabled at configure time.

[akpm@linux-foundation.org: fix uninitialised var warning]
[serue@us.ibm.com: capabilities: use cap_task_prctl when !CONFIG_SECURITY]
Signed-off-by: Andrew G. Morgan <morgan@kernel.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Reviewed-by: James Morris <jmorris@namei.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Paul Moore <paul.moore@hp.com>
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/capability.h |   3 +-
 include/linux/init_task.h  |   3 +-
 include/linux/prctl.h      |   9 +++-
 include/linux/sched.h      |   3 +-
 include/linux/securebits.h |  25 ++++++++---
 include/linux/security.h   |  16 ++++---
 kernel/sys.c               |  27 +-----------
 security/capability.c      |   1 +
 security/commoncap.c       | 103 +++++++++++++++++++++++++++++++++++++++++----
 security/dummy.c           |   2 +-
 security/security.c        |   4 +-
 security/selinux/hooks.c   |   5 ++-
 12 files changed, 141 insertions(+), 60 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 7d50ff6d269..eaab759b146 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -155,6 +155,7 @@ typedef struct kernel_cap_struct {
  *   Add any capability from current's capability bounding set
  *       to the current process' inheritable set
  *   Allow taking bits out of capability bounding set
+ *   Allow modification of the securebits for a process
  */
 
 #define CAP_SETPCAP          8
@@ -490,8 +491,6 @@ extern const kernel_cap_t __cap_init_eff_set;
 int capable(int cap);
 int __capable(struct task_struct *t, int cap);
 
-extern long cap_prctl_drop(unsigned long cap);
-
 #endif /* __KERNEL__ */
 
 #endif /* !_LINUX_CAPABILITY_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 37a6f5bc4a9..bf6b8a61f8d 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -9,6 +9,7 @@
 #include <linux/ipc.h>
 #include <linux/pid_namespace.h>
 #include <linux/user_namespace.h>
+#include <linux/securebits.h>
 #include <net/net_namespace.h>
 
 #define INIT_FDTABLE \
@@ -172,7 +173,7 @@ extern struct group_info init_groups;
 	.cap_inheritable = CAP_INIT_INH_SET,				\
 	.cap_permitted	= CAP_FULL_SET,					\
 	.cap_bset 	= CAP_INIT_BSET,				\
-	.keep_capabilities = 0,						\
+	.securebits     = SECUREBITS_DEFAULT,				\
 	.user		= INIT_USER,					\
 	.comm		= "swapper",					\
 	.thread		= INIT_THREAD,					\
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 5c80b193963..5ad79198d6f 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -16,7 +16,8 @@
 # define PR_UNALIGN_NOPRINT	1	/* silently fix up unaligned user accesses */
 # define PR_UNALIGN_SIGBUS	2	/* generate SIGBUS on unaligned user access */
 
-/* Get/set whether or not to drop capabilities on setuid() away from uid 0 */
+/* Get/set whether or not to drop capabilities on setuid() away from
+ * uid 0 (as per security/commoncap.c) */
 #define PR_GET_KEEPCAPS   7
 #define PR_SET_KEEPCAPS   8
 
@@ -63,7 +64,7 @@
 #define PR_GET_SECCOMP	21
 #define PR_SET_SECCOMP	22
 
-/* Get/set the capability bounding set */
+/* Get/set the capability bounding set (as per security/commoncap.c) */
 #define PR_CAPBSET_READ 23
 #define PR_CAPBSET_DROP 24
 
@@ -73,4 +74,8 @@
 # define PR_TSC_ENABLE		1	/* allow the use of the timestamp counter */
 # define PR_TSC_SIGSEGV		2	/* throw a SIGSEGV instead of reading the TSC */
 
+/* Get/set securebits (as per security/commoncap.c) */
+#define PR_GET_SECUREBITS 27
+#define PR_SET_SECUREBITS 28
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9a4f3e63e3b..024d72b47a0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -68,7 +68,6 @@ struct sched_param {
 #include <linux/smp.h>
 #include <linux/sem.h>
 #include <linux/signal.h>
-#include <linux/securebits.h>
 #include <linux/fs_struct.h>
 #include <linux/compiler.h>
 #include <linux/completion.h>
@@ -1133,7 +1132,7 @@ struct task_struct {
 	gid_t gid,egid,sgid,fsgid;
 	struct group_info *group_info;
 	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted, cap_bset;
-	unsigned keep_capabilities:1;
+	unsigned securebits;
 	struct user_struct *user;
 #ifdef CONFIG_KEYS
 	struct key *request_key_auth;	/* assumed request_key authority */
diff --git a/include/linux/securebits.h b/include/linux/securebits.h
index 5b0617840fa..c1f19dbceb0 100644
--- a/include/linux/securebits.h
+++ b/include/linux/securebits.h
@@ -3,28 +3,39 @@
 
 #define SECUREBITS_DEFAULT 0x00000000
 
-extern unsigned securebits;
-
 /* When set UID 0 has no special privileges. When unset, we support
    inheritance of root-permissions and suid-root executable under
    compatibility mode. We raise the effective and inheritable bitmasks
    *of the executable file* if the effective uid of the new process is
    0. If the real uid is 0, we raise the inheritable bitmask of the
    executable file. */
-#define SECURE_NOROOT            0
+#define SECURE_NOROOT			0
+#define SECURE_NOROOT_LOCKED		1  /* make bit-0 immutable */
 
 /* When set, setuid to/from uid 0 does not trigger capability-"fixes"
    to be compatible with old programs relying on set*uid to loose
    privileges. When unset, setuid doesn't change privileges. */
-#define SECURE_NO_SETUID_FIXUP   2
+#define SECURE_NO_SETUID_FIXUP		2
+#define SECURE_NO_SETUID_FIXUP_LOCKED	3  /* make bit-2 immutable */
+
+/* When set, a process can retain its capabilities even after
+   transitioning to a non-root user (the set-uid fixup suppressed by
+   bit 2). Bit-4 is cleared when a process calls exec(); setting both
+   bit 4 and 5 will create a barrier through exec that no exec()'d
+   child can use this feature again. */
+#define SECURE_KEEP_CAPS		4
+#define SECURE_KEEP_CAPS_LOCKED		5  /* make bit-4 immutable */
 
 /* Each securesetting is implemented using two bits. One bit specify
    whether the setting is on or off. The other bit specify whether the
    setting is fixed or not. A setting which is fixed cannot be changed
    from user-level. */
+#define issecure_mask(X)	(1 << (X))
+#define issecure(X)		(issecure_mask(X) & current->securebits)
 
-#define issecure(X) ( (1 << (X+1)) & SECUREBITS_DEFAULT ? 	\
-		      (1 << (X)) & SECUREBITS_DEFAULT :		\
-		      (1 << (X)) & securebits )
+#define SECURE_ALL_BITS		(issecure_mask(SECURE_NOROOT) | \
+				 issecure_mask(SECURE_NO_SETUID_FIXUP) | \
+				 issecure_mask(SECURE_KEEP_CAPS))
+#define SECURE_ALL_LOCKS	(SECURE_ALL_BITS << 1)
 
 #endif /* !_LINUX_SECUREBITS_H */
diff --git a/include/linux/security.h b/include/linux/security.h
index 53a34539382..e6299e50e21 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -34,8 +34,6 @@
 #include <linux/xfrm.h>
 #include <net/flow.h>
 
-extern unsigned securebits;
-
 /* Maximum number of letters for an LSM name string */
 #define SECURITY_NAME_MAX	10
 
@@ -61,6 +59,8 @@ extern int cap_inode_need_killpriv(struct dentry *dentry);
 extern int cap_inode_killpriv(struct dentry *dentry);
 extern int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid, int flags);
 extern void cap_task_reparent_to_init (struct task_struct *p);
+extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
+			  unsigned long arg4, unsigned long arg5, long *rc_p);
 extern int cap_task_setscheduler (struct task_struct *p, int policy, struct sched_param *lp);
 extern int cap_task_setioprio (struct task_struct *p, int ioprio);
 extern int cap_task_setnice (struct task_struct *p, int nice);
@@ -720,7 +720,9 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@arg3 contains a argument.
  *	@arg4 contains a argument.
  *	@arg5 contains a argument.
- *	Return 0 if permission is granted.
+ *      @rc_p contains a pointer to communicate back the forced return code
+ *	Return 0 if permission is granted, and non-zero if the security module
+ *      has taken responsibility (setting *rc_p) for the prctl call.
  * @task_reparent_to_init:
  * 	Set the security attributes in @p->security for a kernel thread that
  * 	is being reparented to the init task.
@@ -1420,7 +1422,7 @@ struct security_operations {
 	int (*task_wait) (struct task_struct * p);
 	int (*task_prctl) (int option, unsigned long arg2,
 			   unsigned long arg3, unsigned long arg4,
-			   unsigned long arg5);
+			   unsigned long arg5, long *rc_p);
 	void (*task_reparent_to_init) (struct task_struct * p);
 	void (*task_to_inode)(struct task_struct *p, struct inode *inode);
 
@@ -1684,7 +1686,7 @@ int security_task_kill(struct task_struct *p, struct siginfo *info,
 			int sig, u32 secid);
 int security_task_wait(struct task_struct *p);
 int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
-			 unsigned long arg4, unsigned long arg5);
+			 unsigned long arg4, unsigned long arg5, long *rc_p);
 void security_task_reparent_to_init(struct task_struct *p);
 void security_task_to_inode(struct task_struct *p, struct inode *inode);
 int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag);
@@ -2271,9 +2273,9 @@ static inline int security_task_wait (struct task_struct *p)
 static inline int security_task_prctl (int option, unsigned long arg2,
 				       unsigned long arg3,
 				       unsigned long arg4,
-				       unsigned long arg5)
+				       unsigned long arg5, long *rc_p)
 {
-	return 0;
+	return cap_task_prctl(option, arg2, arg3, arg3, arg5, rc_p);
 }
 
 static inline void security_task_reparent_to_init (struct task_struct *p)
diff --git a/kernel/sys.c b/kernel/sys.c
index 6a0cc71ee88..f2a45136695 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1632,10 +1632,9 @@ asmlinkage long sys_umask(int mask)
 asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 			  unsigned long arg4, unsigned long arg5)
 {
-	long error;
+	long uninitialized_var(error);
 
-	error = security_task_prctl(option, arg2, arg3, arg4, arg5);
-	if (error)
+	if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error))
 		return error;
 
 	switch (option) {
@@ -1688,17 +1687,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 				error = -EINVAL;
 			break;
 
-		case PR_GET_KEEPCAPS:
-			if (current->keep_capabilities)
-				error = 1;
-			break;
-		case PR_SET_KEEPCAPS:
-			if (arg2 != 0 && arg2 != 1) {
-				error = -EINVAL;
-				break;
-			}
-			current->keep_capabilities = arg2;
-			break;
 		case PR_SET_NAME: {
 			struct task_struct *me = current;
 			unsigned char ncomm[sizeof(me->comm)];
@@ -1732,17 +1720,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 		case PR_SET_SECCOMP:
 			error = prctl_set_seccomp(arg2);
 			break;
-
-		case PR_CAPBSET_READ:
-			if (!cap_valid(arg2))
-				return -EINVAL;
-			return !!cap_raised(current->cap_bset, arg2);
-		case PR_CAPBSET_DROP:
-#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
-			return cap_prctl_drop(arg2);
-#else
-			return -EINVAL;
-#endif
 		case PR_GET_TSC:
 			error = GET_TSC_CTL(arg2);
 			break;
diff --git a/security/capability.c b/security/capability.c
index 2c6e06d18fa..38ac54e3aed 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -44,6 +44,7 @@ static struct security_operations capability_ops = {
 	.task_setioprio =		cap_task_setioprio,
 	.task_setnice =			cap_task_setnice,
 	.task_post_setuid =		cap_task_post_setuid,
+	.task_prctl =                   cap_task_prctl,
 	.task_reparent_to_init =	cap_task_reparent_to_init,
 
 	.syslog =                       cap_syslog,
diff --git a/security/commoncap.c b/security/commoncap.c
index 852905789ca..e8c3f5e4670 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -24,11 +24,8 @@
 #include <linux/hugetlb.h>
 #include <linux/mount.h>
 #include <linux/sched.h>
-
-/* Global security state */
-
-unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */
-EXPORT_SYMBOL(securebits);
+#include <linux/prctl.h>
+#include <linux/securebits.h>
 
 int cap_netlink_send(struct sock *sk, struct sk_buff *skb)
 {
@@ -368,7 +365,7 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 
 	/* AUD: Audit candidate if current->cap_effective is set */
 
-	current->keep_capabilities = 0;
+	current->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
 }
 
 int cap_bprm_secureexec (struct linux_binprm *bprm)
@@ -448,7 +445,7 @@ static inline void cap_emulate_setxuid (int old_ruid, int old_euid,
 {
 	if ((old_ruid == 0 || old_euid == 0 || old_suid == 0) &&
 	    (current->uid != 0 && current->euid != 0 && current->suid != 0) &&
-	    !current->keep_capabilities) {
+	    !issecure(SECURE_KEEP_CAPS)) {
 		cap_clear (current->cap_permitted);
 		cap_clear (current->cap_effective);
 	}
@@ -547,7 +544,7 @@ int cap_task_setnice (struct task_struct *p, int nice)
  * this task could get inconsistent info.  There can be no
  * racing writer bc a task can only change its own caps.
  */
-long cap_prctl_drop(unsigned long cap)
+static long cap_prctl_drop(unsigned long cap)
 {
 	if (!capable(CAP_SETPCAP))
 		return -EPERM;
@@ -556,6 +553,7 @@ long cap_prctl_drop(unsigned long cap)
 	cap_lower(current->cap_bset, cap);
 	return 0;
 }
+
 #else
 int cap_task_setscheduler (struct task_struct *p, int policy,
 			   struct sched_param *lp)
@@ -572,12 +570,99 @@ int cap_task_setnice (struct task_struct *p, int nice)
 }
 #endif
 
+int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
+		   unsigned long arg4, unsigned long arg5, long *rc_p)
+{
+	long error = 0;
+
+	switch (option) {
+	case PR_CAPBSET_READ:
+		if (!cap_valid(arg2))
+			error = -EINVAL;
+		else
+			error = !!cap_raised(current->cap_bset, arg2);
+		break;
+#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
+	case PR_CAPBSET_DROP:
+		error = cap_prctl_drop(arg2);
+		break;
+
+	/*
+	 * The next four prctl's remain to assist with transitioning a
+	 * system from legacy UID=0 based privilege (when filesystem
+	 * capabilities are not in use) to a system using filesystem
+	 * capabilities only - as the POSIX.1e draft intended.
+	 *
+	 * Note:
+	 *
+	 *  PR_SET_SECUREBITS =
+	 *      issecure_mask(SECURE_KEEP_CAPS_LOCKED)
+	 *    | issecure_mask(SECURE_NOROOT)
+	 *    | issecure_mask(SECURE_NOROOT_LOCKED)
+	 *    | issecure_mask(SECURE_NO_SETUID_FIXUP)
+	 *    | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED)
+	 *
+	 * will ensure that the current process and all of its
+	 * children will be locked into a pure
+	 * capability-based-privilege environment.
+	 */
+	case PR_SET_SECUREBITS:
+		if ((((current->securebits & SECURE_ALL_LOCKS) >> 1)
+		     & (current->securebits ^ arg2))                  /*[1]*/
+		    || ((current->securebits & SECURE_ALL_LOCKS
+			 & ~arg2))                                    /*[2]*/
+		    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/
+		    || (cap_capable(current, CAP_SETPCAP) != 0)) {    /*[4]*/
+			/*
+			 * [1] no changing of bits that are locked
+			 * [2] no unlocking of locks
+			 * [3] no setting of unsupported bits
+			 * [4] doing anything requires privilege (go read about
+			 *     the "sendmail capabilities bug")
+			 */
+			error = -EPERM;  /* cannot change a locked bit */
+		} else {
+			current->securebits = arg2;
+		}
+		break;
+	case PR_GET_SECUREBITS:
+		error = current->securebits;
+		break;
+
+#endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */
+
+	case PR_GET_KEEPCAPS:
+		if (issecure(SECURE_KEEP_CAPS))
+			error = 1;
+		break;
+	case PR_SET_KEEPCAPS:
+		if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */
+			error = -EINVAL;
+		else if (issecure(SECURE_KEEP_CAPS_LOCKED))
+			error = -EPERM;
+		else if (arg2)
+			current->securebits |= issecure_mask(SECURE_KEEP_CAPS);
+		else
+			current->securebits &=
+				~issecure_mask(SECURE_KEEP_CAPS);
+		break;
+
+	default:
+		/* No functionality available - continue with default */
+		return 0;
+	}
+
+	/* Functionality provided */
+	*rc_p = error;
+	return 1;
+}
+
 void cap_task_reparent_to_init (struct task_struct *p)
 {
 	cap_set_init_eff(p->cap_effective);
 	cap_clear(p->cap_inheritable);
 	cap_set_full(p->cap_permitted);
-	p->keep_capabilities = 0;
+	p->securebits = SECUREBITS_DEFAULT;
 	return;
 }
 
diff --git a/security/dummy.c b/security/dummy.c
index b0232bbf427..58d4dd1af5c 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -604,7 +604,7 @@ static int dummy_task_kill (struct task_struct *p, struct siginfo *info,
 }
 
 static int dummy_task_prctl (int option, unsigned long arg2, unsigned long arg3,
-			     unsigned long arg4, unsigned long arg5)
+			     unsigned long arg4, unsigned long arg5, long *rc_p)
 {
 	return 0;
 }
diff --git a/security/security.c b/security/security.c
index 8a285c7b996..d5cb5898d96 100644
--- a/security/security.c
+++ b/security/security.c
@@ -733,9 +733,9 @@ int security_task_wait(struct task_struct *p)
 }
 
 int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
-			 unsigned long arg4, unsigned long arg5)
+			 unsigned long arg4, unsigned long arg5, long *rc_p)
 {
-	return security_ops->task_prctl(option, arg2, arg3, arg4, arg5);
+	return security_ops->task_prctl(option, arg2, arg3, arg4, arg5, rc_p);
 }
 
 void security_task_reparent_to_init(struct task_struct *p)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 308e2cf17d7..04acb5af831 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3303,12 +3303,13 @@ static int selinux_task_prctl(int option,
 			      unsigned long arg2,
 			      unsigned long arg3,
 			      unsigned long arg4,
-			      unsigned long arg5)
+			      unsigned long arg5,
+			      long *rc_p)
 {
 	/* The current prctl operations do not appear to require
 	   any SELinux controls since they merely observe or modify
 	   the state of the current process. */
-	return 0;
+	return secondary_ops->task_prctl(option, arg2, arg3, arg4, arg5, rc_p);
 }
 
 static int selinux_task_wait(struct task_struct *p)
-- 
cgit v1.2.3-70-g09d2


From 3d8d996e0ca5b4093203d3f050b0f70b5c949ae8 Mon Sep 17 00:00:00 2001
From: Srinivasa Ds <srinivasa@in.ibm.com>
Date: Mon, 28 Apr 2008 02:14:26 -0700
Subject: kprobes: prevent probing of preempt_schedule()

Prohibit users from probing preempt_schedule().  One way of prohibiting the
user from probing functions is by marking such functions with __kprobes.  But
this method doesn't work for those functions, which are already marked to
different section like preempt_schedule() (belongs to __sched section).  So we
use blacklist approach to refuse user from probing these functions.

In blacklist approach we populate the blacklisted function's starting address
and its size in kprobe_blacklist structure.  Then we verify the user specified
address against start and end of the blacklisted function.  So any attempt to
register probe on blacklisted functions will be rejected.

[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Srinivasa DS <srinivasa@in.ibm.com>
Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Jim Keniston <jkenisto@us.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kprobes.h |  7 +++++++
 kernel/kprobes.c        | 52 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 0f28486f636..cd507ab4fed 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -173,6 +173,13 @@ struct kretprobe_blackpoint {
 	const char *name;
 	void *addr;
 };
+
+struct kprobe_blackpoint {
+	const char *name;
+	unsigned long start_addr;
+	unsigned long range;
+};
+
 extern struct kretprobe_blackpoint kretprobe_blacklist[];
 
 static inline void kretprobe_assert(struct kretprobe_instance *ri,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index fcfb580c3af..f02a4311768 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -72,6 +72,18 @@ DEFINE_MUTEX(kprobe_mutex);		/* Protects kprobe_table */
 DEFINE_SPINLOCK(kretprobe_lock);	/* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 
+/*
+ * Normally, functions that we'd want to prohibit kprobes in, are marked
+ * __kprobes. But, there are cases where such functions already belong to
+ * a different section (__sched for preempt_schedule)
+ *
+ * For such cases, we now have a blacklist
+ */
+struct kprobe_blackpoint kprobe_blacklist[] = {
+	{"preempt_schedule",},
+	{NULL}    /* Terminator */
+};
+
 #ifdef __ARCH_WANT_KPROBES_INSN_SLOT
 /*
  * kprobe->ainsn.insn points to the copy of the instruction to be
@@ -492,9 +504,22 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 
 static int __kprobes in_kprobes_functions(unsigned long addr)
 {
+	struct kprobe_blackpoint *kb;
+
 	if (addr >= (unsigned long)__kprobes_text_start &&
 	    addr < (unsigned long)__kprobes_text_end)
 		return -EINVAL;
+	/*
+	 * If there exists a kprobe_blacklist, verify and
+	 * fail any probe registration in the prohibited area
+	 */
+	for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
+		if (kb->start_addr) {
+			if (addr >= kb->start_addr &&
+			    addr < (kb->start_addr + kb->range))
+				return -EINVAL;
+		}
+	}
 	return 0;
 }
 
@@ -811,6 +836,11 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp)
 static int __init init_kprobes(void)
 {
 	int i, err = 0;
+	unsigned long offset = 0, size = 0;
+	char *modname, namebuf[128];
+	const char *symbol_name;
+	void *addr;
+	struct kprobe_blackpoint *kb;
 
 	/* FIXME allocate the probe table, currently defined statically */
 	/* initialize all list heads */
@@ -819,6 +849,28 @@ static int __init init_kprobes(void)
 		INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
 	}
 
+	/*
+	 * Lookup and populate the kprobe_blacklist.
+	 *
+	 * Unlike the kretprobe blacklist, we'll need to determine
+	 * the range of addresses that belong to the said functions,
+	 * since a kprobe need not necessarily be at the beginning
+	 * of a function.
+	 */
+	for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
+		kprobe_lookup_name(kb->name, addr);
+		if (!addr)
+			continue;
+
+		kb->start_addr = (unsigned long)addr;
+		symbol_name = kallsyms_lookup(kb->start_addr,
+				&size, &offset, &modname, namebuf);
+		if (!symbol_name)
+			kb->range = 0;
+		else
+			kb->range = size;
+	}
+
 	if (kretprobe_blacklist_size) {
 		/* lookup the function address from its name */
 		for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
-- 
cgit v1.2.3-70-g09d2


From 9861668f747895608cea425f8457989d8dd2edf2 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 28 Apr 2008 02:14:28 -0700
Subject: kprobes: add (un)register_kprobes for batch registration

Introduce unregister_/register_kprobes() for kprobe batch registration.  This
can reduce waiting time for synchronized_sched() when a lot of probes have to
be unregistered at once.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: David Miller <davem@davemloft.net>
Cc: "Frank Ch. Eigler" <fche@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kprobes.h |   9 ++++
 kernel/kprobes.c        | 124 +++++++++++++++++++++++++++++++++---------------
 2 files changed, 96 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index cd507ab4fed..2ba7df645a8 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -234,6 +234,8 @@ static inline struct kprobe_ctlblk *get_kprobe_ctlblk(void)
 
 int register_kprobe(struct kprobe *p);
 void unregister_kprobe(struct kprobe *p);
+int register_kprobes(struct kprobe **kps, int num);
+void unregister_kprobes(struct kprobe **kps, int num);
 int setjmp_pre_handler(struct kprobe *, struct pt_regs *);
 int longjmp_break_handler(struct kprobe *, struct pt_regs *);
 int register_jprobe(struct jprobe *p);
@@ -261,9 +263,16 @@ static inline int register_kprobe(struct kprobe *p)
 {
 	return -ENOSYS;
 }
+static inline int register_kprobes(struct kprobe **kps, int num)
+{
+	return -ENOSYS;
+}
 static inline void unregister_kprobe(struct kprobe *p)
 {
 }
+static inline void unregister_kprobes(struct kprobe **kps, int num)
+{
+}
 static inline int register_jprobe(struct jprobe *p)
 {
 	return -ENOSYS;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index f02a4311768..76275fc025a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -580,6 +580,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
 	}
 
 	p->nmissed = 0;
+	INIT_LIST_HEAD(&p->list);
 	mutex_lock(&kprobe_mutex);
 	old_p = get_kprobe(p->addr);
 	if (old_p) {
@@ -606,35 +607,28 @@ out:
 	return ret;
 }
 
-int __kprobes register_kprobe(struct kprobe *p)
-{
-	return __register_kprobe(p, (unsigned long)__builtin_return_address(0));
-}
-
-void __kprobes unregister_kprobe(struct kprobe *p)
+/*
+ * Unregister a kprobe without a scheduler synchronization.
+ */
+static int __kprobes __unregister_kprobe_top(struct kprobe *p)
 {
-	struct module *mod;
 	struct kprobe *old_p, *list_p;
-	int cleanup_p;
 
-	mutex_lock(&kprobe_mutex);
 	old_p = get_kprobe(p->addr);
-	if (unlikely(!old_p)) {
-		mutex_unlock(&kprobe_mutex);
-		return;
-	}
+	if (unlikely(!old_p))
+		return -EINVAL;
+
 	if (p != old_p) {
 		list_for_each_entry_rcu(list_p, &old_p->list, list)
 			if (list_p == p)
 			/* kprobe p is a valid probe */
 				goto valid_p;
-		mutex_unlock(&kprobe_mutex);
-		return;
+		return -EINVAL;
 	}
 valid_p:
 	if (old_p == p ||
 	    (old_p->pre_handler == aggr_pre_handler &&
-	     p->list.next == &old_p->list && p->list.prev == &old_p->list)) {
+	     list_is_singular(&old_p->list))) {
 		/*
 		 * Only probe on the hash list. Disarm only if kprobes are
 		 * enabled - otherwise, the breakpoint would already have
@@ -643,43 +637,97 @@ valid_p:
 		if (kprobe_enabled)
 			arch_disarm_kprobe(p);
 		hlist_del_rcu(&old_p->hlist);
-		cleanup_p = 1;
 	} else {
+		if (p->break_handler)
+			old_p->break_handler = NULL;
+		if (p->post_handler) {
+			list_for_each_entry_rcu(list_p, &old_p->list, list) {
+				if ((list_p != p) && (list_p->post_handler))
+					goto noclean;
+			}
+			old_p->post_handler = NULL;
+		}
+noclean:
 		list_del_rcu(&p->list);
-		cleanup_p = 0;
 	}
+	return 0;
+}
 
-	mutex_unlock(&kprobe_mutex);
+static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
+{
+	struct module *mod;
+	struct kprobe *old_p;
 
-	synchronize_sched();
 	if (p->mod_refcounted) {
 		mod = module_text_address((unsigned long)p->addr);
 		if (mod)
 			module_put(mod);
 	}
 
-	if (cleanup_p) {
-		if (p != old_p) {
-			list_del_rcu(&p->list);
+	if (list_empty(&p->list) || list_is_singular(&p->list)) {
+		if (!list_empty(&p->list)) {
+			/* "p" is the last child of an aggr_kprobe */
+			old_p = list_entry(p->list.next, struct kprobe, list);
+			list_del(&p->list);
 			kfree(old_p);
 		}
 		arch_remove_kprobe(p);
-	} else {
-		mutex_lock(&kprobe_mutex);
-		if (p->break_handler)
-			old_p->break_handler = NULL;
-		if (p->post_handler){
-			list_for_each_entry_rcu(list_p, &old_p->list, list){
-				if (list_p->post_handler){
-					cleanup_p = 2;
-					break;
-				}
-			}
-			if (cleanup_p == 0)
-				old_p->post_handler = NULL;
+	}
+}
+
+static int __register_kprobes(struct kprobe **kps, int num,
+	unsigned long called_from)
+{
+	int i, ret = 0;
+
+	if (num <= 0)
+		return -EINVAL;
+	for (i = 0; i < num; i++) {
+		ret = __register_kprobe(kps[i], called_from);
+		if (ret < 0 && i > 0) {
+			unregister_kprobes(kps, i);
+			break;
 		}
-		mutex_unlock(&kprobe_mutex);
 	}
+	return ret;
+}
+
+/*
+ * Registration and unregistration functions for kprobe.
+ */
+int __kprobes register_kprobe(struct kprobe *p)
+{
+	return __register_kprobes(&p, 1,
+				  (unsigned long)__builtin_return_address(0));
+}
+
+void __kprobes unregister_kprobe(struct kprobe *p)
+{
+	unregister_kprobes(&p, 1);
+}
+
+int __kprobes register_kprobes(struct kprobe **kps, int num)
+{
+	return __register_kprobes(kps, num,
+				  (unsigned long)__builtin_return_address(0));
+}
+
+void __kprobes unregister_kprobes(struct kprobe **kps, int num)
+{
+	int i;
+
+	if (num <= 0)
+		return;
+	mutex_lock(&kprobe_mutex);
+	for (i = 0; i < num; i++)
+		if (__unregister_kprobe_top(kps[i]) < 0)
+			kps[i]->addr = NULL;
+	mutex_unlock(&kprobe_mutex);
+
+	synchronize_sched();
+	for (i = 0; i < num; i++)
+		if (kps[i]->addr)
+			__unregister_kprobe_bottom(kps[i]);
 }
 
 static struct notifier_block kprobe_exceptions_nb = {
@@ -1118,6 +1166,8 @@ module_init(init_kprobes);
 
 EXPORT_SYMBOL_GPL(register_kprobe);
 EXPORT_SYMBOL_GPL(unregister_kprobe);
+EXPORT_SYMBOL_GPL(register_kprobes);
+EXPORT_SYMBOL_GPL(unregister_kprobes);
 EXPORT_SYMBOL_GPL(register_jprobe);
 EXPORT_SYMBOL_GPL(unregister_jprobe);
 #ifdef CONFIG_KPROBES
-- 
cgit v1.2.3-70-g09d2


From 4a296e07c3a410c09b9155da4c2fa84a07964f38 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 28 Apr 2008 02:14:29 -0700
Subject: kprobes: add (un)register_kretprobes for batch registration

Introduce unregister_/register_kretprobes() for kretprobe batch registration.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: David Miller <davem@davemloft.net>
Cc: "Frank Ch. Eigler" <fche@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kprobes.h |   9 ++++
 kernel/kprobes.c        | 108 +++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 97 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 2ba7df645a8..94c855a236a 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -245,6 +245,8 @@ unsigned long arch_deref_entry_point(void *);
 
 int register_kretprobe(struct kretprobe *rp);
 void unregister_kretprobe(struct kretprobe *rp);
+int register_kretprobes(struct kretprobe **rps, int num);
+void unregister_kretprobes(struct kretprobe **rps, int num);
 
 void kprobe_flush_task(struct task_struct *tk);
 void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
@@ -287,9 +289,16 @@ static inline int register_kretprobe(struct kretprobe *rp)
 {
 	return -ENOSYS;
 }
+static inline int register_kretprobes(struct kretprobe **rps, int num)
+{
+	return -ENOSYS;
+}
 static inline void unregister_kretprobe(struct kretprobe *rp)
 {
 }
+static inline void unregister_kretprobes(struct kretprobe **rps, int num)
+{
+}
 static inline void kprobe_flush_task(struct task_struct *tk)
 {
 }
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 76275fc025a..5e3144ad9b6 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -429,6 +429,21 @@ static inline void free_rp_inst(struct kretprobe *rp)
 	}
 }
 
+static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
+{
+	unsigned long flags;
+	struct kretprobe_instance *ri;
+	struct hlist_node *pos, *next;
+	/* No race here */
+	spin_lock_irqsave(&kretprobe_lock, flags);
+	hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) {
+		ri->rp = NULL;
+		hlist_del(&ri->uflist);
+	}
+	spin_unlock_irqrestore(&kretprobe_lock, flags);
+	free_rp_inst(rp);
+}
+
 /*
  * Keep all fields in the kprobe consistent
  */
@@ -798,7 +813,8 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 	return 0;
 }
 
-int __kprobes register_kretprobe(struct kretprobe *rp)
+static int __kprobes __register_kretprobe(struct kretprobe *rp,
+					  unsigned long called_from)
 {
 	int ret = 0;
 	struct kretprobe_instance *inst;
@@ -844,43 +860,93 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
 
 	rp->nmissed = 0;
 	/* Establish function entry probe point */
-	if ((ret = __register_kprobe(&rp->kp,
-		(unsigned long)__builtin_return_address(0))) != 0)
+	ret = __register_kprobe(&rp->kp, called_from);
+	if (ret != 0)
 		free_rp_inst(rp);
 	return ret;
 }
 
+static int __register_kretprobes(struct kretprobe **rps, int num,
+	unsigned long called_from)
+{
+	int ret = 0, i;
+
+	if (num <= 0)
+		return -EINVAL;
+	for (i = 0; i < num; i++) {
+		ret = __register_kretprobe(rps[i], called_from);
+		if (ret < 0 && i > 0) {
+			unregister_kretprobes(rps, i);
+			break;
+		}
+	}
+	return ret;
+}
+
+int __kprobes register_kretprobe(struct kretprobe *rp)
+{
+	return __register_kretprobes(&rp, 1,
+			(unsigned long)__builtin_return_address(0));
+}
+
+void __kprobes unregister_kretprobe(struct kretprobe *rp)
+{
+	unregister_kretprobes(&rp, 1);
+}
+
+int __kprobes register_kretprobes(struct kretprobe **rps, int num)
+{
+	return __register_kretprobes(rps, num,
+			(unsigned long)__builtin_return_address(0));
+}
+
+void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
+{
+	int i;
+
+	if (num <= 0)
+		return;
+	mutex_lock(&kprobe_mutex);
+	for (i = 0; i < num; i++)
+		if (__unregister_kprobe_top(&rps[i]->kp) < 0)
+			rps[i]->kp.addr = NULL;
+	mutex_unlock(&kprobe_mutex);
+
+	synchronize_sched();
+	for (i = 0; i < num; i++) {
+		if (rps[i]->kp.addr) {
+			__unregister_kprobe_bottom(&rps[i]->kp);
+			cleanup_rp_inst(rps[i]);
+		}
+	}
+}
+
 #else /* CONFIG_KRETPROBES */
 int __kprobes register_kretprobe(struct kretprobe *rp)
 {
 	return -ENOSYS;
 }
 
-static int __kprobes pre_handler_kretprobe(struct kprobe *p,
-					   struct pt_regs *regs)
+int __kprobes register_kretprobes(struct kretprobe **rps, int num)
 {
-	return 0;
+	return -ENOSYS;
 }
-#endif /* CONFIG_KRETPROBES */
-
 void __kprobes unregister_kretprobe(struct kretprobe *rp)
 {
-	unsigned long flags;
-	struct kretprobe_instance *ri;
-	struct hlist_node *pos, *next;
+}
 
-	unregister_kprobe(&rp->kp);
+void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
+{
+}
 
-	/* No race here */
-	spin_lock_irqsave(&kretprobe_lock, flags);
-	hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) {
-		ri->rp = NULL;
-		hlist_del(&ri->uflist);
-	}
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
-	free_rp_inst(rp);
+static int __kprobes pre_handler_kretprobe(struct kprobe *p,
+					   struct pt_regs *regs)
+{
+	return 0;
 }
 
+#endif /* CONFIG_KRETPROBES */
+
 static int __init init_kprobes(void)
 {
 	int i, err = 0;
@@ -1177,4 +1243,6 @@ EXPORT_SYMBOL_GPL(jprobe_return);
 #ifdef CONFIG_KPROBES
 EXPORT_SYMBOL_GPL(register_kretprobe);
 EXPORT_SYMBOL_GPL(unregister_kretprobe);
+EXPORT_SYMBOL_GPL(register_kretprobes);
+EXPORT_SYMBOL_GPL(unregister_kretprobes);
 #endif
-- 
cgit v1.2.3-70-g09d2


From 26b31c1908e02a316edfba08080373342e662c14 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 28 Apr 2008 02:14:29 -0700
Subject: kprobes: add (un)register_jprobes for batch registration

Introduce unregister_/register_jprobes() for jprobe batch registration.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: David Miller <davem@davemloft.net>
Cc: "Frank Ch. Eigler" <fche@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kprobes.h |  9 +++++++
 kernel/kprobes.c        | 65 ++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 65 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 94c855a236a..1036631ff4f 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -240,6 +240,8 @@ int setjmp_pre_handler(struct kprobe *, struct pt_regs *);
 int longjmp_break_handler(struct kprobe *, struct pt_regs *);
 int register_jprobe(struct jprobe *p);
 void unregister_jprobe(struct jprobe *p);
+int register_jprobes(struct jprobe **jps, int num);
+void unregister_jprobes(struct jprobe **jps, int num);
 void jprobe_return(void);
 unsigned long arch_deref_entry_point(void *);
 
@@ -279,9 +281,16 @@ static inline int register_jprobe(struct jprobe *p)
 {
 	return -ENOSYS;
 }
+static inline int register_jprobes(struct jprobe **jps, int num)
+{
+	return -ENOSYS;
+}
 static inline void unregister_jprobe(struct jprobe *p)
 {
 }
+static inline void unregister_jprobes(struct jprobe **jps, int num)
+{
+}
 static inline void jprobe_return(void)
 {
 }
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 5e3144ad9b6..1e0250cb948 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -755,24 +755,69 @@ unsigned long __weak arch_deref_entry_point(void *entry)
 	return (unsigned long)entry;
 }
 
-int __kprobes register_jprobe(struct jprobe *jp)
+static int __register_jprobes(struct jprobe **jps, int num,
+	unsigned long called_from)
 {
-	unsigned long addr = arch_deref_entry_point(jp->entry);
+	struct jprobe *jp;
+	int ret = 0, i;
 
-	if (!kernel_text_address(addr))
+	if (num <= 0)
 		return -EINVAL;
+	for (i = 0; i < num; i++) {
+		unsigned long addr;
+		jp = jps[i];
+		addr = arch_deref_entry_point(jp->entry);
+
+		if (!kernel_text_address(addr))
+			ret = -EINVAL;
+		else {
+			/* Todo: Verify probepoint is a function entry point */
+			jp->kp.pre_handler = setjmp_pre_handler;
+			jp->kp.break_handler = longjmp_break_handler;
+			ret = __register_kprobe(&jp->kp, called_from);
+		}
+		if (ret < 0 && i > 0) {
+			unregister_jprobes(jps, i);
+			break;
+		}
+	}
+	return ret;
+}
 
-	/* Todo: Verify probepoint is a function entry point */
-	jp->kp.pre_handler = setjmp_pre_handler;
-	jp->kp.break_handler = longjmp_break_handler;
-
-	return __register_kprobe(&jp->kp,
+int __kprobes register_jprobe(struct jprobe *jp)
+{
+	return __register_jprobes(&jp, 1,
 		(unsigned long)__builtin_return_address(0));
 }
 
 void __kprobes unregister_jprobe(struct jprobe *jp)
 {
-	unregister_kprobe(&jp->kp);
+	unregister_jprobes(&jp, 1);
+}
+
+int __kprobes register_jprobes(struct jprobe **jps, int num)
+{
+	return __register_jprobes(jps, num,
+		(unsigned long)__builtin_return_address(0));
+}
+
+void __kprobes unregister_jprobes(struct jprobe **jps, int num)
+{
+	int i;
+
+	if (num <= 0)
+		return;
+	mutex_lock(&kprobe_mutex);
+	for (i = 0; i < num; i++)
+		if (__unregister_kprobe_top(&jps[i]->kp) < 0)
+			jps[i]->kp.addr = NULL;
+	mutex_unlock(&kprobe_mutex);
+
+	synchronize_sched();
+	for (i = 0; i < num; i++) {
+		if (jps[i]->kp.addr)
+			__unregister_kprobe_bottom(&jps[i]->kp);
+	}
 }
 
 #ifdef CONFIG_KRETPROBES
@@ -1236,6 +1281,8 @@ EXPORT_SYMBOL_GPL(register_kprobes);
 EXPORT_SYMBOL_GPL(unregister_kprobes);
 EXPORT_SYMBOL_GPL(register_jprobe);
 EXPORT_SYMBOL_GPL(unregister_jprobe);
+EXPORT_SYMBOL_GPL(register_jprobes);
+EXPORT_SYMBOL_GPL(unregister_jprobes);
 #ifdef CONFIG_KPROBES
 EXPORT_SYMBOL_GPL(jprobe_return);
 #endif
-- 
cgit v1.2.3-70-g09d2


From b6f448e99ce7955b9707ed36a46cab2c6ddf7ddc Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Mon, 28 Apr 2008 02:15:03 -0700
Subject: PM/gxfb: add hook to PM console layer that allows disabling of
 suspend VT switch

Prior to suspend, we allocate and switch to a new VT; after suspend, we switch
back to the original VT.  This can be slow, and is completely unnecessary if
the framebuffer we're using can restore video properly.

This adds a hook that allows drivers to select whether or not to do this vt
switch, and changes the gxfb driver to call this hook.  It also adds a module
param to gxfb to allow controlling of the vt switch (defaulting to no switch).

(Note: I'm not convinced that console_sem is the best way to protect this, but
we should probably have some form of locking..)

[akpm@linux-foundation.org: build fix]
Signed-off-by: Andres Salomon <dilinger@debian.org>
Cc: Jordan Crouse <jordan.crouse@amd.com>
Cc: "Antonino A. Daplas" <adaplas@pol.net>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/fb/gxfb.txt       |  3 ++-
 drivers/video/geode/gxfb_core.c |  7 +++++++
 include/linux/suspend.h         | 15 +++++++++++++--
 kernel/power/console.c          | 27 ++++++++++++++++++++++++++-
 4 files changed, 48 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/fb/gxfb.txt b/Documentation/fb/gxfb.txt
index b5609614201..2f640903bbb 100644
--- a/Documentation/fb/gxfb.txt
+++ b/Documentation/fb/gxfb.txt
@@ -45,7 +45,8 @@ Accepted options:
 mode_option	- specify the video mode.  Of the form
 		  <x>x<y>[-<bpp>][@<refresh>]
 vram		- size of video ram (normally auto-detected)
-
+vt_switch	- enable vt switching during suspend/resume.  The vt
+		  switch is slow, but harmless.
 
 --
 Andres Salomon <dilinger@debian.org>
diff --git a/drivers/video/geode/gxfb_core.c b/drivers/video/geode/gxfb_core.c
index 151d964c025..f16c21d5bc2 100644
--- a/drivers/video/geode/gxfb_core.c
+++ b/drivers/video/geode/gxfb_core.c
@@ -29,6 +29,7 @@
 #include <linux/delay.h>
 #include <linux/fb.h>
 #include <linux/console.h>
+#include <linux/suspend.h>
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <asm/geode.h>
@@ -37,6 +38,7 @@
 
 static char *mode_option;
 static int vram;
+static int vt_switch;
 
 /* Modes relevant to the GX (taken from modedb.c) */
 static const struct fb_videomode gx_modedb[] __initdata = {
@@ -382,6 +384,8 @@ static int __init gxfb_probe(struct pci_dev *pdev, const struct pci_device_id *i
 	gxfb_check_var(&info->var, info);
 	gxfb_set_par(info);
 
+	pm_set_vt_switch(vt_switch);
+
 	if (register_framebuffer(info) < 0) {
 		ret = -EINVAL;
 		goto err;
@@ -502,5 +506,8 @@ MODULE_PARM_DESC(mode_option, "video mode (<x>x<y>[-<bpp>][@<refr>])");
 module_param(vram, int, 0);
 MODULE_PARM_DESC(vram, "video memory size");
 
+module_param(vt_switch, int, 0);
+MODULE_PARM_DESC(vt_switch, "enable VT switch during suspend/resume");
+
 MODULE_DESCRIPTION("Framebuffer driver for the AMD Geode GX");
 MODULE_LICENSE("GPL");
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 1d7d4c5797e..a6977423baf 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -12,11 +12,22 @@
 #include <asm/errno.h>
 
 #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
+extern void pm_set_vt_switch(int);
 extern int pm_prepare_console(void);
 extern void pm_restore_console(void);
 #else
-static inline int pm_prepare_console(void) { return 0; }
-static inline void pm_restore_console(void) {}
+static inline void pm_set_vt_switch(int do_switch)
+{
+}
+
+static inline int pm_prepare_console(void)
+{
+	return 0;
+}
+
+static inline void pm_restore_console(void)
+{
+}
 #endif
 
 typedef int __bitwise suspend_state_t;
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 89bcf4973ee..b8628be2a46 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -7,17 +7,39 @@
 #include <linux/vt_kern.h>
 #include <linux/kbd_kern.h>
 #include <linux/console.h>
+#include <linux/module.h>
 #include "power.h"
 
 #if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
 #define SUSPEND_CONSOLE	(MAX_NR_CONSOLES-1)
 
 static int orig_fgconsole, orig_kmsg;
+static int disable_vt_switch;
+
+/*
+ * Normally during a suspend, we allocate a new console and switch to it.
+ * When we resume, we switch back to the original console.  This switch
+ * can be slow, so on systems where the framebuffer can handle restoration
+ * of video registers anyways, there's little point in doing the console
+ * switch.  This function allows you to disable it by passing it '0'.
+ */
+void pm_set_vt_switch(int do_switch)
+{
+	acquire_console_sem();
+	disable_vt_switch = !do_switch;
+	release_console_sem();
+}
+EXPORT_SYMBOL(pm_set_vt_switch);
 
 int pm_prepare_console(void)
 {
 	acquire_console_sem();
 
+	if (disable_vt_switch) {
+		release_console_sem();
+		return 0;
+	}
+
 	orig_fgconsole = fg_console;
 
 	if (vc_allocate(SUSPEND_CONSOLE)) {
@@ -50,9 +72,12 @@ int pm_prepare_console(void)
 void pm_restore_console(void)
 {
 	acquire_console_sem();
+	if (disable_vt_switch) {
+		release_console_sem();
+		return;
+	}
 	set_console(orig_fgconsole);
 	release_console_sem();
 	kmsg_redirect = orig_kmsg;
-	return;
 }
 #endif
-- 
cgit v1.2.3-70-g09d2


From 0c96c5979a522c3323c30a078a70120e29b5bdbc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 28 Apr 2008 09:23:24 +0200
Subject: hrtimer: raise softirq unlocked to avoid circular lock dependency

The scheduler hrtimer bits in 2.6.25 introduced a circular lock
dependency in a rare code path:

=======================================================
[ INFO: possible circular locking dependency detected ]
2.6.25-sched-devel.git-x86-latest.git #19
-------------------------------------------------------
X/2980 is trying to acquire lock:
 (&rq->rq_lock_key#2){++..}, at: [<ffffffff80230146>] task_rq_lock+0x56/0xa0

but task is already holding lock:
 (&cpu_base->lock){++..}, at: [<ffffffff80257ae1>] lock_hrtimer_base+0x31/0x60

which lock already depends on the new lock.

The scenario which leads to this is:

posix-timer signal is delivered
 -> posix-timer is rearmed
    timer is already expired in hrtimer_enqueue()
     -> softirq is raised

To prevent this we need to move the raise of the softirq out of the
base->lock protected code path.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@kernel.org
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/hrtimer.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index e379ef0e9c2..dea4c9124ac 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -590,7 +590,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 			list_add_tail(&timer->cb_entry,
 				      &base->cpu_base->cb_pending);
 			timer->state = HRTIMER_STATE_PENDING;
-			raise_softirq(HRTIMER_SOFTIRQ);
 			return 1;
 		default:
 			BUG();
@@ -633,6 +632,11 @@ static int hrtimer_switch_to_hres(void)
 	return 1;
 }
 
+static inline void hrtimer_raise_softirq(void)
+{
+	raise_softirq(HRTIMER_SOFTIRQ);
+}
+
 #else
 
 static inline int hrtimer_hres_active(void) { return 0; }
@@ -651,6 +655,7 @@ static inline int hrtimer_reprogram(struct hrtimer *timer,
 {
 	return 0;
 }
+static inline void hrtimer_raise_softirq(void) { }
 
 #endif /* CONFIG_HIGH_RES_TIMERS */
 
@@ -850,7 +855,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 {
 	struct hrtimer_clock_base *base, *new_base;
 	unsigned long flags;
-	int ret;
+	int ret, raise;
 
 	base = lock_hrtimer_base(timer, &flags);
 
@@ -884,8 +889,18 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 	enqueue_hrtimer(timer, new_base,
 			new_base->cpu_base == &__get_cpu_var(hrtimer_bases));
 
+	/*
+	 * The timer may be expired and moved to the cb_pending
+	 * list. We can not raise the softirq with base lock held due
+	 * to a possible deadlock with runqueue lock.
+	 */
+	raise = timer->state == HRTIMER_STATE_PENDING;
+
 	unlock_hrtimer_base(timer, &flags);
 
+	if (raise)
+		hrtimer_raise_softirq();
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(hrtimer_start);
-- 
cgit v1.2.3-70-g09d2


From 9d04d9280c4bbf6950b70b705bc4ace41de65615 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Mon, 28 Apr 2008 13:57:19 -0700
Subject: ptrace: conditionalize compat_ptrace_request

My recent additions to compat_ptrace_request made it mandatory
for CONFIG_COMPAT arch's to define copy_siginfo_from_user32.
This broke some builds, though they all really should get cleaned
up in that way.

Since all the arch's that actually call compat_ptrace_request have
now been cleaned up to use the generic compat_sys_ptrace, we can
avoid the build problems on the crufty arch's by changing the
conditionals on the definition.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 67e392ed549..dac4b4e5729 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -612,7 +612,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
 	return (copied == sizeof(data)) ? 0 : -EIO;
 }
 
-#ifdef CONFIG_COMPAT
+#if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE
 #include <linux/compat.h>
 
 int compat_ptrace_request(struct task_struct *child, compat_long_t request,
@@ -667,7 +667,6 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
 	return ret;
 }
 
-#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE
 asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 				  compat_long_t addr, compat_long_t data)
 {
@@ -710,6 +709,4 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 	unlock_kernel();
 	return ret;
 }
-#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */
-
-#endif	/* CONFIG_COMPAT */
+#endif	/* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */
-- 
cgit v1.2.3-70-g09d2


From b331d259b1147f82d692f3b866e036017cbde8fe Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Mon, 28 Apr 2008 14:13:19 -0700
Subject: kernel: fix integer as NULL pointer warnings

kernel/cpuset.c:1268:52: warning: Using plain integer as NULL pointer
kernel/pid_namespace.c:95:24: warning: Using plain integer as NULL pointer

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Reviewed-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c        | 3 ++-
 kernel/pid_namespace.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 024888bb981..48a976c52cf 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1265,7 +1265,8 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
 		return -E2BIG;
 
 	/* +1 for nul-terminator */
-	if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0)
+	buffer = kmalloc(nbytes + 1, GFP_KERNEL);
+	if (!buffer)
 		return -ENOMEM;
 
 	if (copy_from_user(buffer, userbuf, nbytes)) {
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6d792b66d85..5ca37fa50be 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -92,7 +92,7 @@ static struct pid_namespace *create_pid_namespace(int level)
 	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
 
 	for (i = 1; i < PIDMAP_ENTRIES; i++) {
-		ns->pidmap[i].page = 0;
+		ns->pidmap[i].page = NULL;
 		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
 	}
 
-- 
cgit v1.2.3-70-g09d2