From 06f9d4f94a075285d25253edbf57f2cda07d4ff3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 22 Mar 2006 00:07:40 -0800
Subject: [PATCH] unshare: Error if passed unsupported flags

A bare bones trivial patch to ensure we always get -EINVAL on the
unsupported cases for sys_unshare.  If this goes in before 2.6.16 it allows
us to forward compatible with future applications using sys_unshare.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: JANAK DESAI <janak@us.ibm.com>
Cc: <stable@kerenl.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index b373322ca49..9bd7b65ee41 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1534,6 +1534,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 
 	check_unshare_flags(&unshare_flags);
 
+	/* Return -EINVAL for all unsupported flags */
+	err = -EINVAL;
+	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM))
+		goto bad_unshare_out;
+
 	if ((err = unshare_thread(unshare_flags)))
 		goto bad_unshare_out;
 	if ((err = unshare_fs(unshare_flags, &new_fs)))
-- 
cgit v1.2.3-70-g09d2


From 0c9e63fd38a2fb2181668a0cdd622a3c23cfd567 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 23 Mar 2006 03:00:12 -0800
Subject: [PATCH] Shrinks sizeof(files_struct) and better layout

1) Reduce the size of (struct fdtable) to exactly 64 bytes on 32bits
   platforms, lowering kmalloc() allocated space by 50%.

2) Reduce the size of (files_struct), using a special 32 bits (or
   64bits) embedded_fd_set, instead of a 1024 bits fd_set for the
   close_on_exec_init and open_fds_init fields.  This save some ram (248
   bytes per task) as most tasks dont open more than 32 files.  D-Cache
   footprint for such tasks is also reduced to the minimum.

3) Reduce size of allocated fdset.  Currently two full pages are
   allocated, that is 32768 bits on x86 for example, and way too much.  The
   minimum is now L1_CACHE_BYTES.

UP and SMP should benefit from this patch, because most tasks will touch
only one cache line when open()/close() stdin/stdout/stderr (0/1/2),
(next_fd, close_on_exec_init, open_fds_init, fd_array[0 ..  2] being in the
same cache line)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fcntl.c                |  9 ++++-----
 fs/file.c                 | 34 ++++++++++++++--------------------
 fs/open.c                 |  8 ++++----
 include/linux/file.h      | 28 ++++++++++++++++++++++++----
 include/linux/init_task.h | 10 +++++-----
 kernel/fork.c             |  8 ++++----
 6 files changed, 55 insertions(+), 42 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index dc4a7007f4e..03c789560fb 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -73,8 +73,8 @@ repeat:
 	 * orig_start..fdt->next_fd
 	 */
 	start = orig_start;
-	if (start < fdt->next_fd)
-		start = fdt->next_fd;
+	if (start < files->next_fd)
+		start = files->next_fd;
 
 	newfd = start;
 	if (start < fdt->max_fdset) {
@@ -102,9 +102,8 @@ repeat:
 	 * we reacquire the fdtable pointer and use it while holding
 	 * the lock, no one can free it during that time.
 	 */
-	fdt = files_fdtable(files);
-	if (start <= fdt->next_fd)
-		fdt->next_fd = newfd + 1;
+	if (start <= files->next_fd)
+		files->next_fd = newfd + 1;
 
 	error = newfd;
 	
diff --git a/fs/file.c b/fs/file.c
index cea7cbea11d..bbc74331473 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -125,7 +125,8 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
 		kmem_cache_free(files_cachep, fdt->free_files);
 		return;
 	}
-	if (fdt->max_fdset <= __FD_SETSIZE && fdt->max_fds <= NR_OPEN_DEFAULT) {
+	if (fdt->max_fdset <= EMBEDDED_FD_SET_SIZE &&
+		fdt->max_fds <= NR_OPEN_DEFAULT) {
 		/*
 		 * The fdtable was embedded
 		 */
@@ -155,8 +156,9 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
 
 void free_fdtable(struct fdtable *fdt)
 {
-	if (fdt->free_files || fdt->max_fdset > __FD_SETSIZE ||
-					fdt->max_fds > NR_OPEN_DEFAULT)
+	if (fdt->free_files ||
+		fdt->max_fdset > EMBEDDED_FD_SET_SIZE ||
+		fdt->max_fds > NR_OPEN_DEFAULT)
 		call_rcu(&fdt->rcu, free_fdtable_rcu);
 }
 
@@ -199,7 +201,6 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt)
 		       (nfdt->max_fds - fdt->max_fds) *
 					sizeof(struct file *));
 	}
-	nfdt->next_fd = fdt->next_fd;
 }
 
 /*
@@ -220,11 +221,9 @@ fd_set * alloc_fdset(int num)
 
 void free_fdset(fd_set *array, int num)
 {
-	int size = num / 8;
-
-	if (num <= __FD_SETSIZE) /* Don't free an embedded fdset */
+	if (num <= EMBEDDED_FD_SET_SIZE) /* Don't free an embedded fdset */
 		return;
-	else if (size <= PAGE_SIZE)
+	else if (num <= 8 * PAGE_SIZE)
 		kfree(array);
 	else
 		vfree(array);
@@ -237,22 +236,17 @@ static struct fdtable *alloc_fdtable(int nr)
   	fd_set *new_openset = NULL, *new_execset = NULL;
 	struct file **new_fds;
 
-	fdt = kmalloc(sizeof(*fdt), GFP_KERNEL);
+	fdt = kzalloc(sizeof(*fdt), GFP_KERNEL);
 	if (!fdt)
   		goto out;
-	memset(fdt, 0, sizeof(*fdt));
 
-	nfds = __FD_SETSIZE;
+	nfds = 8 * L1_CACHE_BYTES;
   	/* Expand to the max in easy steps */
-  	do {
-		if (nfds < (PAGE_SIZE * 8))
-			nfds = PAGE_SIZE * 8;
-		else {
-			nfds = nfds * 2;
-			if (nfds > NR_OPEN)
-				nfds = NR_OPEN;
-		}
-	} while (nfds <= nr);
+  	while (nfds <= nr) {
+		nfds = nfds * 2;
+		if (nfds > NR_OPEN)
+			nfds = NR_OPEN;
+	}
 
   	new_openset = alloc_fdset(nfds);
   	new_execset = alloc_fdset(nfds);
diff --git a/fs/open.c b/fs/open.c
index 70e0230d8e7..1091dadd6c3 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -973,7 +973,7 @@ repeat:
 	fdt = files_fdtable(files);
  	fd = find_next_zero_bit(fdt->open_fds->fds_bits,
 				fdt->max_fdset,
-				fdt->next_fd);
+				files->next_fd);
 
 	/*
 	 * N.B. For clone tasks sharing a files structure, this test
@@ -998,7 +998,7 @@ repeat:
 
 	FD_SET(fd, fdt->open_fds);
 	FD_CLR(fd, fdt->close_on_exec);
-	fdt->next_fd = fd + 1;
+	files->next_fd = fd + 1;
 #if 1
 	/* Sanity check */
 	if (fdt->fd[fd] != NULL) {
@@ -1019,8 +1019,8 @@ static void __put_unused_fd(struct files_struct *files, unsigned int fd)
 {
 	struct fdtable *fdt = files_fdtable(files);
 	__FD_CLR(fd, fdt->open_fds);
-	if (fd < fdt->next_fd)
-		fdt->next_fd = fd;
+	if (fd < files->next_fd)
+		files->next_fd = fd;
 }
 
 void fastcall put_unused_fd(unsigned int fd)
diff --git a/include/linux/file.h b/include/linux/file.h
index 9901b850f2e..9f7c2513866 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -10,6 +10,7 @@
 #include <linux/compiler.h>
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
+#include <linux/types.h>
 
 /*
  * The default fd array needs to be at least BITS_PER_LONG,
@@ -17,10 +18,22 @@
  */
 #define NR_OPEN_DEFAULT BITS_PER_LONG
 
+/*
+ * The embedded_fd_set is a small fd_set,
+ * suitable for most tasks (which open <= BITS_PER_LONG files)
+ */
+struct embedded_fd_set {
+	unsigned long fds_bits[1];
+};
+
+/*
+ * More than this number of fds: we use a separately allocated fd_set
+ */
+#define EMBEDDED_FD_SET_SIZE (BITS_PER_BYTE * sizeof(struct embedded_fd_set))
+
 struct fdtable {
 	unsigned int max_fds;
 	int max_fdset;
-	int next_fd;
 	struct file ** fd;      /* current fd array */
 	fd_set *close_on_exec;
 	fd_set *open_fds;
@@ -33,13 +46,20 @@ struct fdtable {
  * Open file table structure
  */
 struct files_struct {
+  /*
+   * read mostly part
+   */
 	atomic_t count;
 	struct fdtable *fdt;
 	struct fdtable fdtab;
-	fd_set close_on_exec_init;
-	fd_set open_fds_init;
+  /*
+   * written part on a separate cache line in SMP
+   */
+	spinlock_t file_lock ____cacheline_aligned_in_smp;
+	int next_fd;
+	struct embedded_fd_set close_on_exec_init;
+	struct embedded_fd_set open_fds_init;
 	struct file * fd_array[NR_OPEN_DEFAULT];
-	spinlock_t file_lock;     /* Protects concurrent writers.  Nests inside tsk->alloc_lock */
 };
 
 #define files_fdtable(files) (rcu_dereference((files)->fdt))
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index dcfd2ecccb5..92146f3b742 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -7,11 +7,10 @@
 #define INIT_FDTABLE \
 {							\
 	.max_fds	= NR_OPEN_DEFAULT, 		\
-	.max_fdset	= __FD_SETSIZE, 		\
-	.next_fd	= 0, 				\
+	.max_fdset	= EMBEDDED_FD_SET_SIZE,		\
 	.fd		= &init_files.fd_array[0], 	\
-	.close_on_exec	= &init_files.close_on_exec_init, \
-	.open_fds	= &init_files.open_fds_init, 	\
+	.close_on_exec	= (fd_set *)&init_files.close_on_exec_init, \
+	.open_fds	= (fd_set *)&init_files.open_fds_init, 	\
 	.rcu		= RCU_HEAD_INIT, 		\
 	.free_files	= NULL,		 		\
 	.next		= NULL,		 		\
@@ -20,9 +19,10 @@
 #define INIT_FILES \
 { 							\
 	.count		= ATOMIC_INIT(1), 		\
-	.file_lock	= SPIN_LOCK_UNLOCKED, 		\
 	.fdt		= &init_files.fdtab, 		\
 	.fdtab		= INIT_FDTABLE,			\
+	.file_lock	= SPIN_LOCK_UNLOCKED, 		\
+	.next_fd	= 0, 				\
 	.close_on_exec_init = { { 0, } }, 		\
 	.open_fds_init	= { { 0, } }, 			\
 	.fd_array	= { NULL, } 			\
diff --git a/kernel/fork.c b/kernel/fork.c
index 9bd7b65ee41..c79ae0b19a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -607,12 +607,12 @@ static struct files_struct *alloc_files(void)
 	atomic_set(&newf->count, 1);
 
 	spin_lock_init(&newf->file_lock);
+	newf->next_fd = 0;
 	fdt = &newf->fdtab;
-	fdt->next_fd = 0;
 	fdt->max_fds = NR_OPEN_DEFAULT;
-	fdt->max_fdset = __FD_SETSIZE;
-	fdt->close_on_exec = &newf->close_on_exec_init;
-	fdt->open_fds = &newf->open_fds_init;
+	fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
+	fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
+	fdt->open_fds = (fd_set *)&newf->open_fds_init;
 	fdt->fd = &newf->fd_array[0];
 	INIT_RCU_HEAD(&fdt->rcu);
 	fdt->free_files = NULL;
-- 
cgit v1.2.3-70-g09d2


From 2056a782f8e7e65fd4bfd027506b4ce1c5e9ccd4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 23 Mar 2006 20:00:26 +0100
Subject: [PATCH] Block queue IO tracing support (blktrace) as of 2006-03-23

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/Kconfig                |  12 +
 block/Makefile               |   2 +
 block/blktrace.c             | 538 +++++++++++++++++++++++++++++++++++++++++++
 block/elevator.c             |   4 +
 block/ioctl.c                |   6 +
 block/ll_rw_blk.c            |  44 +++-
 drivers/block/cciss.c        |   2 +
 drivers/md/dm.c              |  13 +-
 fs/bio.c                     |   4 +
 fs/compat_ioctl.c            |   1 +
 include/linux/blkdev.h       |   3 +
 include/linux/blktrace_api.h | 277 ++++++++++++++++++++++
 include/linux/compat_ioctl.h |   4 +
 include/linux/fs.h           |   4 +
 include/linux/sched.h        |   1 +
 kernel/fork.c                |   1 +
 mm/highmem.c                 |   3 +
 17 files changed, 916 insertions(+), 3 deletions(-)
 create mode 100644 block/blktrace.c
 create mode 100644 include/linux/blktrace_api.h

(limited to 'kernel/fork.c')

diff --git a/block/Kconfig b/block/Kconfig
index 377f6dd20e1..96783645092 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -11,4 +11,16 @@ config LBD
 	  your machine, or if you want to have a raid or loopback device
 	  bigger than 2TB.  Otherwise say N.
 
+config BLK_DEV_IO_TRACE
+	bool "Support for tracing block io actions"
+	select RELAY
+	select DEBUG_FS
+	help
+	  Say Y here, if you want to be able to trace the block layer actions
+	  on a given queue. Tracing allows you to see any traffic happening
+	  on a block device queue. For more information (and the user space
+	  support tools needed), fetch the blktrace app from:
+
+	  git://brick.kernel.dk/data/git/blktrace.git
+
 source block/Kconfig.iosched
diff --git a/block/Makefile b/block/Makefile
index 7e4f93e2b44..c05de0e0037 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,3 +8,5 @@ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_AS)	+= as-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
+
+obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
diff --git a/block/blktrace.c b/block/blktrace.c
new file mode 100644
index 00000000000..36f3a172275
--- /dev/null
+++ b/block/blktrace.c
@@ -0,0 +1,538 @@
+/*
+ * Copyright (C) 2006 Jens Axboe <axboe@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blktrace_api.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+#include <asm/uaccess.h>
+
+static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
+static unsigned int blktrace_seq __read_mostly = 1;
+
+/*
+ * Send out a notify for this process, if we haven't done so since a trace
+ * started
+ */
+static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
+{
+	struct blk_io_trace *t;
+
+	t = relay_reserve(bt->rchan, sizeof(*t) + sizeof(tsk->comm));
+	if (t) {
+		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+		t->device = bt->dev;
+		t->action = BLK_TC_ACT(BLK_TC_NOTIFY);
+		t->pid = tsk->pid;
+		t->cpu = smp_processor_id();
+		t->pdu_len = sizeof(tsk->comm);
+		memcpy((void *) t + sizeof(*t), tsk->comm, t->pdu_len);
+		tsk->btrace_seq = blktrace_seq;
+	}
+}
+
+static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
+			 pid_t pid)
+{
+	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
+		return 1;
+	if (sector < bt->start_lba || sector > bt->end_lba)
+		return 1;
+	if (bt->pid && pid != bt->pid)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Data direction bit lookup
+ */
+static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
+
+/*
+ * Bio action bits of interest
+ */
+static u32 bio_act[3] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC) };
+
+/*
+ * More could be added as needed, taking care to increment the decrementer
+ * to get correct indexing
+ */
+#define trace_barrier_bit(rw)	\
+	(((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
+#define trace_sync_bit(rw)	\
+	(((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
+
+/*
+ * The worker for the various blk_add_trace*() types. Fills out a
+ * blk_io_trace structure and places it in a per-cpu subbuffer.
+ */
+void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
+		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
+{
+	struct task_struct *tsk = current;
+	struct blk_io_trace *t;
+	unsigned long flags;
+	unsigned long *sequence;
+	pid_t pid;
+	int cpu;
+
+	if (unlikely(bt->trace_state != Blktrace_running))
+		return;
+
+	what |= ddir_act[rw & WRITE];
+	what |= bio_act[trace_barrier_bit(rw)];
+	what |= bio_act[trace_sync_bit(rw)];
+
+	pid = tsk->pid;
+	if (unlikely(act_log_check(bt, what, sector, pid)))
+		return;
+
+	/*
+	 * A word about the locking here - we disable interrupts to reserve
+	 * some space in the relay per-cpu buffer, to prevent an irq
+	 * from coming in and stepping on our toes. Once reserved, it's
+	 * enough to get preemption disabled to prevent read of this data
+	 * before we are through filling it. get_cpu()/put_cpu() does this
+	 * for us
+	 */
+	local_irq_save(flags);
+
+	if (unlikely(tsk->btrace_seq != blktrace_seq))
+		trace_note_tsk(bt, tsk);
+
+	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
+	if (t) {
+		cpu = smp_processor_id();
+		sequence = per_cpu_ptr(bt->sequence, cpu);
+
+		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+		t->sequence = ++(*sequence);
+		t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
+		t->sector = sector;
+		t->bytes = bytes;
+		t->action = what;
+		t->pid = pid;
+		t->device = bt->dev;
+		t->cpu = cpu;
+		t->error = error;
+		t->pdu_len = pdu_len;
+
+		if (pdu_len)
+			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+	}
+
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL_GPL(__blk_add_trace);
+
+static struct dentry *blk_tree_root;
+static struct mutex blk_tree_mutex;
+static unsigned int root_users;
+
+static inline void blk_remove_root(void)
+{
+	if (blk_tree_root) {
+		debugfs_remove(blk_tree_root);
+		blk_tree_root = NULL;
+	}
+}
+
+static void blk_remove_tree(struct dentry *dir)
+{
+	mutex_lock(&blk_tree_mutex);
+	debugfs_remove(dir);
+	if (--root_users == 0)
+		blk_remove_root();
+	mutex_unlock(&blk_tree_mutex);
+}
+
+static struct dentry *blk_create_tree(const char *blk_name)
+{
+	struct dentry *dir = NULL;
+
+	mutex_lock(&blk_tree_mutex);
+
+	if (!blk_tree_root) {
+		blk_tree_root = debugfs_create_dir("block", NULL);
+		if (!blk_tree_root)
+			goto err;
+	}
+
+	dir = debugfs_create_dir(blk_name, blk_tree_root);
+	if (dir)
+		root_users++;
+	else
+		blk_remove_root();
+
+err:
+	mutex_unlock(&blk_tree_mutex);
+	return dir;
+}
+
+static void blk_trace_cleanup(struct blk_trace *bt)
+{
+	relay_close(bt->rchan);
+	debugfs_remove(bt->dropped_file);
+	blk_remove_tree(bt->dir);
+	free_percpu(bt->sequence);
+	kfree(bt);
+}
+
+static int blk_trace_remove(request_queue_t *q)
+{
+	struct blk_trace *bt;
+
+	bt = xchg(&q->blk_trace, NULL);
+	if (!bt)
+		return -EINVAL;
+
+	if (bt->trace_state == Blktrace_setup ||
+	    bt->trace_state == Blktrace_stopped)
+		blk_trace_cleanup(bt);
+
+	return 0;
+}
+
+static int blk_dropped_open(struct inode *inode, struct file *filp)
+{
+	filp->private_data = inode->u.generic_ip;
+
+	return 0;
+}
+
+static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	struct blk_trace *bt = filp->private_data;
+	char buf[16];
+
+	snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
+
+	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
+}
+
+static struct file_operations blk_dropped_fops = {
+	.owner =	THIS_MODULE,
+	.open =		blk_dropped_open,
+	.read =		blk_dropped_read,
+};
+
+/*
+ * Keep track of how many times we encountered a full subbuffer, to aid
+ * the user space app in telling how many lost events there were.
+ */
+static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
+				     void *prev_subbuf, size_t prev_padding)
+{
+	struct blk_trace *bt;
+
+	if (!relay_buf_full(buf))
+		return 1;
+
+	bt = buf->chan->private_data;
+	atomic_inc(&bt->dropped);
+	return 0;
+}
+
+static int blk_remove_buf_file_callback(struct dentry *dentry)
+{
+	debugfs_remove(dentry);
+	return 0;
+}
+
+static struct dentry *blk_create_buf_file_callback(const char *filename,
+						   struct dentry *parent,
+						   int mode,
+						   struct rchan_buf *buf,
+						   int *is_global)
+{
+	return debugfs_create_file(filename, mode, parent, buf,
+					&relay_file_operations);
+}
+
+static struct rchan_callbacks blk_relay_callbacks = {
+	.subbuf_start		= blk_subbuf_start_callback,
+	.create_buf_file	= blk_create_buf_file_callback,
+	.remove_buf_file	= blk_remove_buf_file_callback,
+};
+
+/*
+ * Setup everything required to start tracing
+ */
+static int blk_trace_setup(request_queue_t *q, struct block_device *bdev,
+			   char __user *arg)
+{
+	struct blk_user_trace_setup buts;
+	struct blk_trace *old_bt, *bt = NULL;
+	struct dentry *dir = NULL;
+	char b[BDEVNAME_SIZE];
+	int ret, i;
+
+	if (copy_from_user(&buts, arg, sizeof(buts)))
+		return -EFAULT;
+
+	if (!buts.buf_size || !buts.buf_nr)
+		return -EINVAL;
+
+	strcpy(buts.name, bdevname(bdev, b));
+
+	/*
+	 * some device names have larger paths - convert the slashes
+	 * to underscores for this to work as expected
+	 */
+	for (i = 0; i < strlen(buts.name); i++)
+		if (buts.name[i] == '/')
+			buts.name[i] = '_';
+
+	if (copy_to_user(arg, &buts, sizeof(buts)))
+		return -EFAULT;
+
+	ret = -ENOMEM;
+	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
+	if (!bt)
+		goto err;
+
+	bt->sequence = alloc_percpu(unsigned long);
+	if (!bt->sequence)
+		goto err;
+
+	ret = -ENOENT;
+	dir = blk_create_tree(buts.name);
+	if (!dir)
+		goto err;
+
+	bt->dir = dir;
+	bt->dev = bdev->bd_dev;
+	atomic_set(&bt->dropped, 0);
+
+	ret = -EIO;
+	bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
+	if (!bt->dropped_file)
+		goto err;
+
+	bt->rchan = relay_open("trace", dir, buts.buf_size, buts.buf_nr, &blk_relay_callbacks);
+	if (!bt->rchan)
+		goto err;
+	bt->rchan->private_data = bt;
+
+	bt->act_mask = buts.act_mask;
+	if (!bt->act_mask)
+		bt->act_mask = (u16) -1;
+
+	bt->start_lba = buts.start_lba;
+	bt->end_lba = buts.end_lba;
+	if (!bt->end_lba)
+		bt->end_lba = -1ULL;
+
+	bt->pid = buts.pid;
+	bt->trace_state = Blktrace_setup;
+
+	ret = -EBUSY;
+	old_bt = xchg(&q->blk_trace, bt);
+	if (old_bt) {
+		(void) xchg(&q->blk_trace, old_bt);
+		goto err;
+	}
+
+	return 0;
+err:
+	if (dir)
+		blk_remove_tree(dir);
+	if (bt) {
+		if (bt->dropped_file)
+			debugfs_remove(bt->dropped_file);
+		if (bt->sequence)
+			free_percpu(bt->sequence);
+		if (bt->rchan)
+			relay_close(bt->rchan);
+		kfree(bt);
+	}
+	return ret;
+}
+
+static int blk_trace_startstop(request_queue_t *q, int start)
+{
+	struct blk_trace *bt;
+	int ret;
+
+	if ((bt = q->blk_trace) == NULL)
+		return -EINVAL;
+
+	/*
+	 * For starting a trace, we can transition from a setup or stopped
+	 * trace. For stopping a trace, the state must be running
+	 */
+	ret = -EINVAL;
+	if (start) {
+		if (bt->trace_state == Blktrace_setup ||
+		    bt->trace_state == Blktrace_stopped) {
+			blktrace_seq++;
+			smp_mb();
+			bt->trace_state = Blktrace_running;
+			ret = 0;
+		}
+	} else {
+		if (bt->trace_state == Blktrace_running) {
+			bt->trace_state = Blktrace_stopped;
+			relay_flush(bt->rchan);
+			ret = 0;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * blk_trace_ioctl: - handle the ioctls associated with tracing
+ * @bdev:	the block device
+ * @cmd: 	the ioctl cmd
+ * @arg:	the argument data, if any
+ *
+ **/
+int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
+{
+	request_queue_t *q;
+	int ret, start = 0;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	mutex_lock(&bdev->bd_mutex);
+
+	switch (cmd) {
+	case BLKTRACESETUP:
+		ret = blk_trace_setup(q, bdev, arg);
+		break;
+	case BLKTRACESTART:
+		start = 1;
+	case BLKTRACESTOP:
+		ret = blk_trace_startstop(q, start);
+		break;
+	case BLKTRACETEARDOWN:
+		ret = blk_trace_remove(q);
+		break;
+	default:
+		ret = -ENOTTY;
+		break;
+	}
+
+	mutex_unlock(&bdev->bd_mutex);
+	return ret;
+}
+
+/**
+ * blk_trace_shutdown: - stop and cleanup trace structures
+ * @q:    the request queue associated with the device
+ *
+ **/
+void blk_trace_shutdown(request_queue_t *q)
+{
+	blk_trace_startstop(q, 0);
+	blk_trace_remove(q);
+}
+
+/*
+ * Average offset over two calls to sched_clock() with a gettimeofday()
+ * in the middle
+ */
+static void blk_check_time(unsigned long long *t)
+{
+	unsigned long long a, b;
+	struct timeval tv;
+
+	a = sched_clock();
+	do_gettimeofday(&tv);
+	b = sched_clock();
+
+	*t = tv.tv_sec * 1000000000 + tv.tv_usec * 1000;
+	*t -= (a + b) / 2;
+}
+
+static void blk_trace_check_cpu_time(void *data)
+{
+	unsigned long long *t;
+	int cpu = get_cpu();
+
+	t = &per_cpu(blk_trace_cpu_offset, cpu);
+
+	/*
+	 * Just call it twice, hopefully the second call will be cache hot
+	 * and a little more precise
+	 */
+	blk_check_time(t);
+	blk_check_time(t);
+
+	put_cpu();
+}
+
+/*
+ * Call blk_trace_check_cpu_time() on each CPU to calibrate our inter-CPU
+ * timings
+ */
+static void blk_trace_calibrate_offsets(void)
+{
+	unsigned long flags;
+
+	smp_call_function(blk_trace_check_cpu_time, NULL, 1, 1);
+	local_irq_save(flags);
+	blk_trace_check_cpu_time(NULL);
+	local_irq_restore(flags);
+}
+
+static void blk_trace_set_ht_offsets(void)
+{
+#if defined(CONFIG_SCHED_SMT)
+	int cpu, i;
+
+	/*
+	 * now make sure HT siblings have the same time offset
+	 */
+	preempt_disable();
+	for_each_online_cpu(cpu) {
+		unsigned long long *cpu_off, *sibling_off;
+
+		for_each_cpu_mask(i, cpu_sibling_map[cpu]) {
+			if (i == cpu)
+				continue;
+
+			cpu_off = &per_cpu(blk_trace_cpu_offset, cpu);
+			sibling_off = &per_cpu(blk_trace_cpu_offset, i);
+			*sibling_off = *cpu_off;
+		}
+	}
+	preempt_enable();
+#endif
+}
+
+static __init int blk_trace_init(void)
+{
+	mutex_init(&blk_tree_mutex);
+	blk_trace_calibrate_offsets();
+	blk_trace_set_ht_offsets();
+
+	return 0;
+}
+
+module_init(blk_trace_init);
+
diff --git a/block/elevator.c b/block/elevator.c
index db3d0d8296a..5e558c4689a 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -33,6 +33,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/delay.h>
+#include <linux/blktrace_api.h>
 
 #include <asm/uaccess.h>
 
@@ -333,6 +334,8 @@ void elv_insert(request_queue_t *q, struct request *rq, int where)
 	struct list_head *pos;
 	unsigned ordseq;
 
+	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+
 	rq->q = q;
 
 	switch (where) {
@@ -499,6 +502,7 @@ struct request *elv_next_request(request_queue_t *q)
 			 * not be passed by new incoming requests
 			 */
 			rq->flags |= REQ_STARTED;
+			blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
 		}
 
 		if (!q->boundary_rq || q->boundary_rq == rq) {
diff --git a/block/ioctl.c b/block/ioctl.c
index 35fdb7dc651..9cfa2e1ecb2 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -5,6 +5,7 @@
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/smp_lock.h>
+#include <linux/blktrace_api.h>
 #include <asm/uaccess.h>
 
 static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
@@ -189,6 +190,11 @@ static int blkdev_locked_ioctl(struct file *file, struct block_device *bdev,
 		return put_ulong(arg, bdev->bd_inode->i_size >> 9);
 	case BLKGETSIZE64:
 		return put_u64(arg, bdev->bd_inode->i_size);
+	case BLKTRACESTART:
+	case BLKTRACESTOP:
+	case BLKTRACESETUP:
+	case BLKTRACETEARDOWN:
+		return blk_trace_ioctl(bdev, cmd, (char __user *) arg);
 	}
 	return -ENOIOCTLCMD;
 }
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 6c793b196aa..062067fa7ea 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -28,6 +28,7 @@
 #include <linux/writeback.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
+#include <linux/blktrace_api.h>
 
 /*
  * for max sense size
@@ -1556,8 +1557,10 @@ void blk_plug_device(request_queue_t *q)
 	if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
 		return;
 
-	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
+	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
+		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+	}
 }
 
 EXPORT_SYMBOL(blk_plug_device);
@@ -1621,14 +1624,21 @@ static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
 	/*
 	 * devices don't necessarily have an ->unplug_fn defined
 	 */
-	if (q->unplug_fn)
+	if (q->unplug_fn) {
+		blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+					q->rq.count[READ] + q->rq.count[WRITE]);
+
 		q->unplug_fn(q);
+	}
 }
 
 static void blk_unplug_work(void *data)
 {
 	request_queue_t *q = data;
 
+	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+				q->rq.count[READ] + q->rq.count[WRITE]);
+
 	q->unplug_fn(q);
 }
 
@@ -1636,6 +1646,9 @@ static void blk_unplug_timeout(unsigned long data)
 {
 	request_queue_t *q = (request_queue_t *)data;
 
+	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
+				q->rq.count[READ] + q->rq.count[WRITE]);
+
 	kblockd_schedule_work(&q->unplug_work);
 }
 
@@ -1753,6 +1766,9 @@ static void blk_release_queue(struct kobject *kobj)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
+	if (q->blk_trace)
+		blk_trace_shutdown(q);
+
 	kmem_cache_free(requestq_cachep, q);
 }
 
@@ -2129,6 +2145,8 @@ rq_starved:
 	
 	rq_init(q, rq);
 	rq->rl = rl;
+
+	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
 out:
 	return rq;
 }
@@ -2157,6 +2175,8 @@ static struct request *get_request_wait(request_queue_t *q, int rw,
 		if (!rq) {
 			struct io_context *ioc;
 
+			blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+
 			__generic_unplug_device(q);
 			spin_unlock_irq(q->queue_lock);
 			io_schedule();
@@ -2210,6 +2230,8 @@ EXPORT_SYMBOL(blk_get_request);
  */
 void blk_requeue_request(request_queue_t *q, struct request *rq)
 {
+	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 
@@ -2844,6 +2866,8 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 			if (!q->back_merge_fn(q, req, bio))
 				break;
 
+			blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+
 			req->biotail->bi_next = bio;
 			req->biotail = bio;
 			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
@@ -2859,6 +2883,8 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 			if (!q->front_merge_fn(q, req, bio))
 				break;
 
+			blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+
 			bio->bi_next = req->bio;
 			req->bio = bio;
 
@@ -2976,6 +3002,7 @@ void generic_make_request(struct bio *bio)
 	request_queue_t *q;
 	sector_t maxsector;
 	int ret, nr_sectors = bio_sectors(bio);
+	dev_t old_dev;
 
 	might_sleep();
 	/* Test device or partition size, when known. */
@@ -3002,6 +3029,8 @@ void generic_make_request(struct bio *bio)
 	 * NOTE: we don't repeat the blk_size check for each new device.
 	 * Stacking drivers are expected to know what they are doing.
 	 */
+	maxsector = -1;
+	old_dev = 0;
 	do {
 		char b[BDEVNAME_SIZE];
 
@@ -3034,6 +3063,15 @@ end_io:
 		 */
 		blk_partition_remap(bio);
 
+		if (maxsector != -1)
+			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, 
+					    maxsector);
+
+		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+
+		maxsector = bio->bi_sector;
+		old_dev = bio->bi_bdev->bd_dev;
+
 		ret = q->make_request_fn(q, bio);
 	} while (ret);
 }
@@ -3153,6 +3191,8 @@ static int __end_that_request_first(struct request *req, int uptodate,
 	int total_bytes, bio_nbytes, error, next_idx = 0;
 	struct bio *bio;
 
+	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+
 	/*
 	 * extend uptodate bool to allow < 0 value to be direct io error
 	 */
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index e29b8926f80..1f2890989b5 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -38,6 +38,7 @@
 #include <linux/hdreg.h>
 #include <linux/spinlock.h>
 #include <linux/compat.h>
+#include <linux/blktrace_api.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
@@ -2331,6 +2332,7 @@ static inline void complete_command( ctlr_info_t *h, CommandList_struct *cmd,
 
 	cmd->rq->completion_data = cmd;
 	cmd->rq->errors = status;
+	blk_add_trace_rq(cmd->rq->q, cmd->rq, BLK_TA_COMPLETE);
 	blk_complete_request(cmd->rq);
 }
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 26b08ee425c..8c82373f7ff 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -17,6 +17,7 @@
 #include <linux/mempool.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
+#include <linux/blktrace_api.h>
 
 static const char *_name = DM_NAME;
 
@@ -334,6 +335,8 @@ static void dec_pending(struct dm_io *io, int error)
 			/* nudge anyone waiting on suspend queue */
 			wake_up(&io->md->wait);
 
+		blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE);
+
 		bio_endio(io->bio, io->bio->bi_size, io->error);
 		free_io(io->md, io);
 	}
@@ -392,6 +395,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 		      struct target_io *tio)
 {
 	int r;
+	sector_t sector;
 
 	/*
 	 * Sanity checks.
@@ -407,10 +411,17 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 	 * this io.
 	 */
 	atomic_inc(&tio->io->io_count);
+	sector = clone->bi_sector;
 	r = ti->type->map(ti, clone, &tio->info);
-	if (r > 0)
+	if (r > 0) {
 		/* the bio has been remapped so dispatch it */
+
+		blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, 
+				    tio->io->bio->bi_bdev->bd_dev, sector, 
+				    clone->bi_sector);
+
 		generic_make_request(clone);
+	}
 
 	else if (r < 0) {
 		/* error the io and bail out */
diff --git a/fs/bio.c b/fs/bio.c
index 8f1d2e815c9..0a8c59cb68f 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -25,6 +25,7 @@
 #include <linux/module.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
+#include <linux/blktrace_api.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
 #define BIO_POOL_SIZE 256
@@ -1095,6 +1096,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
 	if (!bp)
 		return bp;
 
+	blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
+				bi->bi_sector + first_sectors);
+
 	BUG_ON(bi->bi_vcnt != 1);
 	BUG_ON(bi->bi_idx != 0);
 	atomic_set(&bp->cnt, 3);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c666769a875..7c031f00fd7 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -72,6 +72,7 @@
 #include <linux/i2c-dev.h>
 #include <linux/wireless.h>
 #include <linux/atalk.h>
+#include <linux/blktrace_api.h>
 
 #include <net/sock.h>          /* siocdevprivate_ioctl */
 #include <net/bluetooth/bluetooth.h>
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 56bb6a4e15f..c179966f1a2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -22,6 +22,7 @@ typedef struct request_queue request_queue_t;
 struct elevator_queue;
 typedef struct elevator_queue elevator_t;
 struct request_pm_state;
+struct blk_trace;
 
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
@@ -416,6 +417,8 @@ struct request_queue
 	unsigned int		sg_reserved_size;
 	int			node;
 
+	struct blk_trace	*blk_trace;
+
 	/*
 	 * reserved for flush operations
 	 */
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
new file mode 100644
index 00000000000..b34d3e73d5e
--- /dev/null
+++ b/include/linux/blktrace_api.h
@@ -0,0 +1,277 @@
+#ifndef BLKTRACE_H
+#define BLKTRACE_H
+
+#include <linux/config.h>
+#include <linux/blkdev.h>
+#include <linux/relay.h>
+
+/*
+ * Trace categories
+ */
+enum blktrace_cat {
+	BLK_TC_READ	= 1 << 0,	/* reads */
+	BLK_TC_WRITE	= 1 << 1,	/* writes */
+	BLK_TC_BARRIER	= 1 << 2,	/* barrier */
+	BLK_TC_SYNC	= 1 << 3,	/* barrier */
+	BLK_TC_QUEUE	= 1 << 4,	/* queueing/merging */
+	BLK_TC_REQUEUE	= 1 << 5,	/* requeueing */
+	BLK_TC_ISSUE	= 1 << 6,	/* issue */
+	BLK_TC_COMPLETE	= 1 << 7,	/* completions */
+	BLK_TC_FS	= 1 << 8,	/* fs requests */
+	BLK_TC_PC	= 1 << 9,	/* pc requests */
+	BLK_TC_NOTIFY	= 1 << 10,	/* special message */
+
+	BLK_TC_END	= 1 << 15,	/* only 16-bits, reminder */
+};
+
+#define BLK_TC_SHIFT		(16)
+#define BLK_TC_ACT(act)		((act) << BLK_TC_SHIFT)
+
+/*
+ * Basic trace actions
+ */
+enum blktrace_act {
+	__BLK_TA_QUEUE = 1,		/* queued */
+	__BLK_TA_BACKMERGE,		/* back merged to existing rq */
+	__BLK_TA_FRONTMERGE,		/* front merge to existing rq */
+	__BLK_TA_GETRQ,			/* allocated new request */
+	__BLK_TA_SLEEPRQ,		/* sleeping on rq allocation */
+	__BLK_TA_REQUEUE,		/* request requeued */
+	__BLK_TA_ISSUE,			/* sent to driver */
+	__BLK_TA_COMPLETE,		/* completed by driver */
+	__BLK_TA_PLUG,			/* queue was plugged */
+	__BLK_TA_UNPLUG_IO,		/* queue was unplugged by io */
+	__BLK_TA_UNPLUG_TIMER,		/* queue was unplugged by timer */
+	__BLK_TA_INSERT,		/* insert request */
+	__BLK_TA_SPLIT,			/* bio was split */
+	__BLK_TA_BOUNCE,		/* bio was bounced */
+	__BLK_TA_REMAP,			/* bio was remapped */
+};
+
+/*
+ * Trace actions in full. Additionally, read or write is masked
+ */
+#define BLK_TA_QUEUE		(__BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_BACKMERGE	(__BLK_TA_BACKMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_FRONTMERGE	(__BLK_TA_FRONTMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
+#define	BLK_TA_GETRQ		(__BLK_TA_GETRQ | BLK_TC_ACT(BLK_TC_QUEUE))
+#define	BLK_TA_SLEEPRQ		(__BLK_TA_SLEEPRQ | BLK_TC_ACT(BLK_TC_QUEUE))
+#define	BLK_TA_REQUEUE		(__BLK_TA_REQUEUE | BLK_TC_ACT(BLK_TC_REQUEUE))
+#define BLK_TA_ISSUE		(__BLK_TA_ISSUE | BLK_TC_ACT(BLK_TC_ISSUE))
+#define BLK_TA_COMPLETE		(__BLK_TA_COMPLETE| BLK_TC_ACT(BLK_TC_COMPLETE))
+#define BLK_TA_PLUG		(__BLK_TA_PLUG | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_UNPLUG_IO	(__BLK_TA_UNPLUG_IO | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_UNPLUG_TIMER	(__BLK_TA_UNPLUG_TIMER | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_INSERT		(__BLK_TA_INSERT | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_SPLIT		(__BLK_TA_SPLIT)
+#define BLK_TA_BOUNCE		(__BLK_TA_BOUNCE)
+#define BLK_TA_REMAP		(__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE))
+
+#define BLK_IO_TRACE_MAGIC	0x65617400
+#define BLK_IO_TRACE_VERSION	0x07
+
+/*
+ * The trace itself
+ */
+struct blk_io_trace {
+	u32 magic;		/* MAGIC << 8 | version */
+	u32 sequence;		/* event number */
+	u64 time;		/* in microseconds */
+	u64 sector;		/* disk offset */
+	u32 bytes;		/* transfer length */
+	u32 action;		/* what happened */
+	u32 pid;		/* who did it */
+	u32 device;		/* device number */
+	u32 cpu;		/* on what cpu did it happen */
+	u16 error;		/* completion error */
+	u16 pdu_len;		/* length of data after this trace */
+};
+
+/*
+ * The remap event
+ */
+struct blk_io_trace_remap {
+	u32 device;
+	u32 __pad;
+	u64 sector;
+};
+
+enum {
+	Blktrace_setup = 1,
+	Blktrace_running,
+	Blktrace_stopped,
+};
+
+struct blk_trace {
+	int trace_state;
+	struct rchan *rchan;
+	unsigned long *sequence;
+	u16 act_mask;
+	u64 start_lba;
+	u64 end_lba;
+	u32 pid;
+	u32 dev;
+	struct dentry *dir;
+	struct dentry *dropped_file;
+	atomic_t dropped;
+};
+
+/*
+ * User setup structure passed with BLKTRACESTART
+ */
+struct blk_user_trace_setup {
+	char name[BDEVNAME_SIZE];	/* output */
+	u16 act_mask;			/* input */
+	u32 buf_size;			/* input */
+	u32 buf_nr;			/* input */
+	u64 start_lba;
+	u64 end_lba;
+	u32 pid;
+};
+
+#if defined(CONFIG_BLK_DEV_IO_TRACE)
+extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
+extern void blk_trace_shutdown(request_queue_t *);
+extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
+
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * @q:		queue the io is for
+ * @rq:		the source request
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+				    u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+	int rw = rq->flags & 0x07;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_pc_request(rq)) {
+		what |= BLK_TC_ACT(BLK_TC_PC);
+		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
+	} else  {
+		what |= BLK_TC_ACT(BLK_TC_FS);
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
+	}
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
+				     u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+/**
+ * blk_add_trace_generic - Add a trace for a generic action
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @rw:		the data direction
+ * @what:	the action
+ *
+ * Description:
+ *     Records a simple trace
+ *
+ **/
+static inline void blk_add_trace_generic(struct request_queue *q,
+					 struct bio *bio, int rw, u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		blk_add_trace_bio(q, bio, what);
+	else
+		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
+}
+
+/**
+ * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
+ * @q:		queue the io is for
+ * @what:	the action
+ * @bio:	the source bio
+ * @pdu:	the integer payload
+ *
+ * Description:
+ *     Adds a trace with some integer payload. This might be an unplug
+ *     option given as the action, with the depth at unplug time given
+ *     as the payload
+ *
+ **/
+static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
+					 struct bio *bio, unsigned int pdu)
+{
+	struct blk_trace *bt = q->blk_trace;
+	u64 rpdu = cpu_to_be64(pdu);
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
+	else
+		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @dev:	target device
+ * @from:	source sector
+ * @to:		target sector
+ *
+ * Description:
+ *     Device mapper or raid target sometimes need to split a bio because
+ *     it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
+				       dev_t dev, sector_t from, sector_t to)
+{
+	struct blk_trace *bt = q->blk_trace;
+	struct blk_io_trace_remap r;
+
+	if (likely(!bt))
+		return;
+
+	r.device = cpu_to_be32(dev);
+	r.sector = cpu_to_be64(to);
+
+	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+#else /* !CONFIG_BLK_DEV_IO_TRACE */
+#define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
+#define blk_trace_shutdown(q)			do { } while (0)
+#define blk_add_trace_rq(q, rq, what)		do { } while (0)
+#define blk_add_trace_bio(q, rq, what)		do { } while (0)
+#define blk_add_trace_generic(q, rq, rw, what)	do { } while (0)
+#define blk_add_trace_pdu_int(q, what, bio, pdu)	do { } while (0)
+#define blk_add_trace_remap(q, bio, dev, f, t)	do {} while (0)
+#endif /* CONFIG_BLK_DEV_IO_TRACE */
+
+#endif
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index ae7dfb790df..efb518f16bb 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -97,6 +97,10 @@ COMPATIBLE_IOCTL(BLKRRPART)
 COMPATIBLE_IOCTL(BLKFLSBUF)
 COMPATIBLE_IOCTL(BLKSECTSET)
 COMPATIBLE_IOCTL(BLKSSZGET)
+COMPATIBLE_IOCTL(BLKTRACESTART)
+COMPATIBLE_IOCTL(BLKTRACESTOP)
+COMPATIBLE_IOCTL(BLKTRACESETUP)
+COMPATIBLE_IOCTL(BLKTRACETEARDOWN)
 ULONG_IOCTL(BLKRASET)
 ULONG_IOCTL(BLKFRASET)
 /* RAID */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f9c9dea636d..9b34a1b0345 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -197,6 +197,10 @@ extern int dir_notify_enable;
 #define BLKBSZGET  _IOR(0x12,112,size_t)
 #define BLKBSZSET  _IOW(0x12,113,size_t)
 #define BLKGETSIZE64 _IOR(0x12,114,size_t)	/* return device size in bytes (u64 *arg) */
+#define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup)
+#define BLKTRACESTART _IO(0x12,116)
+#define BLKTRACESTOP _IO(0x12,117)
+#define BLKTRACETEARDOWN _IO(0x12,118)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 62e6314382f..e60a91d5b36 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -706,6 +706,7 @@ struct task_struct {
 	prio_array_t *array;
 
 	unsigned short ioprio;
+	unsigned int btrace_seq;
 
 	unsigned long sleep_avg;
 	unsigned long long timestamp, last_ran;
diff --git a/kernel/fork.c b/kernel/fork.c
index c79ae0b19a4..c21bae8c93b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -181,6 +181,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	/* One for us, one for whoever does the "release_task()" (usually parent) */
 	atomic_set(&tsk->usage,2);
 	atomic_set(&tsk->fs_excl, 0);
+	tsk->btrace_seq = 0;
 	return tsk;
 }
 
diff --git a/mm/highmem.c b/mm/highmem.c
index ce2e7e8bbfa..d0ea1eec6a9 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -26,6 +26,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
+#include <linux/blktrace_api.h>
 #include <asm/tlbflush.h>
 
 static mempool_t *page_pool, *isa_page_pool;
@@ -483,6 +484,8 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
 		pool = isa_page_pool;
 	}
 
+	blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+
 	/*
 	 * slow path
 	 */
-- 
cgit v1.2.3-70-g09d2


From c61afb181c649754ea221f104e268cbacfc993e3 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 24 Mar 2006 03:16:08 -0800
Subject: [PATCH] cpuset memory spread slab cache optimizations

The hooks in the slab cache allocator code path for support of NUMA
mempolicies and cpuset memory spreading are in an important code path.  Many
systems will use neither feature.

This patch optimizes those hooks down to a single check of some bits in the
current tasks task_struct flags.  For non NUMA systems, this hook and related
code is already ifdef'd out.

The optimization is done by using another task flag, set if the task is using
a non-default NUMA mempolicy.  Taking this flag bit along with the
PF_SPREAD_PAGE and PF_SPREAD_SLAB flag bits added earlier in this 'cpuset
memory spreading' patch set, one can check for the combination of any of these
special case memory placement mechanisms with a single test of the current
tasks task_struct flags.

This patch also tightens up the code, to save a few bytes of kernel text
space, and moves some of it out of line.  Due to the nested inlines called
from multiple places, we were ending up with three copies of this code, which
once we get off the main code path (for local node allocation) seems a bit
wasteful of instruction memory.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h |  5 +++++
 include/linux/sched.h     |  1 +
 kernel/fork.c             |  1 +
 mm/mempolicy.c            | 32 ++++++++++++++++++++++++++++++++
 mm/slab.c                 | 41 ++++++++++++++++++++++++++++-------------
 5 files changed, 67 insertions(+), 13 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index bbd2221923c..6a7621b2b12 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -147,6 +147,7 @@ extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new);
 extern void mpol_rebind_task(struct task_struct *tsk,
 					const nodemask_t *new);
 extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
+extern void mpol_fix_fork_child_flag(struct task_struct *p);
 #define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x))
 
 #ifdef CONFIG_CPUSET
@@ -248,6 +249,10 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 {
 }
 
+static inline void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+}
+
 #define set_cpuset_being_rebound(x) do {} while (0)
 
 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b0e37cfa09f..2cda439ece4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -932,6 +932,7 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_SWAPWRITE	0x01000000	/* Allowed to write to swap */
 #define PF_SPREAD_PAGE	0x04000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x08000000	/* Spread some slab caches over cpuset */
+#define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
diff --git a/kernel/fork.c b/kernel/fork.c
index c21bae8c93b..a02063903aa 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1021,6 +1021,7 @@ static task_t *copy_process(unsigned long clone_flags,
  		p->mempolicy = NULL;
  		goto bad_fork_cleanup_cpuset;
  	}
+	mpol_fix_fork_child_flag(p);
 #endif
 
 #ifdef CONFIG_DEBUG_MUTEXES
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e93cc740c22..4f71cfd29c6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -422,6 +422,37 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	return mpol_check_policy(mode, nodes);
 }
 
+
+/*
+ * Update task->flags PF_MEMPOLICY bit: set iff non-default
+ * mempolicy.  Allows more rapid checking of this (combined perhaps
+ * with other PF_* flag bits) on memory allocation hot code paths.
+ *
+ * If called from outside this file, the task 'p' should -only- be
+ * a newly forked child not yet visible on the task list, because
+ * manipulating the task flags of a visible task is not safe.
+ *
+ * The above limitation is why this routine has the funny name
+ * mpol_fix_fork_child_flag().
+ *
+ * It is also safe to call this with a task pointer of current,
+ * which the static wrapper mpol_set_task_struct_flag() does,
+ * for use within this file.
+ */
+
+void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+	if (p->mempolicy)
+		p->flags |= PF_MEMPOLICY;
+	else
+		p->flags &= ~PF_MEMPOLICY;
+}
+
+static void mpol_set_task_struct_flag(void)
+{
+	mpol_fix_fork_child_flag(current);
+}
+
 /* Set the process memory policy */
 long do_set_mempolicy(int mode, nodemask_t *nodes)
 {
@@ -434,6 +465,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes)
 		return PTR_ERR(new);
 	mpol_free(current->mempolicy);
 	current->mempolicy = new;
+	mpol_set_task_struct_flag();
 	if (new && new->policy == MPOL_INTERLEAVE)
 		current->il_next = first_node(new->v.nodes);
 	return 0;
diff --git a/mm/slab.c b/mm/slab.c
index de516658d3d..f80b52388a1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -899,6 +899,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 
 #ifdef CONFIG_NUMA
 static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
+static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 
 static struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -2808,19 +2809,11 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 	struct array_cache *ac;
 
 #ifdef CONFIG_NUMA
-	if (unlikely(current->mempolicy && !in_interrupt())) {
-		int nid = slab_node(current->mempolicy);
-
-		if (nid != numa_node_id())
-			return __cache_alloc_node(cachep, flags, nid);
-	}
-	if (unlikely(cpuset_do_slab_mem_spread() &&
-					(cachep->flags & SLAB_MEM_SPREAD) &&
-					!in_interrupt())) {
-		int nid = cpuset_mem_spread_node();
-
-		if (nid != numa_node_id())
-			return __cache_alloc_node(cachep, flags, nid);
+	if (unlikely(current->flags & (PF_SPREAD_PAGE | PF_SPREAD_SLAB |
+							PF_MEMPOLICY))) {
+		objp = alternate_node_alloc(cachep, flags);
+		if (objp != NULL)
+			return objp;
 	}
 #endif
 
@@ -2855,6 +2848,28 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
 }
 
 #ifdef CONFIG_NUMA
+/*
+ * Try allocating on another node if PF_SPREAD_PAGE|PF_SPREAD_SLAB|PF_MEMPOLICY.
+ *
+ * If we are in_interrupt, then process context, including cpusets and
+ * mempolicy, may not apply and should not be used for allocation policy.
+ */
+static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+	int nid_alloc, nid_here;
+
+	if (in_interrupt())
+		return NULL;
+	nid_alloc = nid_here = numa_node_id();
+	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
+		nid_alloc = cpuset_mem_spread_node();
+	else if (current->mempolicy)
+		nid_alloc = slab_node(current->mempolicy);
+	if (nid_alloc != nid_here)
+		return __cache_alloc_node(cachep, flags, nid_alloc);
+	return NULL;
+}
+
 /*
  * A interface to enable slab creation on nodeid
  */
-- 
cgit v1.2.3-70-g09d2


From 910dea7fdda22f0ee83d26d459e460c79ed94557 Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Sun, 26 Mar 2006 18:29:26 +0200
Subject: BUG_ON() Conversion in kernel/fork.c

this changes if() BUG(); constructs to BUG_ON() which is
cleaner, contains unlikely() and can better optimized away.

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 kernel/fork.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index a02063903aa..d93ab2ba729 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -769,8 +769,7 @@ int unshare_files(void)
 	struct files_struct *files  = current->files;
 	int rc;
 
-	if(!files)
-		BUG();
+	BUG_ON(!files);
 
 	/* This can race but the race causes us to copy when we don't
 	   need to and drop the copy */
-- 
cgit v1.2.3-70-g09d2


From 05cfb614ddbf3181540ce09d44d96486f8ba8d6a Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sun, 26 Mar 2006 01:38:12 -0800
Subject: [PATCH] hrtimers: remove data field

The nanosleep cleanup allows to remove the data field of hrtimer.  The
callback function can use container_of() to get it's own data.  Since the
hrtimer structure is anyway embedded in other structures, this adds no
overhead.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c               |  2 +-
 include/linux/hrtimer.h |  5 +----
 include/linux/sched.h   |  1 +
 include/linux/timer.h   |  3 ++-
 kernel/fork.c           |  2 +-
 kernel/hrtimer.c        | 12 +++++-------
 kernel/itimer.c         | 15 +++++++--------
 kernel/posix-timers.c   |  9 ++++-----
 8 files changed, 22 insertions(+), 27 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/fs/exec.c b/fs/exec.c
index 995cba3c62b..c7397c46ad6 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -632,7 +632,7 @@ static int de_thread(struct task_struct *tsk)
 		 * synchronize with any firing (by calling del_timer_sync)
 		 * before we can safely let the old group leader die.
 		 */
-		sig->real_timer.data = current;
+		sig->tsk = current;
 		spin_unlock_irq(lock);
 		if (hrtimer_cancel(&sig->real_timer))
 			hrtimer_restart(&sig->real_timer);
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index f57cc7bd700..93830158348 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -45,9 +45,7 @@ struct hrtimer_base;
  * @expires:	the absolute expiry time in the hrtimers internal
  *		representation. The time is related to the clock on
  *		which the timer is based.
- * @state:	state of the timer
  * @function:	timer expiry callback function
- * @data:	argument for the callback function
  * @base:	pointer to the timer base (per cpu and per clock)
  *
  * The hrtimer structure must be initialized by init_hrtimer_#CLOCKTYPE()
@@ -55,8 +53,7 @@ struct hrtimer_base;
 struct hrtimer {
 	struct rb_node		node;
 	ktime_t			expires;
-	int			(*function)(void *);
-	void			*data;
+	int			(*function)(struct hrtimer *);
 	struct hrtimer_base	*base;
 };
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e0054c1b9a0..036d14d2bf9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -402,6 +402,7 @@ struct signal_struct {
 
 	/* ITIMER_REAL timer for the process */
 	struct hrtimer real_timer;
+	struct task_struct *tsk;
 	ktime_t it_real_incr;
 
 	/* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
diff --git a/include/linux/timer.h b/include/linux/timer.h
index ee5a09e806e..b5caabca553 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -96,6 +96,7 @@ static inline void add_timer(struct timer_list *timer)
 
 extern void init_timers(void);
 extern void run_local_timers(void);
-extern int it_real_fn(void *);
+struct hrtimer;
+extern int it_real_fn(struct hrtimer *);
 
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index a02063903aa..4bd6486aa67 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -848,7 +848,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL);
 	sig->it_real_incr.tv64 = 0;
 	sig->real_timer.function = it_real_fn;
-	sig->real_timer.data = tsk;
+	sig->tsk = tsk;
 
 	sig->it_virt_expires = cputime_zero;
 	sig->it_virt_incr = cputime_zero;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 44108de4f02..0237a556eb1 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -613,21 +613,19 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
 
 	while ((node = base->first)) {
 		struct hrtimer *timer;
-		int (*fn)(void *);
+		int (*fn)(struct hrtimer *);
 		int restart;
-		void *data;
 
 		timer = rb_entry(node, struct hrtimer, node);
 		if (base->softirq_time.tv64 <= timer->expires.tv64)
 			break;
 
 		fn = timer->function;
-		data = timer->data;
 		set_curr_timer(base, timer);
 		__remove_hrtimer(timer, base);
 		spin_unlock_irq(&base->lock);
 
-		restart = fn(data);
+		restart = fn(timer);
 
 		spin_lock_irq(&base->lock);
 
@@ -664,9 +662,10 @@ struct sleep_hrtimer {
 	int expired;
 };
 
-static int nanosleep_wakeup(void *data)
+static int nanosleep_wakeup(struct hrtimer *timer)
 {
-	struct sleep_hrtimer *t = data;
+	struct sleep_hrtimer *t =
+		container_of(timer, struct sleep_hrtimer, timer);
 
 	t->expired = 1;
 	wake_up_process(t->task);
@@ -677,7 +676,6 @@ static int nanosleep_wakeup(void *data)
 static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode)
 {
 	t->timer.function = nanosleep_wakeup;
-	t->timer.data = t;
 	t->task = current;
 	t->expired = 0;
 
diff --git a/kernel/itimer.c b/kernel/itimer.c
index af2ec6b4392..204ed7939e7 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -128,17 +128,16 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
 /*
  * The timer is automagically restarted, when interval != 0
  */
-int it_real_fn(void *data)
+int it_real_fn(struct hrtimer *timer)
 {
-	struct task_struct *tsk = (struct task_struct *) data;
+	struct signal_struct *sig =
+	    container_of(timer, struct signal_struct, real_timer);
 
-	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk);
-
-	if (tsk->signal->it_real_incr.tv64 != 0) {
-		hrtimer_forward(&tsk->signal->real_timer,
-				tsk->signal->real_timer.base->softirq_time,
-				tsk->signal->it_real_incr);
+	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
 
+	if (sig->it_real_incr.tv64 != 0) {
+		hrtimer_forward(timer, timer->base->softirq_time,
+				sig->it_real_incr);
 		return HRTIMER_RESTART;
 	}
 	return HRTIMER_NORESTART;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 7c5f44787c8..ac6dc874442 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -145,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int,
 			    struct itimerspec *, struct itimerspec *);
 static int common_timer_del(struct k_itimer *timer);
 
-static int posix_timer_fn(void *data);
+static int posix_timer_fn(struct hrtimer *data);
 
 static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
 
@@ -334,14 +334,14 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
 
  * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
  */
-static int posix_timer_fn(void *data)
+static int posix_timer_fn(struct hrtimer *timer)
 {
-	struct k_itimer *timr = data;
-	struct hrtimer *timer = &timr->it.real.timer;
+	struct k_itimer *timr;
 	unsigned long flags;
 	int si_private = 0;
 	int ret = HRTIMER_NORESTART;
 
+	timr = container_of(timer, struct k_itimer, it.real.timer);
 	spin_lock_irqsave(&timr->it_lock, flags);
 
 	if (timr->it.real.interval.tv64 != 0)
@@ -725,7 +725,6 @@ common_timer_set(struct k_itimer *timr, int flags,
 
 	mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL;
 	hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
-	timr->it.real.timer.data = timr;
 	timr->it.real.timer.function = posix_timer_fn;
 
 	timer->expires = timespec_to_ktime(new_setting->it_value);
-- 
cgit v1.2.3-70-g09d2


From 8f17d3a5049d32392b79925c73a0cf99ce6d5af0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 27 Mar 2006 01:16:27 -0800
Subject: [PATCH] lightweight robust futexes updates

- fix: initialize the robust list(s) to NULL in copy_process.

- doc update

- cleanup: rename _inuser to _inatomic

- __user cleanups and other small cleanups

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/robust-futex-ABI.txt |  2 --
 Documentation/robust-futexes.txt   |  2 +-
 include/asm-frv/futex.h            |  2 +-
 include/asm-generic/futex.h        |  2 +-
 include/asm-i386/futex.h           |  2 +-
 include/asm-mips/futex.h           |  2 +-
 include/asm-powerpc/futex.h        |  2 +-
 include/asm-x86_64/futex.h         |  2 +-
 include/linux/futex.h              |  2 +-
 kernel/fork.c                      |  5 ++++-
 kernel/futex.c                     | 20 +++++++++-----------
 kernel/futex_compat.c              |  7 +++----
 12 files changed, 24 insertions(+), 26 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/Documentation/robust-futex-ABI.txt b/Documentation/robust-futex-ABI.txt
index def5d873528..8529a17ffaa 100644
--- a/Documentation/robust-futex-ABI.txt
+++ b/Documentation/robust-futex-ABI.txt
@@ -142,8 +142,6 @@ On insertion:
     of the 'lock word', to the linked list starting at 'head', and
  4) clear the 'list_op_pending' word.
 
-	XXX I am particularly unsure of the following -pj XXX
-
 On removal:
  1) set the 'list_op_pending' word to the address of the 'lock word'
     to be removed,
diff --git a/Documentation/robust-futexes.txt b/Documentation/robust-futexes.txt
index 7aecc67b136..df82d75245a 100644
--- a/Documentation/robust-futexes.txt
+++ b/Documentation/robust-futexes.txt
@@ -213,6 +213,6 @@ robust-mutex testcases.
 All other architectures should build just fine too - but they wont have
 the new syscalls yet.
 
-Architectures need to implement the new futex_atomic_cmpxchg_inuser()
+Architectures need to implement the new futex_atomic_cmpxchg_inatomic()
 inline function before writing up the syscalls (that function returns
 -ENOSYS right now).
diff --git a/include/asm-frv/futex.h b/include/asm-frv/futex.h
index 9a0e9026ba5..08b3d1da358 100644
--- a/include/asm-frv/futex.h
+++ b/include/asm-frv/futex.h
@@ -10,7 +10,7 @@
 extern int futex_atomic_op_inuser(int encoded_op, int __user *uaddr);
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	return -ENOSYS;
 }
diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h
index 514bd401cd7..df893c16031 100644
--- a/include/asm-generic/futex.h
+++ b/include/asm-generic/futex.h
@@ -50,7 +50,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	return -ENOSYS;
 }
diff --git a/include/asm-i386/futex.h b/include/asm-i386/futex.h
index 41184a31885..7b8ceefd010 100644
--- a/include/asm-i386/futex.h
+++ b/include/asm-i386/futex.h
@@ -105,7 +105,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
diff --git a/include/asm-mips/futex.h b/include/asm-mips/futex.h
index c5fb2d6d918..a554089991f 100644
--- a/include/asm-mips/futex.h
+++ b/include/asm-mips/futex.h
@@ -100,7 +100,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	return -ENOSYS;
 }
diff --git a/include/asm-powerpc/futex.h b/include/asm-powerpc/futex.h
index 80ed9854e42..f1b3c00bc1c 100644
--- a/include/asm-powerpc/futex.h
+++ b/include/asm-powerpc/futex.h
@@ -82,7 +82,7 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	return -ENOSYS;
 }
diff --git a/include/asm-x86_64/futex.h b/include/asm-x86_64/futex.h
index 7d9eb1a8454..9804bf07b09 100644
--- a/include/asm-x86_64/futex.h
+++ b/include/asm-x86_64/futex.h
@@ -95,7 +95,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 20face6b798..55fff96ae85 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -100,7 +100,7 @@ long do_futex(unsigned long uaddr, int op, int val,
 		unsigned long timeout, unsigned long uaddr2, int val2,
 		int val3);
 
-extern int handle_futex_death(unsigned int *uaddr, struct task_struct *curr);
+extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr);
 
 #ifdef CONFIG_FUTEX
 extern void exit_robust_list(struct task_struct *curr);
diff --git a/kernel/fork.c b/kernel/fork.c
index e0a2b449dea..c49bd193b05 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1061,7 +1061,10 @@ static task_t *copy_process(unsigned long clone_flags,
 	 * Clear TID on mm_release()?
 	 */
 	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
-
+	p->robust_list = NULL;
+#ifdef CONFIG_COMPAT
+	p->compat_robust_list = NULL;
+#endif
 	/*
 	 * sigaltstack should be cleared when sharing the same VM
 	 */
diff --git a/kernel/futex.c b/kernel/futex.c
index feb724b2554..9c9b2b6b22d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -913,15 +913,15 @@ err_unlock:
  * Process a futex-list entry, check whether it's owned by the
  * dying task, and do notification if so:
  */
-int handle_futex_death(unsigned int *uaddr, struct task_struct *curr)
+int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
 {
-	unsigned int futex_val;
+	u32 uval;
 
-repeat:
-	if (get_user(futex_val, uaddr))
+retry:
+	if (get_user(uval, uaddr))
 		return -1;
 
-	if ((futex_val & FUTEX_TID_MASK) == curr->pid) {
+	if ((uval & FUTEX_TID_MASK) == curr->pid) {
 		/*
 		 * Ok, this dying thread is truly holding a futex
 		 * of interest. Set the OWNER_DIED bit atomically
@@ -932,12 +932,11 @@ repeat:
 		 * thread-death.) The rest of the cleanup is done in
 		 * userspace.
 		 */
-		if (futex_atomic_cmpxchg_inuser(uaddr, futex_val,
-					 futex_val | FUTEX_OWNER_DIED) !=
-								   futex_val)
-			goto repeat;
+		if (futex_atomic_cmpxchg_inatomic(uaddr, uval,
+					 uval | FUTEX_OWNER_DIED) != uval)
+			goto retry;
 
-		if (futex_val & FUTEX_WAITERS)
+		if (uval & FUTEX_WAITERS)
 			futex_wake((unsigned long)uaddr, 1);
 	}
 	return 0;
@@ -985,7 +984,6 @@ void exit_robust_list(struct task_struct *curr)
 			if (handle_futex_death((void *)entry + futex_offset,
 						curr))
 				return;
-
 		/*
 		 * Fetch the next entry in the list:
 		 */
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index c153559ef28..9c077cf9aa8 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -121,9 +121,9 @@ err_unlock:
 	return ret;
 }
 
-asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
+asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
 		struct compat_timespec __user *utime, u32 __user *uaddr2,
-		int val3)
+		u32 val3)
 {
 	struct timespec t;
 	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
@@ -137,6 +137,5 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
 	if (op >= FUTEX_REQUEUE)
 		val2 = (int) (unsigned long) utime;
 
-	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2, val3);
+	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
 }
-- 
cgit v1.2.3-70-g09d2


From c97d98931ac52ef110b62d9b75c6a6f2bfbc1898 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:06 -0800
Subject: [PATCH] kill SET_LINKS/REMOVE_LINKS

Both SET_LINKS() and SET_LINKS/REMOVE_LINKS() have exactly one caller, and
these callers already check thread_group_leader().

This patch kills theese macros, they mix two different things: setting
process's parent and registering it in init_task.tasks list.  Callers are
updated to do these actions by hand.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 12 ------------
 kernel/exit.c         |  4 +++-
 kernel/fork.c         |  4 +++-
 3 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4b14c32b28..1f16fb1fea2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1186,18 +1186,6 @@ extern void wait_task_inactive(task_t * p);
 #define remove_parent(p)	list_del_init(&(p)->sibling)
 #define add_parent(p)		list_add_tail(&(p)->sibling,&(p)->parent->children)
 
-#define REMOVE_LINKS(p) do {					\
-	if (thread_group_leader(p))				\
-		list_del_init(&(p)->tasks);			\
-	remove_parent(p);					\
-	} while (0)
-
-#define SET_LINKS(p) do {					\
-	if (thread_group_leader(p))				\
-		list_add_tail(&(p)->tasks,&init_task.tasks);	\
-	add_parent(p);						\
-	} while (0)
-
 #define next_task(p)	list_entry((p)->tasks.next, struct task_struct, tasks)
 #define prev_task(p)	list_entry((p)->tasks.prev, struct task_struct, tasks)
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 5b5e8b67680..f436a6bd3fb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -54,11 +54,13 @@ static void __unhash_process(struct task_struct *p)
 	if (thread_group_leader(p)) {
 		detach_pid(p, PIDTYPE_PGID);
 		detach_pid(p, PIDTYPE_SID);
+
+		list_del_init(&p->tasks);
 		if (p->pid)
 			__get_cpu_var(process_counts)--;
 	}
 
-	REMOVE_LINKS(p);
+	remove_parent(p);
 }
 
 void release_task(struct task_struct * p)
diff --git a/kernel/fork.c b/kernel/fork.c
index c49bd193b05..74c67629ee6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1181,7 +1181,7 @@ static task_t *copy_process(unsigned long clone_flags,
 	 */
 	p->ioprio = current->ioprio;
 
-	SET_LINKS(p);
+	add_parent(p);
 	if (unlikely(p->ptrace & PT_PTRACED))
 		__ptrace_link(p, current->parent);
 
@@ -1191,6 +1191,8 @@ static task_t *copy_process(unsigned long clone_flags,
 		p->signal->session = current->signal->session;
 		attach_pid(p, PIDTYPE_PGID, process_group(p));
 		attach_pid(p, PIDTYPE_SID, p->signal->session);
+
+		list_add_tail(&p->tasks, &init_task.tasks);
 		if (p->pid)
 			__get_cpu_var(process_counts)++;
 	}
-- 
cgit v1.2.3-70-g09d2


From 73b9ebfe126a4a886ee46cbab637374d7024668a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:07 -0800
Subject: [PATCH] pidhash: don't count idle threads

fork_idle() does unhash_process() just after copy_process().  Contrary,
boot_cpu's idle thread explicitely registers itself for each pid_type with nr
= 0.

copy_process() already checks p->pid != 0 before process_counts++, I think we
can just skip attach_pid() calls and job control inits for idle threads and
kill unhash_process().  We don't need to cleanup ->proc_dentry in fork_idle()
because with this patch idle threads are never hashed in
kernel/pid.c:pid_hash[].

We don't need to hash pid == 0 in pidmap_init().  free_pidmap() is never
called with pid == 0 arg, so it will never be reused.  So it is still possible
to use pid == 0 in any PIDTYPE_xxx namespace from kernel/pid.c's POV.

However with this patch we don't hash pid == 0 for PIDTYPE_PID case.  We still
have have PIDTYPE_PGID/PIDTYPE_SID entries with pid == 0: /sbin/init and
kernel threads which don't call daemonize().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/kernel/smp.c  |  1 -
 include/linux/sched.h |  2 --
 kernel/exit.c         | 18 +-----------------
 kernel/fork.c         | 35 ++++++++++++++++++-----------------
 kernel/pid.c          | 10 +---------
 5 files changed, 20 insertions(+), 46 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c
index c8d8d0ac1a7..511116aebaf 100644
--- a/arch/um/kernel/smp.c
+++ b/arch/um/kernel/smp.c
@@ -143,7 +143,6 @@ void smp_prepare_cpus(unsigned int maxcpus)
 		idle = idle_thread(cpu);
 
 		init_idle(idle, cpu);
-		unhash_process(idle);
 
 		waittime = 200000000;
 		while (waittime-- && !cpu_isset(cpu, cpu_callin_map))
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1f16fb1fea2..ddc0df7f8bf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1214,8 +1214,6 @@ static inline int thread_group_empty(task_t *p)
 #define delay_group_leader(p) \
 		(thread_group_leader(p) && !thread_group_empty(p))
 
-extern void unhash_process(struct task_struct *p);
-
 /*
  * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
diff --git a/kernel/exit.c b/kernel/exit.c
index f436a6bd3fb..a94e1c31131 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -56,8 +56,7 @@ static void __unhash_process(struct task_struct *p)
 		detach_pid(p, PIDTYPE_SID);
 
 		list_del_init(&p->tasks);
-		if (p->pid)
-			__get_cpu_var(process_counts)--;
+		__get_cpu_var(process_counts)--;
 	}
 
 	remove_parent(p);
@@ -118,21 +117,6 @@ repeat:
 		goto repeat;
 }
 
-/* we are using it only for SMP init */
-
-void unhash_process(struct task_struct *p)
-{
-	struct dentry *proc_dentry;
-
-	spin_lock(&p->proc_lock);
-	proc_dentry = proc_pid_unhash(p);
-	write_lock_irq(&tasklist_lock);
-	__unhash_process(p);
-	write_unlock_irq(&tasklist_lock);
-	spin_unlock(&p->proc_lock);
-	proc_pid_flush(proc_dentry);
-}
-
 /*
  * This checks not only the pgrp, but falls back on the pid if no
  * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
diff --git a/kernel/fork.c b/kernel/fork.c
index 74c67629ee6..0c32e28cdc5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1181,25 +1181,26 @@ static task_t *copy_process(unsigned long clone_flags,
 	 */
 	p->ioprio = current->ioprio;
 
-	add_parent(p);
-	if (unlikely(p->ptrace & PT_PTRACED))
-		__ptrace_link(p, current->parent);
-
-	if (thread_group_leader(p)) {
-		p->signal->tty = current->signal->tty;
-		p->signal->pgrp = process_group(current);
-		p->signal->session = current->signal->session;
-		attach_pid(p, PIDTYPE_PGID, process_group(p));
-		attach_pid(p, PIDTYPE_SID, p->signal->session);
-
-		list_add_tail(&p->tasks, &init_task.tasks);
-		if (p->pid)
+	if (likely(p->pid)) {
+		add_parent(p);
+		if (unlikely(p->ptrace & PT_PTRACED))
+			__ptrace_link(p, current->parent);
+
+		if (thread_group_leader(p)) {
+			p->signal->tty = current->signal->tty;
+			p->signal->pgrp = process_group(current);
+			p->signal->session = current->signal->session;
+			attach_pid(p, PIDTYPE_PGID, process_group(p));
+			attach_pid(p, PIDTYPE_SID, p->signal->session);
+
+			list_add_tail(&p->tasks, &init_task.tasks);
 			__get_cpu_var(process_counts)++;
+		}
+		attach_pid(p, PIDTYPE_TGID, p->tgid);
+		attach_pid(p, PIDTYPE_PID, p->pid);
+		nr_threads++;
 	}
-	attach_pid(p, PIDTYPE_TGID, p->tgid);
-	attach_pid(p, PIDTYPE_PID, p->pid);
 
-	nr_threads++;
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
@@ -1263,7 +1264,7 @@ task_t * __devinit fork_idle(int cpu)
 	if (!task)
 		return ERR_PTR(-ENOMEM);
 	init_idle(task, cpu);
-	unhash_process(task);
+
 	return task;
 }
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 7781d999905..a9f2dfd006d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -247,16 +247,8 @@ void __init pidhash_init(void)
 
 void __init pidmap_init(void)
 {
-	int i;
-
 	pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
+	/* Reserve PID 0. We never call free_pidmap(0) */
 	set_bit(0, pidmap_array->page);
 	atomic_dec(&pidmap_array->nr_free);
-
-	/*
-	 * Allocate PID 0, and hash it via all PID types:
-	 */
-
-	for (i = 0; i < PIDTYPE_MAX; i++)
-		attach_pid(current, i, 0);
 }
-- 
cgit v1.2.3-70-g09d2


From aa1757f90bea3f598b6e5d04d922a6a60200f1da Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:12 -0800
Subject: [PATCH] convert sighand_cache to use SLAB_DESTROY_BY_RCU

This patch borrows a clever Hugh's 'struct anon_vma' trick.

Without tasklist_lock held we can't trust task->sighand until we locked it
and re-checked that it is still the same.

But this means we don't need to defer 'kmem_cache_free(sighand)'.  We can
return the memory to slab immediately, all we need is to be sure that
sighand->siglock can't dissapear inside rcu protected section.

To do so we need to initialize ->siglock inside ctor function,
SLAB_DESTROY_BY_RCU does the rest.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c             |  3 +--
 include/linux/sched.h |  8 --------
 kernel/fork.c         | 21 +++++++++++----------
 kernel/signal.c       |  2 +-
 4 files changed, 13 insertions(+), 21 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/fs/exec.c b/fs/exec.c
index 9046ad2b061..950ebd43cdc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -768,7 +768,6 @@ no_thread_group:
 		/*
 		 * Move our state over to newsighand and switch it in.
 		 */
-		spin_lock_init(&newsighand->siglock);
 		atomic_set(&newsighand->count, 1);
 		memcpy(newsighand->action, oldsighand->action,
 		       sizeof(newsighand->action));
@@ -785,7 +784,7 @@ no_thread_group:
 		write_unlock_irq(&tasklist_lock);
 
 		if (atomic_dec_and_test(&oldsighand->count))
-			sighand_free(oldsighand);
+			kmem_cache_free(sighand_cachep, oldsighand);
 	}
 
 	BUG_ON(!thread_group_leader(current));
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ddc0df7f8bf..bbcfc873bd9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -355,16 +355,8 @@ struct sighand_struct {
 	atomic_t		count;
 	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
-	struct rcu_head		rcu;
 };
 
-extern void sighand_free_cb(struct rcu_head *rhp);
-
-static inline void sighand_free(struct sighand_struct *sp)
-{
-	call_rcu(&sp->rcu, sighand_free_cb);
-}
-
 /*
  * NOTE! "signal_struct" does not have it's own
  * locking, because a shared signal_struct always
diff --git a/kernel/fork.c b/kernel/fork.c
index 0c32e28cdc5..33ffb5bf0db 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -786,14 +786,6 @@ int unshare_files(void)
 
 EXPORT_SYMBOL(unshare_files);
 
-void sighand_free_cb(struct rcu_head *rhp)
-{
-	struct sighand_struct *sp;
-
-	sp = container_of(rhp, struct sighand_struct, rcu);
-	kmem_cache_free(sighand_cachep, sp);
-}
-
 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct sighand_struct *sig;
@@ -806,7 +798,6 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
 	rcu_assign_pointer(tsk->sighand, sig);
 	if (!sig)
 		return -ENOMEM;
-	spin_lock_init(&sig->siglock);
 	atomic_set(&sig->count, 1);
 	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
 	return 0;
@@ -1356,11 +1347,21 @@ long do_fork(unsigned long clone_flags,
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
 
+static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+{
+	struct sighand_struct *sighand = data;
+
+	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+					SLAB_CTOR_CONSTRUCTOR)
+		spin_lock_init(&sighand->siglock);
+}
+
 void __init proc_caches_init(void)
 {
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
+			sighand_ctor, NULL);
 	signal_cachep = kmem_cache_create("signal_cache",
 			sizeof(struct signal_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
diff --git a/kernel/signal.c b/kernel/signal.c
index dc8f91bf9f8..b0b1ca9daa3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -330,7 +330,7 @@ void __exit_sighand(struct task_struct *tsk)
 	/* Ok, we're done with the signal handlers */
 	tsk->sighand = NULL;
 	if (atomic_dec_and_test(&sighand->count))
-		sighand_free(sighand);
+		kmem_cache_free(sighand_cachep, sighand);
 }
 
 void exit_sighand(struct task_struct *tsk)
-- 
cgit v1.2.3-70-g09d2


From 7001510d0cbf51ad202dd2d0744f54104285cbb9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:14 -0800
Subject: [PATCH] copy_process: cleanup bad_fork_cleanup_sighand

The only caller of exit_sighand(tsk) is copy_process's error path.  We can
call __exit_sighand() directly and kill exit_sighand().

This 'tsk' was not yet registered in pid_hash[] or init_task.tasks, it has no
external references, nobody can see it, and

	IF (clone_flags & CLONE_SIGHAND)
		At least 'current' has a reference to ->sighand, this
		means atomic_dec_and_test(sighand->count) can't be true.

	ELSE
		Nobody can see this ->sighand, this means we can free it
		without any locking.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  1 -
 kernel/fork.c         |  3 ++-
 kernel/signal.c       | 14 --------------
 3 files changed, 2 insertions(+), 16 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ca1fd31aae9..69c2a1e1529 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1151,7 +1151,6 @@ extern void exit_thread(void);
 extern void exit_files(struct task_struct *);
 extern void exit_signal(struct task_struct *);
 extern void __exit_signal(struct task_struct *);
-extern void exit_sighand(struct task_struct *);
 extern void __exit_sighand(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 33ffb5bf0db..8a46ad52be8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1208,7 +1208,8 @@ bad_fork_cleanup_mm:
 bad_fork_cleanup_signal:
 	exit_signal(p);
 bad_fork_cleanup_sighand:
-	exit_sighand(p);
+	if (p->sighand)
+		__exit_sighand(p);
 bad_fork_cleanup_fs:
 	exit_fs(p); /* blocking */
 bad_fork_cleanup_files:
diff --git a/kernel/signal.c b/kernel/signal.c
index c5b65aa4c2b..1d7f4463c32 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -336,20 +336,6 @@ void __exit_sighand(struct task_struct *tsk)
 		kmem_cache_free(sighand_cachep, sighand);
 }
 
-void exit_sighand(struct task_struct *tsk)
-{
-	write_lock_irq(&tasklist_lock);
-	rcu_read_lock();
-	if (tsk->sighand != NULL) {
-		struct sighand_struct *sighand = rcu_dereference(tsk->sighand);
-		spin_lock(&sighand->siglock);
-		__exit_sighand(tsk);
-		spin_unlock(&sighand->siglock);
-	}
-	rcu_read_unlock();
-	write_unlock_irq(&tasklist_lock);
-}
-
 /*
  * This function expects the tasklist_lock write-locked.
  */
-- 
cgit v1.2.3-70-g09d2


From 6b3934ef52712ece50605dfc72e55d00c580831a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:16 -0800
Subject: [PATCH] copy_process: cleanup bad_fork_cleanup_signal

__exit_signal() does important cleanups atomically under ->siglock.  It is
also called from copy_process's error path.  This is not good, for example we
can't move __unhash_process() under ->siglock for that reason.

We should not mix these 2 paths, just look at ugly 'if (p->sighand)' under
'bad_fork_cleanup_sighand:' label.  For copy_process() case it is sufficient
to just backout copy_signal(), nothing more.

Again, nobody can see this task yet.  For CLONE_THREAD case we just decrement
signal->count, otherwise nobody can see this ->signal and we can free it
lockless.

This patch assumes it is safe to do exit_thread_group_keys() without
tasklist_lock.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  2 +-
 include/linux/slab.h  |  1 -
 kernel/fork.c         | 23 +++++++++++++++++++----
 kernel/signal.c       | 15 +--------------
 4 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 69c2a1e1529..7dd430b697a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1149,7 +1149,7 @@ extern void flush_thread(void);
 extern void exit_thread(void);
 
 extern void exit_files(struct task_struct *);
-extern void exit_signal(struct task_struct *);
+extern void __cleanup_signal(struct signal_struct *);
 extern void __exit_signal(struct task_struct *);
 extern void __exit_sighand(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 15e1d9736b1..3af03b19c98 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -210,7 +210,6 @@ extern kmem_cache_t	*names_cachep;
 extern kmem_cache_t	*files_cachep;
 extern kmem_cache_t	*filp_cachep;
 extern kmem_cache_t	*fs_cachep;
-extern kmem_cache_t	*signal_cachep;
 extern kmem_cache_t	*sighand_cachep;
 extern kmem_cache_t	*bio_cachep;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 8a46ad52be8..0aff28cdbad 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -84,7 +84,7 @@ static kmem_cache_t *task_struct_cachep;
 #endif
 
 /* SLAB cache for signal_struct structures (tsk->signal) */
-kmem_cache_t *signal_cachep;
+static kmem_cache_t *signal_cachep;
 
 /* SLAB cache for sighand_struct structures (tsk->sighand) */
 kmem_cache_t *sighand_cachep;
@@ -872,6 +872,22 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	return 0;
 }
 
+void __cleanup_signal(struct signal_struct *sig)
+{
+	exit_thread_group_keys(sig);
+	kmem_cache_free(signal_cachep, sig);
+}
+
+static inline void cleanup_signal(struct task_struct *tsk)
+{
+	struct signal_struct *sig = tsk->signal;
+
+	atomic_dec(&sig->live);
+
+	if (atomic_dec_and_test(&sig->count))
+		__cleanup_signal(sig);
+}
+
 static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
 {
 	unsigned long new_flags = p->flags;
@@ -1206,10 +1222,9 @@ bad_fork_cleanup_mm:
 	if (p->mm)
 		mmput(p->mm);
 bad_fork_cleanup_signal:
-	exit_signal(p);
+	cleanup_signal(p);
 bad_fork_cleanup_sighand:
-	if (p->sighand)
-		__exit_sighand(p);
+	__exit_sighand(p);
 bad_fork_cleanup_fs:
 	exit_fs(p); /* blocking */
 bad_fork_cleanup_files:
diff --git a/kernel/signal.c b/kernel/signal.c
index 1d7f4463c32..54e9ef673e6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -395,23 +395,10 @@ void __exit_signal(struct task_struct *tsk)
 	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
-		/*
-		 * We are cleaning up the signal_struct here.
-		 */
-		exit_thread_group_keys(sig);
-		kmem_cache_free(signal_cachep, sig);
+		__cleanup_signal(sig);
 	}
 }
 
-void exit_signal(struct task_struct *tsk)
-{
-	atomic_dec(&tsk->signal->live);
-
-	write_lock_irq(&tasklist_lock);
-	__exit_signal(tsk);
-	write_unlock_irq(&tasklist_lock);
-}
-
 /*
  * Flush all handlers for a task.
  */
-- 
cgit v1.2.3-70-g09d2


From c81addc9d3a0ebff2155e0cd86f90820ab97147e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:17 -0800
Subject: [PATCH] rename __exit_sighand to cleanup_sighand

Cosmetic, rename __exit_sighand to cleanup_sighand and move it close to
copy_sighand().

This matches copy_signal/cleanup_signal naming, and I think it is easier to
follow.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  2 +-
 kernel/fork.c         | 12 +++++++++++-
 kernel/signal.c       | 19 ++-----------------
 3 files changed, 14 insertions(+), 19 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7dd430b697a..921148277da 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1150,8 +1150,8 @@ extern void exit_thread(void);
 
 extern void exit_files(struct task_struct *);
 extern void __cleanup_signal(struct signal_struct *);
+extern void cleanup_sighand(struct task_struct *);
 extern void __exit_signal(struct task_struct *);
-extern void __exit_sighand(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
 extern NORET_TYPE void do_group_exit(int);
diff --git a/kernel/fork.c b/kernel/fork.c
index 0aff28cdbad..12cdd9fc9d0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -803,6 +803,16 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
 	return 0;
 }
 
+void cleanup_sighand(struct task_struct *tsk)
+{
+	struct sighand_struct * sighand = tsk->sighand;
+
+	/* Ok, we're done with the signal handlers */
+	tsk->sighand = NULL;
+	if (atomic_dec_and_test(&sighand->count))
+		kmem_cache_free(sighand_cachep, sighand);
+}
+
 static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct signal_struct *sig;
@@ -1224,7 +1234,7 @@ bad_fork_cleanup_mm:
 bad_fork_cleanup_signal:
 	cleanup_signal(p);
 bad_fork_cleanup_sighand:
-	__exit_sighand(p);
+	cleanup_sighand(p);
 bad_fork_cleanup_fs:
 	exit_fs(p); /* blocking */
 bad_fork_cleanup_files:
diff --git a/kernel/signal.c b/kernel/signal.c
index ca1fa854e46..b29c868bd5e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -310,9 +310,7 @@ static void flush_sigqueue(struct sigpending *queue)
 /*
  * Flush all pending signals for a task.
  */
-
-void
-flush_signals(struct task_struct *t)
+void flush_signals(struct task_struct *t)
 {
 	unsigned long flags;
 
@@ -323,19 +321,6 @@ flush_signals(struct task_struct *t)
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
 
-/*
- * This function expects the tasklist_lock write-locked.
- */
-void __exit_sighand(struct task_struct *tsk)
-{
-	struct sighand_struct * sighand = tsk->sighand;
-
-	/* Ok, we're done with the signal handlers */
-	tsk->sighand = NULL;
-	if (atomic_dec_and_test(&sighand->count))
-		kmem_cache_free(sighand_cachep, sighand);
-}
-
 /*
  * This function expects the tasklist_lock write-locked.
  */
@@ -386,7 +371,7 @@ void __exit_signal(struct task_struct *tsk)
 	}
 
 	tsk->signal = NULL;
-	__exit_sighand(tsk);
+	cleanup_sighand(tsk);
 	spin_unlock(&sighand->siglock);
 	rcu_read_unlock();
 
-- 
cgit v1.2.3-70-g09d2


From 47e65328a7b1cdfc4e3102e50d60faf94ebba7d3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:25 -0800
Subject: [PATCH] pids: kill PIDTYPE_TGID

This patch kills PIDTYPE_TGID pid_type thus saving one hash table in
kernel/pid.c and speeding up subthreads create/destroy a bit.  It is also a
preparation for the further tref/pids rework.

This patch adds 'struct list_head thread_group' to 'struct task_struct'
instead.

We don't detach group leader from PIDTYPE_PID namespace until another
thread inherits it's ->pid == ->tgid, so we are safe wrt premature
free_pidmap(->tgid) call.

Currently there are no users of find_task_by_pid_type(PIDTYPE_TGID).
Should the need arise, we can use find_task_by_pid()->group_leader.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-By: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pid.h   |  1 -
 include/linux/sched.h | 11 ++++++++---
 kernel/exit.c         | 10 +---------
 kernel/fork.c         |  4 +++-
 4 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 099e70ecf7c..5b9082cc600 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -4,7 +4,6 @@
 enum pid_type
 {
 	PIDTYPE_PID,
-	PIDTYPE_TGID,
 	PIDTYPE_PGID,
 	PIDTYPE_SID,
 	PIDTYPE_MAX
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a913fca9e70..99855f694eb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -752,6 +752,7 @@ struct task_struct {
 
 	/* PID/PID hash table linkage. */
 	struct pid pids[PIDTYPE_MAX];
+	struct list_head thread_group;
 
 	struct completion *vfork_done;		/* for vfork() */
 	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
@@ -1192,13 +1193,17 @@ extern void wait_task_inactive(task_t * p);
 #define while_each_thread(g, t) \
 	while ((t = next_thread(t)) != g)
 
-extern task_t * FASTCALL(next_thread(const task_t *p));
-
 #define thread_group_leader(p)	(p->pid == p->tgid)
 
+static inline task_t *next_thread(task_t *p)
+{
+	return list_entry(rcu_dereference(p->thread_group.next),
+				task_t, thread_group);
+}
+
 static inline int thread_group_empty(task_t *p)
 {
-	return list_empty(&p->pids[PIDTYPE_TGID].pid_list);
+	return list_empty(&p->thread_group);
 }
 
 #define delay_group_leader(p) \
diff --git a/kernel/exit.c b/kernel/exit.c
index aea23e713cf..22399caf757 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,7 +51,6 @@ static void __unhash_process(struct task_struct *p)
 {
 	nr_threads--;
 	detach_pid(p, PIDTYPE_PID);
-	detach_pid(p, PIDTYPE_TGID);
 	if (thread_group_leader(p)) {
 		detach_pid(p, PIDTYPE_PGID);
 		detach_pid(p, PIDTYPE_SID);
@@ -59,7 +58,7 @@ static void __unhash_process(struct task_struct *p)
 		list_del_init(&p->tasks);
 		__get_cpu_var(process_counts)--;
 	}
-
+	list_del_rcu(&p->thread_group);
 	remove_parent(p);
 }
 
@@ -964,13 +963,6 @@ asmlinkage long sys_exit(int error_code)
 	do_exit((error_code&0xff)<<8);
 }
 
-task_t fastcall *next_thread(const task_t *p)
-{
-	return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID);
-}
-
-EXPORT_SYMBOL(next_thread);
-
 /*
  * Take down every thread in the group.  This is called by fatal signals
  * as well as by sys_exit_group (below).
diff --git a/kernel/fork.c b/kernel/fork.c
index 12cdd9fc9d0..bc551efb5fd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1112,6 +1112,7 @@ static task_t *copy_process(unsigned long clone_flags,
 	 * We dont wake it up yet.
 	 */
 	p->group_leader = p;
+	INIT_LIST_HEAD(&p->thread_group);
 	INIT_LIST_HEAD(&p->ptrace_children);
 	INIT_LIST_HEAD(&p->ptrace_list);
 
@@ -1165,7 +1166,9 @@ static task_t *copy_process(unsigned long clone_flags,
 			retval = -EAGAIN;
 			goto bad_fork_cleanup_namespace;
 		}
+
 		p->group_leader = current->group_leader;
+		list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
 
 		if (current->signal->group_stop_count > 0) {
 			/*
@@ -1213,7 +1216,6 @@ static task_t *copy_process(unsigned long clone_flags,
 			list_add_tail(&p->tasks, &init_task.tasks);
 			__get_cpu_var(process_counts)++;
 		}
-		attach_pid(p, PIDTYPE_TGID, p->tgid);
 		attach_pid(p, PIDTYPE_PID, p->pid);
 		nr_threads++;
 	}
-- 
cgit v1.2.3-70-g09d2


From 4a2c7a7837da1b91468e50426066d988050e4d56 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:26 -0800
Subject: [PATCH] make fork() atomic wrt pgrp/session signals

Eric W. Biederman wrote:
>
> Ok. SUSV3/Posix is clear, fork is atomic with respect
> to signals.  Either a signal comes before or after a
> fork but not during. (See the rationale section).
> http://www.opengroup.org/onlinepubs/000095399/functions/fork.html
>
> The tasklist_lock does not stop forks from adding to a process
> group. The forks stall while the tasklist_lock is held, but a fork
> that began before we grabbed the tasklist_lock simply completes
> afterwards, and the child does not receive the signal.

This also means that SIGSTOP or sig_kernel_coredump() signal can't
be delivered to pgrp/session reliably.

With this patch copy_process() returns -ERESTARTNOINTR when it
detects a pending signal, fork() will be restarted transparently
after handling the signals.

This patch also deletes now unneeded "group_stop_count > 0" check,
copy_process() can no longer succeed while group stop in progress.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-By: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index bc551efb5fd..aa50c848fae 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1136,16 +1136,6 @@ static task_t *copy_process(unsigned long clone_flags,
 			!cpu_online(task_cpu(p))))
 		set_task_cpu(p, smp_processor_id());
 
-	/*
-	 * Check for pending SIGKILL! The new thread should not be allowed
-	 * to slip out of an OOM kill. (or normal SIGKILL.)
-	 */
-	if (sigismember(&current->pending.signal, SIGKILL)) {
-		write_unlock_irq(&tasklist_lock);
-		retval = -EINTR;
-		goto bad_fork_cleanup_namespace;
-	}
-
 	/* CLONE_PARENT re-uses the old parent */
 	if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
 		p->real_parent = current->real_parent;
@@ -1154,6 +1144,23 @@ static task_t *copy_process(unsigned long clone_flags,
 	p->parent = p->real_parent;
 
 	spin_lock(&current->sighand->siglock);
+
+	/*
+	 * Process group and session signals need to be delivered to just the
+	 * parent before the fork or both the parent and the child after the
+	 * fork. Restart if a signal comes in before we add the new process to
+	 * it's process group.
+	 * A fatal signal pending means that current will exit, so the new
+	 * thread can't slip out of an OOM kill (or normal SIGKILL).
+ 	 */
+ 	recalc_sigpending();
+	if (signal_pending(current)) {
+		spin_unlock(&current->sighand->siglock);
+		write_unlock_irq(&tasklist_lock);
+		retval = -ERESTARTNOINTR;
+		goto bad_fork_cleanup_namespace;
+	}
+
 	if (clone_flags & CLONE_THREAD) {
 		/*
 		 * Important: if an exit-all has been started then
@@ -1170,16 +1177,6 @@ static task_t *copy_process(unsigned long clone_flags,
 		p->group_leader = current->group_leader;
 		list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
 
-		if (current->signal->group_stop_count > 0) {
-			/*
-			 * There is an all-stop in progress for the group.
-			 * We ourselves will stop as soon as we check signals.
-			 * Make the new thread part of that group stop too.
-			 */
-			current->signal->group_stop_count++;
-			set_tsk_thread_flag(p, TIF_SIGPENDING);
-		}
-
 		if (!cputime_eq(current->signal->it_virt_expires,
 				cputime_zero) ||
 		    !cputime_eq(current->signal->it_prof_expires,
-- 
cgit v1.2.3-70-g09d2


From a7e5328a06a2beee3a2bbfaf87ce2a7bbe937de1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:27 -0800
Subject: [PATCH] cleanup __exit_signal->cleanup_sighand path

Move 'tsk->sighand = NULL' from cleanup_sighand() to __exit_signal().  This
makes the exit path more understandable and allows us to do
cleanup_sighand() outside of ->siglock protected section.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 2 +-
 kernel/exit.c         | 3 ++-
 kernel/fork.c         | 8 ++------
 3 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 99855f694eb..d04186d8cc6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1151,7 +1151,7 @@ extern void exit_thread(void);
 
 extern void exit_files(struct task_struct *);
 extern void __cleanup_signal(struct signal_struct *);
-extern void cleanup_sighand(struct task_struct *);
+extern void __cleanup_sighand(struct sighand_struct *);
 extern void exit_itimers(struct signal_struct *);
 
 extern NORET_TYPE void do_group_exit(int);
diff --git a/kernel/exit.c b/kernel/exit.c
index 22399caf757..bc0ec674d3f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -114,10 +114,11 @@ static void __exit_signal(struct task_struct *tsk)
 	__unhash_process(tsk);
 
 	tsk->signal = NULL;
-	cleanup_sighand(tsk);
+	tsk->sighand = NULL;
 	spin_unlock(&sighand->siglock);
 	rcu_read_unlock();
 
+	__cleanup_sighand(sighand);
 	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
diff --git a/kernel/fork.c b/kernel/fork.c
index aa50c848fae..b3f7a1bb5e5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -803,12 +803,8 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
 	return 0;
 }
 
-void cleanup_sighand(struct task_struct *tsk)
+void __cleanup_sighand(struct sighand_struct *sighand)
 {
-	struct sighand_struct * sighand = tsk->sighand;
-
-	/* Ok, we're done with the signal handlers */
-	tsk->sighand = NULL;
 	if (atomic_dec_and_test(&sighand->count))
 		kmem_cache_free(sighand_cachep, sighand);
 }
@@ -1233,7 +1229,7 @@ bad_fork_cleanup_mm:
 bad_fork_cleanup_signal:
 	cleanup_signal(p);
 bad_fork_cleanup_sighand:
-	cleanup_sighand(p);
+	__cleanup_sighand(p->sighand);
 bad_fork_cleanup_fs:
 	exit_fs(p); /* blocking */
 bad_fork_cleanup_files:
-- 
cgit v1.2.3-70-g09d2