From 260ea1013283d8acbb451459ed1ca560c1445c20 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 23 Jun 2006 02:05:18 -0700
Subject: [PATCH] ptrace: document the locking rules

After a lot of reading the code and thinking about how it behaves I have
managed to figure out what the current ptrace locking rules are.  The
current code is in much better that it appears at first glance.  The
troublesome code paths are actually the code paths that violate the current
rules.

ptrace uses simple exclusive access as it's locking.  You can only touch
task->ptrace if the task is stopped and you are the ptracer, or if the task
is running and are the task itself.

Very simple, very easy to maintain.  It just needs to be documented so
people know not to touch ptrace from elsewhere.

Currently we do have a few pieces of code that are in violation of this
rule.  Particularly the core dump code, and ptrace_attach.  But so far the
code looks fixable.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 267f1525704..a9d23c7d1b2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1225,7 +1225,7 @@ static inline int thread_group_empty(task_t *p)
 		(thread_group_leader(p) && !thread_group_empty(p))
 
 /*
- * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring
+ * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
  * pins the final release of task.io_context.  Also protects ->cpuset.
  *
-- 
cgit v1.2.3-70-g09d2


From b31dc66a54ad986b6b73bdc49c8efc17cbad1833 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 13 Jun 2006 08:26:10 +0200
Subject: [PATCH] Kill PF_SYNCWRITE flag

A process flag to indicate whether we are doing sync io is incredibly
ugly. It also causes performance problems when one does a lot of async
io and then proceeds to sync it. Part of the io will go out as async,
and the other part as sync. This causes a disconnect between the
previously submitted io and the synced io. For io schedulers such as CFQ,
this will cause us lost merges and suboptimal behaviour in scheduling.

Remove PF_SYNCWRITE completely from the fsync/msync paths, and let
the O_DIRECT path just directly indicate that the writes are sync
by using WRITE_SYNC instead.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/as-iosched.c                |  2 +-
 block/cfq-iosched.c               |  4 +---
 block/ll_rw_blk.c                 |  3 +++
 drivers/usb/gadget/file_storage.c |  2 --
 fs/buffer.c                       |  2 --
 fs/direct-io.c                    | 18 ++++++++----------
 fs/fs-writeback.c                 |  2 --
 include/linux/blkdev.h            |  2 ++
 include/linux/sched.h             | 11 +++++------
 mm/msync.c                        |  3 ---
 10 files changed, 20 insertions(+), 29 deletions(-)

(limited to 'include/linux/sched.h')

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 9b13d72ffef..56c99fa037d 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1339,7 +1339,7 @@ static void as_add_request(request_queue_t *q, struct request *rq)
 	arq->state = AS_RQ_NEW;
 
 	if (rq_data_dir(arq->request) == READ
-			|| current->flags&PF_SYNCWRITE)
+			|| (arq->request->flags & REQ_RW_SYNC))
 		arq->is_sync = 1;
 	else
 		arq->is_sync = 0;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c88f161d3fb..4c4e9cc3ae2 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -277,8 +277,6 @@ static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsi
 static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
 
-#define process_sync(tsk)	((tsk)->flags & PF_SYNCWRITE)
-
 /*
  * lots of deadline iosched dupes, can be abstracted later...
  */
@@ -334,7 +332,7 @@ static int cfq_queue_empty(request_queue_t *q)
 
 static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
 {
-	if (rw == READ || process_sync(task))
+	if (rw == READ || rw == WRITE_SYNC)
 		return task->pid;
 
 	return CFQ_KEY_ASYNC;
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 17c42ddd31d..2270bb45138 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -2827,6 +2827,9 @@ static void init_request_from_bio(struct request *req, struct bio *bio)
 	if (unlikely(bio_barrier(bio)))
 		req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
 
+	if (bio_sync(bio))
+		req->flags |= REQ_RW_SYNC;
+
 	req->errors = 0;
 	req->hard_sector = req->sector = bio->bi_sector;
 	req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio);
diff --git a/drivers/usb/gadget/file_storage.c b/drivers/usb/gadget/file_storage.c
index 6f887478b14..a43dc908ac5 100644
--- a/drivers/usb/gadget/file_storage.c
+++ b/drivers/usb/gadget/file_storage.c
@@ -1906,7 +1906,6 @@ static int fsync_sub(struct lun *curlun)
 
 	inode = filp->f_dentry->d_inode;
 	mutex_lock(&inode->i_mutex);
-	current->flags |= PF_SYNCWRITE;
 	rc = filemap_fdatawrite(inode->i_mapping);
 	err = filp->f_op->fsync(filp, filp->f_dentry, 1);
 	if (!rc)
@@ -1914,7 +1913,6 @@ static int fsync_sub(struct lun *curlun)
 	err = filemap_fdatawait(inode->i_mapping);
 	if (!rc)
 		rc = err;
-	current->flags &= ~PF_SYNCWRITE;
 	mutex_unlock(&inode->i_mutex);
 	VLDBG(curlun, "fdatasync -> %d\n", rc);
 	return rc;
diff --git a/fs/buffer.c b/fs/buffer.c
index 23f1f3a6807..373bb6292bd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -331,7 +331,6 @@ long do_fsync(struct file *file, int datasync)
 		goto out;
 	}
 
-	current->flags |= PF_SYNCWRITE;
 	ret = filemap_fdatawrite(mapping);
 
 	/*
@@ -346,7 +345,6 @@ long do_fsync(struct file *file, int datasync)
 	err = filemap_fdatawait(mapping);
 	if (!ret)
 		ret = err;
-	current->flags &= ~PF_SYNCWRITE;
 out:
 	return ret;
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b05d1b21877..538fb0418fb 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -162,7 +162,7 @@ static int dio_refill_pages(struct dio *dio)
 		NULL);				/* vmas */
 	up_read(&current->mm->mmap_sem);
 
-	if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
+	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
 		struct page *page = ZERO_PAGE(dio->curr_user_address);
 		/*
 		 * A memory fault, but the filesystem has some outstanding
@@ -535,7 +535,7 @@ static int get_more_blocks(struct dio *dio)
 		map_bh->b_state = 0;
 		map_bh->b_size = fs_count << dio->inode->i_blkbits;
 
-		create = dio->rw == WRITE;
+		create = dio->rw & WRITE;
 		if (dio->lock_type == DIO_LOCKING) {
 			if (dio->block_in_file < (i_size_read(dio->inode) >>
 							dio->blkbits))
@@ -867,7 +867,7 @@ do_holes:
 				loff_t i_size_aligned;
 
 				/* AKPM: eargh, -ENOTBLK is a hack */
-				if (dio->rw == WRITE) {
+				if (dio->rw & WRITE) {
 					page_cache_release(page);
 					return -ENOTBLK;
 				}
@@ -1045,7 +1045,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 		}
 	} /* end iovec loop */
 
-	if (ret == -ENOTBLK && rw == WRITE) {
+	if (ret == -ENOTBLK && (rw & WRITE)) {
 		/*
 		 * The remaining part of the request will be
 		 * be handled by buffered I/O when we return
@@ -1089,7 +1089,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	if (dio->is_async) {
 		int should_wait = 0;
 
-		if (dio->result < dio->size && rw == WRITE) {
+		if (dio->result < dio->size && (rw & WRITE)) {
 			dio->waiter = current;
 			should_wait = 1;
 		}
@@ -1142,7 +1142,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 			ret = transferred;
 
 		/* We could have also come here on an AIO file extend */
-		if (!is_sync_kiocb(iocb) && rw == WRITE &&
+		if (!is_sync_kiocb(iocb) && (rw & WRITE) &&
 		    ret >= 0 && dio->result == dio->size)
 			/*
 			 * For AIO writes where we have completed the
@@ -1194,7 +1194,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	int acquire_i_mutex = 0;
 
 	if (rw & WRITE)
-		current->flags |= PF_SYNCWRITE;
+		rw = WRITE_SYNC;
 
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
@@ -1270,7 +1270,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	 * even for AIO, we need to wait for i/o to complete before
 	 * returning in this case.
 	 */
-	dio->is_async = !is_sync_kiocb(iocb) && !((rw == WRITE) &&
+	dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
 		(end > i_size_read(inode)));
 
 	retval = direct_io_worker(rw, iocb, inode, iov, offset,
@@ -1284,8 +1284,6 @@ out:
 		mutex_unlock(&inode->i_mutex);
 	else if (acquire_i_mutex)
 		mutex_lock(&inode->i_mutex);
-	if (rw & WRITE)
-		current->flags &= ~PF_SYNCWRITE;
 	return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6db95cf3aaa..031b27a4bc9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -623,7 +623,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
 	int need_write_inode_now = 0;
 	int err2;
 
-	current->flags |= PF_SYNCWRITE;
 	if (what & OSYNC_DATA)
 		err = filemap_fdatawrite(mapping);
 	if (what & (OSYNC_METADATA|OSYNC_DATA)) {
@@ -636,7 +635,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
 		if (!err)
 			err = err2;
 	}
-	current->flags &= ~PF_SYNCWRITE;
 
 	spin_lock(&inode_lock);
 	if ((inode->i_state & I_DIRTY) &&
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3457e7b9736..482a21d6762 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -241,6 +241,7 @@ enum rq_flag_bits {
 	__REQ_PM_RESUME,	/* resume request */
 	__REQ_PM_SHUTDOWN,	/* shutdown request */
 	__REQ_ORDERED_COLOR,	/* is before or after barrier */
+	__REQ_RW_SYNC,		/* request is sync (O_DIRECT) */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -270,6 +271,7 @@ enum rq_flag_bits {
 #define REQ_PM_RESUME	(1 << __REQ_PM_RESUME)
 #define REQ_PM_SHUTDOWN	(1 << __REQ_PM_SHUTDOWN)
 #define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
+#define REQ_RW_SYNC	(1 << __REQ_RW_SYNC)
 
 /*
  * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a9d23c7d1b2..38b4791e6a5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -941,12 +941,11 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_KSWAPD	0x00040000	/* I am kswapd */
 #define PF_SWAPOFF	0x00080000	/* I am in swapoff */
 #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
-#define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
-#define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
-#define PF_RANDOMIZE	0x00800000	/* randomize virtual address space */
-#define PF_SWAPWRITE	0x01000000	/* Allowed to write to swap */
-#define PF_SPREAD_PAGE	0x04000000	/* Spread page cache over cpuset */
-#define PF_SPREAD_SLAB	0x08000000	/* Spread some slab caches over cpuset */
+#define PF_BORROWED_MM	0x00200000	/* I am a kthread doing use_mm */
+#define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
+#define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
+#define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
+#define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 
 /*
diff --git a/mm/msync.c b/mm/msync.c
index bc6c9537636..d083544df21 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -170,8 +170,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 	 * just ignore them, but return -ENOMEM at the end.
 	 */
 	down_read(&current->mm->mmap_sem);
-	if (flags & MS_SYNC)
-		current->flags |= PF_SYNCWRITE;
 	vma = find_vma(current->mm, start);
 	if (!vma) {
 		error = -ENOMEM;
@@ -228,7 +226,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 		}
 	} while (vma && !done);
 out_unlock:
-	current->flags &= ~PF_SYNCWRITE;
 	up_read(&current->mm->mmap_sem);
 out:
 	return error;
-- 
cgit v1.2.3-70-g09d2