From 1af60fbd759d31f565552fea315c2033947cfbe6 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 2 Oct 2009 18:56:53 -0400 Subject: block: get rid of the WRITE_ODIRECT flag Hi, The WRITE_ODIRECT flag is only used in one place, and that code path happens to also call blk_run_address_space. The introduction of this flag, then, could result in the device being unplugged twice for every I/O. Further, with the batching changes in the next patch, we don't want an O_DIRECT write to imply a queue unplug. Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- include/linux/fs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 2620a8c6357..2f5fca4147c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -129,7 +129,6 @@ struct inodes_stat_t { * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device * immediately after submission. The write equivalent * of READ_SYNC. - * WRITE_ODIRECT Special case write for O_DIRECT only. * SWRITE_SYNC * SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer. * See SWRITE. @@ -151,7 +150,6 @@ struct inodes_stat_t { #define READ_META (READ | (1 << BIO_RW_META)) #define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) #define WRITE_SYNC (WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) -#define WRITE_ODIRECT (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) #define SWRITE_SYNC_PLUG \ (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) #define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) -- cgit v1.2.3-70-g09d2 From d9449ce35a1e8fb58dd2d419f9215562a14ecca0 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 26 Nov 2009 09:45:40 +0100 Subject: Fix regression in direct writes performance due to WRITE_ODIRECT flag removal There seems to be a regression in direct write path due to following commit in for-2.6.33 branch of block tree. commit 1af60fbd759d31f565552fea315c2033947cfbe6 Author: Jeff Moyer Date: Fri Oct 2 18:56:53 2009 -0400 block: get rid of the WRITE_ODIRECT flag Marking direct writes as WRITE_SYNC_PLUG instead of WRITE_ODIRECT, sets the NOIDLE flag in bio and hence in request. This tells CFQ to not expect more request from the queue and not idle on it (despite the fact that queue's think time is less and it is not seeky). So direct writers lose big time when competing with sequential readers. Using fio, I have run one direct writer and two sequential readers and following are the results with 2.6.32-rc7 kernel and with for-2.6.33 branch. Test ==== 1 direct writer and 2 sequential reader running simultaneously. [global] directory=/mnt/sdc/fio/ runtime=10 [seqwrite] rw=write size=4G direct=1 [seqread] rw=read size=2G numjobs=2 2.6.32-rc7 ========== direct writes: aggrb=2,968KB/s readers : aggrb=101MB/s for-2.6.33 branch ================= direct write: aggrb=19KB/s readers aggrb=137MB/s This patch brings back the WRITE_ODIRECT flag, with the difference that we don't set the BIO_RW_UNPLUG flag so that device is not unplugged after submission of request and an explicit unplug from submitter is required. That way we fix the jeff's issue of not enough merging taking place in aio path as well as make sure direct writes get their fair share. After the fix ============= for-2.6.33 + fix ---------------- direct writes: aggrb=2,728KB/s reads: aggrb=103MB/s Thanks Vivek Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- fs/direct-io.c | 2 +- include/linux/fs.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/fs/direct-io.c b/fs/direct-io.c index 3af761c8c5c..b912270942f 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1124,7 +1124,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, int acquire_i_mutex = 0; if (rw & WRITE) - rw = WRITE_SYNC_PLUG; + rw = WRITE_ODIRECT_PLUG; if (bdev) bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); diff --git a/include/linux/fs.h b/include/linux/fs.h index 2f5fca4147c..79cea805173 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -129,6 +129,7 @@ struct inodes_stat_t { * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device * immediately after submission. The write equivalent * of READ_SYNC. + * WRITE_ODIRECT_PLUG Special case write for O_DIRECT only. * SWRITE_SYNC * SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer. * See SWRITE. @@ -150,6 +151,7 @@ struct inodes_stat_t { #define READ_META (READ | (1 << BIO_RW_META)) #define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) #define WRITE_SYNC (WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) +#define WRITE_ODIRECT_PLUG (WRITE | (1 << BIO_RW_SYNCIO)) #define SWRITE_SYNC_PLUG \ (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) #define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) -- cgit v1.2.3-70-g09d2 From 98262f2762f0067375f83824d81ea929e37e6bfe Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Thu, 3 Dec 2009 09:24:48 +0100 Subject: block: Allow devices to indicate whether discarded blocks are zeroed The discard ioctl is used by mkfs utilities to clear a block device prior to putting metadata down. However, not all devices return zeroed blocks after a discard. Some drives return stale data, potentially containing old superblocks. It is therefore important to know whether discarded blocks are properly zeroed. Both ATA and SCSI drives have configuration bits that indicate whether zeroes are returned after a discard operation. Implement a block level interface that allows this information to be bubbled up the stack and queried via a new block device ioctl. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-settings.c | 2 ++ block/blk-sysfs.c | 11 +++++++++++ block/compat_ioctl.c | 2 ++ block/ioctl.c | 2 ++ include/linux/blkdev.h | 14 ++++++++++++++ include/linux/fs.h | 1 + 6 files changed, 32 insertions(+) (limited to 'include/linux/fs.h') diff --git a/block/blk-settings.c b/block/blk-settings.c index 1ebc1fdb914..dd1f1e0e196 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -101,6 +101,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->discard_granularity = 0; lim->discard_alignment = 0; lim->discard_misaligned = 0; + lim->discard_zeroes_data = -1; lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); lim->alignment_offset = 0; @@ -544,6 +545,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->io_min = max(t->io_min, b->io_min); t->no_cluster |= b->no_cluster; + t->discard_zeroes_data &= b->discard_zeroes_data; /* Bottom device offset aligned? */ if (offset && diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 3147145edc1..8606c9543fd 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -136,6 +136,11 @@ static ssize_t queue_discard_max_show(struct request_queue *q, char *page) return queue_var_show(q->limits.max_discard_sectors << 9, page); } +static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_discard_zeroes_data(q), page); +} + static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) { @@ -313,6 +318,11 @@ static struct queue_sysfs_entry queue_discard_max_entry = { .show = queue_discard_max_show, }; +static struct queue_sysfs_entry queue_discard_zeroes_data_entry = { + .attr = {.name = "discard_zeroes_data", .mode = S_IRUGO }, + .show = queue_discard_zeroes_data_show, +}; + static struct queue_sysfs_entry queue_nonrot_entry = { .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, .show = queue_nonrot_show, @@ -350,6 +360,7 @@ static struct attribute *default_attrs[] = { &queue_io_opt_entry.attr, &queue_discard_granularity_entry.attr, &queue_discard_max_entry.attr, + &queue_discard_zeroes_data_entry.attr, &queue_nonrot_entry.attr, &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 9bd086c1a4d..4eb8e9ea4af 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -747,6 +747,8 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) return compat_put_uint(arg, bdev_io_opt(bdev)); case BLKALIGNOFF: return compat_put_int(arg, bdev_alignment_offset(bdev)); + case BLKDISCARDZEROES: + return compat_put_uint(arg, bdev_discard_zeroes_data(bdev)); case BLKFLSBUF: case BLKROSET: case BLKDISCARD: diff --git a/block/ioctl.c b/block/ioctl.c index 1f4d1de12b0..be48ea51fae 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -280,6 +280,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, return put_uint(arg, bdev_io_opt(bdev)); case BLKALIGNOFF: return put_int(arg, bdev_alignment_offset(bdev)); + case BLKDISCARDZEROES: + return put_uint(arg, bdev_discard_zeroes_data(bdev)); case BLKSECTGET: return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev))); case BLKRASET: diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e727f6c44c4..784a919aa0d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -322,6 +322,7 @@ struct queue_limits { unsigned char misaligned; unsigned char discard_misaligned; unsigned char no_cluster; + signed char discard_zeroes_data; }; struct request_queue @@ -1150,6 +1151,19 @@ static inline int queue_sector_discard_alignment(struct request_queue *q, & (q->limits.discard_granularity - 1); } +static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) +{ + if (q->limits.discard_zeroes_data == 1) + return 1; + + return 0; +} + +static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev) +{ + return queue_discard_zeroes_data(bdev_get_queue(bdev)); +} + static inline int queue_dma_alignment(struct request_queue *q) { return q ? q->dma_alignment : 511; diff --git a/include/linux/fs.h b/include/linux/fs.h index 79cea805173..891f7d642e5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -304,6 +304,7 @@ struct inodes_stat_t { #define BLKIOOPT _IO(0x12,121) #define BLKALIGNOFF _IO(0x12,122) #define BLKPBSZGET _IO(0x12,123) +#define BLKDISCARDZEROES _IO(0x12,124) #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ -- cgit v1.2.3-70-g09d2