diff options
-rw-r--r-- | Documentation/block/00-INDEX | 10 | ||||
-rw-r--r-- | Documentation/block/cfq-iosched.txt | 77 | ||||
-rw-r--r-- | Documentation/block/queue-sysfs.txt | 64 | ||||
-rw-r--r-- | block/blk-lib.c | 41 | ||||
-rw-r--r-- | block/blk-merge.c | 117 | ||||
-rw-r--r-- | block/genhd.c | 2 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_bitmap.c | 15 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 1 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 28 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 4 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.c | 36 | ||||
-rw-r--r-- | fs/bio.c | 11 | ||||
-rw-r--r-- | fs/block_dev.c | 3 | ||||
-rw-r--r-- | fs/buffer.c | 66 | ||||
-rw-r--r-- | fs/direct-io.c | 5 | ||||
-rw-r--r-- | include/linux/blkdev.h | 14 | ||||
-rw-r--r-- | mm/filemap.c | 7 |
17 files changed, 378 insertions, 123 deletions
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX index d111e3b23db..d18ecd827c4 100644 --- a/Documentation/block/00-INDEX +++ b/Documentation/block/00-INDEX @@ -3,15 +3,21 @@ biodoc.txt - Notes on the Generic Block Layer Rewrite in Linux 2.5 capability.txt - - Generic Block Device Capability (/sys/block/<disk>/capability) + - Generic Block Device Capability (/sys/block/<device>/capability) +cfq-iosched.txt + - CFQ IO scheduler tunables +data-integrity.txt + - Block data integrity deadline-iosched.txt - Deadline IO scheduler tunables ioprio.txt - Block io priorities (in CFQ scheduler) +queue-sysfs.txt + - Queue's sysfs entries request.txt - The members of struct request (in include/linux/blkdev.h) stat.txt - - Block layer statistics in /sys/block/<dev>/stat + - Block layer statistics in /sys/block/<device>/stat switching-sched.txt - Switching I/O schedulers at runtime writeback_cache_control.txt diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt index 6d670f57045..d89b4fe724d 100644 --- a/Documentation/block/cfq-iosched.txt +++ b/Documentation/block/cfq-iosched.txt @@ -1,3 +1,14 @@ +CFQ (Complete Fairness Queueing) +=============================== + +The main aim of CFQ scheduler is to provide a fair allocation of the disk +I/O bandwidth for all the processes which requests an I/O operation. + +CFQ maintains the per process queue for the processes which request I/O +operation(syncronous requests). In case of asynchronous requests, all the +requests from all the processes are batched together according to their +process's I/O priority. + CFQ ioscheduler tunables ======================== @@ -25,6 +36,72 @@ there are multiple spindles behind single LUN (Host based hardware RAID controller or for storage arrays), setting slice_idle=0 might end up in better throughput and acceptable latencies. +back_seek_max +------------- +This specifies, given in Kbytes, the maximum "distance" for backward seeking. +The distance is the amount of space from the current head location to the +sectors that are backward in terms of distance. + +This parameter allows the scheduler to anticipate requests in the "backward" +direction and consider them as being the "next" if they are within this +distance from the current head location. + +back_seek_penalty +----------------- +This parameter is used to compute the cost of backward seeking. If the +backward distance of request is just 1/back_seek_penalty from a "front" +request, then the seeking cost of two requests is considered equivalent. + +So scheduler will not bias toward one or the other request (otherwise scheduler +will bias toward front request). Default value of back_seek_penalty is 2. + +fifo_expire_async +----------------- +This parameter is used to set the timeout of asynchronous requests. Default +value of this is 248ms. + +fifo_expire_sync +---------------- +This parameter is used to set the timeout of synchronous requests. Default +value of this is 124ms. In case to favor synchronous requests over asynchronous +one, this value should be decreased relative to fifo_expire_async. + +slice_async +----------- +This parameter is same as of slice_sync but for asynchronous queue. The +default value is 40ms. + +slice_async_rq +-------------- +This parameter is used to limit the dispatching of asynchronous request to +device request queue in queue's slice time. The maximum number of request that +are allowed to be dispatched also depends upon the io priority. Default value +for this is 2. + +slice_sync +---------- +When a queue is selected for execution, the queues IO requests are only +executed for a certain amount of time(time_slice) before switching to another +queue. This parameter is used to calculate the time slice of synchronous +queue. + +time_slice is computed using the below equation:- +time_slice = slice_sync + (slice_sync/5 * (4 - prio)). To increase the +time_slice of synchronous queue, increase the value of slice_sync. Default +value is 100ms. + +quantum +------- +This specifies the number of request dispatched to the device queue. In a +queue's time slice, a request will not be dispatched if the number of request +in the device exceeds this parameter. This parameter is used for synchronous +request. + +In case of storage with several disk, this setting can limit the parallel +processing of request. Therefore, increasing the value can imporve the +performace although this can cause the latency of some I/O to increase due +to more number of requests. + CFQ IOPS Mode for group scheduling =================================== Basic CFQ design is to provide priority based time slices. Higher priority diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index 6518a55273e..e54ac1d5340 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt @@ -9,20 +9,71 @@ These files are the ones found in the /sys/block/xxx/queue/ directory. Files denoted with a RO postfix are readonly and the RW postfix means read-write. +add_random (RW) +---------------- +This file allows to trun off the disk entropy contribution. Default +value of this file is '1'(on). + +discard_granularity (RO) +----------------------- +This shows the size of internal allocation of the device in bytes, if +reported by the device. A value of '0' means device does not support +the discard functionality. + +discard_max_bytes (RO) +---------------------- +Devices that support discard functionality may have internal limits on +the number of bytes that can be trimmed or unmapped in a single operation. +The discard_max_bytes parameter is set by the device driver to the maximum +number of bytes that can be discarded in a single operation. Discard +requests issued to the device must not exceed this limit. A discard_max_bytes +value of 0 means that the device does not support discard functionality. + +discard_zeroes_data (RO) +------------------------ +When read, this file will show if the discarded block are zeroed by the +device or not. If its value is '1' the blocks are zeroed otherwise not. + hw_sector_size (RO) ------------------- This is the hardware sector size of the device, in bytes. +iostats (RW) +------------- +This file is used to control (on/off) the iostats accounting of the +disk. + +logical_block_size (RO) +----------------------- +This is the logcal block size of the device, in bytes. + max_hw_sectors_kb (RO) ---------------------- This is the maximum number of kilobytes supported in a single data transfer. +max_integrity_segments (RO) +--------------------------- +When read, this file shows the max limit of integrity segments as +set by block layer which a hardware controller can handle. + max_sectors_kb (RW) ------------------- This is the maximum number of kilobytes that the block layer will allow for a filesystem request. Must be smaller than or equal to the maximum size allowed by the hardware. +max_segments (RO) +----------------- +Maximum number of segments of the device. + +max_segment_size (RO) +--------------------- +Maximum segment size of the device. + +minimum_io_size (RO) +-------------------- +This is the smallest preferred io size reported by the device. + nomerges (RW) ------------- This enables the user to disable the lookup logic involved with IO @@ -45,11 +96,24 @@ per-block-cgroup request pool. IOW, if there are N block cgroups, each request queue may have upto N request pools, each independently regulated by nr_requests. +optimal_io_size (RO) +-------------------- +This is the optimal io size reported by the device. + +physical_block_size (RO) +------------------------ +This is the physical block size of device, in bytes. + read_ahead_kb (RW) ------------------ Maximum number of kilobytes to read-ahead for filesystems on this block device. +rotational (RW) +--------------- +This file is used to stat if the device is of rotational type or +non-rotational type. + rq_affinity (RW) ---------------- If this option is '1', the block layer will migrate request completions to the diff --git a/block/blk-lib.c b/block/blk-lib.c index 2b461b496a7..19cc761cacb 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -44,6 +44,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, struct request_queue *q = bdev_get_queue(bdev); int type = REQ_WRITE | REQ_DISCARD; unsigned int max_discard_sectors; + unsigned int granularity, alignment, mask; struct bio_batch bb; struct bio *bio; int ret = 0; @@ -54,18 +55,20 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (!blk_queue_discard(q)) return -EOPNOTSUPP; + /* Zero-sector (unknown) and one-sector granularities are the same. */ + granularity = max(q->limits.discard_granularity >> 9, 1U); + mask = granularity - 1; + alignment = (bdev_discard_alignment(bdev) >> 9) & mask; + /* * Ensure that max_discard_sectors is of the proper - * granularity + * granularity, so that requests stay aligned after a split. */ max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); + max_discard_sectors = round_down(max_discard_sectors, granularity); if (unlikely(!max_discard_sectors)) { /* Avoid infinite loop below. Being cautious never hurts. */ return -EOPNOTSUPP; - } else if (q->limits.discard_granularity) { - unsigned int disc_sects = q->limits.discard_granularity >> 9; - - max_discard_sectors &= ~(disc_sects - 1); } if (flags & BLKDEV_DISCARD_SECURE) { @@ -79,25 +82,37 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, bb.wait = &wait; while (nr_sects) { + unsigned int req_sects; + sector_t end_sect; + bio = bio_alloc(gfp_mask, 1); if (!bio) { ret = -ENOMEM; break; } + req_sects = min_t(sector_t, nr_sects, max_discard_sectors); + + /* + * If splitting a request, and the next starting sector would be + * misaligned, stop the discard at the previous aligned sector. + */ + end_sect = sector + req_sects; + if (req_sects < nr_sects && (end_sect & mask) != alignment) { + end_sect = + round_down(end_sect - alignment, granularity) + + alignment; + req_sects = end_sect - sector; + } + bio->bi_sector = sector; bio->bi_end_io = bio_batch_end_io; bio->bi_bdev = bdev; bio->bi_private = &bb; - if (nr_sects > max_discard_sectors) { - bio->bi_size = max_discard_sectors << 9; - nr_sects -= max_discard_sectors; - sector += max_discard_sectors; - } else { - bio->bi_size = nr_sects << 9; - nr_sects = 0; - } + bio->bi_size = req_sects << 9; + nr_sects -= req_sects; + sector = end_sect; atomic_inc(&bb.done); submit_bio(type, bio); diff --git a/block/blk-merge.c b/block/blk-merge.c index 160035f5488..e76279e4116 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -110,6 +110,49 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, return 0; } +static void +__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, + struct scatterlist *sglist, struct bio_vec **bvprv, + struct scatterlist **sg, int *nsegs, int *cluster) +{ + + int nbytes = bvec->bv_len; + + if (*bvprv && *cluster) { + if ((*sg)->length + nbytes > queue_max_segment_size(q)) + goto new_segment; + + if (!BIOVEC_PHYS_MERGEABLE(*bvprv, bvec)) + goto new_segment; + if (!BIOVEC_SEG_BOUNDARY(q, *bvprv, bvec)) + goto new_segment; + + (*sg)->length += nbytes; + } else { +new_segment: + if (!*sg) + *sg = sglist; + else { + /* + * If the driver previously mapped a shorter + * list, we could see a termination bit + * prematurely unless it fully inits the sg + * table on each mapping. We KNOW that there + * must be more entries here or the driver + * would be buggy, so force clear the + * termination bit to avoid doing a full + * sg_init_table() in drivers for each command. + */ + (*sg)->page_link &= ~0x02; + *sg = sg_next(*sg); + } + + sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset); + (*nsegs)++; + } + *bvprv = bvec; +} + /* * map a request to scatterlist, return number of sg entries setup. Caller * must make sure sg can hold rq->nr_phys_segments entries @@ -131,41 +174,8 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, bvprv = NULL; sg = NULL; rq_for_each_segment(bvec, rq, iter) { - int nbytes = bvec->bv_len; - - if (bvprv && cluster) { - if (sg->length + nbytes > queue_max_segment_size(q)) - goto new_segment; - - if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) - goto new_segment; - if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) - goto new_segment; - - sg->length += nbytes; - } else { -new_segment: - if (!sg) - sg = sglist; - else { - /* - * If the driver previously mapped a shorter - * list, we could see a termination bit - * prematurely unless it fully inits the sg - * table on each mapping. We KNOW that there - * must be more entries here or the driver - * would be buggy, so force clear the - * termination bit to avoid doing a full - * sg_init_table() in drivers for each command. - */ - sg->page_link &= ~0x02; - sg = sg_next(sg); - } - - sg_set_page(sg, bvec->bv_page, nbytes, bvec->bv_offset); - nsegs++; - } - bvprv = bvec; + __blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg, + &nsegs, &cluster); } /* segments in rq */ @@ -199,6 +209,43 @@ new_segment: } EXPORT_SYMBOL(blk_rq_map_sg); +/** + * blk_bio_map_sg - map a bio to a scatterlist + * @q: request_queue in question + * @bio: bio being mapped + * @sglist: scatterlist being mapped + * + * Note: + * Caller must make sure sg can hold bio->bi_phys_segments entries + * + * Will return the number of sg entries setup + */ +int blk_bio_map_sg(struct request_queue *q, struct bio *bio, + struct scatterlist *sglist) +{ + struct bio_vec *bvec, *bvprv; + struct scatterlist *sg; + int nsegs, cluster; + unsigned long i; + + nsegs = 0; + cluster = blk_queue_cluster(q); + + bvprv = NULL; + sg = NULL; + bio_for_each_segment(bvec, bio, i) { + __blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg, + &nsegs, &cluster); + } /* segments in bio */ + + if (sg) + sg_mark_end(sg); + + BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments); + return nsegs; +} +EXPORT_SYMBOL(blk_bio_map_sg); + static inline int ll_new_hw_segment(struct request_queue *q, struct request *req, struct bio *bio) diff --git a/block/genhd.c b/block/genhd.c index cac7366957c..d839723303c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -835,7 +835,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v) static void *show_partition_start(struct seq_file *seqf, loff_t *pos) { - static void *p; + void *p; p = disk_seqf_start(seqf, pos); if (!IS_ERR_OR_NULL(p) && !*pos) diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index ba91b408aba..d8456649674 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -889,6 +889,7 @@ struct bm_aio_ctx { unsigned int done; unsigned flags; #define BM_AIO_COPY_PAGES 1 +#define BM_WRITE_ALL_PAGES 2 int error; struct kref kref; }; @@ -1059,7 +1060,8 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) break; if (rw & WRITE) { - if (bm_test_page_unchanged(b->bm_pages[i])) { + if (!(flags & BM_WRITE_ALL_PAGES) && + bm_test_page_unchanged(b->bm_pages[i])) { dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i); continue; } @@ -1141,6 +1143,17 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) } /** + * drbd_bm_write_all() - Write the whole bitmap to its on disk location. + * @mdev: DRBD device. + * + * Will write all pages. + */ +int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local) +{ + return bm_rw(mdev, WRITE, BM_WRITE_ALL_PAGES, 0); +} + +/** * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed. * @mdev: DRBD device. * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index b2ca143d005..b953cc7c9c0 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1469,6 +1469,7 @@ extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); +extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local); extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local); extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index dbe6135a2ab..f93a0320e95 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -79,6 +79,7 @@ static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); static void md_sync_timer_fn(unsigned long data); static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused); +static void _tl_clear(struct drbd_conf *mdev); MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " "Lars Ellenberg <lars@linbit.com>"); @@ -432,19 +433,10 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) /* Actions operating on the disk state, also want to work on requests that got barrier acked. */ - switch (what) { - case fail_frozen_disk_io: - case restart_frozen_disk_io: - list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { - req = list_entry(le, struct drbd_request, tl_requests); - _req_mod(req, what); - } - case connection_lost_while_pending: - case resend: - break; - default: - dev_err(DEV, "what = %d in _tl_restart()\n", what); + list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { + req = list_entry(le, struct drbd_request, tl_requests); + _req_mod(req, what); } } @@ -459,11 +451,16 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) */ void tl_clear(struct drbd_conf *mdev) { + spin_lock_irq(&mdev->req_lock); + _tl_clear(mdev); + spin_unlock_irq(&mdev->req_lock); +} + +static void _tl_clear(struct drbd_conf *mdev) +{ struct list_head *le, *tle; struct drbd_request *r; - spin_lock_irq(&mdev->req_lock); - _tl_restart(mdev, connection_lost_while_pending); /* we expect this list to be empty. */ @@ -482,7 +479,6 @@ void tl_clear(struct drbd_conf *mdev) memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); - spin_unlock_irq(&mdev->req_lock); } void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) @@ -1476,12 +1472,12 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, if (ns.susp_fen) { /* case1: The outdate peer handler is successful: */ if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) { - tl_clear(mdev); if (test_bit(NEW_CUR_UUID, &mdev->flags)) { drbd_uuid_new_current(mdev); clear_bit(NEW_CUR_UUID, &mdev->flags); } spin_lock_irq(&mdev->req_lock); + _tl_clear(mdev); _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL); spin_unlock_irq(&mdev->req_lock); } diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index fb9dce8daa2..edb490aad8b 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -674,8 +674,8 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds la_size_changed && md_moved ? "size changed and md moved" : la_size_changed ? "size changed" : "md moved"); /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ - err = drbd_bitmap_io(mdev, &drbd_bm_write, - "size changed", BM_LOCKED_MASK); + err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write, + "size changed", BM_LOCKED_MASK); if (err) { rv = dev_size_error; goto out; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 910335c3092..01b2ac641c7 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -695,6 +695,12 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, break; case resend: + /* Simply complete (local only) READs. */ + if (!(req->rq_state & RQ_WRITE) && !req->w.cb) { + _req_may_be_done(req, m); + break; + } + /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK before the connection loss (B&C only); only P_BARRIER_ACK was missing. Trowing them out of the TL here by pretending we got a BARRIER_ACK @@ -834,7 +840,15 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns req->private_bio = NULL; } if (rw == WRITE) { - remote = 1; + /* Need to replicate writes. Unless it is an empty flush, + * which is better mapped to a DRBD P_BARRIER packet, + * also for drbd wire protocol compatibility reasons. */ + if (unlikely(size == 0)) { + /* The only size==0 bios we expect are empty flushes. */ + D_ASSERT(bio->bi_rw & REQ_FLUSH); + remote = 0; + } else + remote = 1; } else { /* READ || READA */ if (local) { @@ -870,8 +884,11 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns * extent. This waits for any resync activity in the corresponding * resync extent to finish, and, if necessary, pulls in the target * extent into the activity log, which involves further disk io because - * of transactional on-disk meta data updates. */ - if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) { + * of transactional on-disk meta data updates. + * Empty flushes don't need to go into the activity log, they can only + * flush data for pending writes which are already in there. */ + if (rw == WRITE && local && size + && !test_bit(AL_SUSPENDED, &mdev->flags)) { req->rq_state |= RQ_IN_ACT_LOG; drbd_al_begin_io(mdev, sector); } @@ -994,7 +1011,10 @@ allocate_barrier: if (rw == WRITE && _req_conflicts(req)) goto fail_conflicting; - list_add_tail(&req->tl_requests, &mdev->newest_tle->requests); + /* no point in adding empty flushes to the transfer log, + * they are mapped to drbd barriers already. */ + if (likely(size!=0)) + list_add_tail(&req->tl_requests, &mdev->newest_tle->requests); /* NOTE remote first: to get the concurrent write detection right, * we must register the request before start of local IO. */ @@ -1014,6 +1034,14 @@ allocate_barrier: mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) maybe_pull_ahead(mdev); + /* If this was a flush, queue a drbd barrier/start a new epoch. + * Unless the current epoch was empty anyways, or we are not currently + * replicating, in which case there is no point. */ + if (unlikely(bio->bi_rw & REQ_FLUSH) + && mdev->newest_tle->n_writes + && drbd_should_do_remote(mdev->state)) + queue_barrier(mdev); + spin_unlock_irq(&mdev->req_lock); kfree(b); /* if someone else has beaten us to it... */ @@ -73,7 +73,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) { unsigned int sz = sizeof(struct bio) + extra_size; struct kmem_cache *slab = NULL; - struct bio_slab *bslab; + struct bio_slab *bslab, *new_bio_slabs; unsigned int i, entry = -1; mutex_lock(&bio_slab_lock); @@ -97,11 +97,12 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) if (bio_slab_nr == bio_slab_max && entry == -1) { bio_slab_max <<= 1; - bio_slabs = krealloc(bio_slabs, - bio_slab_max * sizeof(struct bio_slab), - GFP_KERNEL); - if (!bio_slabs) + new_bio_slabs = krealloc(bio_slabs, + bio_slab_max * sizeof(struct bio_slab), + GFP_KERNEL); + if (!new_bio_slabs) goto out_unlock; + bio_slabs = new_bio_slabs; } if (entry == -1) entry = bio_slab_nr++; diff --git a/fs/block_dev.c b/fs/block_dev.c index 1e519195d45..38e721b35d4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1578,10 +1578,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; + struct blk_plug plug; ssize_t ret; BUG_ON(iocb->ki_pos != pos); + blk_start_plug(&plug); ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); if (ret > 0 || ret == -EIOCBQUEUED) { ssize_t err; @@ -1590,6 +1592,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0 && ret > 0) ret = err; } + blk_finish_plug(&plug); return ret; } EXPORT_SYMBOL_GPL(blkdev_aio_write); diff --git a/fs/buffer.c b/fs/buffer.c index 9f6d2e41281..58e2e7b7737 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -914,7 +914,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head) /* * Initialise the state of a blockdev page's buffers. */ -static void +static sector_t init_page_buffers(struct page *page, struct block_device *bdev, sector_t block, int size) { @@ -936,33 +936,41 @@ init_page_buffers(struct page *page, struct block_device *bdev, block++; bh = bh->b_this_page; } while (bh != head); + + /* + * Caller needs to validate requested block against end of device. + */ + return end_block; } /* * Create the page-cache page that contains the requested block. * - * This is user purely for blockdev mappings. + * This is used purely for blockdev mappings. */ -static struct page * +static int grow_dev_page(struct block_device *bdev, sector_t block, - pgoff_t index, int size) + pgoff_t index, int size, int sizebits) { struct inode *inode = bdev->bd_inode; struct page *page; struct buffer_head *bh; + sector_t end_block; + int ret = 0; /* Will call free_more_memory() */ page = find_or_create_page(inode->i_mapping, index, (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); if (!page) - return NULL; + return ret; BUG_ON(!PageLocked(page)); if (page_has_buffers(page)) { bh = page_buffers(page); if (bh->b_size == size) { - init_page_buffers(page, bdev, block, size); - return page; + end_block = init_page_buffers(page, bdev, + index << sizebits, size); + goto done; } if (!try_to_free_buffers(page)) goto failed; @@ -982,14 +990,14 @@ grow_dev_page(struct block_device *bdev, sector_t block, */ spin_lock(&inode->i_mapping->private_lock); link_dev_buffers(page, bh); - init_page_buffers(page, bdev, block, size); + end_block = init_page_buffers(page, bdev, index << sizebits, size); spin_unlock(&inode->i_mapping->private_lock); - return page; - +done: + ret = (block < end_block) ? 1 : -ENXIO; failed: unlock_page(page); page_cache_release(page); - return NULL; + return ret; } /* @@ -999,7 +1007,6 @@ failed: static int grow_buffers(struct block_device *bdev, sector_t block, int size) { - struct page *page; pgoff_t index; int sizebits; @@ -1023,22 +1030,14 @@ grow_buffers(struct block_device *bdev, sector_t block, int size) bdevname(bdev, b)); return -EIO; } - block = index << sizebits; + /* Create a page with the proper size buffers.. */ - page = grow_dev_page(bdev, block, index, size); - if (!page) - return 0; - unlock_page(page); - page_cache_release(page); - return 1; + return grow_dev_page(bdev, block, index, size, sizebits); } static struct buffer_head * __getblk_slow(struct block_device *bdev, sector_t block, int size) { - int ret; - struct buffer_head *bh; - /* Size must be multiple of hard sectorsize */ if (unlikely(size & (bdev_logical_block_size(bdev)-1) || (size < 512 || size > PAGE_SIZE))) { @@ -1051,21 +1050,20 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size) return NULL; } -retry: - bh = __find_get_block(bdev, block, size); - if (bh) - return bh; + for (;;) { + struct buffer_head *bh; + int ret; - ret = grow_buffers(bdev, block, size); - if (ret == 0) { - free_more_memory(); - goto retry; - } else if (ret > 0) { bh = __find_get_block(bdev, block, size); if (bh) return bh; + + ret = grow_buffers(bdev, block, size); + if (ret < 0) + return NULL; + if (ret == 0) + free_more_memory(); } - return NULL; } /* @@ -1321,10 +1319,6 @@ EXPORT_SYMBOL(__find_get_block); * which corresponds to the passed block_device, block and size. The * returned buffer has its reference count incremented. * - * __getblk() cannot fail - it just keeps trying. If you pass it an - * illegal block number, __getblk() will happily return a buffer_head - * which represents the non-existent block. Very weird. - * * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() * attempt is failing. FIXME, perhaps? */ diff --git a/fs/direct-io.c b/fs/direct-io.c index 1faf4cb56f3..f86c720dba0 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1062,6 +1062,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, unsigned long user_addr; size_t bytes; struct buffer_head map_bh = { 0, }; + struct blk_plug plug; if (rw & WRITE) rw = WRITE_ODIRECT; @@ -1177,6 +1178,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, PAGE_SIZE - user_addr / PAGE_SIZE); } + blk_start_plug(&plug); + for (seg = 0; seg < nr_segs; seg++) { user_addr = (unsigned long)iov[seg].iov_base; sdio.size += bytes = iov[seg].iov_len; @@ -1235,6 +1238,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (sdio.bio) dio_bio_submit(dio, &sdio); + blk_finish_plug(&plug); + /* * It is possible that, we return short IO due to end of file. * In that case, we need to release all the pages we got hold on. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4e72a9d4823..4a2ab7c8539 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -601,7 +601,7 @@ static inline void blk_clear_rl_full(struct request_list *rl, bool sync) * it already be started by driver. */ #define RQ_NOMERGE_FLAGS \ - (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA) + (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA | REQ_DISCARD) #define rq_mergeable(rq) \ (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \ (((rq)->cmd_flags & REQ_DISCARD) || \ @@ -894,6 +894,8 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); +extern int blk_bio_map_sg(struct request_queue *q, struct bio *bio, + struct scatterlist *sglist); extern void blk_dump_rq_flags(struct request *, char *); extern long nr_blockdev_pages(void); @@ -1139,6 +1141,16 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector & (lim->discard_granularity - 1); } +static inline int bdev_discard_alignment(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (bdev != bdev->bd_contains) + return bdev->bd_part->discard_alignment; + + return q->limits.discard_alignment; +} + static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) { if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1) diff --git a/mm/filemap.c b/mm/filemap.c index fa5ca304148..384344575c3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1412,12 +1412,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, retval = filemap_write_and_wait_range(mapping, pos, pos + iov_length(iov, nr_segs) - 1); if (!retval) { - struct blk_plug plug; - - blk_start_plug(&plug); retval = mapping->a_ops->direct_IO(READ, iocb, iov, pos, nr_segs); - blk_finish_plug(&plug); } if (retval > 0) { *ppos = pos + retval; @@ -2527,14 +2523,12 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - struct blk_plug plug; ssize_t ret; BUG_ON(iocb->ki_pos != pos); sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); - blk_start_plug(&plug); ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); @@ -2545,7 +2539,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0 && ret > 0) ret = err; } - blk_finish_plug(&plug); sb_end_write(inode->i_sb); return ret; } |