summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/blk-cgroup.c16
-rw-r--r--block/blk-cgroup.h14
-rw-r--r--block/blk-core.c673
-rw-r--r--block/blk-exec.c4
-rw-r--r--block/blk-flush.c445
-rw-r--r--block/blk-lib.c23
-rw-r--r--block/blk-merge.c6
-rw-r--r--block/blk-settings.c15
-rw-r--r--block/blk-sysfs.c2
-rw-r--r--block/blk-throttle.c170
-rw-r--r--block/blk.h16
-rw-r--r--block/cfq-iosched.c179
-rw-r--r--block/cfq.h6
-rw-r--r--block/deadline-iosched.c9
-rw-r--r--block/elevator.c112
-rw-r--r--block/genhd.c20
-rw-r--r--block/ioctl.c8
-rw-r--r--block/noop-iosched.c8
18 files changed, 1027 insertions, 699 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 455768a3eb9..2bef5705ce2 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -371,12 +371,14 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
}
EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
-void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
+void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
+ unsigned long unaccounted_time)
{
unsigned long flags;
spin_lock_irqsave(&blkg->stats_lock, flags);
blkg->stats.time += time;
+ blkg->stats.unaccounted_time += unaccounted_time;
spin_unlock_irqrestore(&blkg->stats_lock, flags);
}
EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
@@ -604,6 +606,9 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
blkg->stats.sectors, cb, dev);
#ifdef CONFIG_DEBUG_BLK_CGROUP
+ if (type == BLKIO_STAT_UNACCOUNTED_TIME)
+ return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+ blkg->stats.unaccounted_time, cb, dev);
if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
uint64_t sum = blkg->stats.avg_queue_size_sum;
uint64_t samples = blkg->stats.avg_queue_size_samples;
@@ -1125,6 +1130,9 @@ static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
return blkio_read_blkg_stats(blkcg, cft, cb,
BLKIO_STAT_QUEUED, 1);
#ifdef CONFIG_DEBUG_BLK_CGROUP
+ case BLKIO_PROP_unaccounted_time:
+ return blkio_read_blkg_stats(blkcg, cft, cb,
+ BLKIO_STAT_UNACCOUNTED_TIME, 0);
case BLKIO_PROP_dequeue:
return blkio_read_blkg_stats(blkcg, cft, cb,
BLKIO_STAT_DEQUEUE, 0);
@@ -1382,6 +1390,12 @@ struct cftype blkio_files[] = {
BLKIO_PROP_dequeue),
.read_map = blkiocg_file_read_map,
},
+ {
+ .name = "unaccounted_time",
+ .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
+ BLKIO_PROP_unaccounted_time),
+ .read_map = blkiocg_file_read_map,
+ },
#endif
};
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ea4861bdd54..10919fae2d3 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -49,6 +49,8 @@ enum stat_type {
/* All the single valued stats go below this */
BLKIO_STAT_TIME,
BLKIO_STAT_SECTORS,
+ /* Time not charged to this cgroup */
+ BLKIO_STAT_UNACCOUNTED_TIME,
#ifdef CONFIG_DEBUG_BLK_CGROUP
BLKIO_STAT_AVG_QUEUE_SIZE,
BLKIO_STAT_IDLE_TIME,
@@ -81,6 +83,7 @@ enum blkcg_file_name_prop {
BLKIO_PROP_io_serviced,
BLKIO_PROP_time,
BLKIO_PROP_sectors,
+ BLKIO_PROP_unaccounted_time,
BLKIO_PROP_io_service_time,
BLKIO_PROP_io_wait_time,
BLKIO_PROP_io_merged,
@@ -114,6 +117,8 @@ struct blkio_group_stats {
/* total disk time and nr sectors dispatched by this group */
uint64_t time;
uint64_t sectors;
+ /* Time not charged to this cgroup */
+ uint64_t unaccounted_time;
uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
#ifdef CONFIG_DEBUG_BLK_CGROUP
/* Sum of number of IOs queued across all samples */
@@ -240,7 +245,7 @@ static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
#endif
-#define BLKIO_WEIGHT_MIN 100
+#define BLKIO_WEIGHT_MIN 10
#define BLKIO_WEIGHT_MAX 1000
#define BLKIO_WEIGHT_DEFAULT 500
@@ -293,7 +298,8 @@ extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
void *key);
void blkiocg_update_timeslice_used(struct blkio_group *blkg,
- unsigned long time);
+ unsigned long time,
+ unsigned long unaccounted_time);
void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
bool direction, bool sync);
void blkiocg_update_completion_stats(struct blkio_group *blkg,
@@ -319,7 +325,9 @@ blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
static inline struct blkio_group *
blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
- unsigned long time) {}
+ unsigned long time,
+ unsigned long unaccounted_time)
+{}
static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
uint64_t bytes, bool direction, bool sync) {}
static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
diff --git a/block/blk-core.c b/block/blk-core.c
index 2f4002f79a2..59b5c00c012 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -27,6 +27,7 @@
#include <linux/writeback.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/fault-inject.h>
+#include <linux/list_sort.h>
#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
@@ -149,39 +150,29 @@ EXPORT_SYMBOL(blk_rq_init);
static void req_bio_endio(struct request *rq, struct bio *bio,
unsigned int nbytes, int error)
{
- struct request_queue *q = rq->q;
-
- if (&q->flush_rq != rq) {
- if (error)
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
- else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
- error = -EIO;
+ if (error)
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ error = -EIO;
- if (unlikely(nbytes > bio->bi_size)) {
- printk(KERN_ERR "%s: want %u bytes done, %u left\n",
- __func__, nbytes, bio->bi_size);
- nbytes = bio->bi_size;
- }
+ if (unlikely(nbytes > bio->bi_size)) {
+ printk(KERN_ERR "%s: want %u bytes done, %u left\n",
+ __func__, nbytes, bio->bi_size);
+ nbytes = bio->bi_size;
+ }
- if (unlikely(rq->cmd_flags & REQ_QUIET))
- set_bit(BIO_QUIET, &bio->bi_flags);
+ if (unlikely(rq->cmd_flags & REQ_QUIET))
+ set_bit(BIO_QUIET, &bio->bi_flags);
- bio->bi_size -= nbytes;
- bio->bi_sector += (nbytes >> 9);
+ bio->bi_size -= nbytes;
+ bio->bi_sector += (nbytes >> 9);
- if (bio_integrity(bio))
- bio_integrity_advance(bio, nbytes);
+ if (bio_integrity(bio))
+ bio_integrity_advance(bio, nbytes);
- if (bio->bi_size == 0)
- bio_endio(bio, error);
- } else {
- /*
- * Okay, this is the sequenced flush request in
- * progress, just record the error;
- */
- if (error && !q->flush_err)
- q->flush_err = error;
- }
+ /* don't actually finish bio if it's part of flush sequence */
+ if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
+ bio_endio(bio, error);
}
void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -208,135 +199,43 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
EXPORT_SYMBOL(blk_dump_rq_flags);
/*
- * "plug" the device if there are no outstanding requests: this will
- * force the transfer to start only after we have put all the requests
- * on the list.
- *
- * This is called with interrupts off and no requests on the queue and
- * with the queue lock held.
- */
-void blk_plug_device(struct request_queue *q)
+ * Make sure that plugs that were pending when this function was entered,
+ * are now complete and requests pushed to the queue.
+*/
+static inline void queue_sync_plugs(struct request_queue *q)
{
- WARN_ON(!irqs_disabled());
-
/*
- * don't plug a stopped queue, it must be paired with blk_start_queue()
- * which will restart the queueing
+ * If the current process is plugged and has barriers submitted,
+ * we will livelock if we don't unplug first.
*/
- if (blk_queue_stopped(q))
- return;
-
- if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {
- mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
- trace_block_plug(q);
- }
-}
-EXPORT_SYMBOL(blk_plug_device);
-
-/**
- * blk_plug_device_unlocked - plug a device without queue lock held
- * @q: The &struct request_queue to plug
- *
- * Description:
- * Like @blk_plug_device(), but grabs the queue lock and disables
- * interrupts.
- **/
-void blk_plug_device_unlocked(struct request_queue *q)
-{
- unsigned long flags;
-
- spin_lock_irqsave(q->queue_lock, flags);
- blk_plug_device(q);
- spin_unlock_irqrestore(q->queue_lock, flags);
-}
-EXPORT_SYMBOL(blk_plug_device_unlocked);
-
-/*
- * remove the queue from the plugged list, if present. called with
- * queue lock held and interrupts disabled.
- */
-int blk_remove_plug(struct request_queue *q)
-{
- WARN_ON(!irqs_disabled());
-
- if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q))
- return 0;
-
- del_timer(&q->unplug_timer);
- return 1;
+ blk_flush_plug(current);
}
-EXPORT_SYMBOL(blk_remove_plug);
-/*
- * remove the plug and let it rip..
- */
-void __generic_unplug_device(struct request_queue *q)
+static void blk_delay_work(struct work_struct *work)
{
- if (unlikely(blk_queue_stopped(q)))
- return;
- if (!blk_remove_plug(q) && !blk_queue_nonrot(q))
- return;
+ struct request_queue *q;
- q->request_fn(q);
+ q = container_of(work, struct request_queue, delay_work.work);
+ spin_lock_irq(q->queue_lock);
+ __blk_run_queue(q, false);
+ spin_unlock_irq(q->queue_lock);
}
/**
- * generic_unplug_device - fire a request queue
- * @q: The &struct request_queue in question
+ * blk_delay_queue - restart queueing after defined interval
+ * @q: The &struct request_queue in question
+ * @msecs: Delay in msecs
*
* Description:
- * Linux uses plugging to build bigger requests queues before letting
- * the device have at them. If a queue is plugged, the I/O scheduler
- * is still adding and merging requests on the queue. Once the queue
- * gets unplugged, the request_fn defined for the queue is invoked and
- * transfers started.
- **/
-void generic_unplug_device(struct request_queue *q)
-{
- if (blk_queue_plugged(q)) {
- spin_lock_irq(q->queue_lock);
- __generic_unplug_device(q);
- spin_unlock_irq(q->queue_lock);
- }
-}
-EXPORT_SYMBOL(generic_unplug_device);
-
-static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
- struct page *page)
-{
- struct request_queue *q = bdi->unplug_io_data;
-
- blk_unplug(q);
-}
-
-void blk_unplug_work(struct work_struct *work)
-{
- struct request_queue *q =
- container_of(work, struct request_queue, unplug_work);
-
- trace_block_unplug_io(q);
- q->unplug_fn(q);
-}
-
-void blk_unplug_timeout(unsigned long data)
-{
- struct request_queue *q = (struct request_queue *)data;
-
- trace_block_unplug_timer(q);
- kblockd_schedule_work(q, &q->unplug_work);
-}
-
-void blk_unplug(struct request_queue *q)
+ * Sometimes queueing needs to be postponed for a little while, to allow
+ * resources to come back. This function will make sure that queueing is
+ * restarted around the specified time.
+ */
+void blk_delay_queue(struct request_queue *q, unsigned long msecs)
{
- /*
- * devices don't necessarily have an ->unplug_fn defined
- */
- if (q->unplug_fn) {
- trace_block_unplug_io(q);
- q->unplug_fn(q);
- }
+ schedule_delayed_work(&q->delay_work, msecs_to_jiffies(msecs));
}
-EXPORT_SYMBOL(blk_unplug);
+EXPORT_SYMBOL(blk_delay_queue);
/**
* blk_start_queue - restart a previously stopped queue
@@ -352,7 +251,7 @@ void blk_start_queue(struct request_queue *q)
WARN_ON(!irqs_disabled());
queue_flag_clear(QUEUE_FLAG_STOPPED, q);
- __blk_run_queue(q);
+ __blk_run_queue(q, false);
}
EXPORT_SYMBOL(blk_start_queue);
@@ -372,7 +271,7 @@ EXPORT_SYMBOL(blk_start_queue);
**/
void blk_stop_queue(struct request_queue *q)
{
- blk_remove_plug(q);
+ cancel_delayed_work(&q->delay_work);
queue_flag_set(QUEUE_FLAG_STOPPED, q);
}
EXPORT_SYMBOL(blk_stop_queue);
@@ -390,46 +289,43 @@ EXPORT_SYMBOL(blk_stop_queue);
* that its ->make_request_fn will not re-add plugging prior to calling
* this function.
*
+ * This function does not cancel any asynchronous activity arising
+ * out of elevator or throttling code. That would require elevaotor_exit()
+ * and blk_throtl_exit() to be called with queue lock initialized.
+ *
*/
void blk_sync_queue(struct request_queue *q)
{
- del_timer_sync(&q->unplug_timer);
del_timer_sync(&q->timeout);
- cancel_work_sync(&q->unplug_work);
- throtl_shutdown_timer_wq(q);
+ cancel_delayed_work_sync(&q->delay_work);
+ queue_sync_plugs(q);
}
EXPORT_SYMBOL(blk_sync_queue);
/**
* __blk_run_queue - run a single device queue
* @q: The queue to run
+ * @force_kblockd: Don't run @q->request_fn directly. Use kblockd.
*
* Description:
* See @blk_run_queue. This variant must be called with the queue lock
* held and interrupts disabled.
*
*/
-void __blk_run_queue(struct request_queue *q)
+void __blk_run_queue(struct request_queue *q, bool force_kblockd)
{
- blk_remove_plug(q);
-
if (unlikely(blk_queue_stopped(q)))
return;
- if (elv_queue_empty(q))
- return;
-
/*
* Only recurse once to avoid overrunning the stack, let the unplug
* handling reinvoke the handler shortly if we already got there.
*/
- if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
+ if (!force_kblockd && !queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
q->request_fn(q);
queue_flag_clear(QUEUE_FLAG_REENTER, q);
- } else {
- queue_flag_set(QUEUE_FLAG_PLUGGED, q);
- kblockd_schedule_work(q, &q->unplug_work);
- }
+ } else
+ queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
}
EXPORT_SYMBOL(__blk_run_queue);
@@ -446,7 +342,7 @@ void blk_run_queue(struct request_queue *q)
unsigned long flags;
spin_lock_irqsave(q->queue_lock, flags);
- __blk_run_queue(q);
+ __blk_run_queue(q, false);
spin_unlock_irqrestore(q->queue_lock, flags);
}
EXPORT_SYMBOL(blk_run_queue);
@@ -456,6 +352,11 @@ void blk_put_queue(struct request_queue *q)
kobject_put(&q->kobj);
}
+/*
+ * Note: If a driver supplied the queue lock, it should not zap that lock
+ * unexpectedly as some queue cleanup components like elevator_exit() and
+ * blk_throtl_exit() need queue lock.
+ */
void blk_cleanup_queue(struct request_queue *q)
{
/*
@@ -474,6 +375,8 @@ void blk_cleanup_queue(struct request_queue *q)
if (q->elevator)
elevator_exit(q->elevator);
+ blk_throtl_exit(q);
+
blk_put_queue(q);
}
EXPORT_SYMBOL(blk_cleanup_queue);
@@ -516,8 +419,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
if (!q)
return NULL;
- q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
- q->backing_dev_info.unplug_io_data = q;
q->backing_dev_info.ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
q->backing_dev_info.state = 0;
@@ -537,17 +438,24 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
laptop_mode_timer_fn, (unsigned long) q);
- init_timer(&q->unplug_timer);
setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
INIT_LIST_HEAD(&q->timeout_list);
- INIT_LIST_HEAD(&q->pending_flushes);
- INIT_WORK(&q->unplug_work, blk_unplug_work);
+ INIT_LIST_HEAD(&q->flush_queue[0]);
+ INIT_LIST_HEAD(&q->flush_queue[1]);
+ INIT_LIST_HEAD(&q->flush_data_in_flight);
+ INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
kobject_init(&q->kobj, &blk_queue_ktype);
mutex_init(&q->sysfs_lock);
spin_lock_init(&q->__queue_lock);
+ /*
+ * By default initialize queue_lock to internal lock and driver can
+ * override it later if need be.
+ */
+ q->queue_lock = &q->__queue_lock;
+
return q;
}
EXPORT_SYMBOL(blk_alloc_queue_node);
@@ -630,9 +538,11 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
q->request_fn = rfn;
q->prep_rq_fn = NULL;
q->unprep_rq_fn = NULL;
- q->unplug_fn = generic_unplug_device;
q->queue_flags = QUEUE_FLAG_DEFAULT;
- q->queue_lock = lock;
+
+ /* Override internal queue lock with supplied lock pointer */
+ if (lock)
+ q->queue_lock = lock;
/*
* This also sets hw/phys segments, boundary and size
@@ -665,6 +575,8 @@ int blk_get_queue(struct request_queue *q)
static inline void blk_free_request(struct request_queue *q, struct request *rq)
{
+ BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
+
if (rq->cmd_flags & REQ_ELVPRIV)
elv_put_request(q, rq);
mempool_free(rq, q->rq.rq_pool);
@@ -761,6 +673,25 @@ static void freed_request(struct request_queue *q, int sync, int priv)
}
/*
+ * Determine if elevator data should be initialized when allocating the
+ * request associated with @bio.
+ */
+static bool blk_rq_should_init_elevator(struct bio *bio)
+{
+ if (!bio)
+ return true;
+
+ /*
+ * Flush requests do not use the elevator so skip initialization.
+ * This allows a request to share the flush and elevator data.
+ */
+ if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))
+ return false;
+
+ return true;
+}
+
+/*
* Get a free request, queue_lock must be held.
* Returns NULL on failure, with queue_lock held.
* Returns !NULL on success, with queue_lock *not held*.
@@ -772,7 +703,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
struct request_list *rl = &q->rq;
struct io_context *ioc = NULL;
const bool is_sync = rw_is_sync(rw_flags) != 0;
- int may_queue, priv;
+ int may_queue, priv = 0;
may_queue = elv_may_queue(q, rw_flags);
if (may_queue == ELV_MQUEUE_NO)
@@ -816,9 +747,11 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
rl->count[is_sync]++;
rl->starved[is_sync] = 0;
- priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
- if (priv)
- rl->elvpriv++;
+ if (blk_rq_should_init_elevator(bio)) {
+ priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
+ if (priv)
+ rl->elvpriv++;
+ }
if (blk_queue_io_stat(q))
rw_flags |= REQ_IO_STAT;
@@ -865,8 +798,8 @@ out:
}
/*
- * No available requests for this queue, unplug the device and wait for some
- * requests to become available.
+ * No available requests for this queue, wait for some requests to become
+ * available.
*
* Called with q->queue_lock held, and returns with it unlocked.
*/
@@ -887,7 +820,6 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
trace_block_sleeprq(q, bio, rw_flags & 1);
- __generic_unplug_device(q);
spin_unlock_irq(q->queue_lock);
io_schedule();
@@ -1009,6 +941,13 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
}
EXPORT_SYMBOL(blk_requeue_request);
+static void add_acct_request(struct request_queue *q, struct request *rq,
+ int where)
+{
+ drive_stat_acct(rq, 1);
+ __elv_add_request(q, rq, where);
+}
+
/**
* blk_insert_request - insert a special request into a request queue
* @q: request queue where request should be inserted
@@ -1051,9 +990,8 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
if (blk_rq_tagged(rq))
blk_queue_end_tag(q, rq);
- drive_stat_acct(rq, 1);
- __elv_add_request(q, rq, where, 0);
- __blk_run_queue(q);
+ add_acct_request(q, rq, where);
+ __blk_run_queue(q, false);
spin_unlock_irqrestore(q->queue_lock, flags);
}
EXPORT_SYMBOL(blk_insert_request);
@@ -1173,6 +1111,113 @@ void blk_add_request_payload(struct request *rq, struct page *page,
}
EXPORT_SYMBOL_GPL(blk_add_request_payload);
+static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
+ struct bio *bio)
+{
+ const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+
+ /*
+ * Debug stuff, kill later
+ */
+ if (!rq_mergeable(req)) {
+ blk_dump_rq_flags(req, "back");
+ return false;
+ }
+
+ if (!ll_back_merge_fn(q, req, bio))
+ return false;
+
+ trace_block_bio_backmerge(q, bio);
+
+ if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
+ blk_rq_set_mixed_merge(req);
+
+ req->biotail->bi_next = bio;
+ req->biotail = bio;
+ req->__data_len += bio->bi_size;
+ req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
+
+ drive_stat_acct(req, 0);
+ return true;
+}
+
+static bool bio_attempt_front_merge(struct request_queue *q,
+ struct request *req, struct bio *bio)
+{
+ const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+ sector_t sector;
+
+ /*
+ * Debug stuff, kill later
+ */
+ if (!rq_mergeable(req)) {
+ blk_dump_rq_flags(req, "front");
+ return false;
+ }
+
+ if (!ll_front_merge_fn(q, req, bio))
+ return false;
+
+ trace_block_bio_frontmerge(q, bio);
+
+ if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
+ blk_rq_set_mixed_merge(req);
+
+ sector = bio->bi_sector;
+
+ bio->bi_next = req->bio;
+ req->bio = bio;
+
+ /*
+ * may not be valid. if the low level driver said
+ * it didn't need a bounce buffer then it better
+ * not touch req->buffer either...
+ */
+ req->buffer = bio_data(bio);
+ req->__sector = bio->bi_sector;
+ req->__data_len += bio->bi_size;
+ req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
+
+ drive_stat_acct(req, 0);
+ return true;
+}
+
+/*
+ * Attempts to merge with the plugged list in the current process. Returns
+ * true if merge was succesful, otherwise false.
+ */
+static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
+ struct bio *bio)
+{
+ struct blk_plug *plug;
+ struct request *rq;
+ bool ret = false;
+
+ plug = tsk->plug;
+ if (!plug)
+ goto out;
+
+ list_for_each_entry_reverse(rq, &plug->list, queuelist) {
+ int el_ret;
+
+ if (rq->q != q)
+ continue;
+
+ el_ret = elv_try_merge(rq, bio);
+ if (el_ret == ELEVATOR_BACK_MERGE) {
+ ret = bio_attempt_back_merge(q, rq, bio);
+ if (ret)
+ break;
+ } else if (el_ret == ELEVATOR_FRONT_MERGE) {
+ ret = bio_attempt_front_merge(q, rq, bio);
+ if (ret)
+ break;
+ }
+ }
+out:
+ return ret;
+}
+
void init_request_from_bio(struct request *req, struct bio *bio)
{
req->cpu = bio->bi_comp_cpu;
@@ -1188,26 +1233,12 @@ void init_request_from_bio(struct request *req, struct bio *bio)
blk_rq_bio_prep(req->q, req, bio);
}
-/*
- * Only disabling plugging for non-rotational devices if it does tagging
- * as well, otherwise we do need the proper merging
- */
-static inline bool queue_should_plug(struct request_queue *q)
-{
- return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
-}
-
static int __make_request(struct request_queue *q, struct bio *bio)
{
- struct request *req;
- int el_ret;
- unsigned int bytes = bio->bi_size;
- const unsigned short prio = bio_prio(bio);
const bool sync = !!(bio->bi_rw & REQ_SYNC);
- const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
- const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
- int where = ELEVATOR_INSERT_SORT;
- int rw_flags;
+ struct blk_plug *plug;
+ int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
+ struct request *req;
/*
* low level driver can indicate that it wants pages above a
@@ -1216,78 +1247,36 @@ static int __make_request(struct request_queue *q, struct bio *bio)
*/
blk_queue_bounce(q, &bio);
- spin_lock_irq(q->queue_lock);
-
if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
- where = ELEVATOR_INSERT_FRONT;
+ spin_lock_irq(q->queue_lock);
+ where = ELEVATOR_INSERT_FLUSH;
goto get_rq;
}
- if (elv_queue_empty(q))
- goto get_rq;
-
- el_ret = elv_merge(q, &req, bio);
- switch (el_ret) {
- case ELEVATOR_BACK_MERGE:
- BUG_ON(!rq_mergeable(req));
-
- if (!ll_back_merge_fn(q, req, bio))
- break;
-
- trace_block_bio_backmerge(q, bio);
-
- if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
- blk_rq_set_mixed_merge(req);
-
- req->biotail->bi_next = bio;
- req->biotail = bio;
- req->__data_len += bytes;
- req->ioprio = ioprio_best(req->ioprio, prio);
- if (!blk_rq_cpu_valid(req))
- req->cpu = bio->bi_comp_cpu;
- drive_stat_acct(req, 0);
- elv_bio_merged(q, req, bio);
- if (!attempt_back_merge(q, req))
- elv_merged_request(q, req, el_ret);
+ /*
+ * Check if we can merge with the plugged list before grabbing
+ * any locks.
+ */
+ if (attempt_plug_merge(current, q, bio))
goto out;
- case ELEVATOR_FRONT_MERGE:
- BUG_ON(!rq_mergeable(req));
-
- if (!ll_front_merge_fn(q, req, bio))
- break;
-
- trace_block_bio_frontmerge(q, bio);
+ spin_lock_irq(q->queue_lock);
- if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) {
- blk_rq_set_mixed_merge(req);
- req->cmd_flags &= ~REQ_FAILFAST_MASK;
- req->cmd_flags |= ff;
+ el_ret = elv_merge(q, &req, bio);
+ if (el_ret == ELEVATOR_BACK_MERGE) {
+ BUG_ON(req->cmd_flags & REQ_ON_PLUG);
+ if (bio_attempt_back_merge(q, req, bio)) {
+ if (!attempt_back_merge(q, req))
+ elv_merged_request(q, req, el_ret);
+ goto out_unlock;
+ }
+ } else if (el_ret == ELEVATOR_FRONT_MERGE) {
+ BUG_ON(req->cmd_flags & REQ_ON_PLUG);
+ if (bio_attempt_front_merge(q, req, bio)) {
+ if (!attempt_front_merge(q, req))
+ elv_merged_request(q, req, el_ret);
+ goto out_unlock;
}
-
- bio->bi_next = req->bio;
- req->bio = bio;
-
- /*
- * may not be valid. if the low level driver said
- * it didn't need a bounce buffer then it better
- * not touch req->buffer either...
- */
- req->buffer = bio_data(bio);
- req->__sector = bio->bi_sector;
- req->__data_len += bytes;
- req->ioprio = ioprio_best(req->ioprio, prio);
- if (!blk_rq_cpu_valid(req))
- req->cpu = bio->bi_comp_cpu;
- drive_stat_acct(req, 0);
- elv_bio_merged(q, req, bio);
- if (!attempt_front_merge(q, req))
- elv_merged_request(q, req, el_ret);
- goto out;
-
- /* ELV_NO_MERGE: elevator says don't/can't merge. */
- default:
- ;
}
get_rq:
@@ -1314,20 +1303,35 @@ get_rq:
*/
init_request_from_bio(req, bio);
- spin_lock_irq(q->queue_lock);
if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
- bio_flagged(bio, BIO_CPU_AFFINE))
- req->cpu = blk_cpu_to_group(smp_processor_id());
- if (queue_should_plug(q) && elv_queue_empty(q))
- blk_plug_device(q);
-
- /* insert the request into the elevator */
- drive_stat_acct(req, 1);
- __elv_add_request(q, req, where, 0);
+ bio_flagged(bio, BIO_CPU_AFFINE)) {
+ req->cpu = blk_cpu_to_group(get_cpu());
+ put_cpu();
+ }
+
+ plug = current->plug;
+ if (plug) {
+ if (!plug->should_sort && !list_empty(&plug->list)) {
+ struct request *__rq;
+
+ __rq = list_entry_rq(plug->list.prev);
+ if (__rq->q != q)
+ plug->should_sort = 1;
+ }
+ /*
+ * Debug flag, kill later
+ */
+ req->cmd_flags |= REQ_ON_PLUG;
+ list_add_tail(&req->queuelist, &plug->list);
+ drive_stat_acct(req, 1);
+ } else {
+ spin_lock_irq(q->queue_lock);
+ add_acct_request(q, req, where);
+ __blk_run_queue(q, false);
+out_unlock:
+ spin_unlock_irq(q->queue_lock);
+ }
out:
- if (unplug || !queue_should_plug(q))
- __generic_unplug_device(q);
- spin_unlock_irq(q->queue_lock);
return 0;
}
@@ -1730,9 +1734,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
*/
BUG_ON(blk_queued_rq(rq));
- drive_stat_acct(rq, 1);
- __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
-
+ add_acct_request(q, rq, ELEVATOR_INSERT_BACK);
spin_unlock_irqrestore(q->queue_lock, flags);
return 0;
@@ -1804,7 +1806,7 @@ static void blk_account_io_done(struct request *req)
* normal IO on queueing nor completion. Accounting the
* containing request is enough.
*/
- if (blk_do_io_stat(req) && req != &req->q->flush_rq) {
+ if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) {
unsigned long duration = jiffies - req->start_time;
const int rw = rq_data_dir(req);
struct hd_struct *part;
@@ -2044,9 +2046,26 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
if (error && req->cmd_type == REQ_TYPE_FS &&
!(req->cmd_flags & REQ_QUIET)) {
- printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n",
- req->rq_disk ? req->rq_disk->disk_name : "?",
- (unsigned long long)blk_rq_pos(req));
+ char *error_type;
+
+ switch (error) {
+ case -ENOLINK:
+ error_type = "recoverable transport";
+ break;
+ case -EREMOTEIO:
+ error_type = "critical target";
+ break;
+ case -EBADE:
+ error_type = "critical nexus";
+ break;
+ case -EIO:
+ default:
+ error_type = "I/O";
+ break;
+ }
+ printk(KERN_ERR "end_request: %s error, dev %s, sector %llu\n",
+ error_type, req->rq_disk ? req->rq_disk->disk_name : "?",
+ (unsigned long long)blk_rq_pos(req));
}
blk_account_io_completion(req, nr_bytes);
@@ -2617,6 +2636,106 @@ int kblockd_schedule_delayed_work(struct request_queue *q,
}
EXPORT_SYMBOL(kblockd_schedule_delayed_work);
+#define PLUG_MAGIC 0x91827364
+
+void blk_start_plug(struct blk_plug *plug)
+{
+ struct task_struct *tsk = current;
+
+ plug->magic = PLUG_MAGIC;
+ INIT_LIST_HEAD(&plug->list);
+ plug->should_sort = 0;
+
+ /*
+ * If this is a nested plug, don't actually assign it. It will be
+ * flushed on its own.
+ */
+ if (!tsk->plug) {
+ /*
+ * Store ordering should not be needed here, since a potential
+ * preempt will imply a full memory barrier
+ */
+ tsk->plug = plug;
+ }
+}
+EXPORT_SYMBOL(blk_start_plug);
+
+static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct request *rqa = container_of(a, struct request, queuelist);
+ struct request *rqb = container_of(b, struct request, queuelist);
+
+ return !(rqa->q == rqb->q);
+}
+
+static void flush_plug_list(struct blk_plug *plug)
+{
+ struct request_queue *q;
+ unsigned long flags;
+ struct request *rq;
+
+ BUG_ON(plug->magic != PLUG_MAGIC);
+
+ if (list_empty(&plug->list))
+ return;
+
+ if (plug->should_sort)
+ list_sort(NULL, &plug->list, plug_rq_cmp);
+
+ q = NULL;
+ local_irq_save(flags);
+ while (!list_empty(&plug->list)) {
+ rq = list_entry_rq(plug->list.next);
+ list_del_init(&rq->queuelist);
+ BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));
+ BUG_ON(!rq->q);
+ if (rq->q != q) {
+ if (q) {
+ __blk_run_queue(q, false);
+ spin_unlock(q->queue_lock);
+ }
+ q = rq->q;
+ spin_lock(q->queue_lock);
+ }
+ rq->cmd_flags &= ~REQ_ON_PLUG;
+
+ /*
+ * rq is already accounted, so use raw insert
+ */
+ __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
+ }
+
+ if (q) {
+ __blk_run_queue(q, false);
+ spin_unlock(q->queue_lock);
+ }
+
+ BUG_ON(!list_empty(&plug->list));
+ local_irq_restore(flags);
+}
+
+static void __blk_finish_plug(struct task_struct *tsk, struct blk_plug *plug)
+{
+ flush_plug_list(plug);
+
+ if (plug == tsk->plug)
+ tsk->plug = NULL;
+}
+
+void blk_finish_plug(struct blk_plug *plug)
+{
+ if (plug)
+ __blk_finish_plug(current, plug);
+}
+EXPORT_SYMBOL(blk_finish_plug);
+
+void __blk_flush_plug(struct task_struct *tsk, struct blk_plug *plug)
+{
+ __blk_finish_plug(tsk, plug);
+ tsk->plug = plug;
+}
+EXPORT_SYMBOL(__blk_flush_plug);
+
int __init blk_dev_init(void)
{
BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/blk-exec.c b/block/blk-exec.c
index cf1456a02ac..7482b7fa863 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -54,8 +54,8 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
rq->end_io = done;
WARN_ON(irqs_disabled());
spin_lock_irq(q->queue_lock);
- __elv_add_request(q, rq, where, 1);
- __generic_unplug_device(q);
+ __elv_add_request(q, rq, where);
+ __blk_run_queue(q, false);
/* the queue is stopped so it won't be plugged+unplugged */
if (rq->cmd_type == REQ_TYPE_PM_RESUME)
q->request_fn(q);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 54b123d6563..93d5fd8e51e 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -1,6 +1,69 @@
/*
* Functions to sequence FLUSH and FUA writes.
+ *
+ * Copyright (C) 2011 Max Planck Institute for Gravitational Physics
+ * Copyright (C) 2011 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three
+ * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
+ * properties and hardware capability.
+ *
+ * If a request doesn't have data, only REQ_FLUSH makes sense, which
+ * indicates a simple flush request. If there is data, REQ_FLUSH indicates
+ * that the device cache should be flushed before the data is executed, and
+ * REQ_FUA means that the data must be on non-volatile media on request
+ * completion.
+ *
+ * If the device doesn't have writeback cache, FLUSH and FUA don't make any
+ * difference. The requests are either completed immediately if there's no
+ * data or executed as normal requests otherwise.
+ *
+ * If the device has writeback cache and supports FUA, REQ_FLUSH is
+ * translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
+ *
+ * If the device has writeback cache and doesn't support FUA, REQ_FLUSH is
+ * translated to PREFLUSH and REQ_FUA to POSTFLUSH.
+ *
+ * The actual execution of flush is double buffered. Whenever a request
+ * needs to execute PRE or POSTFLUSH, it queues at
+ * q->flush_queue[q->flush_pending_idx]. Once certain criteria are met, a
+ * flush is issued and the pending_idx is toggled. When the flush
+ * completes, all the requests which were pending are proceeded to the next
+ * step. This allows arbitrary merging of different types of FLUSH/FUA
+ * requests.
+ *
+ * Currently, the following conditions are used to determine when to issue
+ * flush.
+ *
+ * C1. At any given time, only one flush shall be in progress. This makes
+ * double buffering sufficient.
+ *
+ * C2. Flush is deferred if any request is executing DATA of its sequence.
+ * This avoids issuing separate POSTFLUSHes for requests which shared
+ * PREFLUSH.
+ *
+ * C3. The second condition is ignored if there is a request which has
+ * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid
+ * starvation in the unlikely case where there are continuous stream of
+ * FUA (without FLUSH) requests.
+ *
+ * For devices which support FUA, it isn't clear whether C2 (and thus C3)
+ * is beneficial.
+ *
+ * Note that a sequenced FLUSH/FUA request with DATA is completed twice.
+ * Once while executing DATA and again after the whole sequence is
+ * complete. The first completion updates the contained bio but doesn't
+ * finish it so that the bio submitter is notified only after the whole
+ * sequence is complete. This is implemented by testing REQ_FLUSH_SEQ in
+ * req_bio_endio().
+ *
+ * The above peculiarity requires that each FLUSH/FUA request has only one
+ * bio attached to it, which is guaranteed as they aren't allowed to be
+ * merged in the usual way.
*/
+
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/bio.h>
@@ -11,184 +74,296 @@
/* FLUSH/FUA sequences */
enum {
- QUEUE_FSEQ_STARTED = (1 << 0), /* flushing in progress */
- QUEUE_FSEQ_PREFLUSH = (1 << 1), /* pre-flushing in progress */
- QUEUE_FSEQ_DATA = (1 << 2), /* data write in progress */
- QUEUE_FSEQ_POSTFLUSH = (1 << 3), /* post-flushing in progress */
- QUEUE_FSEQ_DONE = (1 << 4),
+ REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */
+ REQ_FSEQ_DATA = (1 << 1), /* data write in progress */
+ REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */
+ REQ_FSEQ_DONE = (1 << 3),
+
+ REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |
+ REQ_FSEQ_POSTFLUSH,
+
+ /*
+ * If flush has been pending longer than the following timeout,
+ * it's issued even if flush_data requests are still in flight.
+ */
+ FLUSH_PENDING_TIMEOUT = 5 * HZ,
};
-static struct request *queue_next_fseq(struct request_queue *q);
+static bool blk_kick_flush(struct request_queue *q);
-unsigned blk_flush_cur_seq(struct request_queue *q)
+static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
{
- if (!q->flush_seq)
- return 0;
- return 1 << ffz(q->flush_seq);
+ unsigned int policy = 0;
+
+ if (fflags & REQ_FLUSH) {
+ if (rq->cmd_flags & REQ_FLUSH)
+ policy |= REQ_FSEQ_PREFLUSH;
+ if (blk_rq_sectors(rq))
+ policy |= REQ_FSEQ_DATA;
+ if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
+ policy |= REQ_FSEQ_POSTFLUSH;
+ }
+ return policy;
}
-static struct request *blk_flush_complete_seq(struct request_queue *q,
- unsigned seq, int error)
+static unsigned int blk_flush_cur_seq(struct request *rq)
{
- struct request *next_rq = NULL;
-
- if (error && !q->flush_err)
- q->flush_err = error;
-
- BUG_ON(q->flush_seq & seq);
- q->flush_seq |= seq;
-
- if (blk_flush_cur_seq(q) != QUEUE_FSEQ_DONE) {
- /* not complete yet, queue the next flush sequence */
- next_rq = queue_next_fseq(q);
- } else {
- /* complete this flush request */
- __blk_end_request_all(q->orig_flush_rq, q->flush_err);
- q->orig_flush_rq = NULL;
- q->flush_seq = 0;
-
- /* dispatch the next flush if there's one */
- if (!list_empty(&q->pending_flushes)) {
- next_rq = list_entry_rq(q->pending_flushes.next);
- list_move(&next_rq->queuelist, &q->queue_head);
- }
+ return 1 << ffz(rq->flush.seq);
+}
+
+static void blk_flush_restore_request(struct request *rq)
+{
+ /*
+ * After flush data completion, @rq->bio is %NULL but we need to
+ * complete the bio again. @rq->biotail is guaranteed to equal the
+ * original @rq->bio. Restore it.
+ */
+ rq->bio = rq->biotail;
+
+ /* make @rq a normal request */
+ rq->cmd_flags &= ~REQ_FLUSH_SEQ;
+ rq->end_io = NULL;
+}
+
+/**
+ * blk_flush_complete_seq - complete flush sequence
+ * @rq: FLUSH/FUA request being sequenced
+ * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
+ * @error: whether an error occurred
+ *
+ * @rq just completed @seq part of its flush sequence, record the
+ * completion and trigger the next step.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ *
+ * RETURNS:
+ * %true if requests were added to the dispatch queue, %false otherwise.
+ */
+static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
+ int error)
+{
+ struct request_queue *q = rq->q;
+ struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
+ bool queued = false;
+
+ BUG_ON(rq->flush.seq & seq);
+ rq->flush.seq |= seq;
+
+ if (likely(!error))
+ seq = blk_flush_cur_seq(rq);
+ else
+ seq = REQ_FSEQ_DONE;
+
+ switch (seq) {
+ case REQ_FSEQ_PREFLUSH:
+ case REQ_FSEQ_POSTFLUSH:
+ /* queue for flush */
+ if (list_empty(pending))
+ q->flush_pending_since = jiffies;
+ list_move_tail(&rq->flush.list, pending);
+ break;
+
+ case REQ_FSEQ_DATA:
+ list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
+ list_add(&rq->queuelist, &q->queue_head);
+ queued = true;
+ break;
+
+ case REQ_FSEQ_DONE:
+ /*
+ * @rq was previously adjusted by blk_flush_issue() for
+ * flush sequencing and may already have gone through the
+ * flush data request completion path. Restore @rq for
+ * normal completion and end it.
+ */
+ BUG_ON(!list_empty(&rq->queuelist));
+ list_del_init(&rq->flush.list);
+ blk_flush_restore_request(rq);
+ __blk_end_request_all(rq, error);
+ break;
+
+ default:
+ BUG();
}
- return next_rq;
+
+ return blk_kick_flush(q) | queued;
}
-static void blk_flush_complete_seq_end_io(struct request_queue *q,
- unsigned seq, int error)
+static void flush_end_io(struct request *flush_rq, int error)
{
- bool was_empty = elv_queue_empty(q);
- struct request *next_rq;
+ struct request_queue *q = flush_rq->q;
+ struct list_head *running = &q->flush_queue[q->flush_running_idx];
+ bool queued = false;
+ struct request *rq, *n;
+
+ BUG_ON(q->flush_pending_idx == q->flush_running_idx);
+
+ /* account completion of the flush request */
+ q->flush_running_idx ^= 1;
+ elv_completed_request(q, flush_rq);
- next_rq = blk_flush_complete_seq(q, seq, error);
+ /* and push the waiting requests to the next stage */
+ list_for_each_entry_safe(rq, n, running, flush.list) {
+ unsigned int seq = blk_flush_cur_seq(rq);
+
+ BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
+ queued |= blk_flush_complete_seq(rq, seq, error);
+ }
/*
* Moving a request silently to empty queue_head may stall the
- * queue. Kick the queue in those cases.
+ * queue. Kick the queue in those cases. This function is called
+ * from request completion path and calling directly into
+ * request_fn may confuse the driver. Always use kblockd.
*/
- if (was_empty && next_rq)
- __blk_run_queue(q);
+ if (queued)
+ __blk_run_queue(q, true);
}
-static void pre_flush_end_io(struct request *rq, int error)
+/**
+ * blk_kick_flush - consider issuing flush request
+ * @q: request_queue being kicked
+ *
+ * Flush related states of @q have changed, consider issuing flush request.
+ * Please read the comment at the top of this file for more info.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ *
+ * RETURNS:
+ * %true if flush was issued, %false otherwise.
+ */
+static bool blk_kick_flush(struct request_queue *q)
{
- elv_completed_request(rq->q, rq);
- blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_PREFLUSH, error);
+ struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
+ struct request *first_rq =
+ list_first_entry(pending, struct request, flush.list);
+
+ /* C1 described at the top of this file */
+ if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending))
+ return false;
+
+ /* C2 and C3 */
+ if (!list_empty(&q->flush_data_in_flight) &&
+ time_before(jiffies,
+ q->flush_pending_since + FLUSH_PENDING_TIMEOUT))
+ return false;
+
+ /*
+ * Issue flush and toggle pending_idx. This makes pending_idx
+ * different from running_idx, which means flush is in flight.
+ */
+ blk_rq_init(q, &q->flush_rq);
+ q->flush_rq.cmd_type = REQ_TYPE_FS;
+ q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
+ q->flush_rq.rq_disk = first_rq->rq_disk;
+ q->flush_rq.end_io = flush_end_io;
+
+ q->flush_pending_idx ^= 1;
+ elv_insert(q, &q->flush_rq, ELEVATOR_INSERT_REQUEUE);
+ return true;
}
static void flush_data_end_io(struct request *rq, int error)
{
- elv_completed_request(rq->q, rq);
- blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_DATA, error);
-}
+ struct request_queue *q = rq->q;
-static void post_flush_end_io(struct request *rq, int error)
-{
- elv_completed_request(rq->q, rq);
- blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_POSTFLUSH, error);
+ /*
+ * After populating an empty queue, kick it to avoid stall. Read
+ * the comment in flush_end_io().
+ */
+ if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
+ __blk_run_queue(q, true);
}
-static void init_flush_request(struct request *rq, struct gendisk *disk)
+/**
+ * blk_insert_flush - insert a new FLUSH/FUA request
+ * @rq: request to insert
+ *
+ * To be called from elv_insert() for %ELEVATOR_INSERT_FLUSH insertions.
+ * @rq is being submitted. Analyze what needs to be done and put it on the
+ * right queue.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ */
+void blk_insert_flush(struct request *rq)
{
- rq->cmd_type = REQ_TYPE_FS;
- rq->cmd_flags = WRITE_FLUSH;
- rq->rq_disk = disk;
-}
+ struct request_queue *q = rq->q;
+ unsigned int fflags = q->flush_flags; /* may change, cache */
+ unsigned int policy = blk_flush_policy(fflags, rq);
-static struct request *queue_next_fseq(struct request_queue *q)
-{
- struct request *orig_rq = q->orig_flush_rq;
- struct request *rq = &q->flush_rq;
+ BUG_ON(rq->end_io);
+ BUG_ON(!rq->bio || rq->bio != rq->biotail);
- blk_rq_init(q, rq);
+ /*
+ * @policy now records what operations need to be done. Adjust
+ * REQ_FLUSH and FUA for the driver.
+ */
+ rq->cmd_flags &= ~REQ_FLUSH;
+ if (!(fflags & REQ_FUA))
+ rq->cmd_flags &= ~REQ_FUA;
- switch (blk_flush_cur_seq(q)) {
- case QUEUE_FSEQ_PREFLUSH:
- init_flush_request(rq, orig_rq->rq_disk);
- rq->end_io = pre_flush_end_io;
- break;
- case QUEUE_FSEQ_DATA:
- init_request_from_bio(rq, orig_rq->bio);
- /*
- * orig_rq->rq_disk may be different from
- * bio->bi_bdev->bd_disk if orig_rq got here through
- * remapping drivers. Make sure rq->rq_disk points
- * to the same one as orig_rq.
- */
- rq->rq_disk = orig_rq->rq_disk;
- rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA);
- rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA);
- rq->end_io = flush_data_end_io;
- break;
- case QUEUE_FSEQ_POSTFLUSH:
- init_flush_request(rq, orig_rq->rq_disk);
- rq->end_io = post_flush_end_io;
- break;
- default:
- BUG();
+ /*
+ * If there's data but flush is not necessary, the request can be
+ * processed directly without going through flush machinery. Queue
+ * for normal execution.
+ */
+ if ((policy & REQ_FSEQ_DATA) &&
+ !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
+ list_add(&rq->queuelist, &q->queue_head);
+ return;
}
- elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
- return rq;
+ /*
+ * @rq should go through flush machinery. Mark it part of flush
+ * sequence and submit for further processing.
+ */
+ memset(&rq->flush, 0, sizeof(rq->flush));
+ INIT_LIST_HEAD(&rq->flush.list);
+ rq->cmd_flags |= REQ_FLUSH_SEQ;
+ rq->end_io = flush_data_end_io;
+
+ blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
}
-struct request *blk_do_flush(struct request_queue *q, struct request *rq)
+/**
+ * blk_abort_flushes - @q is being aborted, abort flush requests
+ * @q: request_queue being aborted
+ *
+ * To be called from elv_abort_queue(). @q is being aborted. Prepare all
+ * FLUSH/FUA requests for abortion.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock)
+ */
+void blk_abort_flushes(struct request_queue *q)
{
- unsigned int fflags = q->flush_flags; /* may change, cache it */
- bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA;
- bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH);
- bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA);
- unsigned skip = 0;
+ struct request *rq, *n;
+ int i;
/*
- * Special case. If there's data but flush is not necessary,
- * the request can be issued directly.
- *
- * Flush w/o data should be able to be issued directly too but
- * currently some drivers assume that rq->bio contains
- * non-zero data if it isn't NULL and empty FLUSH requests
- * getting here usually have bio's without data.
+ * Requests in flight for data are already owned by the dispatch
+ * queue or the device driver. Just restore for normal completion.
*/
- if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) {
- rq->cmd_flags &= ~REQ_FLUSH;
- if (!has_fua)
- rq->cmd_flags &= ~REQ_FUA;
- return rq;
+ list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) {
+ list_del_init(&rq->flush.list);
+ blk_flush_restore_request(rq);
}
/*
- * Sequenced flushes can't be processed in parallel. If
- * another one is already in progress, queue for later
- * processing.
+ * We need to give away requests on flush queues. Restore for
+ * normal completion and put them on the dispatch queue.
*/
- if (q->flush_seq) {
- list_move_tail(&rq->queuelist, &q->pending_flushes);
- return NULL;
+ for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) {
+ list_for_each_entry_safe(rq, n, &q->flush_queue[i],
+ flush.list) {
+ list_del_init(&rq->flush.list);
+ blk_flush_restore_request(rq);
+ list_add_tail(&rq->queuelist, &q->queue_head);
+ }
}
-
- /*
- * Start a new flush sequence
- */
- q->flush_err = 0;
- q->flush_seq |= QUEUE_FSEQ_STARTED;
-
- /* adjust FLUSH/FUA of the original request and stash it away */
- rq->cmd_flags &= ~REQ_FLUSH;
- if (!has_fua)
- rq->cmd_flags &= ~REQ_FUA;
- blk_dequeue_request(rq);
- q->orig_flush_rq = rq;
-
- /* skip unneded sequences and return the first one */
- if (!do_preflush)
- skip |= QUEUE_FSEQ_PREFLUSH;
- if (!blk_rq_sectors(rq))
- skip |= QUEUE_FSEQ_DATA;
- if (!do_postflush)
- skip |= QUEUE_FSEQ_POSTFLUSH;
- return blk_flush_complete_seq(q, skip, 0);
}
static void bio_end_flush(struct bio *bio, int err)
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 1a320d2406b..25de73e4759 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -109,7 +109,6 @@ struct bio_batch
atomic_t done;
unsigned long flags;
struct completion *wait;
- bio_end_io_t *end_io;
};
static void bio_batch_end_io(struct bio *bio, int err)
@@ -122,17 +121,14 @@ static void bio_batch_end_io(struct bio *bio, int err)
else
clear_bit(BIO_UPTODATE, &bb->flags);
}
- if (bb) {
- if (bb->end_io)
- bb->end_io(bio, err);
- atomic_inc(&bb->done);
- complete(bb->wait);
- }
+ if (bb)
+ if (atomic_dec_and_test(&bb->done))
+ complete(bb->wait);
bio_put(bio);
}
/**
- * blkdev_issue_zeroout generate number of zero filed write bios
+ * blkdev_issue_zeroout - generate number of zero filed write bios
* @bdev: blockdev to issue
* @sector: start sector
* @nr_sects: number of sectors to write
@@ -140,8 +136,6 @@ static void bio_batch_end_io(struct bio *bio, int err)
*
* Description:
* Generate and issue number of bios with zerofiled pages.
- * Send barrier at the beginning and at the end if requested. This guarantie
- * correct request ordering. Empty barrier allow us to avoid post queue flush.
*/
int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
@@ -150,13 +144,12 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
int ret;
struct bio *bio;
struct bio_batch bb;
- unsigned int sz, issued = 0;
+ unsigned int sz;
DECLARE_COMPLETION_ONSTACK(wait);
- atomic_set(&bb.done, 0);
+ atomic_set(&bb.done, 1);
bb.flags = 1 << BIO_UPTODATE;
bb.wait = &wait;
- bb.end_io = NULL;
submit:
ret = 0;
@@ -185,12 +178,12 @@ submit:
break;
}
ret = 0;
- issued++;
+ atomic_inc(&bb.done);
submit_bio(WRITE, bio);
}
/* Wait for bios in-flight */
- while (issued != atomic_read(&bb.done))
+ if (!atomic_dec_and_test(&bb.done))
wait_for_completion(&wait);
if (!test_bit(BIO_UPTODATE, &bb.flags))
diff --git a/block/blk-merge.c b/block/blk-merge.c
index ea85e20d5e9..cfcc37cb222 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -465,3 +465,9 @@ int attempt_front_merge(struct request_queue *q, struct request *rq)
return 0;
}
+
+int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+ struct request *next)
+{
+ return attempt_merge(q, rq, next);
+}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 36c8c1f2af1..1fa76929359 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -164,25 +164,10 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
blk_queue_congestion_threshold(q);
q->nr_batching = BLK_BATCH_REQ;
- q->unplug_thresh = 4; /* hmm */
- q->unplug_delay = msecs_to_jiffies(3); /* 3 milliseconds */
- if (q->unplug_delay == 0)
- q->unplug_delay = 1;
-
- q->unplug_timer.function = blk_unplug_timeout;
- q->unplug_timer.data = (unsigned long)q;
-
blk_set_default_limits(&q->limits);
blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
/*
- * If the caller didn't supply a lock, fall back to our embedded
- * per-queue locks
- */
- if (!q->queue_lock)
- q->queue_lock = &q->__queue_lock;
-
- /*
* by default assume old behaviour and bounce for any highmem page
*/
blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 41fb69150b4..261c75c665a 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -471,8 +471,6 @@ static void blk_release_queue(struct kobject *kobj)
blk_sync_queue(q);
- blk_throtl_exit(q);
-
if (rl->rq_pool)
mempool_destroy(rl->rq_pool);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 381b09bb562..5352bdafbcf 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -20,6 +20,11 @@ static int throtl_quantum = 32;
/* Throttling is performed over 100ms slice and after that slice is renewed */
static unsigned long throtl_slice = HZ/10; /* 100 ms */
+/* A workqueue to queue throttle related work */
+static struct workqueue_struct *kthrotld_workqueue;
+static void throtl_schedule_delayed_work(struct throtl_data *td,
+ unsigned long delay);
+
struct throtl_rb_root {
struct rb_root rb;
struct rb_node *left;
@@ -97,7 +102,7 @@ struct throtl_data
/* Work for dispatching throttled bios */
struct delayed_work throtl_work;
- atomic_t limits_changed;
+ bool limits_changed;
};
enum tg_state_flags {
@@ -168,7 +173,15 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
* tree of blkg (instead of traversing through hash list all
* the time.
*/
- tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
+
+ /*
+ * This is the common case when there are no blkio cgroups.
+ * Avoid lookup in this case
+ */
+ if (blkcg == &blkio_root_cgroup)
+ tg = &td->root_tg;
+ else
+ tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
/* Fill in device details for root group */
if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
@@ -188,6 +201,7 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
RB_CLEAR_NODE(&tg->rb_node);
bio_list_init(&tg->bio_lists[0]);
bio_list_init(&tg->bio_lists[1]);
+ td->limits_changed = false;
/*
* Take the initial reference that will be released on destroy
@@ -337,10 +351,9 @@ static void throtl_schedule_next_dispatch(struct throtl_data *td)
update_min_dispatch_time(st);
if (time_before_eq(st->min_disptime, jiffies))
- throtl_schedule_delayed_work(td->queue, 0);
+ throtl_schedule_delayed_work(td, 0);
else
- throtl_schedule_delayed_work(td->queue,
- (st->min_disptime - jiffies));
+ throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
}
static inline void
@@ -725,34 +738,36 @@ static void throtl_process_limit_change(struct throtl_data *td)
struct throtl_grp *tg;
struct hlist_node *pos, *n;
- if (!atomic_read(&td->limits_changed))
+ if (!td->limits_changed)
return;
- throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed));
+ xchg(&td->limits_changed, false);
- /*
- * Make sure updates from throtl_update_blkio_group_read_bps() group
- * of functions to tg->limits_changed are visible. We do not
- * want update td->limits_changed to be visible but update to
- * tg->limits_changed not being visible yet on this cpu. Hence
- * the read barrier.
- */
- smp_rmb();
+ throtl_log(td, "limits changed");
hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
- if (throtl_tg_on_rr(tg) && tg->limits_changed) {
- throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
- " riops=%u wiops=%u", tg->bps[READ],
- tg->bps[WRITE], tg->iops[READ],
- tg->iops[WRITE]);
+ if (!tg->limits_changed)
+ continue;
+
+ if (!xchg(&tg->limits_changed, false))
+ continue;
+
+ throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
+ " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
+ tg->iops[READ], tg->iops[WRITE]);
+
+ /*
+ * Restart the slices for both READ and WRITES. It
+ * might happen that a group's limit are dropped
+ * suddenly and we don't want to account recently
+ * dispatched IO with new low rate
+ */
+ throtl_start_new_slice(td, tg, 0);
+ throtl_start_new_slice(td, tg, 1);
+
+ if (throtl_tg_on_rr(tg))
tg_update_disptime(td, tg);
- tg->limits_changed = false;
- }
}
-
- smp_mb__before_atomic_dec();
- atomic_dec(&td->limits_changed);
- smp_mb__after_atomic_dec();
}
/* Dispatch throttled bios. Should be called without queue lock held. */
@@ -762,6 +777,7 @@ static int throtl_dispatch(struct request_queue *q)
unsigned int nr_disp = 0;
struct bio_list bio_list_on_stack;
struct bio *bio;
+ struct blk_plug plug;
spin_lock_irq(q->queue_lock);
@@ -790,9 +806,10 @@ out:
* immediate dispatch
*/
if (nr_disp) {
+ blk_start_plug(&plug);
while((bio = bio_list_pop(&bio_list_on_stack)))
generic_make_request(bio);
- blk_unplug(q);
+ blk_finish_plug(&plug);
}
return nr_disp;
}
@@ -807,24 +824,24 @@ void blk_throtl_work(struct work_struct *work)
}
/* Call with queue lock held */
-void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay)
+static void
+throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
{
- struct throtl_data *td = q->td;
struct delayed_work *dwork = &td->throtl_work;
- if (total_nr_queued(td) > 0) {
+ /* schedule work if limits changed even if no bio is queued */
+ if (total_nr_queued(td) > 0 || td->limits_changed) {
/*
* We might have a work scheduled to be executed in future.
* Cancel that and schedule a new one.
*/
__cancel_delayed_work(dwork);
- kblockd_schedule_delayed_work(q, dwork, delay);
+ queue_delayed_work(kthrotld_workqueue, dwork, delay);
throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
delay, jiffies);
}
}
-EXPORT_SYMBOL(throtl_schedule_delayed_work);
static void
throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
@@ -887,6 +904,15 @@ void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
spin_unlock_irqrestore(td->queue->queue_lock, flags);
}
+static void throtl_update_blkio_group_common(struct throtl_data *td,
+ struct throtl_grp *tg)
+{
+ xchg(&tg->limits_changed, true);
+ xchg(&td->limits_changed, true);
+ /* Schedule a work now to process the limit change */
+ throtl_schedule_delayed_work(td, 0);
+}
+
/*
* For all update functions, key should be a valid pointer because these
* update functions are called under blkcg_lock, that means, blkg is
@@ -900,64 +926,43 @@ static void throtl_update_blkio_group_read_bps(void *key,
struct blkio_group *blkg, u64 read_bps)
{
struct throtl_data *td = key;
+ struct throtl_grp *tg = tg_of_blkg(blkg);
- tg_of_blkg(blkg)->bps[READ] = read_bps;
- /* Make sure read_bps is updated before setting limits_changed */
- smp_wmb();
- tg_of_blkg(blkg)->limits_changed = true;
-
- /* Make sure tg->limits_changed is updated before td->limits_changed */
- smp_mb__before_atomic_inc();
- atomic_inc(&td->limits_changed);
- smp_mb__after_atomic_inc();
-
- /* Schedule a work now to process the limit change */
- throtl_schedule_delayed_work(td->queue, 0);
+ tg->bps[READ] = read_bps;
+ throtl_update_blkio_group_common(td, tg);
}
static void throtl_update_blkio_group_write_bps(void *key,
struct blkio_group *blkg, u64 write_bps)
{
struct throtl_data *td = key;
+ struct throtl_grp *tg = tg_of_blkg(blkg);
- tg_of_blkg(blkg)->bps[WRITE] = write_bps;
- smp_wmb();
- tg_of_blkg(blkg)->limits_changed = true;
- smp_mb__before_atomic_inc();
- atomic_inc(&td->limits_changed);
- smp_mb__after_atomic_inc();
- throtl_schedule_delayed_work(td->queue, 0);
+ tg->bps[WRITE] = write_bps;
+ throtl_update_blkio_group_common(td, tg);
}
static void throtl_update_blkio_group_read_iops(void *key,
struct blkio_group *blkg, unsigned int read_iops)
{
struct throtl_data *td = key;
+ struct throtl_grp *tg = tg_of_blkg(blkg);
- tg_of_blkg(blkg)->iops[READ] = read_iops;
- smp_wmb();
- tg_of_blkg(blkg)->limits_changed = true;
- smp_mb__before_atomic_inc();
- atomic_inc(&td->limits_changed);
- smp_mb__after_atomic_inc();
- throtl_schedule_delayed_work(td->queue, 0);
+ tg->iops[READ] = read_iops;
+ throtl_update_blkio_group_common(td, tg);
}
static void throtl_update_blkio_group_write_iops(void *key,
struct blkio_group *blkg, unsigned int write_iops)
{
struct throtl_data *td = key;
+ struct throtl_grp *tg = tg_of_blkg(blkg);
- tg_of_blkg(blkg)->iops[WRITE] = write_iops;
- smp_wmb();
- tg_of_blkg(blkg)->limits_changed = true;
- smp_mb__before_atomic_inc();
- atomic_inc(&td->limits_changed);
- smp_mb__after_atomic_inc();
- throtl_schedule_delayed_work(td->queue, 0);
+ tg->iops[WRITE] = write_iops;
+ throtl_update_blkio_group_common(td, tg);
}
-void throtl_shutdown_timer_wq(struct request_queue *q)
+static void throtl_shutdown_wq(struct request_queue *q)
{
struct throtl_data *td = q->td;
@@ -998,20 +1003,28 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
/*
* There is already another bio queued in same dir. No
* need to update dispatch time.
- * Still update the disptime if rate limits on this group
- * were changed.
*/
- if (!tg->limits_changed)
- update_disptime = false;
- else
- tg->limits_changed = false;
-
+ update_disptime = false;
goto queue_bio;
+
}
/* Bio is with-in rate limit of group */
if (tg_may_dispatch(td, tg, bio, NULL)) {
throtl_charge_bio(tg, bio);
+
+ /*
+ * We need to trim slice even when bios are not being queued
+ * otherwise it might happen that a bio is not queued for
+ * a long time and slice keeps on extending and trim is not
+ * called for a long time. Now if limits are reduced suddenly
+ * we take into account all the IO dispatched so far at new
+ * low rate and * newly queued IO gets a really long dispatch
+ * time.
+ *
+ * So keep on trimming slice even if bio is not queued.
+ */
+ throtl_trim_slice(td, tg, rw);
goto out;
}
@@ -1047,7 +1060,7 @@ int blk_throtl_init(struct request_queue *q)
INIT_HLIST_HEAD(&td->tg_list);
td->tg_service_tree = THROTL_RB_ROOT;
- atomic_set(&td->limits_changed, 0);
+ td->limits_changed = false;
/* Init root group */
tg = &td->root_tg;
@@ -1059,6 +1072,7 @@ int blk_throtl_init(struct request_queue *q)
/* Practically unlimited BW */
tg->bps[0] = tg->bps[1] = -1;
tg->iops[0] = tg->iops[1] = -1;
+ td->limits_changed = false;
/*
* Set root group reference to 2. One reference will be dropped when
@@ -1091,7 +1105,7 @@ void blk_throtl_exit(struct request_queue *q)
BUG_ON(!td);
- throtl_shutdown_timer_wq(q);
+ throtl_shutdown_wq(q);
spin_lock_irq(q->queue_lock);
throtl_release_tgs(td);
@@ -1121,12 +1135,16 @@ void blk_throtl_exit(struct request_queue *q)
* update limits through cgroup and another work got queued, cancel
* it.
*/
- throtl_shutdown_timer_wq(q);
+ throtl_shutdown_wq(q);
throtl_td_free(td);
}
static int __init throtl_init(void)
{
+ kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
+ if (!kthrotld_workqueue)
+ panic("Failed to create kthrotld\n");
+
blkio_policy_register(&blkio_policy_throtl);
return 0;
}
diff --git a/block/blk.h b/block/blk.h
index 2db8f32838e..c8db371a921 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -18,8 +18,6 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
void blk_dequeue_request(struct request *rq);
void __blk_queue_free_tags(struct request_queue *q);
-void blk_unplug_work(struct work_struct *work);
-void blk_unplug_timeout(unsigned long data);
void blk_rq_timed_out_timer(unsigned long data);
void blk_delete_timer(struct request *);
void blk_add_timer(struct request *);
@@ -51,21 +49,17 @@ static inline void blk_clear_rq_complete(struct request *rq)
*/
#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))
-struct request *blk_do_flush(struct request_queue *q, struct request *rq);
+void blk_insert_flush(struct request *rq);
+void blk_abort_flushes(struct request_queue *q);
static inline struct request *__elv_next_request(struct request_queue *q)
{
struct request *rq;
while (1) {
- while (!list_empty(&q->queue_head)) {
+ if (!list_empty(&q->queue_head)) {
rq = list_entry_rq(q->queue_head.next);
- if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
- rq == &q->flush_rq)
- return rq;
- rq = blk_do_flush(q, rq);
- if (rq)
- return rq;
+ return rq;
}
if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
@@ -109,6 +103,8 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
struct bio *bio);
int attempt_back_merge(struct request_queue *q, struct request *rq);
int attempt_front_merge(struct request_queue *q, struct request *rq);
+int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+ struct request *next);
void blk_recalc_rq_segments(struct request *rq);
void blk_rq_set_mixed_merge(struct request *rq);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 501ffdf0399..7785169f3c8 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -54,9 +54,9 @@ static const int cfq_hist_divisor = 4;
#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
#define RQ_CIC(rq) \
- ((struct cfq_io_context *) (rq)->elevator_private)
-#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2)
-#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3)
+ ((struct cfq_io_context *) (rq)->elevator_private[0])
+#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private[1])
+#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private[2])
static struct kmem_cache *cfq_pool;
static struct kmem_cache *cfq_ioc_pool;
@@ -146,7 +146,6 @@ struct cfq_queue {
struct cfq_rb_root *service_tree;
struct cfq_queue *new_cfqq;
struct cfq_group *cfqg;
- struct cfq_group *orig_cfqg;
/* Number of sectors dispatched from queue in single dispatch round */
unsigned long nr_sectors;
};
@@ -179,6 +178,8 @@ struct cfq_group {
/* group service_tree key */
u64 vdisktime;
unsigned int weight;
+ unsigned int new_weight;
+ bool needs_update;
/* number of cfqq currently on this group */
int nr_cfqq;
@@ -238,6 +239,7 @@ struct cfq_data {
struct rb_root prio_trees[CFQ_PRIO_LISTS];
unsigned int busy_queues;
+ unsigned int busy_sync_queues;
int rq_in_driver;
int rq_in_flight[2];
@@ -285,7 +287,6 @@ struct cfq_data {
unsigned int cfq_slice_idle;
unsigned int cfq_group_idle;
unsigned int cfq_latency;
- unsigned int cfq_group_isolation;
unsigned int cic_index;
struct list_head cic_list;
@@ -501,13 +502,6 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
}
}
-static int cfq_queue_empty(struct request_queue *q)
-{
- struct cfq_data *cfqd = q->elevator->elevator_data;
-
- return !cfqd->rq_queued;
-}
-
/*
* Scale schedule slice based on io priority. Use the sync time slice only
* if a queue is marked sync and has sync io queued. A sync queue with async
@@ -558,15 +552,13 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
static void update_min_vdisktime(struct cfq_rb_root *st)
{
- u64 vdisktime = st->min_vdisktime;
struct cfq_group *cfqg;
if (st->left) {
cfqg = rb_entry_cfqg(st->left);
- vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
+ st->min_vdisktime = max_vdisktime(st->min_vdisktime,
+ cfqg->vdisktime);
}
-
- st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
}
/*
@@ -599,7 +591,7 @@ cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
}
static inline unsigned
-cfq_scaled_group_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
if (cfqd->cfq_latency) {
@@ -631,7 +623,7 @@ cfq_scaled_group_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
static inline void
cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
- unsigned slice = cfq_scaled_group_slice(cfqd, cfqq);
+ unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
cfqq->slice_start = jiffies;
cfqq->slice_end = jiffies + slice;
@@ -863,7 +855,27 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
}
static void
-cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
+cfq_update_group_weight(struct cfq_group *cfqg)
+{
+ BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
+ if (cfqg->needs_update) {
+ cfqg->weight = cfqg->new_weight;
+ cfqg->needs_update = false;
+ }
+}
+
+static void
+cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
+{
+ BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
+
+ cfq_update_group_weight(cfqg);
+ __cfq_group_service_tree_add(st, cfqg);
+ st->total_weight += cfqg->weight;
+}
+
+static void
+cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
struct cfq_rb_root *st = &cfqd->grp_service_tree;
struct cfq_group *__cfqg;
@@ -884,13 +896,19 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
} else
cfqg->vdisktime = st->min_vdisktime;
+ cfq_group_service_tree_add(st, cfqg);
+}
- __cfq_group_service_tree_add(st, cfqg);
- st->total_weight += cfqg->weight;
+static void
+cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
+{
+ st->total_weight -= cfqg->weight;
+ if (!RB_EMPTY_NODE(&cfqg->rb_node))
+ cfq_rb_erase(&cfqg->rb_node, st);
}
static void
-cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
+cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
struct cfq_rb_root *st = &cfqd->grp_service_tree;
@@ -902,14 +920,13 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
return;
cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
- st->total_weight -= cfqg->weight;
- if (!RB_EMPTY_NODE(&cfqg->rb_node))
- cfq_rb_erase(&cfqg->rb_node, st);
+ cfq_group_service_tree_del(st, cfqg);
cfqg->saved_workload_slice = 0;
cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
}
-static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
+static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
+ unsigned int *unaccounted_time)
{
unsigned int slice_used;
@@ -928,8 +945,13 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
1);
} else {
slice_used = jiffies - cfqq->slice_start;
- if (slice_used > cfqq->allocated_slice)
+ if (slice_used > cfqq->allocated_slice) {
+ *unaccounted_time = slice_used - cfqq->allocated_slice;
slice_used = cfqq->allocated_slice;
+ }
+ if (time_after(cfqq->slice_start, cfqq->dispatch_start))
+ *unaccounted_time += cfqq->slice_start -
+ cfqq->dispatch_start;
}
return slice_used;
@@ -939,12 +961,12 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
struct cfq_queue *cfqq)
{
struct cfq_rb_root *st = &cfqd->grp_service_tree;
- unsigned int used_sl, charge;
+ unsigned int used_sl, charge, unaccounted_sl = 0;
int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
- cfqg->service_tree_idle.count;
BUG_ON(nr_sync < 0);
- used_sl = charge = cfq_cfqq_slice_usage(cfqq);
+ used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
if (iops_mode(cfqd))
charge = cfqq->slice_dispatch;
@@ -952,9 +974,10 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
charge = cfqq->allocated_slice;
/* Can't update vdisktime while group is on service tree */
- cfq_rb_erase(&cfqg->rb_node, st);
+ cfq_group_service_tree_del(st, cfqg);
cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
- __cfq_group_service_tree_add(st, cfqg);
+ /* If a new weight was requested, update now, off tree */
+ cfq_group_service_tree_add(st, cfqg);
/* This group is being expired. Save the context */
if (time_after(cfqd->workload_expires, jiffies)) {
@@ -970,7 +993,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
" sect=%u", used_sl, cfqq->slice_dispatch, charge,
iops_mode(cfqd), cfqq->nr_sectors);
- cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
+ cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
+ unaccounted_sl);
cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
}
@@ -985,7 +1009,9 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
unsigned int weight)
{
- cfqg_of_blkg(blkg)->weight = weight;
+ struct cfq_group *cfqg = cfqg_of_blkg(blkg);
+ cfqg->new_weight = weight;
+ cfqg->needs_update = true;
}
static struct cfq_group *
@@ -1187,32 +1213,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
int new_cfqq = 1;
int group_changed = 0;
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
- if (!cfqd->cfq_group_isolation
- && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
- && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
- /* Move this cfq to root group */
- cfq_log_cfqq(cfqd, cfqq, "moving to root group");
- if (!RB_EMPTY_NODE(&cfqq->rb_node))
- cfq_group_service_tree_del(cfqd, cfqq->cfqg);
- cfqq->orig_cfqg = cfqq->cfqg;
- cfqq->cfqg = &cfqd->root_group;
- cfqd->root_group.ref++;
- group_changed = 1;
- } else if (!cfqd->cfq_group_isolation
- && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
- /* cfqq is sequential now needs to go to its original group */
- BUG_ON(cfqq->cfqg != &cfqd->root_group);
- if (!RB_EMPTY_NODE(&cfqq->rb_node))
- cfq_group_service_tree_del(cfqd, cfqq->cfqg);
- cfq_put_cfqg(cfqq->cfqg);
- cfqq->cfqg = cfqq->orig_cfqg;
- cfqq->orig_cfqg = NULL;
- group_changed = 1;
- cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
- }
-#endif
-
service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
cfqq_type(cfqq));
if (cfq_class_idle(cfqq)) {
@@ -1284,7 +1284,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
service_tree->count++;
if ((add_front || !new_cfqq) && !group_changed)
return;
- cfq_group_service_tree_add(cfqd, cfqq->cfqg);
+ cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
}
static struct cfq_queue *
@@ -1372,6 +1372,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
BUG_ON(cfq_cfqq_on_rr(cfqq));
cfq_mark_cfqq_on_rr(cfqq);
cfqd->busy_queues++;
+ if (cfq_cfqq_sync(cfqq))
+ cfqd->busy_sync_queues++;
cfq_resort_rr_list(cfqd, cfqq);
}
@@ -1395,9 +1397,11 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
cfqq->p_root = NULL;
}
- cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+ cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
BUG_ON(!cfqd->busy_queues);
cfqd->busy_queues--;
+ if (cfq_cfqq_sync(cfqq))
+ cfqd->busy_sync_queues--;
}
/*
@@ -1671,7 +1675,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
*/
if (timed_out) {
if (cfq_cfqq_slice_new(cfqq))
- cfqq->slice_resid = cfq_scaled_group_slice(cfqd, cfqq);
+ cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
else
cfqq->slice_resid = cfqq->slice_end - jiffies;
cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
@@ -2405,6 +2409,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
* Does this cfqq already have too much IO in flight?
*/
if (cfqq->dispatched >= max_dispatch) {
+ bool promote_sync = false;
/*
* idle queue must always only have a single IO in flight
*/
@@ -2412,15 +2417,26 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
return false;
/*
+ * If there is only one sync queue
+ * we can ignore async queue here and give the sync
+ * queue no dispatch limit. The reason is a sync queue can
+ * preempt async queue, limiting the sync queue doesn't make
+ * sense. This is useful for aiostress test.
+ */
+ if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
+ promote_sync = true;
+
+ /*
* We have other queues, don't allow more IO from this one
*/
- if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))
+ if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
+ !promote_sync)
return false;
/*
* Sole queue user, no limit
*/
- if (cfqd->busy_queues == 1)
+ if (cfqd->busy_queues == 1 || promote_sync)
max_dispatch = -1;
else
/*
@@ -2542,7 +2558,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
static void cfq_put_queue(struct cfq_queue *cfqq)
{
struct cfq_data *cfqd = cfqq->cfqd;
- struct cfq_group *cfqg, *orig_cfqg;
+ struct cfq_group *cfqg;
BUG_ON(cfqq->ref <= 0);
@@ -2554,7 +2570,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
BUG_ON(rb_first(&cfqq->sort_list));
BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
cfqg = cfqq->cfqg;
- orig_cfqg = cfqq->orig_cfqg;
if (unlikely(cfqd->active_queue == cfqq)) {
__cfq_slice_expired(cfqd, cfqq, 0);
@@ -2564,8 +2579,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
BUG_ON(cfq_cfqq_on_rr(cfqq));
kmem_cache_free(cfq_pool, cfqq);
cfq_put_cfqg(cfqg);
- if (orig_cfqg)
- cfq_put_cfqg(orig_cfqg);
}
/*
@@ -3355,7 +3368,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
cfqd->busy_queues > 1) {
cfq_del_timer(cfqd, cfqq);
cfq_clear_cfqq_wait_request(cfqq);
- __blk_run_queue(cfqd->queue);
+ __blk_run_queue(cfqd->queue, false);
} else {
cfq_blkiocg_update_idle_time_stats(
&cfqq->cfqg->blkg);
@@ -3370,7 +3383,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
* this new queue is RT and the current one is BE
*/
cfq_preempt_queue(cfqd, cfqq);
- __blk_run_queue(cfqd->queue);
+ __blk_run_queue(cfqd->queue, false);
}
}
@@ -3432,6 +3445,10 @@ static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
struct cfq_io_context *cic = cfqd->active_cic;
+ /* If the queue already has requests, don't wait */
+ if (!RB_EMPTY_ROOT(&cfqq->sort_list))
+ return false;
+
/* If there are other queues in the group, don't wait */
if (cfqq->cfqg->nr_cfqq > 1)
return false;
@@ -3609,12 +3626,12 @@ static void cfq_put_request(struct request *rq)
put_io_context(RQ_CIC(rq)->ioc);
- rq->elevator_private = NULL;
- rq->elevator_private2 = NULL;
+ rq->elevator_private[0] = NULL;
+ rq->elevator_private[1] = NULL;
/* Put down rq reference on cfqg */
cfq_put_cfqg(RQ_CFQG(rq));
- rq->elevator_private3 = NULL;
+ rq->elevator_private[2] = NULL;
cfq_put_queue(cfqq);
}
@@ -3701,13 +3718,12 @@ new_queue:
}
cfqq->allocated[rw]++;
- cfqq->ref++;
- rq->elevator_private = cic;
- rq->elevator_private2 = cfqq;
- rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
+ cfqq->ref++;
+ rq->elevator_private[0] = cic;
+ rq->elevator_private[1] = cfqq;
+ rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
spin_unlock_irqrestore(q->queue_lock, flags);
-
return 0;
queue_fail:
@@ -3727,7 +3743,7 @@ static void cfq_kick_queue(struct work_struct *work)
struct request_queue *q = cfqd->queue;
spin_lock_irq(q->queue_lock);
- __blk_run_queue(cfqd->queue);
+ __blk_run_queue(cfqd->queue, false);
spin_unlock_irq(q->queue_lock);
}
@@ -3949,7 +3965,6 @@ static void *cfq_init_queue(struct request_queue *q)
cfqd->cfq_slice_idle = cfq_slice_idle;
cfqd->cfq_group_idle = cfq_group_idle;
cfqd->cfq_latency = 1;
- cfqd->cfq_group_isolation = 0;
cfqd->hw_tag = -1;
/*
* we optimistically start assuming sync ops weren't delayed in last
@@ -4025,7 +4040,6 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
-SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
#undef SHOW_FUNCTION
#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
@@ -4059,7 +4073,6 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
UINT_MAX, 0);
STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
-STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
#undef STORE_FUNCTION
#define CFQ_ATTR(name) \
@@ -4077,7 +4090,6 @@ static struct elv_fs_entry cfq_attrs[] = {
CFQ_ATTR(slice_idle),
CFQ_ATTR(group_idle),
CFQ_ATTR(low_latency),
- CFQ_ATTR(group_isolation),
__ATTR_NULL
};
@@ -4092,7 +4104,6 @@ static struct elevator_type iosched_cfq = {
.elevator_add_req_fn = cfq_insert_request,
.elevator_activate_req_fn = cfq_activate_request,
.elevator_deactivate_req_fn = cfq_deactivate_request,
- .elevator_queue_empty_fn = cfq_queue_empty,
.elevator_completed_req_fn = cfq_completed_request,
.elevator_former_req_fn = elv_rb_former_request,
.elevator_latter_req_fn = elv_rb_latter_request,
diff --git a/block/cfq.h b/block/cfq.h
index 54a6d90f8e8..2a155927e37 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -16,9 +16,9 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
}
static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
- unsigned long time)
+ unsigned long time, unsigned long unaccounted_time)
{
- blkiocg_update_timeslice_used(blkg, time);
+ blkiocg_update_timeslice_used(blkg, time, unaccounted_time);
}
static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg)
@@ -85,7 +85,7 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
unsigned long dequeue) {}
static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
- unsigned long time) {}
+ unsigned long time, unsigned long unaccounted_time) {}
static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
bool direction, bool sync) {}
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index b547cbca7b2..5139c0ea186 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -326,14 +326,6 @@ dispatch_request:
return 1;
}
-static int deadline_queue_empty(struct request_queue *q)
-{
- struct deadline_data *dd = q->elevator->elevator_data;
-
- return list_empty(&dd->fifo_list[WRITE])
- && list_empty(&dd->fifo_list[READ]);
-}
-
static void deadline_exit_queue(struct elevator_queue *e)
{
struct deadline_data *dd = e->elevator_data;
@@ -445,7 +437,6 @@ static struct elevator_type iosched_deadline = {
.elevator_merge_req_fn = deadline_merged_requests,
.elevator_dispatch_fn = deadline_dispatch_requests,
.elevator_add_req_fn = deadline_add_request,
- .elevator_queue_empty_fn = deadline_queue_empty,
.elevator_former_req_fn = elv_rb_former_request,
.elevator_latter_req_fn = elv_rb_latter_request,
.elevator_init_fn = deadline_init_queue,
diff --git a/block/elevator.c b/block/elevator.c
index 2569512830d..c387d316873 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -113,7 +113,7 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
}
EXPORT_SYMBOL(elv_rq_merge_ok);
-static inline int elv_try_merge(struct request *__rq, struct bio *bio)
+int elv_try_merge(struct request *__rq, struct bio *bio)
{
int ret = ELEVATOR_NO_MERGE;
@@ -421,6 +421,8 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
struct list_head *entry;
int stop_flags;
+ BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
+
if (q->last_merge == rq)
q->last_merge = NULL;
@@ -519,6 +521,40 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
return ELEVATOR_NO_MERGE;
}
+/*
+ * Attempt to do an insertion back merge. Only check for the case where
+ * we can append 'rq' to an existing request, so we can throw 'rq' away
+ * afterwards.
+ *
+ * Returns true if we merged, false otherwise
+ */
+static bool elv_attempt_insert_merge(struct request_queue *q,
+ struct request *rq)
+{
+ struct request *__rq;
+
+ if (blk_queue_nomerges(q))
+ return false;
+
+ /*
+ * First try one-hit cache.
+ */
+ if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
+ return true;
+
+ if (blk_queue_noxmerges(q))
+ return false;
+
+ /*
+ * See if our hash lookup can find a potential backmerge.
+ */
+ __rq = elv_rqhash_find(q, blk_rq_pos(rq));
+ if (__rq && blk_attempt_req_merge(q, __rq, rq))
+ return true;
+
+ return false;
+}
+
void elv_merged_request(struct request_queue *q, struct request *rq, int type)
{
struct elevator_queue *e = q->elevator;
@@ -536,14 +572,18 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
struct request *next)
{
struct elevator_queue *e = q->elevator;
+ const int next_sorted = next->cmd_flags & REQ_SORTED;
- if (e->ops->elevator_merge_req_fn)
+ if (next_sorted && e->ops->elevator_merge_req_fn)
e->ops->elevator_merge_req_fn(q, rq, next);
elv_rqhash_reposition(q, rq);
- elv_rqhash_del(q, next);
- q->nr_sorted--;
+ if (next_sorted) {
+ elv_rqhash_del(q, next);
+ q->nr_sorted--;
+ }
+
q->last_merge = rq;
}
@@ -602,7 +642,7 @@ void elv_quiesce_start(struct request_queue *q)
*/
elv_drain_elevator(q);
while (q->rq.elvpriv) {
- __blk_run_queue(q);
+ __blk_run_queue(q, false);
spin_unlock_irq(q->queue_lock);
msleep(10);
spin_lock_irq(q->queue_lock);
@@ -617,21 +657,12 @@ void elv_quiesce_end(struct request_queue *q)
void elv_insert(struct request_queue *q, struct request *rq, int where)
{
- int unplug_it = 1;
-
trace_block_rq_insert(q, rq);
rq->q = q;
switch (where) {
case ELEVATOR_INSERT_REQUEUE:
- /*
- * Most requeues happen because of a busy condition,
- * don't force unplug of the queue for that case.
- * Clear unplug_it and fall through.
- */
- unplug_it = 0;
-
case ELEVATOR_INSERT_FRONT:
rq->cmd_flags |= REQ_SOFTBARRIER;
list_add(&rq->queuelist, &q->queue_head);
@@ -651,9 +682,17 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
* with anything. There's no point in delaying queue
* processing.
*/
- __blk_run_queue(q);
+ __blk_run_queue(q, false);
break;
+ case ELEVATOR_INSERT_SORT_MERGE:
+ /*
+ * If we succeed in merging this request with one in the
+ * queue already, we are done - rq has now been freed,
+ * so no need to do anything further.
+ */
+ if (elv_attempt_insert_merge(q, rq))
+ break;
case ELEVATOR_INSERT_SORT:
BUG_ON(rq->cmd_type != REQ_TYPE_FS &&
!(rq->cmd_flags & REQ_DISCARD));
@@ -673,24 +712,21 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
q->elevator->ops->elevator_add_req_fn(q, rq);
break;
+ case ELEVATOR_INSERT_FLUSH:
+ rq->cmd_flags |= REQ_SOFTBARRIER;
+ blk_insert_flush(rq);
+ break;
default:
printk(KERN_ERR "%s: bad insertion point %d\n",
__func__, where);
BUG();
}
-
- if (unplug_it && blk_queue_plugged(q)) {
- int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC]
- - queue_in_flight(q);
-
- if (nrq >= q->unplug_thresh)
- __generic_unplug_device(q);
- }
}
-void __elv_add_request(struct request_queue *q, struct request *rq, int where,
- int plug)
+void __elv_add_request(struct request_queue *q, struct request *rq, int where)
{
+ BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
+
if (rq->cmd_flags & REQ_SOFTBARRIER) {
/* barriers are scheduling boundary, update end_sector */
if (rq->cmd_type == REQ_TYPE_FS ||
@@ -702,38 +738,20 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
where == ELEVATOR_INSERT_SORT)
where = ELEVATOR_INSERT_BACK;
- if (plug)
- blk_plug_device(q);
-
elv_insert(q, rq, where);
}
EXPORT_SYMBOL(__elv_add_request);
-void elv_add_request(struct request_queue *q, struct request *rq, int where,
- int plug)
+void elv_add_request(struct request_queue *q, struct request *rq, int where)
{
unsigned long flags;
spin_lock_irqsave(q->queue_lock, flags);
- __elv_add_request(q, rq, where, plug);
+ __elv_add_request(q, rq, where);
spin_unlock_irqrestore(q->queue_lock, flags);
}
EXPORT_SYMBOL(elv_add_request);
-int elv_queue_empty(struct request_queue *q)
-{
- struct elevator_queue *e = q->elevator;
-
- if (!list_empty(&q->queue_head))
- return 0;
-
- if (e->ops->elevator_queue_empty_fn)
- return e->ops->elevator_queue_empty_fn(q);
-
- return 1;
-}
-EXPORT_SYMBOL(elv_queue_empty);
-
struct request *elv_latter_request(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
@@ -759,7 +777,7 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
if (e->ops->elevator_set_req_fn)
return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
- rq->elevator_private = NULL;
+ rq->elevator_private[0] = NULL;
return 0;
}
@@ -785,6 +803,8 @@ void elv_abort_queue(struct request_queue *q)
{
struct request *rq;
+ blk_abort_flushes(q);
+
while (!list_empty(&q->queue_head)) {
rq = list_entry_rq(q->queue_head.next);
rq->cmd_flags |= REQ_QUIET;
diff --git a/block/genhd.c b/block/genhd.c
index 6a5b772aa20..c91a2dac6b6 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1158,14 +1158,14 @@ static int diskstats_show(struct seq_file *seqf, void *v)
"%u %lu %lu %llu %u %u %u %u\n",
MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
disk_name(gp, hd->partno, buf),
- part_stat_read(hd, ios[0]),
- part_stat_read(hd, merges[0]),
- (unsigned long long)part_stat_read(hd, sectors[0]),
- jiffies_to_msecs(part_stat_read(hd, ticks[0])),
- part_stat_read(hd, ios[1]),
- part_stat_read(hd, merges[1]),
- (unsigned long long)part_stat_read(hd, sectors[1]),
- jiffies_to_msecs(part_stat_read(hd, ticks[1])),
+ part_stat_read(hd, ios[READ]),
+ part_stat_read(hd, merges[READ]),
+ (unsigned long long)part_stat_read(hd, sectors[READ]),
+ jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
+ part_stat_read(hd, ios[WRITE]),
+ part_stat_read(hd, merges[WRITE]),
+ (unsigned long long)part_stat_read(hd, sectors[WRITE]),
+ jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
part_in_flight(hd),
jiffies_to_msecs(part_stat_read(hd, io_ticks)),
jiffies_to_msecs(part_stat_read(hd, time_in_queue))
@@ -1355,7 +1355,7 @@ int invalidate_partition(struct gendisk *disk, int partno)
struct block_device *bdev = bdget_disk(disk, partno);
if (bdev) {
fsync_bdev(bdev);
- res = __invalidate_device(bdev);
+ res = __invalidate_device(bdev, true);
bdput(bdev);
}
return res;
@@ -1494,7 +1494,7 @@ void disk_block_events(struct gendisk *disk)
void disk_unblock_events(struct gendisk *disk)
{
if (disk->ev)
- __disk_unblock_events(disk, true);
+ __disk_unblock_events(disk, false);
}
/**
diff --git a/block/ioctl.c b/block/ioctl.c
index 9049d460fa8..1124cd29726 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -294,9 +294,11 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
return -EINVAL;
if (get_user(n, (int __user *) arg))
return -EFAULT;
- if (!(mode & FMODE_EXCL) &&
- blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0)
- return -EBUSY;
+ if (!(mode & FMODE_EXCL)) {
+ bdgrab(bdev);
+ if (blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0)
+ return -EBUSY;
+ }
ret = set_blocksize(bdev, n);
if (!(mode & FMODE_EXCL))
blkdev_put(bdev, mode | FMODE_EXCL);
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 232c4b38cd3..06389e9ef96 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -39,13 +39,6 @@ static void noop_add_request(struct request_queue *q, struct request *rq)
list_add_tail(&rq->queuelist, &nd->queue);
}
-static int noop_queue_empty(struct request_queue *q)
-{
- struct noop_data *nd = q->elevator->elevator_data;
-
- return list_empty(&nd->queue);
-}
-
static struct request *
noop_former_request(struct request_queue *q, struct request *rq)
{
@@ -90,7 +83,6 @@ static struct elevator_type elevator_noop = {
.elevator_merge_req_fn = noop_merged_requests,
.elevator_dispatch_fn = noop_dispatch,
.elevator_add_req_fn = noop_add_request,
- .elevator_queue_empty_fn = noop_queue_empty,
.elevator_former_req_fn = noop_former_request,
.elevator_latter_req_fn = noop_latter_request,
.elevator_init_fn = noop_init_queue,