+/*
+ * Block multiqueue core code
+ *
+ * Copyright (C) 2013-2014 Jens Axboe
+ * Copyright (C) 2013-2014 Christoph Hellwig
+ */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
-static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
- unsigned int cpu)
-{
- return per_cpu_ptr(q->queue_ctx, cpu);
-}
-
-/*
- * This assumes per-cpu software queueing queues. They could be per-node
- * as well, for instance. For now this is hardcoded as-is. Note that we don't
- * care about preemption, since we know the ctx's are persistent. This does
- * mean that we can't rely on ctx always matching the currently running CPU.
- */
-static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
-{
- return __blk_mq_get_ctx(q, get_cpu());
-}
-
-static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
-{
- put_cpu();
-}
-
/*
* Check if any of the ctx's have pending work in this hardware queue
*/
__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
smp_wmb();
- /* we have problems to freeze the queue if it's initializing */
- if (!blk_queue_bypass(q) || !blk_queue_init_done(q))
+
+ /* we have problems freezing the queue if it's initializing */
+ if (!blk_queue_dying(q) &&
+ (!blk_queue_bypass(q) || !blk_queue_init_done(q)))
return 0;
__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
rq->q = q;
rq->mq_ctx = ctx;
rq->cmd_flags |= rw_flags;
- rq->cmd_type = 0;
/* do not touch atomic flags, it needs atomic ops against the timer */
rq->cpu = -1;
- rq->__data_len = 0;
- rq->__sector = (sector_t) -1;
- rq->bio = NULL;
- rq->biotail = NULL;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
- memset(&rq->flush, 0, max(sizeof(rq->flush), sizeof(rq->elv)));
rq->rq_disk = NULL;
rq->part = NULL;
- rq->start_time = jiffies;
#ifdef CONFIG_BLK_CGROUP
rq->rl = NULL;
set_start_time_ns(rq);
#if defined(CONFIG_BLK_DEV_INTEGRITY)
rq->nr_integrity_segments = 0;
#endif
- rq->ioprio = 0;
rq->special = NULL;
/* tag was already set */
rq->errors = 0;
- memset(rq->__cmd, 0, sizeof(rq->__cmd));
- rq->cmd = rq->__cmd;
- rq->cmd_len = BLK_MAX_CDB;
rq->extra_len = 0;
rq->sense_len = 0;
rq->resid_len = 0;
rq->sense = NULL;
- rq->deadline = 0;
INIT_LIST_HEAD(&rq->timeout_list);
- rq->timeout = 0;
- rq->retries = 0;
rq->end_io = NULL;
rq->end_io_data = NULL;
rq->next_rq = NULL;
}
static struct request *
-__blk_mq_alloc_request(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
- struct blk_mq_ctx *ctx, int rw, gfp_t gfp, bool reserved)
+__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
{
struct request *rq;
unsigned int tag;
- tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved);
+ tag = blk_mq_get_tag(data);
if (tag != BLK_MQ_TAG_FAIL) {
- rq = hctx->tags->rqs[tag];
+ rq = data->hctx->tags->rqs[tag];
rq->cmd_flags = 0;
- if (blk_mq_tag_busy(hctx)) {
+ if (blk_mq_tag_busy(data->hctx)) {
rq->cmd_flags = REQ_MQ_INFLIGHT;
- atomic_inc(&hctx->nr_active);
+ atomic_inc(&data->hctx->nr_active);
}
rq->tag = tag;
- blk_mq_rq_ctx_init(q, ctx, rq, rw);
+ blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw);
return rq;
}
return NULL;
}
-static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
- int rw, gfp_t gfp,
- bool reserved)
-{
- bool gfp_mask = gfp & ~__GFP_WAIT;
- struct request *rq;
-
- do {
- struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
- struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
-
- rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp_mask,
- reserved);
- if (rq)
- break;
-
- if (!(gfp & __GFP_WAIT)) {
- blk_mq_put_ctx(ctx);
- break;
- }
-
- __blk_mq_run_hw_queue(hctx);
- blk_mq_put_ctx(ctx);
- gfp_mask = gfp;
- } while (1);
-
- return rq;
-}
-
struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
bool reserved)
{
+ struct blk_mq_ctx *ctx;
+ struct blk_mq_hw_ctx *hctx;
struct request *rq;
+ struct blk_mq_alloc_data alloc_data;
if (blk_mq_queue_enter(q))
return NULL;
- rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved);
- if (rq)
- blk_mq_put_ctx(rq->mq_ctx);
+ ctx = blk_mq_get_ctx(q);
+ hctx = q->mq_ops->map_queue(q, ctx->cpu);
+ blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT,
+ reserved, ctx, hctx);
+
+ rq = __blk_mq_alloc_request(&alloc_data, rw);
+ if (!rq && (gfp & __GFP_WAIT)) {
+ __blk_mq_run_hw_queue(hctx);
+ blk_mq_put_ctx(ctx);
+
+ ctx = blk_mq_get_ctx(q);
+ hctx = q->mq_ops->map_queue(q, ctx->cpu);
+ blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
+ hctx);
+ rq = __blk_mq_alloc_request(&alloc_data, rw);
+ ctx = alloc_data.ctx;
+ }
+ blk_mq_put_ctx(ctx);
return rq;
}
EXPORT_SYMBOL(blk_mq_alloc_request);
rq->q->softirq_done_fn(rq);
}
-void __blk_mq_complete_request(struct request *rq)
+static void blk_mq_ipi_complete_request(struct request *rq)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
bool shared = false;
put_cpu();
}
+void __blk_mq_complete_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+
+ if (!q->softirq_done_fn)
+ blk_mq_end_io(rq, rq->errors);
+ else
+ blk_mq_ipi_complete_request(rq);
+}
+
/**
* blk_mq_complete_request - end I/O on a request
* @rq: the request being processed
if (unlikely(blk_should_fake_timeout(q)))
return;
- if (!blk_mark_rq_complete(rq)) {
- if (q->softirq_done_fn)
- __blk_mq_complete_request(rq);
- else
- blk_mq_end_io(rq, rq->errors);
- }
+ if (!blk_mark_rq_complete(rq))
+ __blk_mq_complete_request(rq);
}
EXPORT_SYMBOL(blk_mq_complete_request);
* complete. So be sure to clear complete again when we start
* the request, otherwise we'll ignore the completion event.
*/
- set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
- clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+ if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+ set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+ if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+ clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
if (q->dma_drain_size && blk_rq_bytes(rq)) {
/*
}
EXPORT_SYMBOL(blk_mq_kick_requeue_list);
+static inline bool is_flush_request(struct request *rq, unsigned int tag)
+{
+ return ((rq->cmd_flags & REQ_FLUSH_SEQ) &&
+ rq->q->flush_rq->tag == tag);
+}
+
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
{
- return tags->rqs[tag];
+ struct request *rq = tags->rqs[tag];
+
+ if (!is_flush_request(rq, tag))
+ return rq;
+
+ return rq->q->flush_rq;
}
EXPORT_SYMBOL(blk_mq_tag_to_rq);
static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
{
init_request_from_bio(rq, bio);
- blk_account_io_start(rq, 1);
+
+ if (blk_do_io_stat(rq)) {
+ rq->start_time = jiffies;
+ blk_account_io_start(rq, 1);
+ }
}
static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx;
struct request *rq;
int rw = bio_data_dir(bio);
+ struct blk_mq_alloc_data alloc_data;
if (unlikely(blk_mq_queue_enter(q))) {
bio_endio(bio, -EIO);
rw |= REQ_SYNC;
trace_block_getrq(q, bio, rw);
- rq = __blk_mq_alloc_request(q, hctx, ctx, rw, GFP_ATOMIC, false);
+ blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx,
+ hctx);
+ rq = __blk_mq_alloc_request(&alloc_data, rw);
if (unlikely(!rq)) {
__blk_mq_run_hw_queue(hctx);
blk_mq_put_ctx(ctx);
ctx = blk_mq_get_ctx(q);
hctx = q->mq_ops->map_queue(q, ctx->cpu);
- rq = __blk_mq_alloc_request(q, hctx, ctx, rw,
- __GFP_WAIT|GFP_ATOMIC, false);
+ blk_mq_set_alloc_data(&alloc_data, q,
+ __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx);
+ rq = __blk_mq_alloc_request(&alloc_data, rw);
+ ctx = alloc_data.ctx;
+ hctx = alloc_data.hctx;
}
hctx->queued++;
blk_mq_bio_to_request(rq, bio);
blk_mq_start_request(rq, true);
+ blk_add_timer(rq);
/*
* For OK queue, we are done. For error, kill it. Any other
return;
rq = blk_mq_map_request(q, bio, &data);
+ if (unlikely(!rq))
+ return;
if (unlikely(is_flush_fua)) {
blk_mq_bio_to_request(rq, bio);
}
EXPORT_SYMBOL(blk_mq_map_queue);
-struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *set,
- unsigned int hctx_index,
- int node)
-{
- return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, node);
-}
-EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
-
-void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
- unsigned int hctx_index)
-{
- kfree(hctx);
-}
-EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
-
static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
struct blk_mq_tags *tags, unsigned int hctx_idx)
{
if (i == nr_queue)
break;
+ blk_mq_tag_idle(hctx);
+
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, i);
queue_for_each_hw_ctx(q, hctx, i) {
free_cpumask_var(hctx->cpumask);
- set->ops->free_hctx(hctx, i);
+ kfree(hctx);
}
}
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
{
struct blk_mq_hw_ctx **hctxs;
- struct blk_mq_ctx *ctx;
+ struct blk_mq_ctx __percpu *ctx;
struct request_queue *q;
unsigned int *map;
int i;
for (i = 0; i < set->nr_hw_queues; i++) {
int node = blk_mq_hw_queue_to_node(map, i);
- hctxs[i] = set->ops->alloc_hctx(set, i, node);
+ hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
+ GFP_KERNEL, node);
if (!hctxs[i])
goto err_hctxs;
q->mq_ops = set->ops;
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
+ if (!(set->flags & BLK_MQ_F_SG_MERGE))
+ q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
+
q->sg_reserved_size = INT_MAX;
INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
if (!hctxs[i])
break;
free_cpumask_var(hctxs[i]->cpumask);
- set->ops->free_hctx(hctxs[i], i);
+ kfree(hctxs[i]);
}
err_map:
kfree(hctxs);
{
blk_mq_freeze_queue(q);
+ blk_mq_sysfs_unregister(q);
+
blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
/*
blk_mq_map_swqueue(q);
+ blk_mq_sysfs_register(q);
+
blk_mq_unfreeze_queue(q);
}
return NOTIFY_OK;
}
+/*
+ * Alloc a tag set to be associated with one or more request queues.
+ * May fail with EINVAL for various error conditions. May adjust the
+ * requested depth down, if if it too large. In that case, the set
+ * value will be stored in set->queue_depth.
+ */
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
{
int i;
if (!set->nr_hw_queues)
return -EINVAL;
- if (!set->queue_depth || set->queue_depth > BLK_MQ_MAX_DEPTH)
+ if (!set->queue_depth)
return -EINVAL;
if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
return -EINVAL;
- if (!set->nr_hw_queues ||
- !set->ops->queue_rq || !set->ops->map_queue ||
- !set->ops->alloc_hctx || !set->ops->free_hctx)
+ if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue)
return -EINVAL;
+ if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
+ pr_info("blk-mq: reduced tag depth to %u\n",
+ BLK_MQ_MAX_DEPTH);
+ set->queue_depth = BLK_MQ_MAX_DEPTH;
+ }
set->tags = kmalloc_node(set->nr_hw_queues *
sizeof(struct blk_mq_tags *),