blk-mq: don't allow queue entering for a dying queue
[deliverable/linux.git] / block / blk-mq.c
index 43f0c8ffa92a3fd35966f651dda31fbf3a8cbf32..75fc33f342516c883231ca16b3c2129ff386a624 100644 (file)
@@ -1,3 +1,9 @@
+/*
+ * Block multiqueue core code
+ *
+ * Copyright (C) 2013-2014 Jens Axboe
+ * Copyright (C) 2013-2014 Christoph Hellwig
+ */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/backing-dev.h>
@@ -27,28 +33,6 @@ static LIST_HEAD(all_q_list);
 
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
 
-static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
-                                          unsigned int cpu)
-{
-       return per_cpu_ptr(q->queue_ctx, cpu);
-}
-
-/*
- * This assumes per-cpu software queueing queues. They could be per-node
- * as well, for instance. For now this is hardcoded as-is. Note that we don't
- * care about preemption, since we know the ctx's are persistent. This does
- * mean that we can't rely on ctx always matching the currently running CPU.
- */
-static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
-{
-       return __blk_mq_get_ctx(q, get_cpu());
-}
-
-static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
-{
-       put_cpu();
-}
-
 /*
  * Check if any of the ctx's have pending work in this hardware queue
  */
@@ -98,8 +82,10 @@ static int blk_mq_queue_enter(struct request_queue *q)
 
        __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
        smp_wmb();
-       /* we have problems to freeze the queue if it's initializing */
-       if (!blk_queue_bypass(q) || !blk_queue_init_done(q))
+
+       /* we have problems freezing the queue if it's initializing */
+       if (!blk_queue_dying(q) &&
+           (!blk_queue_bypass(q) || !blk_queue_init_done(q)))
                return 0;
 
        __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
@@ -193,19 +179,12 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
        rq->q = q;
        rq->mq_ctx = ctx;
        rq->cmd_flags |= rw_flags;
-       rq->cmd_type = 0;
        /* do not touch atomic flags, it needs atomic ops against the timer */
        rq->cpu = -1;
-       rq->__data_len = 0;
-       rq->__sector = (sector_t) -1;
-       rq->bio = NULL;
-       rq->biotail = NULL;
        INIT_HLIST_NODE(&rq->hash);
        RB_CLEAR_NODE(&rq->rb_node);
-       memset(&rq->flush, 0, max(sizeof(rq->flush), sizeof(rq->elv)));
        rq->rq_disk = NULL;
        rq->part = NULL;
-       rq->start_time = jiffies;
 #ifdef CONFIG_BLK_CGROUP
        rq->rl = NULL;
        set_start_time_ns(rq);
@@ -215,23 +194,16 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
        rq->nr_integrity_segments = 0;
 #endif
-       rq->ioprio = 0;
        rq->special = NULL;
        /* tag was already set */
        rq->errors = 0;
-       memset(rq->__cmd, 0, sizeof(rq->__cmd));
-       rq->cmd = rq->__cmd;
-       rq->cmd_len = BLK_MAX_CDB;
 
        rq->extra_len = 0;
        rq->sense_len = 0;
        rq->resid_len = 0;
        rq->sense = NULL;
 
-       rq->deadline = 0;
        INIT_LIST_HEAD(&rq->timeout_list);
-       rq->timeout = 0;
-       rq->retries = 0;
        rq->end_io = NULL;
        rq->end_io_data = NULL;
        rq->next_rq = NULL;
@@ -240,70 +212,58 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 }
 
 static struct request *
-__blk_mq_alloc_request(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
-               struct blk_mq_ctx *ctx, int rw, gfp_t gfp, bool reserved)
+__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
 {
        struct request *rq;
        unsigned int tag;
 
-       tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved);
+       tag = blk_mq_get_tag(data);
        if (tag != BLK_MQ_TAG_FAIL) {
-               rq = hctx->tags->rqs[tag];
+               rq = data->hctx->tags->rqs[tag];
 
                rq->cmd_flags = 0;
-               if (blk_mq_tag_busy(hctx)) {
+               if (blk_mq_tag_busy(data->hctx)) {
                        rq->cmd_flags = REQ_MQ_INFLIGHT;
-                       atomic_inc(&hctx->nr_active);
+                       atomic_inc(&data->hctx->nr_active);
                }
 
                rq->tag = tag;
-               blk_mq_rq_ctx_init(q, ctx, rq, rw);
+               blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw);
                return rq;
        }
 
        return NULL;
 }
 
-static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
-                                                  int rw, gfp_t gfp,
-                                                  bool reserved)
-{
-       bool gfp_mask = gfp & ~__GFP_WAIT;
-       struct request *rq;
-
-       do {
-               struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
-               struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
-
-               rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp_mask,
-                                               reserved);
-               if (rq)
-                       break;
-
-               if (!(gfp & __GFP_WAIT)) {
-                       blk_mq_put_ctx(ctx);
-                       break;
-               }
-
-               __blk_mq_run_hw_queue(hctx);
-               blk_mq_put_ctx(ctx);
-               gfp_mask = gfp;
-       } while (1);
-
-       return rq;
-}
-
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
                bool reserved)
 {
+       struct blk_mq_ctx *ctx;
+       struct blk_mq_hw_ctx *hctx;
        struct request *rq;
+       struct blk_mq_alloc_data alloc_data;
 
        if (blk_mq_queue_enter(q))
                return NULL;
 
-       rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved);
-       if (rq)
-               blk_mq_put_ctx(rq->mq_ctx);
+       ctx = blk_mq_get_ctx(q);
+       hctx = q->mq_ops->map_queue(q, ctx->cpu);
+       blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT,
+                       reserved, ctx, hctx);
+
+       rq = __blk_mq_alloc_request(&alloc_data, rw);
+       if (!rq && (gfp & __GFP_WAIT)) {
+               __blk_mq_run_hw_queue(hctx);
+               blk_mq_put_ctx(ctx);
+
+               ctx = blk_mq_get_ctx(q);
+               hctx = q->mq_ops->map_queue(q, ctx->cpu);
+               blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
+                               hctx);
+               rq =  __blk_mq_alloc_request(&alloc_data, rw);
+               ctx = alloc_data.ctx;
+       }
+       blk_mq_put_ctx(ctx);
        return rq;
 }
 EXPORT_SYMBOL(blk_mq_alloc_request);
@@ -383,7 +343,7 @@ static void __blk_mq_complete_request_remote(void *data)
        rq->q->softirq_done_fn(rq);
 }
 
-void __blk_mq_complete_request(struct request *rq)
+static void blk_mq_ipi_complete_request(struct request *rq)
 {
        struct blk_mq_ctx *ctx = rq->mq_ctx;
        bool shared = false;
@@ -409,6 +369,16 @@ void __blk_mq_complete_request(struct request *rq)
        put_cpu();
 }
 
+void __blk_mq_complete_request(struct request *rq)
+{
+       struct request_queue *q = rq->q;
+
+       if (!q->softirq_done_fn)
+               blk_mq_end_io(rq, rq->errors);
+       else
+               blk_mq_ipi_complete_request(rq);
+}
+
 /**
  * blk_mq_complete_request - end I/O on a request
  * @rq:                the request being processed
@@ -423,12 +393,8 @@ void blk_mq_complete_request(struct request *rq)
 
        if (unlikely(blk_should_fake_timeout(q)))
                return;
-       if (!blk_mark_rq_complete(rq)) {
-               if (q->softirq_done_fn)
-                       __blk_mq_complete_request(rq);
-               else
-                       blk_mq_end_io(rq, rq->errors);
-       }
+       if (!blk_mark_rq_complete(rq))
+               __blk_mq_complete_request(rq);
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
@@ -459,8 +425,10 @@ static void blk_mq_start_request(struct request *rq, bool last)
         * complete. So be sure to clear complete again when we start
         * the request, otherwise we'll ignore the completion event.
         */
-       set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-       clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+       if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+               set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+       if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+               clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
 
        if (q->dma_drain_size && blk_rq_bytes(rq)) {
                /*
@@ -563,9 +531,20 @@ void blk_mq_kick_requeue_list(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 
+static inline bool is_flush_request(struct request *rq, unsigned int tag)
+{
+       return ((rq->cmd_flags & REQ_FLUSH_SEQ) &&
+                       rq->q->flush_rq->tag == tag);
+}
+
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 {
-       return tags->rqs[tag];
+       struct request *rq = tags->rqs[tag];
+
+       if (!is_flush_request(rq, tag))
+               return rq;
+
+       return rq->q->flush_rq;
 }
 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 
@@ -1122,7 +1101,11 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 {
        init_request_from_bio(rq, bio);
-       blk_account_io_start(rq, 1);
+
+       if (blk_do_io_stat(rq)) {
+               rq->start_time = jiffies;
+               blk_account_io_start(rq, 1);
+       }
 }
 
 static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
@@ -1164,6 +1147,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
        struct blk_mq_ctx *ctx;
        struct request *rq;
        int rw = bio_data_dir(bio);
+       struct blk_mq_alloc_data alloc_data;
 
        if (unlikely(blk_mq_queue_enter(q))) {
                bio_endio(bio, -EIO);
@@ -1177,7 +1161,9 @@ static struct request *blk_mq_map_request(struct request_queue *q,
                rw |= REQ_SYNC;
 
        trace_block_getrq(q, bio, rw);
-       rq = __blk_mq_alloc_request(q, hctx, ctx, rw, GFP_ATOMIC, false);
+       blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx,
+                       hctx);
+       rq = __blk_mq_alloc_request(&alloc_data, rw);
        if (unlikely(!rq)) {
                __blk_mq_run_hw_queue(hctx);
                blk_mq_put_ctx(ctx);
@@ -1185,8 +1171,11 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 
                ctx = blk_mq_get_ctx(q);
                hctx = q->mq_ops->map_queue(q, ctx->cpu);
-               rq = __blk_mq_alloc_request(q, hctx, ctx, rw,
-                                           __GFP_WAIT|GFP_ATOMIC, false);
+               blk_mq_set_alloc_data(&alloc_data, q,
+                               __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx);
+               rq = __blk_mq_alloc_request(&alloc_data, rw);
+               ctx = alloc_data.ctx;
+               hctx = alloc_data.hctx;
        }
 
        hctx->queued++;
@@ -1229,6 +1218,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
                blk_mq_bio_to_request(rq, bio);
                blk_mq_start_request(rq, true);
+               blk_add_timer(rq);
 
                /*
                 * For OK queue, we are done. For error, kill it. Any other
@@ -1293,6 +1283,8 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
                return;
 
        rq = blk_mq_map_request(q, bio, &data);
+       if (unlikely(!rq))
+               return;
 
        if (unlikely(is_flush_fua)) {
                blk_mq_bio_to_request(rq, bio);
@@ -1345,21 +1337,6 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
 }
 EXPORT_SYMBOL(blk_mq_map_queue);
 
-struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *set,
-                                                  unsigned int hctx_index,
-                                                  int node)
-{
-       return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, node);
-}
-EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
-
-void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
-                                unsigned int hctx_index)
-{
-       kfree(hctx);
-}
-EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
-
 static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
                struct blk_mq_tags *tags, unsigned int hctx_idx)
 {
@@ -1582,6 +1559,8 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
                if (i == nr_queue)
                        break;
 
+               blk_mq_tag_idle(hctx);
+
                if (set->ops->exit_hctx)
                        set->ops->exit_hctx(hctx, i);
 
@@ -1600,7 +1579,7 @@ static void blk_mq_free_hw_queues(struct request_queue *q,
 
        queue_for_each_hw_ctx(q, hctx, i) {
                free_cpumask_var(hctx->cpumask);
-               set->ops->free_hctx(hctx, i);
+               kfree(hctx);
        }
 }
 
@@ -1799,7 +1778,7 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 {
        struct blk_mq_hw_ctx **hctxs;
-       struct blk_mq_ctx *ctx;
+       struct blk_mq_ctx __percpu *ctx;
        struct request_queue *q;
        unsigned int *map;
        int i;
@@ -1821,7 +1800,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
        for (i = 0; i < set->nr_hw_queues; i++) {
                int node = blk_mq_hw_queue_to_node(map, i);
 
-               hctxs[i] = set->ops->alloc_hctx(set, i, node);
+               hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
+                                       GFP_KERNEL, node);
                if (!hctxs[i])
                        goto err_hctxs;
 
@@ -1853,6 +1833,9 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
        q->mq_ops = set->ops;
        q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
 
+       if (!(set->flags & BLK_MQ_F_SG_MERGE))
+               q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
+
        q->sg_reserved_size = INT_MAX;
 
        INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
@@ -1908,7 +1891,7 @@ err_hctxs:
                if (!hctxs[i])
                        break;
                free_cpumask_var(hctxs[i]->cpumask);
-               set->ops->free_hctx(hctxs[i], i);
+               kfree(hctxs[i]);
        }
 err_map:
        kfree(hctxs);
@@ -1947,6 +1930,8 @@ static void blk_mq_queue_reinit(struct request_queue *q)
 {
        blk_mq_freeze_queue(q);
 
+       blk_mq_sysfs_unregister(q);
+
        blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
 
        /*
@@ -1957,6 +1942,8 @@ static void blk_mq_queue_reinit(struct request_queue *q)
 
        blk_mq_map_swqueue(q);
 
+       blk_mq_sysfs_register(q);
+
        blk_mq_unfreeze_queue(q);
 }
 
@@ -1982,22 +1969,31 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
        return NOTIFY_OK;
 }
 
+/*
+ * Alloc a tag set to be associated with one or more request queues.
+ * May fail with EINVAL for various error conditions. May adjust the
+ * requested depth down, if if it too large. In that case, the set
+ * value will be stored in set->queue_depth.
+ */
 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 {
        int i;
 
        if (!set->nr_hw_queues)
                return -EINVAL;
-       if (!set->queue_depth || set->queue_depth > BLK_MQ_MAX_DEPTH)
+       if (!set->queue_depth)
                return -EINVAL;
        if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
                return -EINVAL;
 
-       if (!set->nr_hw_queues ||
-           !set->ops->queue_rq || !set->ops->map_queue ||
-           !set->ops->alloc_hctx || !set->ops->free_hctx)
+       if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue)
                return -EINVAL;
 
+       if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
+               pr_info("blk-mq: reduced tag depth to %u\n",
+                       BLK_MQ_MAX_DEPTH);
+               set->queue_depth = BLK_MQ_MAX_DEPTH;
+       }
 
        set->tags = kmalloc_node(set->nr_hw_queues *
                                 sizeof(struct blk_mq_tags *),
This page took 0.030703 seconds and 5 git commands to generate.