2 * Copyright (C) 2003 Sistina Software Limited.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
5 * This file is released under the GPL.
8 #include <linux/device-mapper.h>
11 #include "dm-bio-record.h"
12 #include "dm-path-selector.h"
13 #include "dm-uevent.h"
15 #include <linux/blkdev.h>
16 #include <linux/ctype.h>
17 #include <linux/init.h>
18 #include <linux/mempool.h>
19 #include <linux/module.h>
20 #include <linux/pagemap.h>
21 #include <linux/slab.h>
22 #include <linux/time.h>
23 #include <linux/workqueue.h>
24 #include <linux/delay.h>
25 #include <scsi/scsi_dh.h>
26 #include <linux/atomic.h>
27 #include <linux/blk-mq.h>
29 #define DM_MSG_PREFIX "multipath"
30 #define DM_PG_INIT_DELAY_MSECS 2000
31 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
35 struct list_head list
;
37 struct priority_group
*pg
; /* Owning PG */
38 unsigned fail_count
; /* Cumulative failure count */
41 struct delayed_work activate_path
;
43 bool is_active
:1; /* Path status */
46 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
49 * Paths are grouped into Priority Groups and numbered from 1 upwards.
50 * Each has a path selector which controls which path gets used.
52 struct priority_group
{
53 struct list_head list
;
55 struct multipath
*m
; /* Owning multipath instance */
56 struct path_selector ps
;
58 unsigned pg_num
; /* Reference number */
59 unsigned nr_pgpaths
; /* Number of paths in PG */
60 struct list_head pgpaths
;
62 bool bypassed
:1; /* Temporarily bypass this PG? */
65 /* Multipath context */
67 struct list_head list
;
70 const char *hw_handler_name
;
71 char *hw_handler_params
;
75 unsigned nr_priority_groups
;
76 struct list_head priority_groups
;
78 wait_queue_head_t pg_init_wait
; /* Wait for pg_init completion */
80 struct pgpath
*current_pgpath
;
81 struct priority_group
*current_pg
;
82 struct priority_group
*next_pg
; /* Switch to this PG if set */
84 unsigned long flags
; /* Multipath state flags */
86 unsigned pg_init_retries
; /* Number of times to retry pg_init */
87 unsigned pg_init_delay_msecs
; /* Number of msecs before pg_init retry */
89 atomic_t nr_valid_paths
; /* Total number of usable paths */
90 atomic_t pg_init_in_progress
; /* Only one pg_init allowed at once */
91 atomic_t pg_init_count
; /* Number of times pg_init called */
96 * We must use a mempool of dm_mpath_io structs so that we
97 * can resubmit bios on error.
101 struct mutex work_mutex
;
102 struct work_struct trigger_event
;
104 struct work_struct process_queued_bios
;
105 struct bio_list queued_bios
;
109 * Context information attached to each io we process.
112 struct pgpath
*pgpath
;
116 typedef int (*action_fn
) (struct pgpath
*pgpath
);
118 static struct kmem_cache
*_mpio_cache
;
120 static struct workqueue_struct
*kmultipathd
, *kmpath_handlerd
;
121 static void trigger_event(struct work_struct
*work
);
122 static void activate_path(struct work_struct
*work
);
123 static void process_queued_bios(struct work_struct
*work
);
125 /*-----------------------------------------------
126 * Multipath state flags.
127 *-----------------------------------------------*/
129 #define MPATHF_QUEUE_IO 0 /* Must we queue all I/O? */
130 #define MPATHF_QUEUE_IF_NO_PATH 1 /* Queue I/O if last path fails? */
131 #define MPATHF_SAVED_QUEUE_IF_NO_PATH 2 /* Saved state during suspension */
132 #define MPATHF_RETAIN_ATTACHED_HW_HANDLER 3 /* If there's already a hw_handler present, don't change it. */
133 #define MPATHF_PG_INIT_DISABLED 4 /* pg_init is not currently allowed */
134 #define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */
135 #define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */
137 /*-----------------------------------------------
138 * Allocation routines
139 *-----------------------------------------------*/
141 static struct pgpath
*alloc_pgpath(void)
143 struct pgpath
*pgpath
= kzalloc(sizeof(*pgpath
), GFP_KERNEL
);
146 pgpath
->is_active
= true;
147 INIT_DELAYED_WORK(&pgpath
->activate_path
, activate_path
);
153 static void free_pgpath(struct pgpath
*pgpath
)
158 static struct priority_group
*alloc_priority_group(void)
160 struct priority_group
*pg
;
162 pg
= kzalloc(sizeof(*pg
), GFP_KERNEL
);
165 INIT_LIST_HEAD(&pg
->pgpaths
);
170 static void free_pgpaths(struct list_head
*pgpaths
, struct dm_target
*ti
)
172 struct pgpath
*pgpath
, *tmp
;
174 list_for_each_entry_safe(pgpath
, tmp
, pgpaths
, list
) {
175 list_del(&pgpath
->list
);
176 dm_put_device(ti
, pgpath
->path
.dev
);
181 static void free_priority_group(struct priority_group
*pg
,
182 struct dm_target
*ti
)
184 struct path_selector
*ps
= &pg
->ps
;
187 ps
->type
->destroy(ps
);
188 dm_put_path_selector(ps
->type
);
191 free_pgpaths(&pg
->pgpaths
, ti
);
195 static struct multipath
*alloc_multipath(struct dm_target
*ti
)
199 m
= kzalloc(sizeof(*m
), GFP_KERNEL
);
201 INIT_LIST_HEAD(&m
->priority_groups
);
202 spin_lock_init(&m
->lock
);
203 set_bit(MPATHF_QUEUE_IO
, &m
->flags
);
204 atomic_set(&m
->nr_valid_paths
, 0);
205 atomic_set(&m
->pg_init_in_progress
, 0);
206 atomic_set(&m
->pg_init_count
, 0);
207 m
->pg_init_delay_msecs
= DM_PG_INIT_DELAY_DEFAULT
;
208 INIT_WORK(&m
->trigger_event
, trigger_event
);
209 init_waitqueue_head(&m
->pg_init_wait
);
210 mutex_init(&m
->work_mutex
);
213 m
->queue_mode
= DM_TYPE_NONE
;
222 static int alloc_multipath_stage2(struct dm_target
*ti
, struct multipath
*m
)
224 if (m
->queue_mode
== DM_TYPE_NONE
) {
226 * Default to request-based.
228 if (dm_use_blk_mq(dm_table_get_md(ti
->table
)))
229 m
->queue_mode
= DM_TYPE_MQ_REQUEST_BASED
;
231 m
->queue_mode
= DM_TYPE_REQUEST_BASED
;
234 if (m
->queue_mode
== DM_TYPE_REQUEST_BASED
) {
235 unsigned min_ios
= dm_get_reserved_rq_based_ios();
237 m
->mpio_pool
= mempool_create_slab_pool(min_ios
, _mpio_cache
);
241 else if (m
->queue_mode
== DM_TYPE_BIO_BASED
) {
242 INIT_WORK(&m
->process_queued_bios
, process_queued_bios
);
244 * bio-based doesn't support any direct scsi_dh management;
245 * it just discovers if a scsi_dh is attached.
247 set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER
, &m
->flags
);
250 dm_table_set_type(ti
->table
, m
->queue_mode
);
255 static void free_multipath(struct multipath
*m
)
257 struct priority_group
*pg
, *tmp
;
259 list_for_each_entry_safe(pg
, tmp
, &m
->priority_groups
, list
) {
261 free_priority_group(pg
, m
->ti
);
264 kfree(m
->hw_handler_name
);
265 kfree(m
->hw_handler_params
);
266 mempool_destroy(m
->mpio_pool
);
270 static struct dm_mpath_io
*get_mpio(union map_info
*info
)
275 static struct dm_mpath_io
*set_mpio(struct multipath
*m
, union map_info
*info
)
277 struct dm_mpath_io
*mpio
;
280 /* Use blk-mq pdu memory requested via per_io_data_size */
281 mpio
= get_mpio(info
);
282 memset(mpio
, 0, sizeof(*mpio
));
286 mpio
= mempool_alloc(m
->mpio_pool
, GFP_ATOMIC
);
290 memset(mpio
, 0, sizeof(*mpio
));
296 static void clear_request_fn_mpio(struct multipath
*m
, union map_info
*info
)
298 /* Only needed for non blk-mq (.request_fn) multipath */
300 struct dm_mpath_io
*mpio
= info
->ptr
;
303 mempool_free(mpio
, m
->mpio_pool
);
307 static size_t multipath_per_bio_data_size(void)
309 return sizeof(struct dm_mpath_io
) + sizeof(struct dm_bio_details
);
312 static struct dm_mpath_io
*get_mpio_from_bio(struct bio
*bio
)
314 return dm_per_bio_data(bio
, multipath_per_bio_data_size());
317 static struct dm_bio_details
*get_bio_details_from_bio(struct bio
*bio
)
319 /* dm_bio_details is immediately after the dm_mpath_io in bio's per-bio-data */
320 struct dm_mpath_io
*mpio
= get_mpio_from_bio(bio
);
321 void *bio_details
= mpio
+ 1;
326 static void multipath_init_per_bio_data(struct bio
*bio
, struct dm_mpath_io
**mpio_p
,
327 struct dm_bio_details
**bio_details_p
)
329 struct dm_mpath_io
*mpio
= get_mpio_from_bio(bio
);
330 struct dm_bio_details
*bio_details
= get_bio_details_from_bio(bio
);
332 memset(mpio
, 0, sizeof(*mpio
));
333 memset(bio_details
, 0, sizeof(*bio_details
));
334 dm_bio_record(bio_details
, bio
);
339 *bio_details_p
= bio_details
;
342 /*-----------------------------------------------
344 *-----------------------------------------------*/
346 static int __pg_init_all_paths(struct multipath
*m
)
348 struct pgpath
*pgpath
;
349 unsigned long pg_init_delay
= 0;
351 if (atomic_read(&m
->pg_init_in_progress
) || test_bit(MPATHF_PG_INIT_DISABLED
, &m
->flags
))
354 atomic_inc(&m
->pg_init_count
);
355 clear_bit(MPATHF_PG_INIT_REQUIRED
, &m
->flags
);
357 /* Check here to reset pg_init_required */
361 if (test_bit(MPATHF_PG_INIT_DELAY_RETRY
, &m
->flags
))
362 pg_init_delay
= msecs_to_jiffies(m
->pg_init_delay_msecs
!= DM_PG_INIT_DELAY_DEFAULT
?
363 m
->pg_init_delay_msecs
: DM_PG_INIT_DELAY_MSECS
);
364 list_for_each_entry(pgpath
, &m
->current_pg
->pgpaths
, list
) {
365 /* Skip failed paths */
366 if (!pgpath
->is_active
)
368 if (queue_delayed_work(kmpath_handlerd
, &pgpath
->activate_path
,
370 atomic_inc(&m
->pg_init_in_progress
);
372 return atomic_read(&m
->pg_init_in_progress
);
375 static int pg_init_all_paths(struct multipath
*m
)
380 spin_lock_irqsave(&m
->lock
, flags
);
381 r
= __pg_init_all_paths(m
);
382 spin_unlock_irqrestore(&m
->lock
, flags
);
387 static void __switch_pg(struct multipath
*m
, struct priority_group
*pg
)
391 /* Must we initialise the PG first, and queue I/O till it's ready? */
392 if (m
->hw_handler_name
) {
393 set_bit(MPATHF_PG_INIT_REQUIRED
, &m
->flags
);
394 set_bit(MPATHF_QUEUE_IO
, &m
->flags
);
396 clear_bit(MPATHF_PG_INIT_REQUIRED
, &m
->flags
);
397 clear_bit(MPATHF_QUEUE_IO
, &m
->flags
);
400 atomic_set(&m
->pg_init_count
, 0);
403 static struct pgpath
*choose_path_in_pg(struct multipath
*m
,
404 struct priority_group
*pg
,
408 struct dm_path
*path
;
409 struct pgpath
*pgpath
;
411 path
= pg
->ps
.type
->select_path(&pg
->ps
, nr_bytes
);
413 return ERR_PTR(-ENXIO
);
415 pgpath
= path_to_pgpath(path
);
417 if (unlikely(lockless_dereference(m
->current_pg
) != pg
)) {
418 /* Only update current_pgpath if pg changed */
419 spin_lock_irqsave(&m
->lock
, flags
);
420 m
->current_pgpath
= pgpath
;
422 spin_unlock_irqrestore(&m
->lock
, flags
);
428 static struct pgpath
*choose_pgpath(struct multipath
*m
, size_t nr_bytes
)
431 struct priority_group
*pg
;
432 struct pgpath
*pgpath
;
433 bool bypassed
= true;
435 if (!atomic_read(&m
->nr_valid_paths
)) {
436 clear_bit(MPATHF_QUEUE_IO
, &m
->flags
);
440 /* Were we instructed to switch PG? */
441 if (lockless_dereference(m
->next_pg
)) {
442 spin_lock_irqsave(&m
->lock
, flags
);
445 spin_unlock_irqrestore(&m
->lock
, flags
);
446 goto check_current_pg
;
449 spin_unlock_irqrestore(&m
->lock
, flags
);
450 pgpath
= choose_path_in_pg(m
, pg
, nr_bytes
);
451 if (!IS_ERR_OR_NULL(pgpath
))
455 /* Don't change PG until it has no remaining paths */
457 pg
= lockless_dereference(m
->current_pg
);
459 pgpath
= choose_path_in_pg(m
, pg
, nr_bytes
);
460 if (!IS_ERR_OR_NULL(pgpath
))
465 * Loop through priority groups until we find a valid path.
466 * First time we skip PGs marked 'bypassed'.
467 * Second time we only try the ones we skipped, but set
468 * pg_init_delay_retry so we do not hammer controllers.
471 list_for_each_entry(pg
, &m
->priority_groups
, list
) {
472 if (pg
->bypassed
== bypassed
)
474 pgpath
= choose_path_in_pg(m
, pg
, nr_bytes
);
475 if (!IS_ERR_OR_NULL(pgpath
)) {
477 set_bit(MPATHF_PG_INIT_DELAY_RETRY
, &m
->flags
);
481 } while (bypassed
--);
484 spin_lock_irqsave(&m
->lock
, flags
);
485 m
->current_pgpath
= NULL
;
486 m
->current_pg
= NULL
;
487 spin_unlock_irqrestore(&m
->lock
, flags
);
493 * Check whether bios must be queued in the device-mapper core rather
494 * than here in the target.
496 * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
497 * same value then we are not between multipath_presuspend()
498 * and multipath_resume() calls and we have no need to check
499 * for the DMF_NOFLUSH_SUSPENDING flag.
501 static bool __must_push_back(struct multipath
*m
)
503 return ((test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
) !=
504 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH
, &m
->flags
)) &&
505 dm_noflush_suspending(m
->ti
));
508 static bool must_push_back_rq(struct multipath
*m
)
510 return (test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
) ||
511 __must_push_back(m
));
514 static bool must_push_back_bio(struct multipath
*m
)
516 return __must_push_back(m
);
520 * Map cloned requests (request-based multipath)
522 static int __multipath_map(struct dm_target
*ti
, struct request
*clone
,
523 union map_info
*map_context
,
524 struct request
*rq
, struct request
**__clone
)
526 struct multipath
*m
= ti
->private;
527 int r
= DM_MAPIO_REQUEUE
;
528 size_t nr_bytes
= clone
? blk_rq_bytes(clone
) : blk_rq_bytes(rq
);
529 struct pgpath
*pgpath
;
530 struct block_device
*bdev
;
531 struct dm_mpath_io
*mpio
;
533 /* Do we need to select a new pgpath? */
534 pgpath
= lockless_dereference(m
->current_pgpath
);
535 if (!pgpath
|| !test_bit(MPATHF_QUEUE_IO
, &m
->flags
))
536 pgpath
= choose_pgpath(m
, nr_bytes
);
539 if (!must_push_back_rq(m
))
540 r
= -EIO
; /* Failed */
542 } else if (test_bit(MPATHF_QUEUE_IO
, &m
->flags
) ||
543 test_bit(MPATHF_PG_INIT_REQUIRED
, &m
->flags
)) {
544 pg_init_all_paths(m
);
548 mpio
= set_mpio(m
, map_context
);
550 /* ENOMEM, requeue */
553 mpio
->pgpath
= pgpath
;
554 mpio
->nr_bytes
= nr_bytes
;
556 bdev
= pgpath
->path
.dev
->bdev
;
560 * Old request-based interface: allocated clone is passed in.
561 * Used by: .request_fn stacked on .request_fn path(s).
563 clone
->q
= bdev_get_queue(bdev
);
564 clone
->rq_disk
= bdev
->bd_disk
;
565 clone
->cmd_flags
|= REQ_FAILFAST_TRANSPORT
;
568 * blk-mq request-based interface; used by both:
569 * .request_fn stacked on blk-mq path(s) and
570 * blk-mq stacked on blk-mq path(s).
572 *__clone
= blk_mq_alloc_request(bdev_get_queue(bdev
),
573 rq_data_dir(rq
), BLK_MQ_REQ_NOWAIT
);
574 if (IS_ERR(*__clone
)) {
575 /* ENOMEM, requeue */
576 clear_request_fn_mpio(m
, map_context
);
579 (*__clone
)->bio
= (*__clone
)->biotail
= NULL
;
580 (*__clone
)->rq_disk
= bdev
->bd_disk
;
581 (*__clone
)->cmd_flags
|= REQ_FAILFAST_TRANSPORT
;
584 if (pgpath
->pg
->ps
.type
->start_io
)
585 pgpath
->pg
->ps
.type
->start_io(&pgpath
->pg
->ps
,
588 return DM_MAPIO_REMAPPED
;
591 static int multipath_map(struct dm_target
*ti
, struct request
*clone
,
592 union map_info
*map_context
)
594 return __multipath_map(ti
, clone
, map_context
, NULL
, NULL
);
597 static int multipath_clone_and_map(struct dm_target
*ti
, struct request
*rq
,
598 union map_info
*map_context
,
599 struct request
**clone
)
601 return __multipath_map(ti
, NULL
, map_context
, rq
, clone
);
604 static void multipath_release_clone(struct request
*clone
)
606 blk_mq_free_request(clone
);
610 * Map cloned bios (bio-based multipath)
612 static int __multipath_map_bio(struct multipath
*m
, struct bio
*bio
, struct dm_mpath_io
*mpio
)
614 size_t nr_bytes
= bio
->bi_iter
.bi_size
;
615 struct pgpath
*pgpath
;
619 /* Do we need to select a new pgpath? */
620 pgpath
= lockless_dereference(m
->current_pgpath
);
621 queue_io
= test_bit(MPATHF_QUEUE_IO
, &m
->flags
);
622 if (!pgpath
|| !queue_io
)
623 pgpath
= choose_pgpath(m
, nr_bytes
);
625 if ((pgpath
&& queue_io
) ||
626 (!pgpath
&& test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
))) {
627 /* Queue for the daemon to resubmit */
628 spin_lock_irqsave(&m
->lock
, flags
);
629 bio_list_add(&m
->queued_bios
, bio
);
630 spin_unlock_irqrestore(&m
->lock
, flags
);
631 /* PG_INIT_REQUIRED cannot be set without QUEUE_IO */
632 if (queue_io
|| test_bit(MPATHF_PG_INIT_REQUIRED
, &m
->flags
))
633 pg_init_all_paths(m
);
635 queue_work(kmultipathd
, &m
->process_queued_bios
);
636 return DM_MAPIO_SUBMITTED
;
640 if (!must_push_back_bio(m
))
642 return DM_MAPIO_REQUEUE
;
645 mpio
->pgpath
= pgpath
;
646 mpio
->nr_bytes
= nr_bytes
;
649 bio
->bi_bdev
= pgpath
->path
.dev
->bdev
;
650 bio
->bi_rw
|= REQ_FAILFAST_TRANSPORT
;
652 if (pgpath
->pg
->ps
.type
->start_io
)
653 pgpath
->pg
->ps
.type
->start_io(&pgpath
->pg
->ps
,
656 return DM_MAPIO_REMAPPED
;
659 static int multipath_map_bio(struct dm_target
*ti
, struct bio
*bio
)
661 struct multipath
*m
= ti
->private;
662 struct dm_mpath_io
*mpio
= NULL
;
664 multipath_init_per_bio_data(bio
, &mpio
, NULL
);
666 return __multipath_map_bio(m
, bio
, mpio
);
669 static void process_queued_bios_list(struct multipath
*m
)
671 if (m
->queue_mode
== DM_TYPE_BIO_BASED
)
672 queue_work(kmultipathd
, &m
->process_queued_bios
);
675 static void process_queued_bios(struct work_struct
*work
)
680 struct bio_list bios
;
681 struct blk_plug plug
;
682 struct multipath
*m
=
683 container_of(work
, struct multipath
, process_queued_bios
);
685 bio_list_init(&bios
);
687 spin_lock_irqsave(&m
->lock
, flags
);
689 if (bio_list_empty(&m
->queued_bios
)) {
690 spin_unlock_irqrestore(&m
->lock
, flags
);
694 bio_list_merge(&bios
, &m
->queued_bios
);
695 bio_list_init(&m
->queued_bios
);
697 spin_unlock_irqrestore(&m
->lock
, flags
);
699 blk_start_plug(&plug
);
700 while ((bio
= bio_list_pop(&bios
))) {
701 r
= __multipath_map_bio(m
, bio
, get_mpio_from_bio(bio
));
702 if (r
< 0 || r
== DM_MAPIO_REQUEUE
) {
705 } else if (r
== DM_MAPIO_REMAPPED
)
706 generic_make_request(bio
);
708 blk_finish_plug(&plug
);
712 * If we run out of usable paths, should we queue I/O or error it?
714 static int queue_if_no_path(struct multipath
*m
, bool queue_if_no_path
,
719 spin_lock_irqsave(&m
->lock
, flags
);
721 if (save_old_value
) {
722 if (test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
))
723 set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH
, &m
->flags
);
725 clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH
, &m
->flags
);
727 if (queue_if_no_path
)
728 set_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH
, &m
->flags
);
730 clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH
, &m
->flags
);
732 if (queue_if_no_path
)
733 set_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
);
735 clear_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
);
737 spin_unlock_irqrestore(&m
->lock
, flags
);
739 if (!queue_if_no_path
) {
740 dm_table_run_md_queue_async(m
->ti
->table
);
741 process_queued_bios_list(m
);
748 * An event is triggered whenever a path is taken out of use.
749 * Includes path failure and PG bypass.
751 static void trigger_event(struct work_struct
*work
)
753 struct multipath
*m
=
754 container_of(work
, struct multipath
, trigger_event
);
756 dm_table_event(m
->ti
->table
);
759 /*-----------------------------------------------------------------
760 * Constructor/argument parsing:
761 * <#multipath feature args> [<arg>]*
762 * <#hw_handler args> [hw_handler [<arg>]*]
764 * <initial priority group>
765 * [<selector> <#selector args> [<arg>]*
766 * <#paths> <#per-path selector args>
767 * [<path> [<arg>]* ]+ ]+
768 *---------------------------------------------------------------*/
769 static int parse_path_selector(struct dm_arg_set
*as
, struct priority_group
*pg
,
770 struct dm_target
*ti
)
773 struct path_selector_type
*pst
;
776 static struct dm_arg _args
[] = {
777 {0, 1024, "invalid number of path selector args"},
780 pst
= dm_get_path_selector(dm_shift_arg(as
));
782 ti
->error
= "unknown path selector type";
786 r
= dm_read_arg_group(_args
, as
, &ps_argc
, &ti
->error
);
788 dm_put_path_selector(pst
);
792 r
= pst
->create(&pg
->ps
, ps_argc
, as
->argv
);
794 dm_put_path_selector(pst
);
795 ti
->error
= "path selector constructor failed";
800 dm_consume_args(as
, ps_argc
);
805 static struct pgpath
*parse_path(struct dm_arg_set
*as
, struct path_selector
*ps
,
806 struct dm_target
*ti
)
810 struct multipath
*m
= ti
->private;
811 struct request_queue
*q
= NULL
;
812 const char *attached_handler_name
;
814 /* we need at least a path arg */
816 ti
->error
= "no device given";
817 return ERR_PTR(-EINVAL
);
822 return ERR_PTR(-ENOMEM
);
824 r
= dm_get_device(ti
, dm_shift_arg(as
), dm_table_get_mode(ti
->table
),
827 ti
->error
= "error getting device";
831 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER
, &m
->flags
) || m
->hw_handler_name
)
832 q
= bdev_get_queue(p
->path
.dev
->bdev
);
834 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER
, &m
->flags
)) {
836 attached_handler_name
= scsi_dh_attached_handler_name(q
, GFP_KERNEL
);
837 if (attached_handler_name
) {
839 * Reset hw_handler_name to match the attached handler
840 * and clear any hw_handler_params associated with the
843 * NB. This modifies the table line to show the actual
844 * handler instead of the original table passed in.
846 kfree(m
->hw_handler_name
);
847 m
->hw_handler_name
= attached_handler_name
;
849 kfree(m
->hw_handler_params
);
850 m
->hw_handler_params
= NULL
;
854 if (m
->hw_handler_name
) {
855 r
= scsi_dh_attach(q
, m
->hw_handler_name
);
857 char b
[BDEVNAME_SIZE
];
859 printk(KERN_INFO
"dm-mpath: retaining handler on device %s\n",
860 bdevname(p
->path
.dev
->bdev
, b
));
864 ti
->error
= "error attaching hardware handler";
865 dm_put_device(ti
, p
->path
.dev
);
869 if (m
->hw_handler_params
) {
870 r
= scsi_dh_set_params(q
, m
->hw_handler_params
);
872 ti
->error
= "unable to set hardware "
873 "handler parameters";
874 dm_put_device(ti
, p
->path
.dev
);
880 r
= ps
->type
->add_path(ps
, &p
->path
, as
->argc
, as
->argv
, &ti
->error
);
882 dm_put_device(ti
, p
->path
.dev
);
893 static struct priority_group
*parse_priority_group(struct dm_arg_set
*as
,
896 static struct dm_arg _args
[] = {
897 {1, 1024, "invalid number of paths"},
898 {0, 1024, "invalid number of selector args"}
902 unsigned i
, nr_selector_args
, nr_args
;
903 struct priority_group
*pg
;
904 struct dm_target
*ti
= m
->ti
;
908 ti
->error
= "not enough priority group arguments";
909 return ERR_PTR(-EINVAL
);
912 pg
= alloc_priority_group();
914 ti
->error
= "couldn't allocate priority group";
915 return ERR_PTR(-ENOMEM
);
919 r
= parse_path_selector(as
, pg
, ti
);
926 r
= dm_read_arg(_args
, as
, &pg
->nr_pgpaths
, &ti
->error
);
930 r
= dm_read_arg(_args
+ 1, as
, &nr_selector_args
, &ti
->error
);
934 nr_args
= 1 + nr_selector_args
;
935 for (i
= 0; i
< pg
->nr_pgpaths
; i
++) {
936 struct pgpath
*pgpath
;
937 struct dm_arg_set path_args
;
939 if (as
->argc
< nr_args
) {
940 ti
->error
= "not enough path parameters";
945 path_args
.argc
= nr_args
;
946 path_args
.argv
= as
->argv
;
948 pgpath
= parse_path(&path_args
, &pg
->ps
, ti
);
949 if (IS_ERR(pgpath
)) {
955 list_add_tail(&pgpath
->list
, &pg
->pgpaths
);
956 dm_consume_args(as
, nr_args
);
962 free_priority_group(pg
, ti
);
966 static int parse_hw_handler(struct dm_arg_set
*as
, struct multipath
*m
)
970 struct dm_target
*ti
= m
->ti
;
972 static struct dm_arg _args
[] = {
973 {0, 1024, "invalid number of hardware handler args"},
976 if (dm_read_arg_group(_args
, as
, &hw_argc
, &ti
->error
))
982 if (m
->queue_mode
== DM_TYPE_BIO_BASED
) {
983 dm_consume_args(as
, hw_argc
);
984 DMERR("bio-based multipath doesn't allow hardware handler args");
988 m
->hw_handler_name
= kstrdup(dm_shift_arg(as
), GFP_KERNEL
);
994 for (i
= 0; i
<= hw_argc
- 2; i
++)
995 len
+= strlen(as
->argv
[i
]) + 1;
996 p
= m
->hw_handler_params
= kzalloc(len
, GFP_KERNEL
);
998 ti
->error
= "memory allocation failed";
1002 j
= sprintf(p
, "%d", hw_argc
- 1);
1003 for (i
= 0, p
+=j
+1; i
<= hw_argc
- 2; i
++, p
+=j
+1)
1004 j
= sprintf(p
, "%s", as
->argv
[i
]);
1006 dm_consume_args(as
, hw_argc
- 1);
1010 kfree(m
->hw_handler_name
);
1011 m
->hw_handler_name
= NULL
;
1015 static int parse_features(struct dm_arg_set
*as
, struct multipath
*m
)
1019 struct dm_target
*ti
= m
->ti
;
1020 const char *arg_name
;
1022 static struct dm_arg _args
[] = {
1023 {0, 8, "invalid number of feature args"},
1024 {1, 50, "pg_init_retries must be between 1 and 50"},
1025 {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
1028 r
= dm_read_arg_group(_args
, as
, &argc
, &ti
->error
);
1036 arg_name
= dm_shift_arg(as
);
1039 if (!strcasecmp(arg_name
, "queue_if_no_path")) {
1040 r
= queue_if_no_path(m
, true, false);
1044 if (!strcasecmp(arg_name
, "retain_attached_hw_handler")) {
1045 set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER
, &m
->flags
);
1049 if (!strcasecmp(arg_name
, "pg_init_retries") &&
1051 r
= dm_read_arg(_args
+ 1, as
, &m
->pg_init_retries
, &ti
->error
);
1056 if (!strcasecmp(arg_name
, "pg_init_delay_msecs") &&
1058 r
= dm_read_arg(_args
+ 2, as
, &m
->pg_init_delay_msecs
, &ti
->error
);
1063 if (!strcasecmp(arg_name
, "queue_mode") &&
1065 const char *queue_mode_name
= dm_shift_arg(as
);
1067 if (!strcasecmp(queue_mode_name
, "bio"))
1068 m
->queue_mode
= DM_TYPE_BIO_BASED
;
1069 else if (!strcasecmp(queue_mode_name
, "rq"))
1070 m
->queue_mode
= DM_TYPE_REQUEST_BASED
;
1071 else if (!strcasecmp(queue_mode_name
, "mq"))
1072 m
->queue_mode
= DM_TYPE_MQ_REQUEST_BASED
;
1074 ti
->error
= "Unknown 'queue_mode' requested";
1081 ti
->error
= "Unrecognised multipath feature request";
1083 } while (argc
&& !r
);
1088 static int multipath_ctr(struct dm_target
*ti
, unsigned argc
, char **argv
)
1090 /* target arguments */
1091 static struct dm_arg _args
[] = {
1092 {0, 1024, "invalid number of priority groups"},
1093 {0, 1024, "invalid initial priority group number"},
1097 struct multipath
*m
;
1098 struct dm_arg_set as
;
1099 unsigned pg_count
= 0;
1100 unsigned next_pg_num
;
1105 m
= alloc_multipath(ti
);
1107 ti
->error
= "can't allocate multipath";
1111 r
= parse_features(&as
, m
);
1115 r
= alloc_multipath_stage2(ti
, m
);
1119 r
= parse_hw_handler(&as
, m
);
1123 r
= dm_read_arg(_args
, &as
, &m
->nr_priority_groups
, &ti
->error
);
1127 r
= dm_read_arg(_args
+ 1, &as
, &next_pg_num
, &ti
->error
);
1131 if ((!m
->nr_priority_groups
&& next_pg_num
) ||
1132 (m
->nr_priority_groups
&& !next_pg_num
)) {
1133 ti
->error
= "invalid initial priority group";
1138 /* parse the priority groups */
1140 struct priority_group
*pg
;
1141 unsigned nr_valid_paths
= atomic_read(&m
->nr_valid_paths
);
1143 pg
= parse_priority_group(&as
, m
);
1149 nr_valid_paths
+= pg
->nr_pgpaths
;
1150 atomic_set(&m
->nr_valid_paths
, nr_valid_paths
);
1152 list_add_tail(&pg
->list
, &m
->priority_groups
);
1154 pg
->pg_num
= pg_count
;
1159 if (pg_count
!= m
->nr_priority_groups
) {
1160 ti
->error
= "priority group count mismatch";
1165 ti
->num_flush_bios
= 1;
1166 ti
->num_discard_bios
= 1;
1167 ti
->num_write_same_bios
= 1;
1168 if (m
->queue_mode
== DM_TYPE_BIO_BASED
)
1169 ti
->per_io_data_size
= multipath_per_bio_data_size();
1170 else if (m
->queue_mode
== DM_TYPE_MQ_REQUEST_BASED
)
1171 ti
->per_io_data_size
= sizeof(struct dm_mpath_io
);
1180 static void multipath_wait_for_pg_init_completion(struct multipath
*m
)
1182 DECLARE_WAITQUEUE(wait
, current
);
1184 add_wait_queue(&m
->pg_init_wait
, &wait
);
1187 set_current_state(TASK_UNINTERRUPTIBLE
);
1189 if (!atomic_read(&m
->pg_init_in_progress
))
1194 set_current_state(TASK_RUNNING
);
1196 remove_wait_queue(&m
->pg_init_wait
, &wait
);
1199 static void flush_multipath_work(struct multipath
*m
)
1201 set_bit(MPATHF_PG_INIT_DISABLED
, &m
->flags
);
1202 smp_mb__after_atomic();
1204 flush_workqueue(kmpath_handlerd
);
1205 multipath_wait_for_pg_init_completion(m
);
1206 flush_workqueue(kmultipathd
);
1207 flush_work(&m
->trigger_event
);
1209 clear_bit(MPATHF_PG_INIT_DISABLED
, &m
->flags
);
1210 smp_mb__after_atomic();
1213 static void multipath_dtr(struct dm_target
*ti
)
1215 struct multipath
*m
= ti
->private;
1217 flush_multipath_work(m
);
1222 * Take a path out of use.
1224 static int fail_path(struct pgpath
*pgpath
)
1226 unsigned long flags
;
1227 struct multipath
*m
= pgpath
->pg
->m
;
1229 spin_lock_irqsave(&m
->lock
, flags
);
1231 if (!pgpath
->is_active
)
1234 DMWARN("Failing path %s.", pgpath
->path
.dev
->name
);
1236 pgpath
->pg
->ps
.type
->fail_path(&pgpath
->pg
->ps
, &pgpath
->path
);
1237 pgpath
->is_active
= false;
1238 pgpath
->fail_count
++;
1240 atomic_dec(&m
->nr_valid_paths
);
1242 if (pgpath
== m
->current_pgpath
)
1243 m
->current_pgpath
= NULL
;
1245 dm_path_uevent(DM_UEVENT_PATH_FAILED
, m
->ti
,
1246 pgpath
->path
.dev
->name
, atomic_read(&m
->nr_valid_paths
));
1248 schedule_work(&m
->trigger_event
);
1251 spin_unlock_irqrestore(&m
->lock
, flags
);
1257 * Reinstate a previously-failed path
1259 static int reinstate_path(struct pgpath
*pgpath
)
1261 int r
= 0, run_queue
= 0;
1262 unsigned long flags
;
1263 struct multipath
*m
= pgpath
->pg
->m
;
1264 unsigned nr_valid_paths
;
1266 spin_lock_irqsave(&m
->lock
, flags
);
1268 if (pgpath
->is_active
)
1271 DMWARN("Reinstating path %s.", pgpath
->path
.dev
->name
);
1273 r
= pgpath
->pg
->ps
.type
->reinstate_path(&pgpath
->pg
->ps
, &pgpath
->path
);
1277 pgpath
->is_active
= true;
1279 nr_valid_paths
= atomic_inc_return(&m
->nr_valid_paths
);
1280 if (nr_valid_paths
== 1) {
1281 m
->current_pgpath
= NULL
;
1283 } else if (m
->hw_handler_name
&& (m
->current_pg
== pgpath
->pg
)) {
1284 if (queue_work(kmpath_handlerd
, &pgpath
->activate_path
.work
))
1285 atomic_inc(&m
->pg_init_in_progress
);
1288 dm_path_uevent(DM_UEVENT_PATH_REINSTATED
, m
->ti
,
1289 pgpath
->path
.dev
->name
, nr_valid_paths
);
1291 schedule_work(&m
->trigger_event
);
1294 spin_unlock_irqrestore(&m
->lock
, flags
);
1296 dm_table_run_md_queue_async(m
->ti
->table
);
1297 process_queued_bios_list(m
);
1304 * Fail or reinstate all paths that match the provided struct dm_dev.
1306 static int action_dev(struct multipath
*m
, struct dm_dev
*dev
,
1310 struct pgpath
*pgpath
;
1311 struct priority_group
*pg
;
1313 list_for_each_entry(pg
, &m
->priority_groups
, list
) {
1314 list_for_each_entry(pgpath
, &pg
->pgpaths
, list
) {
1315 if (pgpath
->path
.dev
== dev
)
1324 * Temporarily try to avoid having to use the specified PG
1326 static void bypass_pg(struct multipath
*m
, struct priority_group
*pg
,
1329 unsigned long flags
;
1331 spin_lock_irqsave(&m
->lock
, flags
);
1333 pg
->bypassed
= bypassed
;
1334 m
->current_pgpath
= NULL
;
1335 m
->current_pg
= NULL
;
1337 spin_unlock_irqrestore(&m
->lock
, flags
);
1339 schedule_work(&m
->trigger_event
);
1343 * Switch to using the specified PG from the next I/O that gets mapped
1345 static int switch_pg_num(struct multipath
*m
, const char *pgstr
)
1347 struct priority_group
*pg
;
1349 unsigned long flags
;
1352 if (!pgstr
|| (sscanf(pgstr
, "%u%c", &pgnum
, &dummy
) != 1) || !pgnum
||
1353 (pgnum
> m
->nr_priority_groups
)) {
1354 DMWARN("invalid PG number supplied to switch_pg_num");
1358 spin_lock_irqsave(&m
->lock
, flags
);
1359 list_for_each_entry(pg
, &m
->priority_groups
, list
) {
1360 pg
->bypassed
= false;
1364 m
->current_pgpath
= NULL
;
1365 m
->current_pg
= NULL
;
1368 spin_unlock_irqrestore(&m
->lock
, flags
);
1370 schedule_work(&m
->trigger_event
);
1375 * Set/clear bypassed status of a PG.
1376 * PGs are numbered upwards from 1 in the order they were declared.
1378 static int bypass_pg_num(struct multipath
*m
, const char *pgstr
, bool bypassed
)
1380 struct priority_group
*pg
;
1384 if (!pgstr
|| (sscanf(pgstr
, "%u%c", &pgnum
, &dummy
) != 1) || !pgnum
||
1385 (pgnum
> m
->nr_priority_groups
)) {
1386 DMWARN("invalid PG number supplied to bypass_pg");
1390 list_for_each_entry(pg
, &m
->priority_groups
, list
) {
1395 bypass_pg(m
, pg
, bypassed
);
1400 * Should we retry pg_init immediately?
1402 static bool pg_init_limit_reached(struct multipath
*m
, struct pgpath
*pgpath
)
1404 unsigned long flags
;
1405 bool limit_reached
= false;
1407 spin_lock_irqsave(&m
->lock
, flags
);
1409 if (atomic_read(&m
->pg_init_count
) <= m
->pg_init_retries
&&
1410 !test_bit(MPATHF_PG_INIT_DISABLED
, &m
->flags
))
1411 set_bit(MPATHF_PG_INIT_REQUIRED
, &m
->flags
);
1413 limit_reached
= true;
1415 spin_unlock_irqrestore(&m
->lock
, flags
);
1417 return limit_reached
;
1420 static void pg_init_done(void *data
, int errors
)
1422 struct pgpath
*pgpath
= data
;
1423 struct priority_group
*pg
= pgpath
->pg
;
1424 struct multipath
*m
= pg
->m
;
1425 unsigned long flags
;
1426 bool delay_retry
= false;
1428 /* device or driver problems */
1433 if (!m
->hw_handler_name
) {
1437 DMERR("Could not failover the device: Handler scsi_dh_%s "
1438 "Error %d.", m
->hw_handler_name
, errors
);
1440 * Fail path for now, so we do not ping pong
1444 case SCSI_DH_DEV_TEMP_BUSY
:
1446 * Probably doing something like FW upgrade on the
1447 * controller so try the other pg.
1449 bypass_pg(m
, pg
, true);
1452 /* Wait before retrying. */
1454 case SCSI_DH_IMM_RETRY
:
1455 case SCSI_DH_RES_TEMP_UNAVAIL
:
1456 if (pg_init_limit_reached(m
, pgpath
))
1460 case SCSI_DH_DEV_OFFLINED
:
1463 * We probably do not want to fail the path for a device
1464 * error, but this is what the old dm did. In future
1465 * patches we can do more advanced handling.
1470 spin_lock_irqsave(&m
->lock
, flags
);
1472 if (pgpath
== m
->current_pgpath
) {
1473 DMERR("Could not failover device. Error %d.", errors
);
1474 m
->current_pgpath
= NULL
;
1475 m
->current_pg
= NULL
;
1477 } else if (!test_bit(MPATHF_PG_INIT_REQUIRED
, &m
->flags
))
1478 pg
->bypassed
= false;
1480 if (atomic_dec_return(&m
->pg_init_in_progress
) > 0)
1481 /* Activations of other paths are still on going */
1484 if (test_bit(MPATHF_PG_INIT_REQUIRED
, &m
->flags
)) {
1486 set_bit(MPATHF_PG_INIT_DELAY_RETRY
, &m
->flags
);
1488 clear_bit(MPATHF_PG_INIT_DELAY_RETRY
, &m
->flags
);
1490 if (__pg_init_all_paths(m
))
1493 clear_bit(MPATHF_QUEUE_IO
, &m
->flags
);
1495 process_queued_bios_list(m
);
1498 * Wake up any thread waiting to suspend.
1500 wake_up(&m
->pg_init_wait
);
1503 spin_unlock_irqrestore(&m
->lock
, flags
);
1506 static void activate_path(struct work_struct
*work
)
1508 struct pgpath
*pgpath
=
1509 container_of(work
, struct pgpath
, activate_path
.work
);
1511 if (pgpath
->is_active
)
1512 scsi_dh_activate(bdev_get_queue(pgpath
->path
.dev
->bdev
),
1513 pg_init_done
, pgpath
);
1515 pg_init_done(pgpath
, SCSI_DH_DEV_OFFLINED
);
1518 static int noretry_error(int error
)
1529 /* Anything else could be a path failure, so should be retried */
1536 static int do_end_io(struct multipath
*m
, struct request
*clone
,
1537 int error
, struct dm_mpath_io
*mpio
)
1540 * We don't queue any clone request inside the multipath target
1541 * during end I/O handling, since those clone requests don't have
1542 * bio clones. If we queue them inside the multipath target,
1543 * we need to make bio clones, that requires memory allocation.
1544 * (See drivers/md/dm-rq.c:end_clone_bio() about why the clone requests
1545 * don't have bio clones.)
1546 * Instead of queueing the clone request here, we queue the original
1547 * request into dm core, which will remake a clone request and
1548 * clone bios for it and resubmit it later.
1550 int r
= DM_ENDIO_REQUEUE
;
1552 if (!error
&& !clone
->errors
)
1553 return 0; /* I/O complete */
1555 if (noretry_error(error
))
1559 fail_path(mpio
->pgpath
);
1561 if (!atomic_read(&m
->nr_valid_paths
)) {
1562 if (!test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
)) {
1563 if (!must_push_back_rq(m
))
1566 if (error
== -EBADE
)
1574 static int multipath_end_io(struct dm_target
*ti
, struct request
*clone
,
1575 int error
, union map_info
*map_context
)
1577 struct multipath
*m
= ti
->private;
1578 struct dm_mpath_io
*mpio
= get_mpio(map_context
);
1579 struct pgpath
*pgpath
;
1580 struct path_selector
*ps
;
1585 r
= do_end_io(m
, clone
, error
, mpio
);
1586 pgpath
= mpio
->pgpath
;
1588 ps
= &pgpath
->pg
->ps
;
1589 if (ps
->type
->end_io
)
1590 ps
->type
->end_io(ps
, &pgpath
->path
, mpio
->nr_bytes
);
1592 clear_request_fn_mpio(m
, map_context
);
1597 static int do_end_io_bio(struct multipath
*m
, struct bio
*clone
,
1598 int error
, struct dm_mpath_io
*mpio
)
1600 unsigned long flags
;
1603 return 0; /* I/O complete */
1605 if (noretry_error(error
))
1609 fail_path(mpio
->pgpath
);
1611 if (!atomic_read(&m
->nr_valid_paths
)) {
1612 if (!test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
)) {
1613 if (!must_push_back_bio(m
))
1615 return DM_ENDIO_REQUEUE
;
1617 if (error
== -EBADE
)
1622 /* Queue for the daemon to resubmit */
1623 dm_bio_restore(get_bio_details_from_bio(clone
), clone
);
1625 spin_lock_irqsave(&m
->lock
, flags
);
1626 bio_list_add(&m
->queued_bios
, clone
);
1627 spin_unlock_irqrestore(&m
->lock
, flags
);
1628 if (!test_bit(MPATHF_QUEUE_IO
, &m
->flags
))
1629 queue_work(kmultipathd
, &m
->process_queued_bios
);
1631 return DM_ENDIO_INCOMPLETE
;
1634 static int multipath_end_io_bio(struct dm_target
*ti
, struct bio
*clone
, int error
)
1636 struct multipath
*m
= ti
->private;
1637 struct dm_mpath_io
*mpio
= get_mpio_from_bio(clone
);
1638 struct pgpath
*pgpath
;
1639 struct path_selector
*ps
;
1644 r
= do_end_io_bio(m
, clone
, error
, mpio
);
1645 pgpath
= mpio
->pgpath
;
1647 ps
= &pgpath
->pg
->ps
;
1648 if (ps
->type
->end_io
)
1649 ps
->type
->end_io(ps
, &pgpath
->path
, mpio
->nr_bytes
);
1656 * Suspend can't complete until all the I/O is processed so if
1657 * the last path fails we must error any remaining I/O.
1658 * Note that if the freeze_bdev fails while suspending, the
1659 * queue_if_no_path state is lost - userspace should reset it.
1661 static void multipath_presuspend(struct dm_target
*ti
)
1663 struct multipath
*m
= ti
->private;
1665 queue_if_no_path(m
, false, true);
1668 static void multipath_postsuspend(struct dm_target
*ti
)
1670 struct multipath
*m
= ti
->private;
1672 mutex_lock(&m
->work_mutex
);
1673 flush_multipath_work(m
);
1674 mutex_unlock(&m
->work_mutex
);
1678 * Restore the queue_if_no_path setting.
1680 static void multipath_resume(struct dm_target
*ti
)
1682 struct multipath
*m
= ti
->private;
1684 if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH
, &m
->flags
))
1685 set_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
);
1687 clear_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
);
1688 smp_mb__after_atomic();
1692 * Info output has the following format:
1693 * num_multipath_feature_args [multipath_feature_args]*
1694 * num_handler_status_args [handler_status_args]*
1695 * num_groups init_group_number
1696 * [A|D|E num_ps_status_args [ps_status_args]*
1697 * num_paths num_selector_args
1698 * [path_dev A|F fail_count [selector_args]* ]+ ]+
1700 * Table output has the following format (identical to the constructor string):
1701 * num_feature_args [features_args]*
1702 * num_handler_args hw_handler [hw_handler_args]*
1703 * num_groups init_group_number
1704 * [priority selector-name num_ps_args [ps_args]*
1705 * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1707 static void multipath_status(struct dm_target
*ti
, status_type_t type
,
1708 unsigned status_flags
, char *result
, unsigned maxlen
)
1711 unsigned long flags
;
1712 struct multipath
*m
= ti
->private;
1713 struct priority_group
*pg
;
1718 spin_lock_irqsave(&m
->lock
, flags
);
1721 if (type
== STATUSTYPE_INFO
)
1722 DMEMIT("2 %u %u ", test_bit(MPATHF_QUEUE_IO
, &m
->flags
),
1723 atomic_read(&m
->pg_init_count
));
1725 DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
) +
1726 (m
->pg_init_retries
> 0) * 2 +
1727 (m
->pg_init_delay_msecs
!= DM_PG_INIT_DELAY_DEFAULT
) * 2 +
1728 test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER
, &m
->flags
) +
1729 (m
->queue_mode
!= DM_TYPE_REQUEST_BASED
) * 2);
1731 if (test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
))
1732 DMEMIT("queue_if_no_path ");
1733 if (m
->pg_init_retries
)
1734 DMEMIT("pg_init_retries %u ", m
->pg_init_retries
);
1735 if (m
->pg_init_delay_msecs
!= DM_PG_INIT_DELAY_DEFAULT
)
1736 DMEMIT("pg_init_delay_msecs %u ", m
->pg_init_delay_msecs
);
1737 if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER
, &m
->flags
))
1738 DMEMIT("retain_attached_hw_handler ");
1739 if (m
->queue_mode
!= DM_TYPE_REQUEST_BASED
) {
1740 switch(m
->queue_mode
) {
1741 case DM_TYPE_BIO_BASED
:
1742 DMEMIT("queue_mode bio ");
1744 case DM_TYPE_MQ_REQUEST_BASED
:
1745 DMEMIT("queue_mode mq ");
1751 if (!m
->hw_handler_name
|| type
== STATUSTYPE_INFO
)
1754 DMEMIT("1 %s ", m
->hw_handler_name
);
1756 DMEMIT("%u ", m
->nr_priority_groups
);
1759 pg_num
= m
->next_pg
->pg_num
;
1760 else if (m
->current_pg
)
1761 pg_num
= m
->current_pg
->pg_num
;
1763 pg_num
= (m
->nr_priority_groups
? 1 : 0);
1765 DMEMIT("%u ", pg_num
);
1768 case STATUSTYPE_INFO
:
1769 list_for_each_entry(pg
, &m
->priority_groups
, list
) {
1771 state
= 'D'; /* Disabled */
1772 else if (pg
== m
->current_pg
)
1773 state
= 'A'; /* Currently Active */
1775 state
= 'E'; /* Enabled */
1777 DMEMIT("%c ", state
);
1779 if (pg
->ps
.type
->status
)
1780 sz
+= pg
->ps
.type
->status(&pg
->ps
, NULL
, type
,
1786 DMEMIT("%u %u ", pg
->nr_pgpaths
,
1787 pg
->ps
.type
->info_args
);
1789 list_for_each_entry(p
, &pg
->pgpaths
, list
) {
1790 DMEMIT("%s %s %u ", p
->path
.dev
->name
,
1791 p
->is_active
? "A" : "F",
1793 if (pg
->ps
.type
->status
)
1794 sz
+= pg
->ps
.type
->status(&pg
->ps
,
1795 &p
->path
, type
, result
+ sz
,
1801 case STATUSTYPE_TABLE
:
1802 list_for_each_entry(pg
, &m
->priority_groups
, list
) {
1803 DMEMIT("%s ", pg
->ps
.type
->name
);
1805 if (pg
->ps
.type
->status
)
1806 sz
+= pg
->ps
.type
->status(&pg
->ps
, NULL
, type
,
1812 DMEMIT("%u %u ", pg
->nr_pgpaths
,
1813 pg
->ps
.type
->table_args
);
1815 list_for_each_entry(p
, &pg
->pgpaths
, list
) {
1816 DMEMIT("%s ", p
->path
.dev
->name
);
1817 if (pg
->ps
.type
->status
)
1818 sz
+= pg
->ps
.type
->status(&pg
->ps
,
1819 &p
->path
, type
, result
+ sz
,
1826 spin_unlock_irqrestore(&m
->lock
, flags
);
1829 static int multipath_message(struct dm_target
*ti
, unsigned argc
, char **argv
)
1833 struct multipath
*m
= ti
->private;
1836 mutex_lock(&m
->work_mutex
);
1838 if (dm_suspended(ti
)) {
1844 if (!strcasecmp(argv
[0], "queue_if_no_path")) {
1845 r
= queue_if_no_path(m
, true, false);
1847 } else if (!strcasecmp(argv
[0], "fail_if_no_path")) {
1848 r
= queue_if_no_path(m
, false, false);
1854 DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc
);
1858 if (!strcasecmp(argv
[0], "disable_group")) {
1859 r
= bypass_pg_num(m
, argv
[1], true);
1861 } else if (!strcasecmp(argv
[0], "enable_group")) {
1862 r
= bypass_pg_num(m
, argv
[1], false);
1864 } else if (!strcasecmp(argv
[0], "switch_group")) {
1865 r
= switch_pg_num(m
, argv
[1]);
1867 } else if (!strcasecmp(argv
[0], "reinstate_path"))
1868 action
= reinstate_path
;
1869 else if (!strcasecmp(argv
[0], "fail_path"))
1872 DMWARN("Unrecognised multipath message received: %s", argv
[0]);
1876 r
= dm_get_device(ti
, argv
[1], dm_table_get_mode(ti
->table
), &dev
);
1878 DMWARN("message: error getting device %s",
1883 r
= action_dev(m
, dev
, action
);
1885 dm_put_device(ti
, dev
);
1888 mutex_unlock(&m
->work_mutex
);
1892 static int multipath_prepare_ioctl(struct dm_target
*ti
,
1893 struct block_device
**bdev
, fmode_t
*mode
)
1895 struct multipath
*m
= ti
->private;
1896 struct pgpath
*current_pgpath
;
1899 current_pgpath
= lockless_dereference(m
->current_pgpath
);
1900 if (!current_pgpath
)
1901 current_pgpath
= choose_pgpath(m
, 0);
1903 if (current_pgpath
) {
1904 if (!test_bit(MPATHF_QUEUE_IO
, &m
->flags
)) {
1905 *bdev
= current_pgpath
->path
.dev
->bdev
;
1906 *mode
= current_pgpath
->path
.dev
->mode
;
1909 /* pg_init has not started or completed */
1913 /* No path is available */
1914 if (test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
))
1920 if (r
== -ENOTCONN
) {
1921 if (!lockless_dereference(m
->current_pg
)) {
1922 /* Path status changed, redo selection */
1923 (void) choose_pgpath(m
, 0);
1925 if (test_bit(MPATHF_PG_INIT_REQUIRED
, &m
->flags
))
1926 pg_init_all_paths(m
);
1927 dm_table_run_md_queue_async(m
->ti
->table
);
1928 process_queued_bios_list(m
);
1932 * Only pass ioctls through if the device sizes match exactly.
1934 if (!r
&& ti
->len
!= i_size_read((*bdev
)->bd_inode
) >> SECTOR_SHIFT
)
1939 static int multipath_iterate_devices(struct dm_target
*ti
,
1940 iterate_devices_callout_fn fn
, void *data
)
1942 struct multipath
*m
= ti
->private;
1943 struct priority_group
*pg
;
1947 list_for_each_entry(pg
, &m
->priority_groups
, list
) {
1948 list_for_each_entry(p
, &pg
->pgpaths
, list
) {
1949 ret
= fn(ti
, p
->path
.dev
, ti
->begin
, ti
->len
, data
);
1959 static int pgpath_busy(struct pgpath
*pgpath
)
1961 struct request_queue
*q
= bdev_get_queue(pgpath
->path
.dev
->bdev
);
1963 return blk_lld_busy(q
);
1967 * We return "busy", only when we can map I/Os but underlying devices
1968 * are busy (so even if we map I/Os now, the I/Os will wait on
1969 * the underlying queue).
1970 * In other words, if we want to kill I/Os or queue them inside us
1971 * due to map unavailability, we don't return "busy". Otherwise,
1972 * dm core won't give us the I/Os and we can't do what we want.
1974 static int multipath_busy(struct dm_target
*ti
)
1976 bool busy
= false, has_active
= false;
1977 struct multipath
*m
= ti
->private;
1978 struct priority_group
*pg
, *next_pg
;
1979 struct pgpath
*pgpath
;
1981 /* pg_init in progress or no paths available */
1982 if (atomic_read(&m
->pg_init_in_progress
) ||
1983 (!atomic_read(&m
->nr_valid_paths
) && test_bit(MPATHF_QUEUE_IF_NO_PATH
, &m
->flags
)))
1986 /* Guess which priority_group will be used at next mapping time */
1987 pg
= lockless_dereference(m
->current_pg
);
1988 next_pg
= lockless_dereference(m
->next_pg
);
1989 if (unlikely(!lockless_dereference(m
->current_pgpath
) && next_pg
))
1994 * We don't know which pg will be used at next mapping time.
1995 * We don't call choose_pgpath() here to avoid to trigger
1996 * pg_init just by busy checking.
1997 * So we don't know whether underlying devices we will be using
1998 * at next mapping time are busy or not. Just try mapping.
2004 * If there is one non-busy active path at least, the path selector
2005 * will be able to select it. So we consider such a pg as not busy.
2008 list_for_each_entry(pgpath
, &pg
->pgpaths
, list
) {
2009 if (pgpath
->is_active
) {
2011 if (!pgpath_busy(pgpath
)) {
2020 * No active path in this pg, so this pg won't be used and
2021 * the current_pg will be changed at next mapping time.
2022 * We need to try mapping to determine it.
2030 /*-----------------------------------------------------------------
2032 *---------------------------------------------------------------*/
2033 static struct target_type multipath_target
= {
2034 .name
= "multipath",
2035 .version
= {1, 12, 0},
2036 .features
= DM_TARGET_SINGLETON
| DM_TARGET_IMMUTABLE
,
2037 .module
= THIS_MODULE
,
2038 .ctr
= multipath_ctr
,
2039 .dtr
= multipath_dtr
,
2040 .map_rq
= multipath_map
,
2041 .clone_and_map_rq
= multipath_clone_and_map
,
2042 .release_clone_rq
= multipath_release_clone
,
2043 .rq_end_io
= multipath_end_io
,
2044 .map
= multipath_map_bio
,
2045 .end_io
= multipath_end_io_bio
,
2046 .presuspend
= multipath_presuspend
,
2047 .postsuspend
= multipath_postsuspend
,
2048 .resume
= multipath_resume
,
2049 .status
= multipath_status
,
2050 .message
= multipath_message
,
2051 .prepare_ioctl
= multipath_prepare_ioctl
,
2052 .iterate_devices
= multipath_iterate_devices
,
2053 .busy
= multipath_busy
,
2056 static int __init
dm_multipath_init(void)
2060 /* allocate a slab for the dm_mpath_ios */
2061 _mpio_cache
= KMEM_CACHE(dm_mpath_io
, 0);
2065 r
= dm_register_target(&multipath_target
);
2067 DMERR("request-based register failed %d", r
);
2069 goto bad_register_target
;
2072 kmultipathd
= alloc_workqueue("kmpathd", WQ_MEM_RECLAIM
, 0);
2074 DMERR("failed to create workqueue kmpathd");
2076 goto bad_alloc_kmultipathd
;
2080 * A separate workqueue is used to handle the device handlers
2081 * to avoid overloading existing workqueue. Overloading the
2082 * old workqueue would also create a bottleneck in the
2083 * path of the storage hardware device activation.
2085 kmpath_handlerd
= alloc_ordered_workqueue("kmpath_handlerd",
2087 if (!kmpath_handlerd
) {
2088 DMERR("failed to create workqueue kmpath_handlerd");
2090 goto bad_alloc_kmpath_handlerd
;
2095 bad_alloc_kmpath_handlerd
:
2096 destroy_workqueue(kmultipathd
);
2097 bad_alloc_kmultipathd
:
2098 dm_unregister_target(&multipath_target
);
2099 bad_register_target
:
2100 kmem_cache_destroy(_mpio_cache
);
2105 static void __exit
dm_multipath_exit(void)
2107 destroy_workqueue(kmpath_handlerd
);
2108 destroy_workqueue(kmultipathd
);
2110 dm_unregister_target(&multipath_target
);
2111 kmem_cache_destroy(_mpio_cache
);
2114 module_init(dm_multipath_init
);
2115 module_exit(dm_multipath_exit
);
2117 MODULE_DESCRIPTION(DM_NAME
" multipath target");
2118 MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
2119 MODULE_LICENSE("GPL");