2 * Copyright (C) 2010-2011 Neil Brown
3 * Copyright (C) 2010-2016 Red Hat, Inc. All rights reserved.
5 * This file is released under the GPL.
8 #include <linux/slab.h>
9 #include <linux/module.h>
17 #include <linux/device-mapper.h>
19 #define DM_MSG_PREFIX "raid"
20 #define MAX_RAID_DEVICES 253 /* md-raid kernel limit */
22 static bool devices_handle_discard_safely
= false;
25 * The following flags are used by dm-raid.c to set up the array state.
26 * They must be cleared before md_run is called.
28 #define FirstUse 10 /* rdev flag */
32 * Two DM devices, one to hold metadata and one to hold the
33 * actual data/parity. The reason for this is to not confuse
34 * ti->len and give more flexibility in altering size and
37 * While it is possible for this device to be associated
38 * with a different physical device than the data_dev, it
39 * is intended for it to be the same.
40 * |--------- Physical Device ---------|
41 * |- meta_dev -|------ data_dev ------|
43 struct dm_dev
*meta_dev
;
44 struct dm_dev
*data_dev
;
49 * Flags for rs->ctr_flags field.
54 #define CTR_FLAG_SYNC 0x1 /* 1 */ /* Not with raid0! */
55 #define CTR_FLAG_NOSYNC 0x2 /* 1 */ /* Not with raid0! */
56 #define CTR_FLAG_REBUILD 0x4 /* 2 */ /* Not with raid0! */
57 #define CTR_FLAG_DAEMON_SLEEP 0x8 /* 2 */ /* Not with raid0! */
58 #define CTR_FLAG_MIN_RECOVERY_RATE 0x10 /* 2 */ /* Not with raid0! */
59 #define CTR_FLAG_MAX_RECOVERY_RATE 0x20 /* 2 */ /* Not with raid0! */
60 #define CTR_FLAG_MAX_WRITE_BEHIND 0x40 /* 2 */ /* Only with raid1! */
61 #define CTR_FLAG_WRITE_MOSTLY 0x80 /* 2 */ /* Only with raid1! */
62 #define CTR_FLAG_STRIPE_CACHE 0x100 /* 2 */ /* Only with raid4/5/6! */
63 #define CTR_FLAG_REGION_SIZE 0x200 /* 2 */ /* Not with raid0! */
64 #define CTR_FLAG_RAID10_COPIES 0x400 /* 2 */ /* Only with raid10 */
65 #define CTR_FLAG_RAID10_FORMAT 0x800 /* 2 */ /* Only with raid10 */
67 #define CTR_FLAG_DELTA_DISKS 0x1000 /* 2 */ /* Only with reshapable raid4/5/6/10! */
68 #define CTR_FLAG_DATA_OFFSET 0x2000 /* 2 */ /* Only with reshapable raid4/5/6/10! */
69 #define CTR_FLAG_RAID10_USE_NEAR_SETS 0x4000 /* 2 */ /* Only with raid10! */
72 * Definitions of various constructor flags to
73 * be used in checks of valid / invalid flags
76 /* Define all any sync flags */
77 #define CTR_FLAGS_ANY_SYNC (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC)
79 /* Define flags for options without argument (e.g. 'nosync') */
80 #define CTR_FLAG_OPTIONS_NO_ARGS (CTR_FLAGS_ANY_SYNC | \
81 CTR_FLAG_RAID10_USE_NEAR_SETS)
83 /* Define flags for options with one argument (e.g. 'delta_disks +2') */
84 #define CTR_FLAG_OPTIONS_ONE_ARG (CTR_FLAG_REBUILD | \
85 CTR_FLAG_WRITE_MOSTLY | \
86 CTR_FLAG_DAEMON_SLEEP | \
87 CTR_FLAG_MIN_RECOVERY_RATE | \
88 CTR_FLAG_MAX_RECOVERY_RATE | \
89 CTR_FLAG_MAX_WRITE_BEHIND | \
90 CTR_FLAG_STRIPE_CACHE | \
91 CTR_FLAG_REGION_SIZE | \
92 CTR_FLAG_RAID10_COPIES | \
93 CTR_FLAG_RAID10_FORMAT | \
94 CTR_FLAG_DELTA_DISKS | \
97 /* All ctr optional arguments */
98 #define ALL_CTR_FLAGS (CTR_FLAG_OPTIONS_NO_ARGS | \
99 CTR_FLAG_OPTIONS_ONE_ARG)
101 /* Invalid options definitions per raid level... */
103 /* "raid0" does not accept any options */
104 #define RAID0_INVALID_FLAGS ALL_CTR_FLAGS
106 /* "raid1" does not accept stripe cache or any raid10 options */
107 #define RAID1_INVALID_FLAGS (CTR_FLAG_STRIPE_CACHE | \
108 CTR_FLAG_RAID10_COPIES | \
109 CTR_FLAG_RAID10_FORMAT | \
110 CTR_FLAG_DELTA_DISKS | \
111 CTR_FLAG_DATA_OFFSET)
113 /* "raid10" does not accept any raid1 or stripe cache options */
114 #define RAID10_INVALID_FLAGS (CTR_FLAG_WRITE_MOSTLY | \
115 CTR_FLAG_MAX_WRITE_BEHIND | \
116 CTR_FLAG_STRIPE_CACHE)
118 * "raid4/5/6" do not accept any raid1 or raid10 specific options
120 * "raid6" does not accept "nosync", because it is not guaranteed
121 * that both parity and q-syndrome are being written properly with
124 #define RAID45_INVALID_FLAGS (CTR_FLAG_WRITE_MOSTLY | \
125 CTR_FLAG_MAX_WRITE_BEHIND | \
126 CTR_FLAG_RAID10_FORMAT | \
127 CTR_FLAG_RAID10_COPIES | \
128 CTR_FLAG_RAID10_USE_NEAR_SETS)
129 #define RAID6_INVALID_FLAGS (CTR_FLAG_NOSYNC | RAID45_INVALID_FLAGS)
130 /* ...invalid options definitions per raid level */
133 * Flags for rs->runtime_flags field
134 * (RT_FLAG prefix meaning "runtime flag")
136 * These are all internal and used to define runtime state,
137 * e.g. to prevent another resume from preresume processing
138 * the raid set all over again.
140 #define RT_FLAG_RS_PRERESUMED 0x1
141 #define RT_FLAG_RS_RESUMED 0x2
142 #define RT_FLAG_RS_BITMAP_LOADED 0x4
143 #define RT_FLAG_UPDATE_SBS 0x8
145 /* Array elements of 64 bit needed for rebuild/write_mostly bits */
146 #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
149 * raid set level, layout and chunk sectors backup/restore
154 int new_chunk_sectors
;
158 struct dm_target
*ti
;
160 uint32_t bitmap_loaded
;
162 uint32_t runtime_flags
;
164 uint64_t rebuild_disks
[DISKS_ARRAY_ELEMS
];
172 struct raid_type
*raid_type
;
173 struct dm_target_callbacks callbacks
;
174 struct rs_layout rs_layout
;
176 struct raid_dev dev
[0];
179 /* Backup/restore raid set configuration helpers */
180 static void _rs_config_backup(struct raid_set
*rs
, struct rs_layout
*l
)
182 struct mddev
*mddev
= &rs
->md
;
184 l
->new_level
= mddev
->new_level
;
185 l
->new_layout
= mddev
->new_layout
;
186 l
->new_chunk_sectors
= mddev
->new_chunk_sectors
;
189 static void rs_config_backup(struct raid_set
*rs
)
191 return _rs_config_backup(rs
, &rs
->rs_layout
);
194 static void _rs_config_restore(struct raid_set
*rs
, struct rs_layout
*l
)
196 struct mddev
*mddev
= &rs
->md
;
198 mddev
->new_level
= l
->new_level
;
199 mddev
->new_layout
= l
->new_layout
;
200 mddev
->new_chunk_sectors
= l
->new_chunk_sectors
;
203 static void rs_config_restore(struct raid_set
*rs
)
205 return _rs_config_restore(rs
, &rs
->rs_layout
);
207 /* END: backup/restore raid set configuration helpers */
209 /* raid10 algorithms (i.e. formats) */
210 #define ALGORITHM_RAID10_DEFAULT 0
211 #define ALGORITHM_RAID10_NEAR 1
212 #define ALGORITHM_RAID10_OFFSET 2
213 #define ALGORITHM_RAID10_FAR 3
215 /* Supported raid types and properties. */
216 static struct raid_type
{
217 const char *name
; /* RAID algorithm. */
218 const char *descr
; /* Descriptor text for logging. */
219 const unsigned parity_devs
; /* # of parity devices. */
220 const unsigned minimal_devs
; /* minimal # of devices in set. */
221 const unsigned level
; /* RAID level. */
222 const unsigned algorithm
; /* RAID algorithm. */
224 {"raid0", "raid0 (striping)", 0, 2, 0, 0 /* NONE */},
225 {"raid1", "raid1 (mirroring)", 0, 2, 1, 0 /* NONE */},
226 {"raid10_far", "raid10 far (striped mirrors)", 0, 2, 10, ALGORITHM_RAID10_FAR
},
227 {"raid10_offset", "raid10 offset (striped mirrors)", 0, 2, 10, ALGORITHM_RAID10_OFFSET
},
228 {"raid10_near", "raid10 near (striped mirrors)", 0, 2, 10, ALGORITHM_RAID10_NEAR
},
229 {"raid10", "raid10 (striped mirrors)", 0, 2, 10, ALGORITHM_RAID10_DEFAULT
},
230 {"raid4", "raid4 (dedicated last parity disk)", 1, 2, 4, ALGORITHM_PARITY_N
}, /* raid4 layout = raid5_n */
231 {"raid5_n", "raid5 (dedicated last parity disk)", 1, 2, 5, ALGORITHM_PARITY_N
},
232 {"raid5_ls", "raid5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC
},
233 {"raid5_rs", "raid5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC
},
234 {"raid5_la", "raid5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC
},
235 {"raid5_ra", "raid5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC
},
236 {"raid6_zr", "raid6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART
},
237 {"raid6_nr", "raid6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART
},
238 {"raid6_nc", "raid6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE
},
239 {"raid6_n_6", "raid6 (dedicated parity/Q n/6)", 2, 4, 6, ALGORITHM_PARITY_N_6
},
240 {"raid6_ls_6", "raid6 (left symmetric dedicated Q 6)", 2, 4, 6, ALGORITHM_LEFT_SYMMETRIC_6
},
241 {"raid6_rs_6", "raid6 (right symmetric dedicated Q 6)", 2, 4, 6, ALGORITHM_RIGHT_SYMMETRIC_6
},
242 {"raid6_la_6", "raid6 (left asymmetric dedicated Q 6)", 2, 4, 6, ALGORITHM_LEFT_ASYMMETRIC_6
},
243 {"raid6_ra_6", "raid6 (right asymmetric dedicated Q 6)", 2, 4, 6, ALGORITHM_RIGHT_ASYMMETRIC_6
}
246 /* True, if @v is in inclusive range [@min, @max] */
247 static bool _in_range(long v
, long min
, long max
)
249 return v
>= min
&& v
<= max
;
252 /* ctr flag bit manipulation... */
253 /* Set single @flag in @flags */
254 static void _set_flag(uint32_t flag
, uint32_t *flags
)
256 WARN_ON_ONCE(hweight32(flag
) != 1);
260 /* Clear single @flag in @flags */
261 static void _clear_flag(uint32_t flag
, uint32_t *flags
)
263 WARN_ON_ONCE(hweight32(flag
) != 1);
267 /* Test single @flag in @flags */
268 static bool _test_flag(uint32_t flag
, uint32_t flags
)
270 WARN_ON_ONCE(hweight32(flag
) != 1);
271 return (flag
& flags
) ? true : false;
274 /* Test multiple @flags in @all_flags */
275 static bool _test_flags(uint32_t flags
, uint32_t all_flags
)
277 return (flags
& all_flags
) ? true : false;
280 /* Clear (multiple) @flags in @all_flags */
281 static void _clear_flags(uint32_t flags
, uint32_t *all_flags
)
283 *all_flags
&= ~flags
;
286 /* Return true if single @flag is set in @*flags, else set it and return false */
287 static bool _test_and_set_flag(uint32_t flag
, uint32_t *flags
)
289 if (_test_flag(flag
, *flags
))
292 _set_flag(flag
, flags
);
296 /* Return true if single @flag is set in @*flags and clear it, else return false */
297 static bool _test_and_clear_flag(uint32_t flag
, uint32_t *flags
)
299 if (_test_flag(flag
, *flags
)) {
300 _clear_flag(flag
, flags
);
306 /* ...ctr and runtime flag bit manipulation */
308 /* All table line arguments are defined here */
309 static struct arg_name_flag
{
312 } _arg_name_flags
[] = {
313 { CTR_FLAG_SYNC
, "sync"},
314 { CTR_FLAG_NOSYNC
, "nosync"},
315 { CTR_FLAG_REBUILD
, "rebuild"},
316 { CTR_FLAG_DAEMON_SLEEP
, "daemon_sleep"},
317 { CTR_FLAG_MIN_RECOVERY_RATE
, "min_recovery_rate"},
318 { CTR_FLAG_MAX_RECOVERY_RATE
, "max_recovery_rate"},
319 { CTR_FLAG_MAX_WRITE_BEHIND
, "max_write_behind"},
320 { CTR_FLAG_WRITE_MOSTLY
, "writemostly"},
321 { CTR_FLAG_STRIPE_CACHE
, "stripe_cache"},
322 { CTR_FLAG_REGION_SIZE
, "region_size"},
323 { CTR_FLAG_RAID10_COPIES
, "raid10_copies"},
324 { CTR_FLAG_RAID10_FORMAT
, "raid10_format"},
325 { CTR_FLAG_DATA_OFFSET
, "data_offset"},
326 { CTR_FLAG_DELTA_DISKS
, "delta_disks"},
327 { CTR_FLAG_RAID10_USE_NEAR_SETS
, "raid10_use_near_sets"},
330 /* Return argument name string for given @flag */
331 static const char *dm_raid_arg_name_by_flag(const uint32_t flag
)
333 if (hweight32(flag
) == 1) {
334 struct arg_name_flag
*anf
= _arg_name_flags
+ ARRAY_SIZE(_arg_name_flags
);
336 while (anf
-- > _arg_name_flags
)
337 if (_test_flag(flag
, anf
->flag
))
341 DMERR("%s called with more than one flag!", __func__
);
347 * bool helpers to test for various raid levels of a raid set,
348 * is. it's level as reported by the superblock rather than
349 * the requested raid_type passed to the constructor.
351 /* Return true, if raid set in @rs is raid0 */
352 static bool rs_is_raid0(struct raid_set
*rs
)
354 return !rs
->md
.level
;
357 /* Return true, if raid set in @rs is raid10 */
358 static bool rs_is_raid10(struct raid_set
*rs
)
360 return rs
->md
.level
== 10;
364 * bool helpers to test for various raid levels of a raid type
367 /* Return true, if raid type in @rt is raid0 */
368 static bool rt_is_raid0(struct raid_type
*rt
)
373 /* Return true, if raid type in @rt is raid1 */
374 static bool rt_is_raid1(struct raid_type
*rt
)
376 return rt
->level
== 1;
379 /* Return true, if raid type in @rt is raid10 */
380 static bool rt_is_raid10(struct raid_type
*rt
)
382 return rt
->level
== 10;
385 /* Return true, if raid type in @rt is raid4/5 */
386 static bool rt_is_raid45(struct raid_type
*rt
)
388 return _in_range(rt
->level
, 4, 5);
391 /* Return true, if raid type in @rt is raid6 */
392 static bool rt_is_raid6(struct raid_type
*rt
)
394 return rt
->level
== 6;
397 /* Return true, if raid type in @rt is raid4/5/6 */
398 static bool rt_is_raid456(struct raid_type
*rt
)
400 return _in_range(rt
->level
, 4, 6);
402 /* END: raid level bools */
404 /* Return invalid ctr flags for the raid level of @rs */
405 static uint32_t _invalid_flags(struct raid_set
*rs
)
407 if (rt_is_raid0(rs
->raid_type
))
408 return RAID0_INVALID_FLAGS
;
409 else if (rt_is_raid1(rs
->raid_type
))
410 return RAID1_INVALID_FLAGS
;
411 else if (rt_is_raid10(rs
->raid_type
))
412 return RAID10_INVALID_FLAGS
;
413 else if (rt_is_raid45(rs
->raid_type
))
414 return RAID45_INVALID_FLAGS
;
415 else if (rt_is_raid6(rs
->raid_type
))
416 return RAID6_INVALID_FLAGS
;
422 * Check for any invalid flags set on @rs defined by bitset @invalid_flags
424 * Has to be called after parsing of the ctr flags!
426 static int rs_check_for_invalid_flags(struct raid_set
*rs
)
428 if (_test_flags(rs
->ctr_flags
, _invalid_flags(rs
))) {
429 rs
->ti
->error
= "Invalid flag combined";
437 /* MD raid10 bit definitions and helpers */
438 #define RAID10_OFFSET (1 << 16) /* stripes with data copies area adjacent on devices */
439 #define RAID10_BROCKEN_USE_FAR_SETS (1 << 17) /* Broken in raid10.c: use sets instead of whole stripe rotation */
440 #define RAID10_USE_FAR_SETS (1 << 18) /* Use sets instead of whole stripe rotation */
441 #define RAID10_FAR_COPIES_SHIFT 8 /* raid10 # far copies shift (2nd byte of layout) */
443 /* Return md raid10 near copies for @layout */
444 static unsigned int _raid10_near_copies(int layout
)
446 return layout
& 0xFF;
449 /* Return md raid10 far copies for @layout */
450 static unsigned int _raid10_far_copies(int layout
)
452 return _raid10_near_copies(layout
>> RAID10_FAR_COPIES_SHIFT
);
455 /* Return true if md raid10 offset for @layout */
456 static unsigned int _is_raid10_offset(int layout
)
458 return layout
& RAID10_OFFSET
;
461 /* Return true if md raid10 near for @layout */
462 static unsigned int _is_raid10_near(int layout
)
464 return !_is_raid10_offset(layout
) && _raid10_near_copies(layout
) > 1;
467 /* Return true if md raid10 far for @layout */
468 static unsigned int _is_raid10_far(int layout
)
470 return !_is_raid10_offset(layout
) && _raid10_far_copies(layout
) > 1;
473 /* Return md raid10 layout string for @layout */
474 static const char *raid10_md_layout_to_format(int layout
)
477 * Bit 16 stands for "offset"
478 * (i.e. adjacent stripes hold copies)
480 * Refer to MD's raid10.c for details
482 if (_is_raid10_offset(layout
))
485 if (_raid10_near_copies(layout
) > 1)
488 WARN_ON(_raid10_far_copies(layout
) < 2);
493 /* Return md raid10 algorithm for @name */
494 static const int raid10_name_to_format(const char *name
)
496 if (!strcasecmp(name
, "near"))
497 return ALGORITHM_RAID10_NEAR
;
498 else if (!strcasecmp(name
, "offset"))
499 return ALGORITHM_RAID10_OFFSET
;
500 else if (!strcasecmp(name
, "far"))
501 return ALGORITHM_RAID10_FAR
;
507 /* Return md raid10 copies for @layout */
508 static unsigned int raid10_md_layout_to_copies(int layout
)
510 return _raid10_near_copies(layout
) > 1 ?
511 _raid10_near_copies(layout
) : _raid10_far_copies(layout
);
514 /* Return md raid10 format id for @format string */
515 static int raid10_format_to_md_layout(struct raid_set
*rs
,
516 unsigned int algorithm
,
519 unsigned int n
= 1, f
= 1, r
= 0;
522 * MD resilienece flaw:
524 * enabling use_far_sets for far/offset formats causes copies
525 * to be colocated on the same devs together with their origins!
527 * -> disable it for now in the definition above
529 if (algorithm
== ALGORITHM_RAID10_DEFAULT
||
530 algorithm
== ALGORITHM_RAID10_NEAR
)
533 else if (algorithm
== ALGORITHM_RAID10_OFFSET
) {
536 if (!_test_flag(CTR_FLAG_RAID10_USE_NEAR_SETS
, rs
->ctr_flags
))
537 r
|= RAID10_USE_FAR_SETS
;
539 } else if (algorithm
== ALGORITHM_RAID10_FAR
) {
542 if (!_test_flag(CTR_FLAG_RAID10_USE_NEAR_SETS
, rs
->ctr_flags
))
543 r
|= RAID10_USE_FAR_SETS
;
548 return r
| (f
<< RAID10_FAR_COPIES_SHIFT
) | n
;
550 /* END: MD raid10 bit definitions and helpers */
552 /* Check for any of the raid10 algorithms */
553 static int _got_raid10(struct raid_type
*rtp
, const int layout
)
555 if (rtp
->level
== 10) {
556 switch (rtp
->algorithm
) {
557 case ALGORITHM_RAID10_DEFAULT
:
558 case ALGORITHM_RAID10_NEAR
:
559 return _is_raid10_near(layout
);
560 case ALGORITHM_RAID10_OFFSET
:
561 return _is_raid10_offset(layout
);
562 case ALGORITHM_RAID10_FAR
:
563 return _is_raid10_far(layout
);
572 /* Return raid_type for @name */
573 static struct raid_type
*get_raid_type(const char *name
)
575 struct raid_type
*rtp
= raid_types
+ ARRAY_SIZE(raid_types
);
577 while (rtp
-- > raid_types
)
578 if (!strcasecmp(rtp
->name
, name
))
584 /* Return raid_type for @name based derived from @level and @layout */
585 static struct raid_type
*get_raid_type_by_ll(const int level
, const int layout
)
587 struct raid_type
*rtp
= raid_types
+ ARRAY_SIZE(raid_types
);
589 while (rtp
-- > raid_types
) {
590 /* RAID10 special checks based on @layout flags/properties */
591 if (rtp
->level
== level
&&
592 (_got_raid10(rtp
, layout
) || rtp
->algorithm
== layout
))
600 * Set the mddev properties in @rs to the current
601 * ones retrieved from the freshest superblock
603 static void rs_set_cur(struct raid_set
*rs
)
605 struct mddev
*mddev
= &rs
->md
;
607 mddev
->new_level
= mddev
->level
;
608 mddev
->new_layout
= mddev
->layout
;
609 mddev
->new_chunk_sectors
= mddev
->chunk_sectors
;
613 * Set the mddev properties in @rs to the new
614 * ones requested by the ctr
616 static void rs_set_new(struct raid_set
*rs
)
618 struct mddev
*mddev
= &rs
->md
;
620 mddev
->level
= mddev
->new_level
;
621 mddev
->layout
= mddev
->new_layout
;
622 mddev
->chunk_sectors
= mddev
->new_chunk_sectors
;
623 mddev
->raid_disks
= rs
->raid_disks
;
624 mddev
->delta_disks
= 0;
628 static struct raid_set
*context_alloc(struct dm_target
*ti
, struct raid_type
*raid_type
, unsigned raid_devs
)
633 if (raid_devs
<= raid_type
->parity_devs
) {
634 ti
->error
= "Insufficient number of devices";
635 return ERR_PTR(-EINVAL
);
638 rs
= kzalloc(sizeof(*rs
) + raid_devs
* sizeof(rs
->dev
[0]), GFP_KERNEL
);
640 ti
->error
= "Cannot allocate raid context";
641 return ERR_PTR(-ENOMEM
);
646 rs
->raid_disks
= raid_devs
;
650 rs
->raid_type
= raid_type
;
651 rs
->md
.raid_disks
= raid_devs
;
652 rs
->md
.level
= raid_type
->level
;
653 rs
->md
.new_level
= rs
->md
.level
;
654 rs
->md
.layout
= raid_type
->algorithm
;
655 rs
->md
.new_layout
= rs
->md
.layout
;
656 rs
->md
.delta_disks
= 0;
657 rs
->md
.recovery_cp
= rs_is_raid0(rs
) ? MaxSector
: 0;
659 for (i
= 0; i
< raid_devs
; i
++)
660 md_rdev_init(&rs
->dev
[i
].rdev
);
663 * Remaining items to be initialized by further RAID params:
666 * rs->md.chunk_sectors
667 * rs->md.new_chunk_sectors
674 static void context_free(struct raid_set
*rs
)
678 for (i
= 0; i
< rs
->md
.raid_disks
; i
++) {
679 if (rs
->dev
[i
].meta_dev
)
680 dm_put_device(rs
->ti
, rs
->dev
[i
].meta_dev
);
681 md_rdev_clear(&rs
->dev
[i
].rdev
);
682 if (rs
->dev
[i
].data_dev
)
683 dm_put_device(rs
->ti
, rs
->dev
[i
].data_dev
);
690 * For every device we have two words
691 * <meta_dev>: meta device name or '-' if missing
692 * <data_dev>: data device name or '-' if missing
694 * The following are permitted:
697 * <meta_dev> <data_dev>
699 * The following is not allowed:
702 * This code parses those words. If there is a failure,
703 * the caller must use context_free to unwind the operations.
705 static int parse_dev_params(struct raid_set
*rs
, struct dm_arg_set
*as
)
709 int metadata_available
= 0;
713 /* Put off the number of raid devices argument to get to dev pairs */
714 arg
= dm_shift_arg(as
);
718 for (i
= 0; i
< rs
->md
.raid_disks
; i
++) {
719 rs
->dev
[i
].rdev
.raid_disk
= i
;
721 rs
->dev
[i
].meta_dev
= NULL
;
722 rs
->dev
[i
].data_dev
= NULL
;
725 * There are no offsets, since there is a separate device
726 * for data and metadata.
728 rs
->dev
[i
].rdev
.data_offset
= 0;
729 rs
->dev
[i
].rdev
.mddev
= &rs
->md
;
731 arg
= dm_shift_arg(as
);
735 if (strcmp(arg
, "-")) {
736 r
= dm_get_device(rs
->ti
, arg
, dm_table_get_mode(rs
->ti
->table
),
737 &rs
->dev
[i
].meta_dev
);
739 rs
->ti
->error
= "RAID metadata device lookup failure";
743 rs
->dev
[i
].rdev
.sb_page
= alloc_page(GFP_KERNEL
);
744 if (!rs
->dev
[i
].rdev
.sb_page
) {
745 rs
->ti
->error
= "Failed to allocate superblock page";
750 arg
= dm_shift_arg(as
);
754 if (!strcmp(arg
, "-")) {
755 if (!test_bit(In_sync
, &rs
->dev
[i
].rdev
.flags
) &&
756 (!rs
->dev
[i
].rdev
.recovery_offset
)) {
757 rs
->ti
->error
= "Drive designated for rebuild not specified";
761 if (rs
->dev
[i
].meta_dev
) {
762 rs
->ti
->error
= "No data device supplied with metadata device";
769 r
= dm_get_device(rs
->ti
, arg
, dm_table_get_mode(rs
->ti
->table
),
770 &rs
->dev
[i
].data_dev
);
772 rs
->ti
->error
= "RAID device lookup failure";
776 if (rs
->dev
[i
].meta_dev
) {
777 metadata_available
= 1;
778 rs
->dev
[i
].rdev
.meta_bdev
= rs
->dev
[i
].meta_dev
->bdev
;
780 rs
->dev
[i
].rdev
.bdev
= rs
->dev
[i
].data_dev
->bdev
;
781 list_add_tail(&rs
->dev
[i
].rdev
.same_set
, &rs
->md
.disks
);
782 if (!test_bit(In_sync
, &rs
->dev
[i
].rdev
.flags
))
786 if (metadata_available
) {
788 rs
->md
.persistent
= 1;
789 rs
->md
.major_version
= 2;
790 } else if (rebuild
&& !rs
->md
.recovery_cp
) {
792 * Without metadata, we will not be able to tell if the array
793 * is in-sync or not - we must assume it is not. Therefore,
794 * it is impossible to rebuild a drive.
796 * Even if there is metadata, the on-disk information may
797 * indicate that the array is not in-sync and it will then
800 * User could specify 'nosync' option if desperate.
802 rs
->ti
->error
= "Unable to rebuild drive while array is not in-sync";
810 * validate_region_size
812 * @region_size: region size in sectors. If 0, pick a size (4MiB default).
814 * Set rs->md.bitmap_info.chunksize (which really refers to 'region size').
815 * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap.
817 * Returns: 0 on success, -EINVAL on failure.
819 static int validate_region_size(struct raid_set
*rs
, unsigned long region_size
)
821 unsigned long min_region_size
= rs
->ti
->len
/ (1 << 21);
825 * Choose a reasonable default. All figures in sectors.
827 if (min_region_size
> (1 << 13)) {
828 /* If not a power of 2, make it the next power of 2 */
829 region_size
= roundup_pow_of_two(min_region_size
);
830 DMINFO("Choosing default region size of %lu sectors",
833 DMINFO("Choosing default region size of 4MiB");
834 region_size
= 1 << 13; /* sectors */
838 * Validate user-supplied value.
840 if (region_size
> rs
->ti
->len
) {
841 rs
->ti
->error
= "Supplied region size is too large";
845 if (region_size
< min_region_size
) {
846 DMERR("Supplied region_size (%lu sectors) below minimum (%lu)",
847 region_size
, min_region_size
);
848 rs
->ti
->error
= "Supplied region size is too small";
852 if (!is_power_of_2(region_size
)) {
853 rs
->ti
->error
= "Region size is not a power of 2";
857 if (region_size
< rs
->md
.chunk_sectors
) {
858 rs
->ti
->error
= "Region size is smaller than the chunk size";
864 * Convert sectors to bytes.
866 rs
->md
.bitmap_info
.chunksize
= (region_size
<< 9);
872 * validate_raid_redundancy
875 * Determine if there are enough devices in the array that haven't
876 * failed (or are being rebuilt) to form a usable array.
878 * Returns: 0 on success, -EINVAL on failure.
880 static int validate_raid_redundancy(struct raid_set
*rs
)
882 unsigned i
, rebuild_cnt
= 0;
883 unsigned rebuilds_per_group
= 0, copies
, d
;
884 unsigned group_size
, last_group_start
;
886 for (i
= 0; i
< rs
->md
.raid_disks
; i
++)
887 if (!test_bit(In_sync
, &rs
->dev
[i
].rdev
.flags
) ||
888 !rs
->dev
[i
].rdev
.sb_page
)
891 switch (rs
->raid_type
->level
) {
893 if (rebuild_cnt
>= rs
->md
.raid_disks
)
899 if (rebuild_cnt
> rs
->raid_type
->parity_devs
)
903 copies
= raid10_md_layout_to_copies(rs
->md
.layout
);
904 if (rebuild_cnt
< copies
)
908 * It is possible to have a higher rebuild count for RAID10,
909 * as long as the failed devices occur in different mirror
910 * groups (i.e. different stripes).
912 * When checking "near" format, make sure no adjacent devices
913 * have failed beyond what can be handled. In addition to the
914 * simple case where the number of devices is a multiple of the
915 * number of copies, we must also handle cases where the number
916 * of devices is not a multiple of the number of copies.
917 * E.g. dev1 dev2 dev3 dev4 dev5
921 if (!strcmp("near", raid10_md_layout_to_format(rs
->md
.layout
))) {
922 for (i
= 0; i
< rs
->md
.raid_disks
* copies
; i
++) {
924 rebuilds_per_group
= 0;
925 d
= i
% rs
->md
.raid_disks
;
926 if ((!rs
->dev
[d
].rdev
.sb_page
||
927 !test_bit(In_sync
, &rs
->dev
[d
].rdev
.flags
)) &&
928 (++rebuilds_per_group
>= copies
))
935 * When checking "far" and "offset" formats, we need to ensure
936 * that the device that holds its copy is not also dead or
937 * being rebuilt. (Note that "far" and "offset" formats only
938 * support two copies right now. These formats also only ever
939 * use the 'use_far_sets' variant.)
941 * This check is somewhat complicated by the need to account
942 * for arrays that are not a multiple of (far) copies. This
943 * results in the need to treat the last (potentially larger)
946 group_size
= (rs
->md
.raid_disks
/ copies
);
947 last_group_start
= (rs
->md
.raid_disks
/ group_size
) - 1;
948 last_group_start
*= group_size
;
949 for (i
= 0; i
< rs
->md
.raid_disks
; i
++) {
950 if (!(i
% copies
) && !(i
> last_group_start
))
951 rebuilds_per_group
= 0;
952 if ((!rs
->dev
[i
].rdev
.sb_page
||
953 !test_bit(In_sync
, &rs
->dev
[i
].rdev
.flags
)) &&
954 (++rebuilds_per_group
>= copies
))
970 * Possible arguments are...
971 * <chunk_size> [optional_args]
973 * Argument definitions
974 * <chunk_size> The number of sectors per disk that
975 * will form the "stripe"
976 * [[no]sync] Force or prevent recovery of the
978 * [rebuild <idx>] Rebuild the drive indicated by the index
979 * [daemon_sleep <ms>] Time between bitmap daemon work to
981 * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
982 * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
983 * [write_mostly <idx>] Indicate a write mostly drive via index
984 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
985 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
986 * [region_size <sectors>] Defines granularity of bitmap
988 * RAID10-only options:
989 * [raid10_copies <# copies>] Number of copies. (Default: 2)
990 * [raid10_format <near|far|offset>] Layout algorithm. (Default: near)
992 static int parse_raid_params(struct raid_set
*rs
, struct dm_arg_set
*as
,
993 unsigned num_raid_params
)
995 int raid10_format
= ALGORITHM_RAID10_DEFAULT
;
996 unsigned raid10_copies
= 2;
998 unsigned value
, region_size
= 0;
999 sector_t sectors_per_dev
= rs
->ti
->len
;
1000 sector_t max_io_len
;
1001 const char *arg
, *key
;
1002 struct raid_dev
*rd
;
1003 struct raid_type
*rt
= rs
->raid_type
;
1005 arg
= dm_shift_arg(as
);
1006 num_raid_params
--; /* Account for chunk_size argument */
1008 if (kstrtouint(arg
, 10, &value
) < 0) {
1009 rs
->ti
->error
= "Bad numerical argument given for chunk_size";
1014 * First, parse the in-order required arguments
1015 * "chunk_size" is the only argument of this type.
1017 if (rt_is_raid1(rt
)) {
1019 DMERR("Ignoring chunk size parameter for RAID 1");
1021 } else if (!is_power_of_2(value
)) {
1022 rs
->ti
->error
= "Chunk size must be a power of 2";
1024 } else if (value
< 8) {
1025 rs
->ti
->error
= "Chunk size value is too small";
1029 rs
->md
.new_chunk_sectors
= rs
->md
.chunk_sectors
= value
;
1032 * We set each individual device as In_sync with a completed
1033 * 'recovery_offset'. If there has been a device failure or
1034 * replacement then one of the following cases applies:
1036 * 1) User specifies 'rebuild'.
1037 * - Device is reset when param is read.
1038 * 2) A new device is supplied.
1039 * - No matching superblock found, resets device.
1040 * 3) Device failure was transient and returns on reload.
1041 * - Failure noticed, resets device for bitmap replay.
1042 * 4) Device hadn't completed recovery after previous failure.
1043 * - Superblock is read and overrides recovery_offset.
1045 * What is found in the superblocks of the devices is always
1046 * authoritative, unless 'rebuild' or '[no]sync' was specified.
1048 for (i
= 0; i
< rs
->md
.raid_disks
; i
++) {
1049 set_bit(In_sync
, &rs
->dev
[i
].rdev
.flags
);
1050 rs
->dev
[i
].rdev
.recovery_offset
= MaxSector
;
1054 * Second, parse the unordered optional arguments
1056 for (i
= 0; i
< num_raid_params
; i
++) {
1057 key
= dm_shift_arg(as
);
1059 rs
->ti
->error
= "Not enough raid parameters given";
1063 if (!strcasecmp(key
, dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC
))) {
1064 if (_test_and_set_flag(CTR_FLAG_NOSYNC
, &rs
->ctr_flags
)) {
1065 rs
->ti
->error
= "Only one 'nosync' argument allowed";
1068 rs
->md
.recovery_cp
= MaxSector
;
1071 if (!strcasecmp(key
, dm_raid_arg_name_by_flag(CTR_FLAG_SYNC
))) {
1072 if (_test_and_set_flag(CTR_FLAG_SYNC
, &rs
->ctr_flags
)) {
1073 rs
->ti
->error
= "Only one 'sync' argument allowed";
1076 rs
->md
.recovery_cp
= 0;
1079 if (!strcasecmp(key
, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_USE_NEAR_SETS
))) {
1080 if (_test_and_set_flag(CTR_FLAG_RAID10_USE_NEAR_SETS
, &rs
->ctr_flags
)) {
1081 rs
->ti
->error
= "Only one 'raid10_use_new_sets' argument allowed";
1087 arg
= dm_shift_arg(as
);
1088 i
++; /* Account for the argument pairs */
1090 rs
->ti
->error
= "Wrong number of raid parameters given";
1095 * Parameters that take a string value are checked here.
1098 if (!strcasecmp(key
, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT
))) {
1099 if (_test_and_set_flag(CTR_FLAG_RAID10_FORMAT
, &rs
->ctr_flags
)) {
1100 rs
->ti
->error
= "Only one 'raid10_format' argument pair allowed";
1103 if (!rt_is_raid10(rt
)) {
1104 rs
->ti
->error
= "'raid10_format' is an invalid parameter for this RAID type";
1107 raid10_format
= raid10_name_to_format(arg
);
1108 if (raid10_format
< 0) {
1109 rs
->ti
->error
= "Invalid 'raid10_format' value given";
1110 return raid10_format
;
1115 if (kstrtouint(arg
, 10, &value
) < 0) {
1116 rs
->ti
->error
= "Bad numerical argument given in raid params";
1120 if (!strcasecmp(key
, dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD
))) {
1122 * "rebuild" is being passed in by userspace to provide
1123 * indexes of replaced devices and to set up additional
1124 * devices on raid level takeover.
1126 if (!_in_range(value
, 0, rs
->raid_disks
- 1)) {
1127 rs
->ti
->error
= "Invalid rebuild index given";
1131 if (test_and_set_bit(value
, (void *) rs
->rebuild_disks
)) {
1132 rs
->ti
->error
= "rebuild for this index already given";
1136 rd
= rs
->dev
+ value
;
1137 clear_bit(In_sync
, &rd
->rdev
.flags
);
1138 clear_bit(Faulty
, &rd
->rdev
.flags
);
1139 rd
->rdev
.recovery_offset
= 0;
1140 _set_flag(CTR_FLAG_REBUILD
, &rs
->ctr_flags
);
1141 } else if (!strcasecmp(key
, dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY
))) {
1142 if (!rt_is_raid1(rt
)) {
1143 rs
->ti
->error
= "write_mostly option is only valid for RAID1";
1147 if (!_in_range(value
, 0, rs
->md
.raid_disks
- 1)) {
1148 rs
->ti
->error
= "Invalid write_mostly index given";
1152 set_bit(WriteMostly
, &rs
->dev
[value
].rdev
.flags
);
1153 _set_flag(CTR_FLAG_WRITE_MOSTLY
, &rs
->ctr_flags
);
1154 } else if (!strcasecmp(key
, dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND
))) {
1155 if (!rt_is_raid1(rt
)) {
1156 rs
->ti
->error
= "max_write_behind option is only valid for RAID1";
1160 if (_test_and_set_flag(CTR_FLAG_MAX_WRITE_BEHIND
, &rs
->ctr_flags
)) {
1161 rs
->ti
->error
= "Only one max_write_behind argument pair allowed";
1166 * In device-mapper, we specify things in sectors, but
1167 * MD records this value in kB
1170 if (value
> COUNTER_MAX
) {
1171 rs
->ti
->error
= "Max write-behind limit out of range";
1175 rs
->md
.bitmap_info
.max_write_behind
= value
;
1176 } else if (!strcasecmp(key
, dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP
))) {
1177 if (_test_and_set_flag(CTR_FLAG_DAEMON_SLEEP
, &rs
->ctr_flags
)) {
1178 rs
->ti
->error
= "Only one daemon_sleep argument pair allowed";
1181 if (!value
|| (value
> MAX_SCHEDULE_TIMEOUT
)) {
1182 rs
->ti
->error
= "daemon sleep period out of range";
1185 rs
->md
.bitmap_info
.daemon_sleep
= value
;
1186 } else if (!strcasecmp(key
, dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET
))) {
1187 /* Userspace passes new data_offset after having extended the the data image LV */
1188 if (_test_and_set_flag(CTR_FLAG_DATA_OFFSET
, &rs
->ctr_flags
)) {
1189 rs
->ti
->error
= "Only one data_offset argument pair allowed";
1192 /* Ensure sensible data offset */
1194 rs
->ti
->error
= "Bogus data_offset value";
1197 rs
->data_offset
= value
;
1198 } else if (!strcasecmp(key
, dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS
))) {
1199 /* Define the +/-# of disks to add to/remove from the given raid set */
1200 if (_test_and_set_flag(CTR_FLAG_DELTA_DISKS
, &rs
->ctr_flags
)) {
1201 rs
->ti
->error
= "Only one delta_disks argument pair allowed";
1204 /* Ensure MAX_RAID_DEVICES and raid type minimal_devs! */
1205 if (!_in_range(abs(value
), 1, MAX_RAID_DEVICES
- rt
->minimal_devs
)) {
1206 rs
->ti
->error
= "Too many delta_disk requested";
1210 rs
->delta_disks
= value
;
1211 } else if (!strcasecmp(key
, dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE
))) {
1212 if (_test_and_set_flag(CTR_FLAG_STRIPE_CACHE
, &rs
->ctr_flags
)) {
1213 rs
->ti
->error
= "Only one stripe_cache argument pair allowed";
1218 * In device-mapper, we specify things in sectors, but
1219 * MD records this value in kB
1223 if (!rt_is_raid456(rt
)) {
1224 rs
->ti
->error
= "Inappropriate argument: stripe_cache";
1227 if (raid5_set_cache_size(&rs
->md
, (int)value
)) {
1228 rs
->ti
->error
= "Bad stripe_cache size";
1232 } else if (!strcasecmp(key
, dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE
))) {
1233 if (_test_and_set_flag(CTR_FLAG_MIN_RECOVERY_RATE
, &rs
->ctr_flags
)) {
1234 rs
->ti
->error
= "Only one min_recovery_rate argument pair allowed";
1237 if (value
> INT_MAX
) {
1238 rs
->ti
->error
= "min_recovery_rate out of range";
1241 rs
->md
.sync_speed_min
= (int)value
;
1242 } else if (!strcasecmp(key
, dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE
))) {
1243 if (_test_and_set_flag(CTR_FLAG_MIN_RECOVERY_RATE
, &rs
->ctr_flags
)) {
1244 rs
->ti
->error
= "Only one max_recovery_rate argument pair allowed";
1247 if (value
> INT_MAX
) {
1248 rs
->ti
->error
= "max_recovery_rate out of range";
1251 rs
->md
.sync_speed_max
= (int)value
;
1252 } else if (!strcasecmp(key
, dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE
))) {
1253 if (_test_and_set_flag(CTR_FLAG_REGION_SIZE
, &rs
->ctr_flags
)) {
1254 rs
->ti
->error
= "Only one region_size argument pair allowed";
1258 region_size
= value
;
1259 } else if (!strcasecmp(key
, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES
))) {
1260 if (_test_and_set_flag(CTR_FLAG_RAID10_COPIES
, &rs
->ctr_flags
)) {
1261 rs
->ti
->error
= "Only one raid10_copies argument pair allowed";
1265 if (!_in_range(value
, 2, rs
->md
.raid_disks
)) {
1266 rs
->ti
->error
= "Bad value for 'raid10_copies'";
1270 raid10_copies
= value
;
1272 DMERR("Unable to parse RAID parameter: %s", key
);
1273 rs
->ti
->error
= "Unable to parse RAID parameter";
1278 if (validate_region_size(rs
, region_size
))
1281 if (rs
->md
.chunk_sectors
)
1282 max_io_len
= rs
->md
.chunk_sectors
;
1284 max_io_len
= region_size
;
1286 if (dm_set_target_max_io_len(rs
->ti
, max_io_len
))
1289 if (rt_is_raid10(rt
)) {
1290 if (raid10_copies
> rs
->md
.raid_disks
) {
1291 rs
->ti
->error
= "Not enough devices to satisfy specification";
1295 rs
->md
.new_layout
= raid10_format_to_md_layout(rs
, raid10_format
, raid10_copies
);
1296 if (rs
->md
.new_layout
< 0) {
1297 rs
->ti
->error
= "Error getting raid10 format";
1298 return rs
->md
.new_layout
;
1301 rt
= get_raid_type_by_ll(10, rs
->md
.new_layout
);
1303 rs
->ti
->error
= "Failed to recognize new raid10 layout";
1307 if ((rt
->algorithm
== ALGORITHM_RAID10_DEFAULT
||
1308 rt
->algorithm
== ALGORITHM_RAID10_NEAR
) &&
1309 _test_flag(CTR_FLAG_RAID10_USE_NEAR_SETS
, rs
->ctr_flags
)) {
1310 rs
->ti
->error
= "RAID10 format 'near' and 'raid10_use_near_sets' are incompatible";
1314 /* (Len * #mirrors) / #devices */
1315 sectors_per_dev
= rs
->ti
->len
* raid10_copies
;
1316 sector_div(sectors_per_dev
, rs
->md
.raid_disks
);
1318 rs
->md
.layout
= raid10_format_to_md_layout(rs
, raid10_format
, raid10_copies
);
1319 rs
->md
.new_layout
= rs
->md
.layout
;
1320 } else if (!rt_is_raid1(rt
) &&
1321 sector_div(sectors_per_dev
, (rs
->md
.raid_disks
- rt
->parity_devs
))) {
1322 rs
->ti
->error
= "Target length not divisible by number of data devices";
1326 rs
->raid10_copies
= raid10_copies
;
1327 rs
->md
.dev_sectors
= sectors_per_dev
;
1329 /* Assume there are no metadata devices until the drives are parsed */
1330 rs
->md
.persistent
= 0;
1331 rs
->md
.external
= 1;
1333 /* Check, if any invalid ctr arguments have been passed in for the raid level */
1334 return rs_check_for_invalid_flags(rs
);
1337 /* Return # of data stripes as kept in mddev as of @rs (i.e. as of superblock) */
1338 static unsigned int mddev_data_stripes(struct raid_set
*rs
)
1340 return rs
->md
.raid_disks
- rs
->raid_type
->parity_devs
;
1343 static void do_table_event(struct work_struct
*ws
)
1345 struct raid_set
*rs
= container_of(ws
, struct raid_set
, md
.event_work
);
1347 dm_table_event(rs
->ti
->table
);
1350 static int raid_is_congested(struct dm_target_callbacks
*cb
, int bits
)
1352 struct raid_set
*rs
= container_of(cb
, struct raid_set
, callbacks
);
1354 return mddev_congested(&rs
->md
, bits
);
1358 * Make sure a valid takover (level switch) is being requested on @rs
1360 * Conversions of raid sets from one MD personality to another
1361 * have to conform to restrictions which are enforced here.
1363 * Degration is already checked for in rs_check_conversion() below.
1365 static int rs_check_takeover(struct raid_set
*rs
)
1367 struct mddev
*mddev
= &rs
->md
;
1368 unsigned int near_copies
;
1370 switch (mddev
->level
) {
1372 /* raid0 -> raid1/5 with one disk */
1373 if ((mddev
->new_level
== 1 || mddev
->new_level
== 5) &&
1374 mddev
->raid_disks
== 1)
1377 /* raid0 -> raid10 */
1378 if (mddev
->new_level
== 10 &&
1379 !(rs
->raid_disks
% 2))
1382 /* raid0 with multiple disks -> raid4/5/6 */
1383 if (_in_range(mddev
->new_level
, 4, 6) &&
1384 mddev
->new_layout
== ALGORITHM_PARITY_N
&&
1385 mddev
->raid_disks
> 1)
1391 /* Can't takeover raid10_offset! */
1392 if (_is_raid10_offset(mddev
->layout
))
1395 near_copies
= _raid10_near_copies(mddev
->layout
);
1397 /* raid10* -> raid0 */
1398 if (mddev
->new_level
== 0) {
1399 /* Can takeover raid10_near with raid disks divisable by data copies! */
1400 if (near_copies
> 1 &&
1401 !(mddev
->raid_disks
% near_copies
)) {
1402 mddev
->raid_disks
/= near_copies
;
1403 mddev
->delta_disks
= mddev
->raid_disks
;
1407 /* Can takeover raid10_far */
1408 if (near_copies
== 1 &&
1409 _raid10_far_copies(mddev
->layout
) > 1)
1415 /* raid10_{near,far} -> raid1 */
1416 if (mddev
->new_level
== 1 &&
1417 max(near_copies
, _raid10_far_copies(mddev
->layout
)) == mddev
->raid_disks
)
1420 /* raid10_{near,far} with 2 disks -> raid4/5 */
1421 if (_in_range(mddev
->new_level
, 4, 5) &&
1422 mddev
->raid_disks
== 2)
1427 /* raid1 with 2 disks -> raid4/5 */
1428 if (_in_range(mddev
->new_level
, 4, 5) &&
1429 mddev
->raid_disks
== 2) {
1430 mddev
->degraded
= 1;
1434 /* raid1 -> raid0 */
1435 if (mddev
->new_level
== 0 &&
1436 mddev
->raid_disks
== 1)
1439 /* raid1 -> raid10 */
1440 if (mddev
->new_level
== 10)
1446 /* raid4 -> raid0 */
1447 if (mddev
->new_level
== 0)
1450 /* raid4 -> raid1/5 with 2 disks */
1451 if ((mddev
->new_level
== 1 || mddev
->new_level
== 5) &&
1452 mddev
->raid_disks
== 2)
1455 /* raid4 -> raid5/6 with parity N */
1456 if (_in_range(mddev
->new_level
, 5, 6) &&
1457 mddev
->layout
== ALGORITHM_PARITY_N
)
1462 /* raid5 with parity N -> raid0 */
1463 if (mddev
->new_level
== 0 &&
1464 mddev
->layout
== ALGORITHM_PARITY_N
)
1467 /* raid5 with parity N -> raid4 */
1468 if (mddev
->new_level
== 4 &&
1469 mddev
->layout
== ALGORITHM_PARITY_N
)
1472 /* raid5 with 2 disks -> raid1/4/10 */
1473 if ((mddev
->new_level
== 1 || mddev
->new_level
== 4 || mddev
->new_level
== 10) &&
1474 mddev
->raid_disks
== 2)
1477 /* raid5 with parity N -> raid6 with parity N */
1478 if (mddev
->new_level
== 6 &&
1479 ((mddev
->layout
== ALGORITHM_PARITY_N
&& mddev
->new_layout
== ALGORITHM_PARITY_N
) ||
1480 _in_range(mddev
->new_layout
, ALGORITHM_LEFT_ASYMMETRIC_6
, ALGORITHM_RIGHT_SYMMETRIC_6
)))
1485 /* raid6 with parity N -> raid0 */
1486 if (mddev
->new_level
== 0 &&
1487 mddev
->layout
== ALGORITHM_PARITY_N
)
1490 /* raid6 with parity N -> raid4 */
1491 if (mddev
->new_level
== 4 &&
1492 mddev
->layout
== ALGORITHM_PARITY_N
)
1495 /* raid6_*_n with parity N -> raid5_* */
1496 if (mddev
->new_level
== 5 &&
1497 ((mddev
->layout
== ALGORITHM_PARITY_N
&& mddev
->new_layout
== ALGORITHM_PARITY_N
) ||
1498 _in_range(mddev
->new_layout
, ALGORITHM_LEFT_ASYMMETRIC
, ALGORITHM_RIGHT_SYMMETRIC
)))
1505 rs
->ti
->error
= "takeover not possible";
1509 /* True if @rs requested to be taken over */
1510 static bool rs_takeover_requested(struct raid_set
*rs
)
1512 return rs
->md
.new_level
!= rs
->md
.level
;
1516 #define FEATURE_FLAG_SUPPORTS_V190 0x1 /* Supports extended superblock */
1518 /* State flags for sb->flags */
1519 #define SB_FLAG_RESHAPE_ACTIVE 0x1
1520 #define SB_FLAG_RESHAPE_BACKWARDS 0x2
1523 * This structure is never routinely used by userspace, unlike md superblocks.
1524 * Devices with this superblock should only ever be accessed via device-mapper.
1526 #define DM_RAID_MAGIC 0x64526D44
1527 struct dm_raid_superblock
{
1528 __le32 magic
; /* "DmRd" */
1529 __le32 compat_features
; /* Used to indicate compatible features (like 1.9.0 ondisk metadata extension) */
1531 __le32 num_devices
; /* Number of devices in this raid set. (Max 64) */
1532 __le32 array_position
; /* The position of this drive in the raid set */
1534 __le64 events
; /* Incremented by md when superblock updated */
1535 __le64 failed_devices
; /* Pre 1.9.0 part of bit field of devices to */
1536 /* indicate failures (see extension below) */
1539 * This offset tracks the progress of the repair or replacement of
1540 * an individual drive.
1542 __le64 disk_recovery_offset
;
1545 * This offset tracks the progress of the initial raid set
1546 * synchronisation/parity calculation.
1548 __le64 array_resync_offset
;
1551 * raid characteristics
1555 __le32 stripe_sectors
;
1557 /********************************************************************
1558 * BELOW FOLLOW V1.9.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!!
1560 * FEATURE_FLAG_SUPPORTS_V190 in the features member indicates that those exist
1563 __le32 flags
; /* Flags defining array states for reshaping */
1566 * This offset tracks the progress of a raid
1567 * set reshape in order to be able to restart it
1569 __le64 reshape_position
;
1572 * These define the properties of the array in case of an interrupted reshape
1576 __le32 new_stripe_sectors
;
1579 __le64 array_sectors
; /* Array size in sectors */
1582 * Sector offsets to data on devices (reshaping).
1583 * Needed to support out of place reshaping, thus
1584 * not writing over any stripes whilst converting
1585 * them from old to new layout
1588 __le64 new_data_offset
;
1590 __le64 sectors
; /* Used device size in sectors */
1593 * Additonal Bit field of devices indicating failures to support
1594 * up to 256 devices with the 1.9.0 on-disk metadata format
1596 __le64 extended_failed_devices
[DISKS_ARRAY_ELEMS
- 1];
1598 __le32 incompat_features
; /* Used to indicate any incompatible features */
1600 /* Always set rest up to logical block size to 0 when writing (see get_metadata_device() below). */
1603 static int read_disk_sb(struct md_rdev
*rdev
, int size
)
1605 BUG_ON(!rdev
->sb_page
);
1607 if (rdev
->sb_loaded
)
1610 if (!sync_page_io(rdev
, 0, size
, rdev
->sb_page
, REQ_OP_READ
, 0, 1)) {
1611 DMERR("Failed to read superblock of device at position %d",
1613 md_error(rdev
->mddev
, rdev
);
1617 rdev
->sb_loaded
= 1;
1622 static void sb_retrieve_failed_devices(struct dm_raid_superblock
*sb
, uint64_t *failed_devices
)
1624 failed_devices
[0] = le64_to_cpu(sb
->failed_devices
);
1625 memset(failed_devices
+ 1, 0, sizeof(sb
->extended_failed_devices
));
1627 if (_test_flag(FEATURE_FLAG_SUPPORTS_V190
, le32_to_cpu(sb
->compat_features
))) {
1628 int i
= ARRAY_SIZE(sb
->extended_failed_devices
);
1631 failed_devices
[i
+1] = le64_to_cpu(sb
->extended_failed_devices
[i
]);
1635 static void sb_update_failed_devices(struct dm_raid_superblock
*sb
, uint64_t *failed_devices
)
1637 int i
= ARRAY_SIZE(sb
->extended_failed_devices
);
1639 sb
->failed_devices
= cpu_to_le64(failed_devices
[0]);
1641 sb
->extended_failed_devices
[i
] = cpu_to_le64(failed_devices
[i
+1]);
1645 * Synchronize the superblock members with the raid set properties
1647 * All superblock data is little endian.
1649 static void super_sync(struct mddev
*mddev
, struct md_rdev
*rdev
)
1651 bool update_failed_devices
= false;
1653 uint64_t failed_devices
[DISKS_ARRAY_ELEMS
];
1654 struct dm_raid_superblock
*sb
;
1655 struct raid_set
*rs
= container_of(mddev
, struct raid_set
, md
);
1657 /* No metadata device, no superblock */
1658 if (!rdev
->meta_bdev
)
1661 BUG_ON(!rdev
->sb_page
);
1663 sb
= page_address(rdev
->sb_page
);
1665 sb_retrieve_failed_devices(sb
, failed_devices
);
1667 for (i
= 0; i
< rs
->raid_disks
; i
++)
1668 if (!rs
->dev
[i
].data_dev
|| test_bit(Faulty
, &rs
->dev
[i
].rdev
.flags
)) {
1669 update_failed_devices
= true;
1670 set_bit(i
, (void *) failed_devices
);
1673 if (update_failed_devices
)
1674 sb_update_failed_devices(sb
, failed_devices
);
1676 sb
->magic
= cpu_to_le32(DM_RAID_MAGIC
);
1677 sb
->compat_features
= cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190
);
1679 sb
->num_devices
= cpu_to_le32(mddev
->raid_disks
);
1680 sb
->array_position
= cpu_to_le32(rdev
->raid_disk
);
1682 sb
->events
= cpu_to_le64(mddev
->events
);
1684 sb
->disk_recovery_offset
= cpu_to_le64(rdev
->recovery_offset
);
1685 sb
->array_resync_offset
= cpu_to_le64(mddev
->recovery_cp
);
1687 sb
->level
= cpu_to_le32(mddev
->level
);
1688 sb
->layout
= cpu_to_le32(mddev
->layout
);
1689 sb
->stripe_sectors
= cpu_to_le32(mddev
->chunk_sectors
);
1691 sb
->new_level
= cpu_to_le32(mddev
->new_level
);
1692 sb
->new_layout
= cpu_to_le32(mddev
->new_layout
);
1693 sb
->new_stripe_sectors
= cpu_to_le32(mddev
->new_chunk_sectors
);
1695 sb
->delta_disks
= cpu_to_le32(mddev
->delta_disks
);
1697 smp_rmb(); /* Make sure we access most recent reshape position */
1698 sb
->reshape_position
= cpu_to_le64(mddev
->reshape_position
);
1699 if (le64_to_cpu(sb
->reshape_position
) != MaxSector
) {
1700 /* Flag ongoing reshape */
1701 sb
->flags
|= cpu_to_le32(SB_FLAG_RESHAPE_ACTIVE
);
1703 if (mddev
->delta_disks
< 0 || mddev
->reshape_backwards
)
1704 sb
->flags
|= cpu_to_le32(SB_FLAG_RESHAPE_BACKWARDS
);
1706 /* Flag no reshape */
1707 _clear_flags(cpu_to_le32(SB_FLAG_RESHAPE_ACTIVE
|SB_FLAG_RESHAPE_BACKWARDS
), &sb
->flags
);
1709 sb
->array_sectors
= cpu_to_le64(mddev
->array_sectors
);
1710 sb
->data_offset
= cpu_to_le64(rdev
->data_offset
);
1711 sb
->new_data_offset
= cpu_to_le64(rdev
->new_data_offset
);
1712 sb
->sectors
= cpu_to_le64(rdev
->sectors
);
1714 /* Zero out the rest of the payload after the size of the superblock */
1715 memset(sb
+ 1, 0, rdev
->sb_size
- sizeof(*sb
));
1721 * This function creates a superblock if one is not found on the device
1722 * and will decide which superblock to use if there's a choice.
1724 * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
1726 static int super_load(struct md_rdev
*rdev
, struct md_rdev
*refdev
)
1729 struct dm_raid_superblock
*sb
;
1730 struct dm_raid_superblock
*refsb
;
1731 uint64_t events_sb
, events_refsb
;
1734 rdev
->sb_size
= bdev_logical_block_size(rdev
->meta_bdev
);
1735 if (rdev
->sb_size
< sizeof(*sb
) || rdev
->sb_size
> PAGE_SIZE
) {
1736 DMERR("superblock size of a logical block is no longer valid");
1740 r
= read_disk_sb(rdev
, rdev
->sb_size
);
1744 sb
= page_address(rdev
->sb_page
);
1747 * Two cases that we want to write new superblocks and rebuild:
1748 * 1) New device (no matching magic number)
1749 * 2) Device specified for rebuild (!In_sync w/ offset == 0)
1751 if ((sb
->magic
!= cpu_to_le32(DM_RAID_MAGIC
)) ||
1752 (!test_bit(In_sync
, &rdev
->flags
) && !rdev
->recovery_offset
)) {
1753 super_sync(rdev
->mddev
, rdev
);
1755 set_bit(FirstUse
, &rdev
->flags
);
1756 sb
->compat_features
= cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190
);
1758 /* Force writing of superblocks to disk */
1759 set_bit(MD_CHANGE_DEVS
, &rdev
->mddev
->flags
);
1761 /* Any superblock is better than none, choose that if given */
1762 return refdev
? 0 : 1;
1768 events_sb
= le64_to_cpu(sb
->events
);
1770 refsb
= page_address(refdev
->sb_page
);
1771 events_refsb
= le64_to_cpu(refsb
->events
);
1773 return (events_sb
> events_refsb
) ? 1 : 0;
1776 static int super_init_validation(struct raid_set
*rs
, struct md_rdev
*rdev
)
1780 struct mddev
*mddev
= &rs
->md
;
1782 uint64_t failed_devices
[DISKS_ARRAY_ELEMS
];
1783 struct dm_raid_superblock
*sb
;
1784 uint32_t new_devs
= 0, rebuild_and_new
= 0, rebuilds
= 0;
1786 struct dm_raid_superblock
*sb2
;
1788 sb
= page_address(rdev
->sb_page
);
1789 events_sb
= le64_to_cpu(sb
->events
);
1792 * Initialise to 1 if this is a new superblock.
1794 mddev
->events
= events_sb
? : 1;
1796 mddev
->reshape_position
= MaxSector
;
1799 * Reshaping is supported, e.g. reshape_position is valid
1800 * in superblock and superblock content is authoritative.
1802 if (_test_flag(FEATURE_FLAG_SUPPORTS_V190
, le32_to_cpu(sb
->compat_features
))) {
1803 /* Superblock is authoritative wrt given raid set layout! */
1804 mddev
->raid_disks
= le32_to_cpu(sb
->num_devices
);
1805 mddev
->level
= le32_to_cpu(sb
->level
);
1806 mddev
->layout
= le32_to_cpu(sb
->layout
);
1807 mddev
->chunk_sectors
= le32_to_cpu(sb
->stripe_sectors
);
1808 mddev
->new_level
= le32_to_cpu(sb
->new_level
);
1809 mddev
->new_layout
= le32_to_cpu(sb
->new_layout
);
1810 mddev
->new_chunk_sectors
= le32_to_cpu(sb
->new_stripe_sectors
);
1811 mddev
->delta_disks
= le32_to_cpu(sb
->delta_disks
);
1812 mddev
->array_sectors
= le64_to_cpu(sb
->array_sectors
);
1814 /* raid was reshaping and got interrupted */
1815 if (_test_flag(SB_FLAG_RESHAPE_ACTIVE
, le32_to_cpu(sb
->flags
))) {
1816 if (_test_flag(CTR_FLAG_DELTA_DISKS
, rs
->ctr_flags
)) {
1817 DMERR("Reshape requested but raid set is still reshaping");
1821 if (mddev
->delta_disks
< 0 ||
1822 (!mddev
->delta_disks
&& _test_flag(SB_FLAG_RESHAPE_BACKWARDS
, le32_to_cpu(sb
->flags
))))
1823 mddev
->reshape_backwards
= 1;
1825 mddev
->reshape_backwards
= 0;
1827 mddev
->reshape_position
= le64_to_cpu(sb
->reshape_position
);
1828 rs
->raid_type
= get_raid_type_by_ll(mddev
->level
, mddev
->layout
);
1833 * No takeover/reshaping, because we don't have the extended v1.9.0 metadata
1835 if (le32_to_cpu(sb
->level
) != mddev
->level
) {
1836 DMERR("Reshaping/takeover raid sets not yet supported. (raid level/stripes/size change)");
1839 if (le32_to_cpu(sb
->layout
) != mddev
->layout
) {
1840 DMERR("Reshaping raid sets not yet supported. (raid layout change)");
1841 DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb
->layout
), mddev
->layout
);
1842 DMERR(" Old layout: %s w/ %d copies",
1843 raid10_md_layout_to_format(le32_to_cpu(sb
->layout
)),
1844 raid10_md_layout_to_copies(le32_to_cpu(sb
->layout
)));
1845 DMERR(" New layout: %s w/ %d copies",
1846 raid10_md_layout_to_format(mddev
->layout
),
1847 raid10_md_layout_to_copies(mddev
->layout
));
1850 if (le32_to_cpu(sb
->stripe_sectors
) != mddev
->chunk_sectors
) {
1851 DMERR("Reshaping raid sets not yet supported. (stripe sectors change)");
1855 /* We can only change the number of devices in raid1 with old (i.e. pre 1.0.7) metadata */
1856 if (!rt_is_raid1(rs
->raid_type
) &&
1857 (le32_to_cpu(sb
->num_devices
) != mddev
->raid_disks
)) {
1858 DMERR("Reshaping raid sets not yet supported. (device count change from %u to %u)",
1859 sb
->num_devices
, mddev
->raid_disks
);
1863 /* Table line is checked vs. authoritative superblock */
1867 if (!_test_flag(CTR_FLAG_NOSYNC
, rs
->ctr_flags
))
1868 mddev
->recovery_cp
= le64_to_cpu(sb
->array_resync_offset
);
1871 * During load, we set FirstUse if a new superblock was written.
1872 * There are two reasons we might not have a superblock:
1873 * 1) The raid set is brand new - in which case, all of the
1874 * devices must have their In_sync bit set. Also,
1875 * recovery_cp must be 0, unless forced.
1876 * 2) This is a new device being added to an old raid set
1877 * and the new device needs to be rebuilt - in which
1878 * case the In_sync bit will /not/ be set and
1879 * recovery_cp must be MaxSector.
1882 rdev_for_each(r
, mddev
) {
1883 if (test_bit(FirstUse
, &r
->flags
))
1886 if (!test_bit(In_sync
, &r
->flags
)) {
1887 DMINFO("Device %d specified for rebuild; clearing superblock",
1891 if (test_bit(FirstUse
, &r
->flags
))
1898 if (new_devs
== rs
->raid_disks
|| !rebuilds
) {
1899 /* Replace a broken device */
1900 if (new_devs
== 1 && !rs
->delta_disks
)
1902 if (new_devs
== rs
->raid_disks
) {
1903 DMINFO("Superblocks created for new raid set");
1904 set_bit(MD_ARRAY_FIRST_USE
, &mddev
->flags
);
1905 _set_flag(RT_FLAG_UPDATE_SBS
, &rs
->runtime_flags
);
1906 mddev
->recovery_cp
= 0;
1907 } else if (new_devs
&& new_devs
!= rs
->raid_disks
&& !rebuilds
) {
1908 DMERR("New device injected into existing raid set without "
1909 "'delta_disks' or 'rebuild' parameter specified");
1912 } else if (new_devs
&& new_devs
!= rebuilds
) {
1913 DMERR("%u 'rebuild' devices cannot be injected into"
1914 " a raid set with %u other first-time devices",
1915 rebuilds
, new_devs
);
1917 } else if (rebuilds
) {
1918 if (rebuild_and_new
&& rebuilds
!= rebuild_and_new
) {
1919 DMERR("new device%s provided without 'rebuild'",
1920 new_devs
> 1 ? "s" : "");
1922 } else if (mddev
->recovery_cp
!= MaxSector
) {
1923 DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)",
1924 (unsigned long long) mddev
->recovery_cp
);
1926 } else if (mddev
->reshape_position
!= MaxSector
) {
1927 DMERR("'rebuild' specified while raid set is being reshaped");
1933 * Now we set the Faulty bit for those devices that are
1934 * recorded in the superblock as failed.
1936 sb_retrieve_failed_devices(sb
, failed_devices
);
1937 rdev_for_each(r
, mddev
) {
1940 sb2
= page_address(r
->sb_page
);
1941 sb2
->failed_devices
= 0;
1942 memset(sb2
->extended_failed_devices
, 0, sizeof(sb2
->extended_failed_devices
));
1945 * Check for any device re-ordering.
1947 if (!test_bit(FirstUse
, &r
->flags
) && (r
->raid_disk
>= 0)) {
1948 role
= le32_to_cpu(sb2
->array_position
);
1952 if (role
!= r
->raid_disk
) {
1953 if (_is_raid10_near(mddev
->layout
)) {
1954 if (mddev
->raid_disks
% _raid10_near_copies(mddev
->layout
) ||
1955 rs
->raid_disks
% rs
->raid10_copies
) {
1957 "Cannot change raid10 near set to odd # of devices!";
1961 sb2
->array_position
= cpu_to_le32(r
->raid_disk
);
1963 } else if (!(rs_is_raid10(rs
) && rt_is_raid0(rs
->raid_type
)) &&
1964 !(rs_is_raid0(rs
) && rt_is_raid10(rs
->raid_type
)) &&
1965 !rt_is_raid1(rs
->raid_type
)) {
1966 rs
->ti
->error
= "Cannot change device positions in raid set";
1970 DMINFO("raid device #%d now at position #%d", role
, r
->raid_disk
);
1974 * Partial recovery is performed on
1975 * returning failed devices.
1977 if (test_bit(role
, (void *) failed_devices
))
1978 set_bit(Faulty
, &r
->flags
);
1985 static int super_validate(struct raid_set
*rs
, struct md_rdev
*rdev
)
1987 struct mddev
*mddev
= &rs
->md
;
1988 struct dm_raid_superblock
*sb
;
1990 if (rs_is_raid0(rs
) || !rdev
->sb_page
)
1993 sb
= page_address(rdev
->sb_page
);
1996 * If mddev->events is not set, we know we have not yet initialized
1999 if (!mddev
->events
&& super_init_validation(rs
, rdev
))
2002 if (le32_to_cpu(sb
->compat_features
) != FEATURE_FLAG_SUPPORTS_V190
) {
2003 rs
->ti
->error
= "Unable to assemble array: Unknown flag(s) in compatible feature flags";
2007 if (sb
->incompat_features
) {
2008 rs
->ti
->error
= "Unable to assemble array: No incompatible feature flags supported yet";
2012 /* Enable bitmap creation for RAID levels != 0 */
2013 mddev
->bitmap_info
.offset
= rt_is_raid0(rs
->raid_type
) ? 0 : to_sector(4096);
2014 rdev
->mddev
->bitmap_info
.default_offset
= mddev
->bitmap_info
.offset
;
2016 if (!test_and_clear_bit(FirstUse
, &rdev
->flags
)) {
2017 /* Retrieve device size stored in superblock to be prepared for shrink */
2018 rdev
->sectors
= le64_to_cpu(sb
->sectors
);
2019 rdev
->recovery_offset
= le64_to_cpu(sb
->disk_recovery_offset
);
2020 if (rdev
->recovery_offset
== MaxSector
)
2021 set_bit(In_sync
, &rdev
->flags
);
2023 * If no reshape in progress -> we're recovering single
2024 * disk(s) and have to set the device(s) to out-of-sync
2026 else if (rs
->md
.reshape_position
== MaxSector
)
2027 clear_bit(In_sync
, &rdev
->flags
); /* Mandatory for recovery */
2031 * If a device comes back, set it as not In_sync and no longer faulty.
2033 if (test_and_clear_bit(Faulty
, &rdev
->flags
)) {
2034 rdev
->recovery_offset
= 0;
2035 clear_bit(In_sync
, &rdev
->flags
);
2036 rdev
->saved_raid_disk
= rdev
->raid_disk
;
2039 /* Reshape support -> restore repective data offsets */
2040 rdev
->data_offset
= le64_to_cpu(sb
->data_offset
);
2041 rdev
->new_data_offset
= le64_to_cpu(sb
->new_data_offset
);
2047 * Analyse superblocks and select the freshest.
2049 static int analyse_superblocks(struct dm_target
*ti
, struct raid_set
*rs
)
2052 struct raid_dev
*dev
;
2053 struct md_rdev
*rdev
, *tmp
, *freshest
;
2054 struct mddev
*mddev
= &rs
->md
;
2057 rdev_for_each_safe(rdev
, tmp
, mddev
) {
2059 * Skipping super_load due to CTR_FLAG_SYNC will cause
2060 * the array to undergo initialization again as
2061 * though it were new. This is the intended effect
2062 * of the "sync" directive.
2064 * When reshaping capability is added, we must ensure
2065 * that the "sync" directive is disallowed during the
2068 if (_test_flag(CTR_FLAG_SYNC
, rs
->ctr_flags
))
2071 if (!rdev
->meta_bdev
)
2074 r
= super_load(rdev
, freshest
);
2083 dev
= container_of(rdev
, struct raid_dev
, rdev
);
2085 dm_put_device(ti
, dev
->meta_dev
);
2087 dev
->meta_dev
= NULL
;
2088 rdev
->meta_bdev
= NULL
;
2091 put_page(rdev
->sb_page
);
2093 rdev
->sb_page
= NULL
;
2095 rdev
->sb_loaded
= 0;
2098 * We might be able to salvage the data device
2099 * even though the meta device has failed. For
2100 * now, we behave as though '- -' had been
2101 * set for this device in the table.
2104 dm_put_device(ti
, dev
->data_dev
);
2106 dev
->data_dev
= NULL
;
2109 list_del(&rdev
->same_set
);
2116 if (validate_raid_redundancy(rs
)) {
2117 rs
->ti
->error
= "Insufficient redundancy to activate array";
2122 * Validation of the freshest device provides the source of
2123 * validation for the remaining devices.
2125 if (super_validate(rs
, freshest
)) {
2126 rs
->ti
->error
= "Unable to assemble array: Invalid superblocks";
2130 rdev_for_each(rdev
, mddev
)
2131 if ((rdev
!= freshest
) && super_validate(rs
, rdev
))
2137 /* Userpace reordered disks -> adjust raid_disk indexes in @rs */
2138 static void _reorder_raid_disk_indexes(struct raid_set
*rs
)
2141 struct md_rdev
*rdev
;
2143 rdev_for_each(rdev
, &rs
->md
) {
2144 rdev
->raid_disk
= i
++;
2145 rdev
->saved_raid_disk
= rdev
->new_raid_disk
= -1;
2150 * Setup @rs for takeover by a different raid level
2152 static int rs_setup_takeover(struct raid_set
*rs
)
2154 struct mddev
*mddev
= &rs
->md
;
2155 struct md_rdev
*rdev
;
2156 unsigned int d
= mddev
->raid_disks
= rs
->raid_disks
;
2157 sector_t new_data_offset
= rs
->dev
[0].rdev
.data_offset
? 0 : rs
->data_offset
;
2159 if (rt_is_raid10(rs
->raid_type
)) {
2160 if (mddev
->level
== 0) {
2161 /* Userpace reordered disks -> adjust raid_disk indexes */
2162 _reorder_raid_disk_indexes(rs
);
2164 /* raid0 -> raid10_far layout */
2165 mddev
->layout
= raid10_format_to_md_layout(rs
, ALGORITHM_RAID10_FAR
,
2167 } else if (mddev
->level
== 1)
2168 /* raid1 -> raid10_near layout */
2169 mddev
->layout
= raid10_format_to_md_layout(rs
, ALGORITHM_RAID10_NEAR
,
2176 clear_bit(MD_ARRAY_FIRST_USE
, &mddev
->flags
);
2177 mddev
->recovery_cp
= MaxSector
;
2180 rdev
= &rs
->dev
[d
].rdev
;
2182 if (test_bit(d
, (void *) rs
->rebuild_disks
)) {
2183 clear_bit(In_sync
, &rdev
->flags
);
2184 clear_bit(Faulty
, &rdev
->flags
);
2185 mddev
->recovery_cp
= rdev
->recovery_offset
= 0;
2186 /* Bitmap has to be created when we do an "up" takeover */
2187 set_bit(MD_ARRAY_FIRST_USE
, &mddev
->flags
);
2190 rdev
->new_data_offset
= new_data_offset
;
2197 * Enable/disable discard support on RAID set depending on
2198 * RAID level and discard properties of underlying RAID members.
2200 static void configure_discard_support(struct raid_set
*rs
)
2204 struct dm_target
*ti
= rs
->ti
;
2206 /* Assume discards not supported until after checks below. */
2207 ti
->discards_supported
= false;
2209 /* RAID level 4,5,6 require discard_zeroes_data for data integrity! */
2210 raid456
= (rs
->md
.level
== 4 || rs
->md
.level
== 5 || rs
->md
.level
== 6);
2212 for (i
= 0; i
< rs
->md
.raid_disks
; i
++) {
2213 struct request_queue
*q
;
2215 if (!rs
->dev
[i
].rdev
.bdev
)
2218 q
= bdev_get_queue(rs
->dev
[i
].rdev
.bdev
);
2219 if (!q
|| !blk_queue_discard(q
))
2223 if (!q
->limits
.discard_zeroes_data
)
2225 if (!devices_handle_discard_safely
) {
2226 DMERR("raid456 discard support disabled due to discard_zeroes_data uncertainty.");
2227 DMERR("Set dm-raid.devices_handle_discard_safely=Y to override.");
2233 /* All RAID members properly support discards */
2234 ti
->discards_supported
= true;
2237 * RAID1 and RAID10 personalities require bio splitting,
2238 * RAID0/4/5/6 don't and process large discard bios properly.
2240 ti
->split_discard_bios
= !!(rs
->md
.level
== 1 || rs
->md
.level
== 10);
2241 ti
->num_discard_bios
= 1;
2245 * Construct a RAID0/1/10/4/5/6 mapping:
2247 * <raid_type> <#raid_params> <raid_params>{0,} \
2248 * <#raid_devs> [<meta_dev1> <dev1>]{1,}
2250 * <raid_params> varies by <raid_type>. See 'parse_raid_params' for
2251 * details on possible <raid_params>.
2253 * Userspace is free to initialize the metadata devices, hence the superblocks to
2254 * enforce recreation based on the passed in table parameters.
2257 static int raid_ctr(struct dm_target
*ti
, unsigned argc
, char **argv
)
2260 struct raid_type
*rt
;
2261 unsigned num_raid_params
, num_raid_devs
;
2262 struct raid_set
*rs
= NULL
;
2264 struct dm_arg_set as
= { argc
, argv
}, as_nrd
;
2265 struct dm_arg _args
[] = {
2266 { 0, as
.argc
, "Cannot understand number of raid parameters" },
2267 { 1, 254, "Cannot understand number of raid devices parameters" }
2270 /* Must have <raid_type> */
2271 arg
= dm_shift_arg(&as
);
2273 ti
->error
= "No arguments";
2277 rt
= get_raid_type(arg
);
2279 ti
->error
= "Unrecognised raid_type";
2283 /* Must have <#raid_params> */
2284 if (dm_read_arg_group(_args
, &as
, &num_raid_params
, &ti
->error
))
2287 /* number of raid device tupples <meta_dev data_dev> */
2289 dm_consume_args(&as_nrd
, num_raid_params
);
2290 _args
[1].max
= (as_nrd
.argc
- 1) / 2;
2291 if (dm_read_arg(_args
+ 1, &as_nrd
, &num_raid_devs
, &ti
->error
))
2294 if (!_in_range(num_raid_devs
, 1, MAX_RAID_DEVICES
)) {
2295 ti
->error
= "Invalid number of supplied raid devices";
2299 rs
= context_alloc(ti
, rt
, num_raid_devs
);
2303 r
= parse_raid_params(rs
, &as
, num_raid_params
);
2307 r
= parse_dev_params(rs
, &as
);
2311 rs
->md
.sync_super
= super_sync
;
2314 * Backup any new raid set level, layout, ...
2315 * requested to be able to compare to superblock
2316 * members for conversion decisions.
2318 rs_config_backup(rs
);
2320 r
= analyse_superblocks(ti
, rs
);
2324 INIT_WORK(&rs
->md
.event_work
, do_table_event
);
2326 ti
->num_flush_bios
= 1;
2328 /* Restore any requested new layout for conversion decision */
2329 rs_config_restore(rs
);
2332 * If a takeover is needed, just set the level to
2333 * the new requested one and allow the raid set to run.
2335 if (rs_takeover_requested(rs
)) {
2336 r
= rs_check_takeover(rs
);
2340 r
= rs_setup_takeover(rs
);
2344 /* Tell preresume to update superblocks with new layout */
2345 _set_flag(RT_FLAG_UPDATE_SBS
, &rs
->runtime_flags
);
2350 /* Start raid set read-only and assumed clean to change in raid_resume() */
2353 set_bit(MD_RECOVERY_FROZEN
, &rs
->md
.recovery
);
2355 /* Has to be held on running the array */
2356 mddev_lock_nointr(&rs
->md
);
2357 r
= md_run(&rs
->md
);
2358 rs
->md
.in_sync
= 0; /* Assume already marked dirty */
2359 mddev_unlock(&rs
->md
);
2362 ti
->error
= "Fail to run raid array";
2366 if (ti
->len
!= rs
->md
.array_sectors
) {
2367 ti
->error
= "Array size does not match requested target length";
2371 rs
->callbacks
.congested_fn
= raid_is_congested
;
2372 dm_table_add_target_callbacks(ti
->table
, &rs
->callbacks
);
2374 mddev_suspend(&rs
->md
);
2385 static void raid_dtr(struct dm_target
*ti
)
2387 struct raid_set
*rs
= ti
->private;
2389 list_del_init(&rs
->callbacks
.list
);
2394 static int raid_map(struct dm_target
*ti
, struct bio
*bio
)
2396 struct raid_set
*rs
= ti
->private;
2397 struct mddev
*mddev
= &rs
->md
;
2399 mddev
->pers
->make_request(mddev
, bio
);
2401 return DM_MAPIO_SUBMITTED
;
2404 /* Return string describing the current sync action of @mddev */
2405 static const char *decipher_sync_action(struct mddev
*mddev
)
2407 if (test_bit(MD_RECOVERY_FROZEN
, &mddev
->recovery
))
2410 if (test_bit(MD_RECOVERY_RUNNING
, &mddev
->recovery
) ||
2411 (!mddev
->ro
&& test_bit(MD_RECOVERY_NEEDED
, &mddev
->recovery
))) {
2412 if (test_bit(MD_RECOVERY_RESHAPE
, &mddev
->recovery
))
2415 if (test_bit(MD_RECOVERY_SYNC
, &mddev
->recovery
)) {
2416 if (!test_bit(MD_RECOVERY_REQUESTED
, &mddev
->recovery
))
2418 else if (test_bit(MD_RECOVERY_CHECK
, &mddev
->recovery
))
2423 if (test_bit(MD_RECOVERY_RECOVER
, &mddev
->recovery
))
2431 * Return status string @rdev
2433 * Status characters:
2435 * 'D' = Dead/Failed device
2436 * 'a' = Alive but not in-sync
2437 * 'A' = Alive and in-sync
2439 static const char *_raid_dev_status(struct md_rdev
*rdev
, bool array_in_sync
)
2441 if (test_bit(Faulty
, &rdev
->flags
))
2443 else if (!array_in_sync
|| !test_bit(In_sync
, &rdev
->flags
))
2449 /* Helper to return resync/reshape progress for @rs and @array_in_sync */
2450 static sector_t
rs_get_progress(struct raid_set
*rs
,
2451 sector_t resync_max_sectors
, bool *array_in_sync
)
2453 sector_t r
, recovery_cp
, curr_resync_completed
;
2454 struct mddev
*mddev
= &rs
->md
;
2456 curr_resync_completed
= mddev
->curr_resync_completed
?: mddev
->recovery_cp
;
2457 recovery_cp
= mddev
->recovery_cp
;
2458 *array_in_sync
= false;
2460 if (rs_is_raid0(rs
)) {
2461 r
= resync_max_sectors
;
2462 *array_in_sync
= true;
2465 r
= mddev
->reshape_position
;
2467 /* Reshape is relative to the array size */
2468 if (test_bit(MD_RECOVERY_RESHAPE
, &mddev
->recovery
) ||
2470 if (r
== MaxSector
) {
2471 *array_in_sync
= true;
2472 r
= resync_max_sectors
;
2474 /* Got to reverse on backward reshape */
2475 if (mddev
->reshape_backwards
)
2476 r
= mddev
->array_sectors
- r
;
2478 /* Devide by # of data stripes */
2479 sector_div(r
, mddev_data_stripes(rs
));
2482 /* Sync is relative to the component device size */
2483 } else if (test_bit(MD_RECOVERY_RUNNING
, &mddev
->recovery
))
2484 r
= curr_resync_completed
;
2488 if (r
== MaxSector
) {
2492 *array_in_sync
= true;
2493 r
= resync_max_sectors
;
2494 } else if (test_bit(MD_RECOVERY_REQUESTED
, &mddev
->recovery
)) {
2496 * If "check" or "repair" is occurring, the raid set has
2497 * undergone an initial sync and the health characters
2498 * should not be 'a' anymore.
2500 *array_in_sync
= true;
2502 struct md_rdev
*rdev
;
2505 * The raid set may be doing an initial sync, or it may
2506 * be rebuilding individual components. If all the
2507 * devices are In_sync, then it is the raid set that is
2508 * being initialized.
2510 rdev_for_each(rdev
, mddev
)
2511 if (!test_bit(In_sync
, &rdev
->flags
))
2512 *array_in_sync
= true;
2514 r
= 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */
2522 /* Helper to return @dev name or "-" if !@dev */
2523 static const char *_get_dev_name(struct dm_dev
*dev
)
2525 return dev
? dev
->name
: "-";
2528 static void raid_status(struct dm_target
*ti
, status_type_t type
,
2529 unsigned int status_flags
, char *result
, unsigned int maxlen
)
2531 struct raid_set
*rs
= ti
->private;
2532 struct mddev
*mddev
= &rs
->md
;
2533 struct r5conf
*conf
= mddev
->private;
2534 int max_nr_stripes
= conf
? conf
->max_nr_stripes
: 0;
2536 unsigned int raid_param_cnt
= 1; /* at least 1 for chunksize */
2537 unsigned int sz
= 0;
2538 unsigned int write_mostly_params
= 0;
2539 sector_t progress
, resync_max_sectors
, resync_mismatches
;
2540 const char *sync_action
;
2541 struct raid_type
*rt
;
2542 struct md_rdev
*rdev
;
2545 case STATUSTYPE_INFO
:
2546 /* *Should* always succeed */
2547 rt
= get_raid_type_by_ll(mddev
->new_level
, mddev
->new_layout
);
2551 DMEMIT("%s %d ", rt
? rt
->name
: "unknown", mddev
->raid_disks
);
2553 /* Access most recent mddev properties for status output */
2555 /* Get sensible max sectors even if raid set not yet started */
2556 resync_max_sectors
= _test_flag(RT_FLAG_RS_PRERESUMED
, rs
->runtime_flags
) ?
2557 mddev
->resync_max_sectors
: mddev
->dev_sectors
;
2558 progress
= rs_get_progress(rs
, resync_max_sectors
, &array_in_sync
);
2559 resync_mismatches
= (mddev
->last_sync_action
&& !strcasecmp(mddev
->last_sync_action
, "check")) ?
2560 (unsigned int) atomic64_read(&mddev
->resync_mismatches
) : 0;
2561 sync_action
= decipher_sync_action(&rs
->md
);
2563 /* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */
2564 rdev_for_each(rdev
, mddev
)
2565 DMEMIT(_raid_dev_status(rdev
, array_in_sync
));
2568 * In-sync/Reshape ratio:
2569 * The in-sync ratio shows the progress of:
2570 * - Initializing the raid set
2571 * - Rebuilding a subset of devices of the raid set
2572 * The user can distinguish between the two by referring
2573 * to the status characters.
2575 * The reshape ratio shows the progress of
2576 * changing the raid layout or the number of
2577 * disks of a raid set
2579 DMEMIT(" %llu/%llu", (unsigned long long) progress
,
2580 (unsigned long long) resync_max_sectors
);
2586 * See Documentation/device-mapper/dm-raid.txt for
2587 * information on each of these states.
2589 DMEMIT(" %s", sync_action
);
2594 * resync_mismatches/mismatch_cnt
2595 * This field shows the number of discrepancies found when
2596 * performing a "check" of the raid set.
2598 DMEMIT(" %llu", (unsigned long long) resync_mismatches
);
2603 * data_offset (needed for out of space reshaping)
2604 * This field shows the data offset into the data
2605 * image LV where the first stripes data starts.
2607 * We keep data_offset equal on all raid disks of the set,
2608 * so retrieving it from the first raid disk is sufficient.
2610 DMEMIT(" %llu", (unsigned long long) rs
->dev
[0].rdev
.data_offset
);
2613 case STATUSTYPE_TABLE
:
2614 /* Report the table line string you would use to construct this raid set */
2616 /* Calculate raid parameter count */
2617 rdev_for_each(rdev
, mddev
)
2618 if (test_bit(WriteMostly
, &rdev
->flags
))
2619 write_mostly_params
+= 2;
2620 raid_param_cnt
+= memweight(rs
->rebuild_disks
,
2621 DISKS_ARRAY_ELEMS
* sizeof(*rs
->rebuild_disks
)) * 2 +
2622 write_mostly_params
+
2623 hweight32(rs
->ctr_flags
& CTR_FLAG_OPTIONS_NO_ARGS
) +
2624 hweight32(rs
->ctr_flags
& CTR_FLAG_OPTIONS_ONE_ARG
) * 2;
2625 /* Emit table line */
2626 DMEMIT("%s %u %u", rs
->raid_type
->name
, raid_param_cnt
, mddev
->new_chunk_sectors
);
2627 if (_test_flag(CTR_FLAG_RAID10_FORMAT
, rs
->ctr_flags
))
2628 DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT
),
2629 raid10_md_layout_to_format(mddev
->layout
));
2630 if (_test_flag(CTR_FLAG_RAID10_COPIES
, rs
->ctr_flags
))
2631 DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_COPIES
),
2632 raid10_md_layout_to_copies(mddev
->layout
));
2633 if (_test_flag(CTR_FLAG_NOSYNC
, rs
->ctr_flags
))
2634 DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC
));
2635 if (_test_flag(CTR_FLAG_SYNC
, rs
->ctr_flags
))
2636 DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC
));
2637 if (_test_flag(CTR_FLAG_REGION_SIZE
, rs
->ctr_flags
))
2638 DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_REGION_SIZE
),
2639 (unsigned long long) to_sector(mddev
->bitmap_info
.chunksize
));
2640 if (_test_flag(CTR_FLAG_DATA_OFFSET
, rs
->ctr_flags
))
2641 DMEMIT(" %s %llu", dm_raid_arg_name_by_flag(CTR_FLAG_DATA_OFFSET
),
2642 (unsigned long long) rs
->data_offset
);
2643 if (_test_flag(CTR_FLAG_DAEMON_SLEEP
, rs
->ctr_flags
))
2644 DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP
),
2645 mddev
->bitmap_info
.daemon_sleep
);
2646 if (_test_flag(CTR_FLAG_DELTA_DISKS
, rs
->ctr_flags
))
2647 DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_DELTA_DISKS
),
2648 mddev
->delta_disks
);
2649 if (_test_flag(CTR_FLAG_STRIPE_CACHE
, rs
->ctr_flags
))
2650 DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_STRIPE_CACHE
),
2652 rdev_for_each(rdev
, mddev
)
2653 if (test_bit(rdev
->raid_disk
, (void *) rs
->rebuild_disks
))
2654 DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD
),
2656 rdev_for_each(rdev
, mddev
)
2657 if (test_bit(WriteMostly
, &rdev
->flags
))
2658 DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY
),
2660 if (_test_flag(CTR_FLAG_MAX_WRITE_BEHIND
, rs
->ctr_flags
))
2661 DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_WRITE_BEHIND
),
2662 mddev
->bitmap_info
.max_write_behind
);
2663 if (_test_flag(CTR_FLAG_MAX_RECOVERY_RATE
, rs
->ctr_flags
))
2664 DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE
),
2665 mddev
->sync_speed_max
);
2666 if (_test_flag(CTR_FLAG_MIN_RECOVERY_RATE
, rs
->ctr_flags
))
2667 DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE
),
2668 mddev
->sync_speed_min
);
2669 DMEMIT(" %d", rs
->raid_disks
);
2670 rdev_for_each(rdev
, mddev
) {
2671 struct raid_dev
*rd
= container_of(rdev
, struct raid_dev
, rdev
);
2673 DMEMIT(" %s %s", _get_dev_name(rd
->meta_dev
),
2674 _get_dev_name(rd
->data_dev
));
2679 static int raid_message(struct dm_target
*ti
, unsigned argc
, char **argv
)
2681 struct raid_set
*rs
= ti
->private;
2682 struct mddev
*mddev
= &rs
->md
;
2684 if (!strcasecmp(argv
[0], "reshape")) {
2685 DMERR("Reshape not supported.");
2689 if (!mddev
->pers
|| !mddev
->pers
->sync_request
)
2692 if (!strcasecmp(argv
[0], "frozen"))
2693 set_bit(MD_RECOVERY_FROZEN
, &mddev
->recovery
);
2695 clear_bit(MD_RECOVERY_FROZEN
, &mddev
->recovery
);
2697 if (!strcasecmp(argv
[0], "idle") || !strcasecmp(argv
[0], "frozen")) {
2698 if (mddev
->sync_thread
) {
2699 set_bit(MD_RECOVERY_INTR
, &mddev
->recovery
);
2700 md_reap_sync_thread(mddev
);
2702 } else if (test_bit(MD_RECOVERY_RUNNING
, &mddev
->recovery
) ||
2703 test_bit(MD_RECOVERY_NEEDED
, &mddev
->recovery
))
2705 else if (!strcasecmp(argv
[0], "resync"))
2706 ; /* MD_RECOVERY_NEEDED set below */
2707 else if (!strcasecmp(argv
[0], "recover"))
2708 set_bit(MD_RECOVERY_RECOVER
, &mddev
->recovery
);
2710 if (!strcasecmp(argv
[0], "check"))
2711 set_bit(MD_RECOVERY_CHECK
, &mddev
->recovery
);
2712 else if (!!strcasecmp(argv
[0], "repair"))
2714 set_bit(MD_RECOVERY_REQUESTED
, &mddev
->recovery
);
2715 set_bit(MD_RECOVERY_SYNC
, &mddev
->recovery
);
2717 if (mddev
->ro
== 2) {
2718 /* A write to sync_action is enough to justify
2719 * canceling read-auto mode
2722 if (!mddev
->suspended
&& mddev
->sync_thread
)
2723 md_wakeup_thread(mddev
->sync_thread
);
2725 set_bit(MD_RECOVERY_NEEDED
, &mddev
->recovery
);
2726 if (!mddev
->suspended
&& mddev
->thread
)
2727 md_wakeup_thread(mddev
->thread
);
2732 static int raid_iterate_devices(struct dm_target
*ti
,
2733 iterate_devices_callout_fn fn
, void *data
)
2735 struct raid_set
*rs
= ti
->private;
2739 for (i
= 0; !r
&& i
< rs
->md
.raid_disks
; i
++)
2740 if (rs
->dev
[i
].data_dev
)
2742 rs
->dev
[i
].data_dev
,
2743 0, /* No offset on data devs */
2750 static void raid_io_hints(struct dm_target
*ti
, struct queue_limits
*limits
)
2752 struct raid_set
*rs
= ti
->private;
2753 unsigned chunk_size
= rs
->md
.chunk_sectors
<< 9;
2754 struct r5conf
*conf
= rs
->md
.private;
2756 blk_limits_io_min(limits
, chunk_size
);
2757 blk_limits_io_opt(limits
, chunk_size
* (conf
->raid_disks
- conf
->max_degraded
));
2760 static void raid_presuspend(struct dm_target
*ti
)
2762 struct raid_set
*rs
= ti
->private;
2764 md_stop_writes(&rs
->md
);
2767 static void raid_postsuspend(struct dm_target
*ti
)
2769 struct raid_set
*rs
= ti
->private;
2771 mddev_suspend(&rs
->md
);
2774 static void attempt_restore_of_faulty_devices(struct raid_set
*rs
)
2777 uint64_t failed_devices
, cleared_failed_devices
= 0;
2778 unsigned long flags
;
2779 struct dm_raid_superblock
*sb
;
2782 for (i
= 0; i
< rs
->md
.raid_disks
; i
++) {
2783 r
= &rs
->dev
[i
].rdev
;
2784 if (test_bit(Faulty
, &r
->flags
) && r
->sb_page
&&
2785 sync_page_io(r
, 0, r
->sb_size
, r
->sb_page
, REQ_OP_READ
, 0,
2787 DMINFO("Faulty %s device #%d has readable super block."
2788 " Attempting to revive it.",
2789 rs
->raid_type
->name
, i
);
2792 * Faulty bit may be set, but sometimes the array can
2793 * be suspended before the personalities can respond
2794 * by removing the device from the array (i.e. calling
2795 * 'hot_remove_disk'). If they haven't yet removed
2796 * the failed device, its 'raid_disk' number will be
2797 * '>= 0' - meaning we must call this function
2800 if ((r
->raid_disk
>= 0) &&
2801 (r
->mddev
->pers
->hot_remove_disk(r
->mddev
, r
) != 0))
2802 /* Failed to revive this device, try next */
2806 r
->saved_raid_disk
= i
;
2808 clear_bit(Faulty
, &r
->flags
);
2809 clear_bit(WriteErrorSeen
, &r
->flags
);
2810 clear_bit(In_sync
, &r
->flags
);
2811 if (r
->mddev
->pers
->hot_add_disk(r
->mddev
, r
)) {
2813 r
->saved_raid_disk
= -1;
2816 r
->recovery_offset
= 0;
2817 cleared_failed_devices
|= 1 << i
;
2821 if (cleared_failed_devices
) {
2822 rdev_for_each(r
, &rs
->md
) {
2823 sb
= page_address(r
->sb_page
);
2824 failed_devices
= le64_to_cpu(sb
->failed_devices
);
2825 failed_devices
&= ~cleared_failed_devices
;
2826 sb
->failed_devices
= cpu_to_le64(failed_devices
);
2831 /* Load the dirty region bitmap */
2832 static int _bitmap_load(struct raid_set
*rs
)
2836 /* Try loading the bitmap unless "raid0", which does not have one */
2837 if (!rs_is_raid0(rs
) &&
2838 !_test_and_set_flag(RT_FLAG_RS_BITMAP_LOADED
, &rs
->runtime_flags
)) {
2839 r
= bitmap_load(&rs
->md
);
2841 DMERR("Failed to load bitmap");
2847 static int raid_preresume(struct dm_target
*ti
)
2849 struct raid_set
*rs
= ti
->private;
2850 struct mddev
*mddev
= &rs
->md
;
2852 /* This is a resume after a suspend of the set -> it's already started */
2853 if (_test_and_set_flag(RT_FLAG_RS_PRERESUMED
, &rs
->runtime_flags
))
2857 * The superblocks need to be updated on disk if the
2858 * array is new or _bitmap_load will overwrite them
2859 * in core with old data.
2861 * In case the array got modified (takeover/reshape/resize)
2862 * or the data offsets on the component devices changed, they
2863 * have to be updated as well.
2865 * Have to switch to readwrite and back in order to
2866 * allow for the superblock updates.
2868 if (_test_and_clear_flag(RT_FLAG_UPDATE_SBS
, &rs
->runtime_flags
)) {
2869 set_bit(MD_CHANGE_DEVS
, &mddev
->flags
);
2871 md_update_sb(mddev
, 1);
2876 * Disable/enable discard support on raid set after any
2877 * conversion, because devices can have been added
2879 configure_discard_support(rs
);
2881 /* Load the bitmap from disk unless raid0 */
2882 return _bitmap_load(rs
);
2885 static void raid_resume(struct dm_target
*ti
)
2887 struct raid_set
*rs
= ti
->private;
2888 struct mddev
*mddev
= &rs
->md
;
2890 if (_test_and_set_flag(RT_FLAG_RS_RESUMED
, &rs
->runtime_flags
)) {
2892 * A secondary resume while the device is active.
2893 * Take this opportunity to check whether any failed
2894 * devices are reachable again.
2896 attempt_restore_of_faulty_devices(rs
);
2901 clear_bit(MD_RECOVERY_FROZEN
, &mddev
->recovery
);
2903 if (mddev
->suspended
)
2904 mddev_resume(mddev
);
2907 static struct target_type raid_target
= {
2909 .version
= {1, 9, 0},
2910 .module
= THIS_MODULE
,
2914 .status
= raid_status
,
2915 .message
= raid_message
,
2916 .iterate_devices
= raid_iterate_devices
,
2917 .io_hints
= raid_io_hints
,
2918 .presuspend
= raid_presuspend
,
2919 .postsuspend
= raid_postsuspend
,
2920 .preresume
= raid_preresume
,
2921 .resume
= raid_resume
,
2924 static int __init
dm_raid_init(void)
2926 DMINFO("Loading target version %u.%u.%u",
2927 raid_target
.version
[0],
2928 raid_target
.version
[1],
2929 raid_target
.version
[2]);
2930 return dm_register_target(&raid_target
);
2933 static void __exit
dm_raid_exit(void)
2935 dm_unregister_target(&raid_target
);
2938 module_init(dm_raid_init
);
2939 module_exit(dm_raid_exit
);
2941 module_param(devices_handle_discard_safely
, bool, 0644);
2942 MODULE_PARM_DESC(devices_handle_discard_safely
,
2943 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
2945 MODULE_DESCRIPTION(DM_NAME
" raid0/1/10/4/5/6 target");
2946 MODULE_ALIAS("dm-raid0");
2947 MODULE_ALIAS("dm-raid1");
2948 MODULE_ALIAS("dm-raid10");
2949 MODULE_ALIAS("dm-raid4");
2950 MODULE_ALIAS("dm-raid5");
2951 MODULE_ALIAS("dm-raid6");
2952 MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
2953 MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
2954 MODULE_LICENSE("GPL");