dm snapshot: use dm-bufio prefetch
[deliverable/linux.git] / drivers / md / dm.c
CommitLineData
1da177e4
LT
1/*
2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
784aae73 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
1da177e4
LT
4 *
5 * This file is released under the GPL.
6 */
7
8#include "dm.h"
51e5b2bd 9#include "dm-uevent.h"
1da177e4
LT
10
11#include <linux/init.h>
12#include <linux/module.h>
48c9c27b 13#include <linux/mutex.h>
1da177e4
LT
14#include <linux/moduleparam.h>
15#include <linux/blkpg.h>
16#include <linux/bio.h>
1da177e4
LT
17#include <linux/mempool.h>
18#include <linux/slab.h>
19#include <linux/idr.h>
3ac51e74 20#include <linux/hdreg.h>
3f77316d 21#include <linux/delay.h>
55782138
LZ
22
23#include <trace/events/block.h>
1da177e4 24
72d94861
AK
25#define DM_MSG_PREFIX "core"
26
71a16736
NK
27#ifdef CONFIG_PRINTK
28/*
29 * ratelimit state to be used in DMXXX_LIMIT().
30 */
31DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
32 DEFAULT_RATELIMIT_INTERVAL,
33 DEFAULT_RATELIMIT_BURST);
34EXPORT_SYMBOL(dm_ratelimit_state);
35#endif
36
60935eb2
MB
37/*
38 * Cookies are numeric values sent with CHANGE and REMOVE
39 * uevents while resuming, removing or renaming the device.
40 */
41#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
42#define DM_COOKIE_LENGTH 24
43
1da177e4
LT
44static const char *_name = DM_NAME;
45
46static unsigned int major = 0;
47static unsigned int _major = 0;
48
d15b774c
AK
49static DEFINE_IDR(_minor_idr);
50
f32c10b0 51static DEFINE_SPINLOCK(_minor_lock);
2c140a24
MP
52
53static void do_deferred_remove(struct work_struct *w);
54
55static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
56
1da177e4 57/*
8fbf26ad 58 * For bio-based dm.
1da177e4
LT
59 * One of these is allocated per bio.
60 */
61struct dm_io {
62 struct mapped_device *md;
63 int error;
1da177e4 64 atomic_t io_count;
6ae2fa67 65 struct bio *bio;
3eaf840e 66 unsigned long start_time;
f88fb981 67 spinlock_t endio_lock;
fd2ed4d2 68 struct dm_stats_aux stats_aux;
1da177e4
LT
69};
70
8fbf26ad
KU
71/*
72 * For request-based dm.
73 * One of these is allocated per request.
74 */
75struct dm_rq_target_io {
76 struct mapped_device *md;
77 struct dm_target *ti;
78 struct request *orig, clone;
79 int error;
80 union map_info info;
81};
82
83/*
94818742
KO
84 * For request-based dm - the bio clones we allocate are embedded in these
85 * structs.
86 *
87 * We allocate these with bio_alloc_bioset, using the front_pad parameter when
88 * the bioset is created - this means the bio has to come at the end of the
89 * struct.
8fbf26ad
KU
90 */
91struct dm_rq_clone_bio_info {
92 struct bio *orig;
cec47e3d 93 struct dm_rq_target_io *tio;
94818742 94 struct bio clone;
8fbf26ad
KU
95};
96
1da177e4
LT
97union map_info *dm_get_mapinfo(struct bio *bio)
98{
17b2f66f 99 if (bio && bio->bi_private)
028867ac 100 return &((struct dm_target_io *)bio->bi_private)->info;
17b2f66f 101 return NULL;
1da177e4
LT
102}
103
cec47e3d
KU
104union map_info *dm_get_rq_mapinfo(struct request *rq)
105{
106 if (rq && rq->end_io_data)
107 return &((struct dm_rq_target_io *)rq->end_io_data)->info;
108 return NULL;
109}
110EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
111
ba61fdd1
JM
112#define MINOR_ALLOCED ((void *)-1)
113
1da177e4
LT
114/*
115 * Bits for the md->flags field.
116 */
1eb787ec 117#define DMF_BLOCK_IO_FOR_SUSPEND 0
1da177e4 118#define DMF_SUSPENDED 1
aa8d7c2f 119#define DMF_FROZEN 2
fba9f90e 120#define DMF_FREEING 3
5c6bd75d 121#define DMF_DELETING 4
2e93ccc1 122#define DMF_NOFLUSH_SUSPENDING 5
d5b9dd04 123#define DMF_MERGE_IS_OPTIONAL 6
2c140a24 124#define DMF_DEFERRED_REMOVE 7
1da177e4 125
83d5e5b0
MP
126/*
127 * A dummy definition to make RCU happy.
128 * struct dm_table should never be dereferenced in this file.
129 */
130struct dm_table {
131 int undefined__;
132};
133
304f3f6a
MB
134/*
135 * Work processed by per-device workqueue.
136 */
1da177e4 137struct mapped_device {
83d5e5b0 138 struct srcu_struct io_barrier;
e61290a4 139 struct mutex suspend_lock;
1da177e4 140 atomic_t holders;
5c6bd75d 141 atomic_t open_count;
1da177e4 142
2a7faeb1
MP
143 /*
144 * The current mapping.
145 * Use dm_get_live_table{_fast} or take suspend_lock for
146 * dereference.
147 */
148 struct dm_table *map;
149
1da177e4
LT
150 unsigned long flags;
151
165125e1 152 struct request_queue *queue;
a5664dad 153 unsigned type;
4a0b4ddf 154 /* Protect queue and type against concurrent access. */
a5664dad
MS
155 struct mutex type_lock;
156
36a0456f
AK
157 struct target_type *immutable_target_type;
158
1da177e4 159 struct gendisk *disk;
7e51f257 160 char name[16];
1da177e4
LT
161
162 void *interface_ptr;
163
164 /*
165 * A list of ios that arrived while we were suspended.
166 */
316d315b 167 atomic_t pending[2];
1da177e4 168 wait_queue_head_t wait;
53d5914f 169 struct work_struct work;
74859364 170 struct bio_list deferred;
022c2611 171 spinlock_t deferred_lock;
1da177e4 172
af7e466a 173 /*
29e4013d 174 * Processing queue (flush)
304f3f6a
MB
175 */
176 struct workqueue_struct *wq;
177
1da177e4
LT
178 /*
179 * io objects are allocated from here.
180 */
181 mempool_t *io_pool;
1da177e4 182
9faf400f
SB
183 struct bio_set *bs;
184
1da177e4
LT
185 /*
186 * Event handling.
187 */
188 atomic_t event_nr;
189 wait_queue_head_t eventq;
7a8c3d3b
MA
190 atomic_t uevent_seq;
191 struct list_head uevent_list;
192 spinlock_t uevent_lock; /* Protect access to uevent_list */
1da177e4
LT
193
194 /*
195 * freeze/thaw support require holding onto a super block
196 */
197 struct super_block *frozen_sb;
db8fef4f 198 struct block_device *bdev;
3ac51e74
DW
199
200 /* forced geometry settings */
201 struct hd_geometry geometry;
784aae73
MB
202
203 /* sysfs handle */
204 struct kobject kobj;
52b1fd5a 205
be35f486
MP
206 /* wait until the kobject is released */
207 struct completion kobj_completion;
208
d87f4c14
TH
209 /* zero-length flush that will be cloned and submitted to targets */
210 struct bio flush_bio;
fd2ed4d2
MP
211
212 struct dm_stats stats;
1da177e4
LT
213};
214
e6ee8c0b
KU
215/*
216 * For mempools pre-allocation at the table loading time.
217 */
218struct dm_md_mempools {
219 mempool_t *io_pool;
e6ee8c0b
KU
220 struct bio_set *bs;
221};
222
6cfa5857
MS
223#define RESERVED_BIO_BASED_IOS 16
224#define RESERVED_REQUEST_BASED_IOS 256
f4790826 225#define RESERVED_MAX_IOS 1024
e18b890b 226static struct kmem_cache *_io_cache;
8fbf26ad 227static struct kmem_cache *_rq_tio_cache;
94818742 228
e8603136
MS
229/*
230 * Bio-based DM's mempools' reserved IOs set by the user.
231 */
232static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
233
f4790826
MS
234/*
235 * Request-based DM's mempools' reserved IOs set by the user.
236 */
237static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
238
239static unsigned __dm_get_reserved_ios(unsigned *reserved_ios,
240 unsigned def, unsigned max)
241{
242 unsigned ios = ACCESS_ONCE(*reserved_ios);
243 unsigned modified_ios = 0;
244
245 if (!ios)
246 modified_ios = def;
247 else if (ios > max)
248 modified_ios = max;
249
250 if (modified_ios) {
251 (void)cmpxchg(reserved_ios, ios, modified_ios);
252 ios = modified_ios;
253 }
254
255 return ios;
256}
257
e8603136
MS
258unsigned dm_get_reserved_bio_based_ios(void)
259{
260 return __dm_get_reserved_ios(&reserved_bio_based_ios,
261 RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS);
262}
263EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
264
f4790826
MS
265unsigned dm_get_reserved_rq_based_ios(void)
266{
267 return __dm_get_reserved_ios(&reserved_rq_based_ios,
268 RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS);
269}
270EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
271
1da177e4
LT
272static int __init local_init(void)
273{
51157b4a 274 int r = -ENOMEM;
1da177e4 275
1da177e4 276 /* allocate a slab for the dm_ios */
028867ac 277 _io_cache = KMEM_CACHE(dm_io, 0);
1da177e4 278 if (!_io_cache)
51157b4a 279 return r;
1da177e4 280
8fbf26ad
KU
281 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
282 if (!_rq_tio_cache)
dba14160 283 goto out_free_io_cache;
8fbf26ad 284
51e5b2bd 285 r = dm_uevent_init();
51157b4a 286 if (r)
23e5083b 287 goto out_free_rq_tio_cache;
51e5b2bd 288
1da177e4
LT
289 _major = major;
290 r = register_blkdev(_major, _name);
51157b4a
KU
291 if (r < 0)
292 goto out_uevent_exit;
1da177e4
LT
293
294 if (!_major)
295 _major = r;
296
297 return 0;
51157b4a
KU
298
299out_uevent_exit:
300 dm_uevent_exit();
8fbf26ad
KU
301out_free_rq_tio_cache:
302 kmem_cache_destroy(_rq_tio_cache);
51157b4a
KU
303out_free_io_cache:
304 kmem_cache_destroy(_io_cache);
305
306 return r;
1da177e4
LT
307}
308
309static void local_exit(void)
310{
2c140a24
MP
311 flush_scheduled_work();
312
8fbf26ad 313 kmem_cache_destroy(_rq_tio_cache);
1da177e4 314 kmem_cache_destroy(_io_cache);
00d59405 315 unregister_blkdev(_major, _name);
51e5b2bd 316 dm_uevent_exit();
1da177e4
LT
317
318 _major = 0;
319
320 DMINFO("cleaned up");
321}
322
b9249e55 323static int (*_inits[])(void) __initdata = {
1da177e4
LT
324 local_init,
325 dm_target_init,
326 dm_linear_init,
327 dm_stripe_init,
952b3557 328 dm_io_init,
945fa4d2 329 dm_kcopyd_init,
1da177e4 330 dm_interface_init,
fd2ed4d2 331 dm_statistics_init,
1da177e4
LT
332};
333
b9249e55 334static void (*_exits[])(void) = {
1da177e4
LT
335 local_exit,
336 dm_target_exit,
337 dm_linear_exit,
338 dm_stripe_exit,
952b3557 339 dm_io_exit,
945fa4d2 340 dm_kcopyd_exit,
1da177e4 341 dm_interface_exit,
fd2ed4d2 342 dm_statistics_exit,
1da177e4
LT
343};
344
345static int __init dm_init(void)
346{
347 const int count = ARRAY_SIZE(_inits);
348
349 int r, i;
350
351 for (i = 0; i < count; i++) {
352 r = _inits[i]();
353 if (r)
354 goto bad;
355 }
356
357 return 0;
358
359 bad:
360 while (i--)
361 _exits[i]();
362
363 return r;
364}
365
366static void __exit dm_exit(void)
367{
368 int i = ARRAY_SIZE(_exits);
369
370 while (i--)
371 _exits[i]();
d15b774c
AK
372
373 /*
374 * Should be empty by this point.
375 */
d15b774c 376 idr_destroy(&_minor_idr);
1da177e4
LT
377}
378
379/*
380 * Block device functions
381 */
432a212c
MA
382int dm_deleting_md(struct mapped_device *md)
383{
384 return test_bit(DMF_DELETING, &md->flags);
385}
386
fe5f9f2c 387static int dm_blk_open(struct block_device *bdev, fmode_t mode)
1da177e4
LT
388{
389 struct mapped_device *md;
390
fba9f90e
JM
391 spin_lock(&_minor_lock);
392
fe5f9f2c 393 md = bdev->bd_disk->private_data;
fba9f90e
JM
394 if (!md)
395 goto out;
396
5c6bd75d 397 if (test_bit(DMF_FREEING, &md->flags) ||
432a212c 398 dm_deleting_md(md)) {
fba9f90e
JM
399 md = NULL;
400 goto out;
401 }
402
1da177e4 403 dm_get(md);
5c6bd75d 404 atomic_inc(&md->open_count);
fba9f90e
JM
405
406out:
407 spin_unlock(&_minor_lock);
408
409 return md ? 0 : -ENXIO;
1da177e4
LT
410}
411
db2a144b 412static void dm_blk_close(struct gendisk *disk, fmode_t mode)
1da177e4 413{
fe5f9f2c 414 struct mapped_device *md = disk->private_data;
6e9624b8 415
4a1aeb98
MB
416 spin_lock(&_minor_lock);
417
2c140a24
MP
418 if (atomic_dec_and_test(&md->open_count) &&
419 (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
420 schedule_work(&deferred_remove_work);
421
1da177e4 422 dm_put(md);
4a1aeb98
MB
423
424 spin_unlock(&_minor_lock);
1da177e4
LT
425}
426
5c6bd75d
AK
427int dm_open_count(struct mapped_device *md)
428{
429 return atomic_read(&md->open_count);
430}
431
432/*
433 * Guarantees nothing is using the device before it's deleted.
434 */
2c140a24 435int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
5c6bd75d
AK
436{
437 int r = 0;
438
439 spin_lock(&_minor_lock);
440
2c140a24 441 if (dm_open_count(md)) {
5c6bd75d 442 r = -EBUSY;
2c140a24
MP
443 if (mark_deferred)
444 set_bit(DMF_DEFERRED_REMOVE, &md->flags);
445 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
446 r = -EEXIST;
5c6bd75d
AK
447 else
448 set_bit(DMF_DELETING, &md->flags);
449
450 spin_unlock(&_minor_lock);
451
452 return r;
453}
454
2c140a24
MP
455int dm_cancel_deferred_remove(struct mapped_device *md)
456{
457 int r = 0;
458
459 spin_lock(&_minor_lock);
460
461 if (test_bit(DMF_DELETING, &md->flags))
462 r = -EBUSY;
463 else
464 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
465
466 spin_unlock(&_minor_lock);
467
468 return r;
469}
470
471static void do_deferred_remove(struct work_struct *w)
472{
473 dm_deferred_remove();
474}
475
fd2ed4d2
MP
476sector_t dm_get_size(struct mapped_device *md)
477{
478 return get_capacity(md->disk);
479}
480
481struct dm_stats *dm_get_stats(struct mapped_device *md)
482{
483 return &md->stats;
484}
485
3ac51e74
DW
486static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
487{
488 struct mapped_device *md = bdev->bd_disk->private_data;
489
490 return dm_get_geometry(md, geo);
491}
492
fe5f9f2c 493static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
aa129a22
MB
494 unsigned int cmd, unsigned long arg)
495{
fe5f9f2c 496 struct mapped_device *md = bdev->bd_disk->private_data;
83d5e5b0 497 int srcu_idx;
6c182cd8 498 struct dm_table *map;
aa129a22
MB
499 struct dm_target *tgt;
500 int r = -ENOTTY;
501
6c182cd8 502retry:
83d5e5b0
MP
503 map = dm_get_live_table(md, &srcu_idx);
504
aa129a22
MB
505 if (!map || !dm_table_get_size(map))
506 goto out;
507
508 /* We only support devices that have a single target */
509 if (dm_table_get_num_targets(map) != 1)
510 goto out;
511
512 tgt = dm_table_get_target(map, 0);
513
4f186f8b 514 if (dm_suspended_md(md)) {
aa129a22
MB
515 r = -EAGAIN;
516 goto out;
517 }
518
519 if (tgt->type->ioctl)
647b3d00 520 r = tgt->type->ioctl(tgt, cmd, arg);
aa129a22
MB
521
522out:
83d5e5b0 523 dm_put_live_table(md, srcu_idx);
aa129a22 524
6c182cd8
HR
525 if (r == -ENOTCONN) {
526 msleep(10);
527 goto retry;
528 }
529
aa129a22
MB
530 return r;
531}
532
028867ac 533static struct dm_io *alloc_io(struct mapped_device *md)
1da177e4
LT
534{
535 return mempool_alloc(md->io_pool, GFP_NOIO);
536}
537
028867ac 538static void free_io(struct mapped_device *md, struct dm_io *io)
1da177e4
LT
539{
540 mempool_free(io, md->io_pool);
541}
542
028867ac 543static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
1da177e4 544{
dba14160 545 bio_put(&tio->clone);
1da177e4
LT
546}
547
08885643
KU
548static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
549 gfp_t gfp_mask)
cec47e3d 550{
5f015204 551 return mempool_alloc(md->io_pool, gfp_mask);
cec47e3d
KU
552}
553
554static void free_rq_tio(struct dm_rq_target_io *tio)
555{
5f015204 556 mempool_free(tio, tio->md->io_pool);
cec47e3d
KU
557}
558
90abb8c4
KU
559static int md_in_flight(struct mapped_device *md)
560{
561 return atomic_read(&md->pending[READ]) +
562 atomic_read(&md->pending[WRITE]);
563}
564
3eaf840e
JNN
565static void start_io_acct(struct dm_io *io)
566{
567 struct mapped_device *md = io->md;
fd2ed4d2 568 struct bio *bio = io->bio;
c9959059 569 int cpu;
fd2ed4d2 570 int rw = bio_data_dir(bio);
3eaf840e
JNN
571
572 io->start_time = jiffies;
573
074a7aca
TH
574 cpu = part_stat_lock();
575 part_round_stats(cpu, &dm_disk(md)->part0);
576 part_stat_unlock();
1e9bb880
SL
577 atomic_set(&dm_disk(md)->part0.in_flight[rw],
578 atomic_inc_return(&md->pending[rw]));
fd2ed4d2
MP
579
580 if (unlikely(dm_stats_used(&md->stats)))
581 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
582 bio_sectors(bio), false, 0, &io->stats_aux);
3eaf840e
JNN
583}
584
d221d2e7 585static void end_io_acct(struct dm_io *io)
3eaf840e
JNN
586{
587 struct mapped_device *md = io->md;
588 struct bio *bio = io->bio;
589 unsigned long duration = jiffies - io->start_time;
c9959059 590 int pending, cpu;
3eaf840e
JNN
591 int rw = bio_data_dir(bio);
592
074a7aca
TH
593 cpu = part_stat_lock();
594 part_round_stats(cpu, &dm_disk(md)->part0);
595 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
596 part_stat_unlock();
3eaf840e 597
fd2ed4d2
MP
598 if (unlikely(dm_stats_used(&md->stats)))
599 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
600 bio_sectors(bio), true, duration, &io->stats_aux);
601
af7e466a
MP
602 /*
603 * After this is decremented the bio must not be touched if it is
d87f4c14 604 * a flush.
af7e466a 605 */
1e9bb880
SL
606 pending = atomic_dec_return(&md->pending[rw]);
607 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
316d315b 608 pending += atomic_read(&md->pending[rw^0x1]);
3eaf840e 609
d221d2e7
MP
610 /* nudge anyone waiting on suspend queue */
611 if (!pending)
612 wake_up(&md->wait);
3eaf840e
JNN
613}
614
1da177e4
LT
615/*
616 * Add the bio to the list of deferred io.
617 */
92c63902 618static void queue_io(struct mapped_device *md, struct bio *bio)
1da177e4 619{
05447420 620 unsigned long flags;
1da177e4 621
05447420 622 spin_lock_irqsave(&md->deferred_lock, flags);
1da177e4 623 bio_list_add(&md->deferred, bio);
05447420 624 spin_unlock_irqrestore(&md->deferred_lock, flags);
6a8736d1 625 queue_work(md->wq, &md->work);
1da177e4
LT
626}
627
628/*
629 * Everyone (including functions in this file), should use this
630 * function to access the md->map field, and make sure they call
83d5e5b0 631 * dm_put_live_table() when finished.
1da177e4 632 */
83d5e5b0 633struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
1da177e4 634{
83d5e5b0
MP
635 *srcu_idx = srcu_read_lock(&md->io_barrier);
636
637 return srcu_dereference(md->map, &md->io_barrier);
638}
1da177e4 639
83d5e5b0
MP
640void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
641{
642 srcu_read_unlock(&md->io_barrier, srcu_idx);
643}
644
645void dm_sync_table(struct mapped_device *md)
646{
647 synchronize_srcu(&md->io_barrier);
648 synchronize_rcu_expedited();
649}
650
651/*
652 * A fast alternative to dm_get_live_table/dm_put_live_table.
653 * The caller must not block between these two functions.
654 */
655static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
656{
657 rcu_read_lock();
658 return rcu_dereference(md->map);
659}
1da177e4 660
83d5e5b0
MP
661static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
662{
663 rcu_read_unlock();
1da177e4
LT
664}
665
3ac51e74
DW
666/*
667 * Get the geometry associated with a dm device
668 */
669int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
670{
671 *geo = md->geometry;
672
673 return 0;
674}
675
676/*
677 * Set the geometry of a device.
678 */
679int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
680{
681 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
682
683 if (geo->start > sz) {
684 DMWARN("Start sector is beyond the geometry limits.");
685 return -EINVAL;
686 }
687
688 md->geometry = *geo;
689
690 return 0;
691}
692
1da177e4
LT
693/*-----------------------------------------------------------------
694 * CRUD START:
695 * A more elegant soln is in the works that uses the queue
696 * merge fn, unfortunately there are a couple of changes to
697 * the block layer that I want to make for this. So in the
698 * interests of getting something for people to use I give
699 * you this clearly demarcated crap.
700 *---------------------------------------------------------------*/
701
2e93ccc1
KU
702static int __noflush_suspending(struct mapped_device *md)
703{
704 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
705}
706
1da177e4
LT
707/*
708 * Decrements the number of outstanding ios that a bio has been
709 * cloned into, completing the original io if necc.
710 */
858119e1 711static void dec_pending(struct dm_io *io, int error)
1da177e4 712{
2e93ccc1 713 unsigned long flags;
b35f8caa
MB
714 int io_error;
715 struct bio *bio;
716 struct mapped_device *md = io->md;
2e93ccc1
KU
717
718 /* Push-back supersedes any I/O errors */
f88fb981
KU
719 if (unlikely(error)) {
720 spin_lock_irqsave(&io->endio_lock, flags);
721 if (!(io->error > 0 && __noflush_suspending(md)))
722 io->error = error;
723 spin_unlock_irqrestore(&io->endio_lock, flags);
724 }
1da177e4
LT
725
726 if (atomic_dec_and_test(&io->io_count)) {
2e93ccc1
KU
727 if (io->error == DM_ENDIO_REQUEUE) {
728 /*
729 * Target requested pushing back the I/O.
2e93ccc1 730 */
022c2611 731 spin_lock_irqsave(&md->deferred_lock, flags);
6a8736d1
TH
732 if (__noflush_suspending(md))
733 bio_list_add_head(&md->deferred, io->bio);
734 else
2e93ccc1
KU
735 /* noflush suspend was interrupted. */
736 io->error = -EIO;
022c2611 737 spin_unlock_irqrestore(&md->deferred_lock, flags);
2e93ccc1
KU
738 }
739
b35f8caa
MB
740 io_error = io->error;
741 bio = io->bio;
6a8736d1
TH
742 end_io_acct(io);
743 free_io(md, io);
744
745 if (io_error == DM_ENDIO_REQUEUE)
746 return;
2e93ccc1 747
b372d360 748 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
af7e466a 749 /*
6a8736d1
TH
750 * Preflush done for flush with data, reissue
751 * without REQ_FLUSH.
af7e466a 752 */
6a8736d1
TH
753 bio->bi_rw &= ~REQ_FLUSH;
754 queue_io(md, bio);
af7e466a 755 } else {
b372d360 756 /* done with normal IO or empty flush */
0a82a8d1 757 trace_block_bio_complete(md->queue, bio, io_error);
b372d360 758 bio_endio(bio, io_error);
b35f8caa 759 }
1da177e4
LT
760 }
761}
762
6712ecf8 763static void clone_endio(struct bio *bio, int error)
1da177e4
LT
764{
765 int r = 0;
028867ac 766 struct dm_target_io *tio = bio->bi_private;
b35f8caa 767 struct dm_io *io = tio->io;
9faf400f 768 struct mapped_device *md = tio->io->md;
1da177e4
LT
769 dm_endio_fn endio = tio->ti->type->end_io;
770
1da177e4
LT
771 if (!bio_flagged(bio, BIO_UPTODATE) && !error)
772 error = -EIO;
773
774 if (endio) {
7de3ee57 775 r = endio(tio->ti, bio, error);
2e93ccc1
KU
776 if (r < 0 || r == DM_ENDIO_REQUEUE)
777 /*
778 * error and requeue request are handled
779 * in dec_pending().
780 */
1da177e4 781 error = r;
45cbcd79
KU
782 else if (r == DM_ENDIO_INCOMPLETE)
783 /* The target will handle the io */
6712ecf8 784 return;
45cbcd79
KU
785 else if (r) {
786 DMWARN("unimplemented target endio return value: %d", r);
787 BUG();
788 }
1da177e4
LT
789 }
790
9faf400f 791 free_tio(md, tio);
b35f8caa 792 dec_pending(io, error);
1da177e4
LT
793}
794
cec47e3d
KU
795/*
796 * Partial completion handling for request-based dm
797 */
798static void end_clone_bio(struct bio *clone, int error)
799{
800 struct dm_rq_clone_bio_info *info = clone->bi_private;
801 struct dm_rq_target_io *tio = info->tio;
802 struct bio *bio = info->orig;
803 unsigned int nr_bytes = info->orig->bi_size;
804
805 bio_put(clone);
806
807 if (tio->error)
808 /*
809 * An error has already been detected on the request.
810 * Once error occurred, just let clone->end_io() handle
811 * the remainder.
812 */
813 return;
814 else if (error) {
815 /*
816 * Don't notice the error to the upper layer yet.
817 * The error handling decision is made by the target driver,
818 * when the request is completed.
819 */
820 tio->error = error;
821 return;
822 }
823
824 /*
825 * I/O for the bio successfully completed.
826 * Notice the data completion to the upper layer.
827 */
828
829 /*
830 * bios are processed from the head of the list.
831 * So the completing bio should always be rq->bio.
832 * If it's not, something wrong is happening.
833 */
834 if (tio->orig->bio != bio)
835 DMERR("bio completion is going in the middle of the request");
836
837 /*
838 * Update the original request.
839 * Do not use blk_end_request() here, because it may complete
840 * the original request before the clone, and break the ordering.
841 */
842 blk_update_request(tio->orig, 0, nr_bytes);
843}
844
845/*
846 * Don't touch any member of the md after calling this function because
847 * the md may be freed in dm_put() at the end of this function.
848 * Or do dm_get() before calling this function and dm_put() later.
849 */
b4324fee 850static void rq_completed(struct mapped_device *md, int rw, int run_queue)
cec47e3d 851{
b4324fee 852 atomic_dec(&md->pending[rw]);
cec47e3d
KU
853
854 /* nudge anyone waiting on suspend queue */
b4324fee 855 if (!md_in_flight(md))
cec47e3d
KU
856 wake_up(&md->wait);
857
a8c32a5c
JA
858 /*
859 * Run this off this callpath, as drivers could invoke end_io while
860 * inside their request_fn (and holding the queue lock). Calling
861 * back into ->request_fn() could deadlock attempting to grab the
862 * queue lock again.
863 */
cec47e3d 864 if (run_queue)
a8c32a5c 865 blk_run_queue_async(md->queue);
cec47e3d
KU
866
867 /*
868 * dm_put() must be at the end of this function. See the comment above
869 */
870 dm_put(md);
871}
872
a77e28c7
KU
873static void free_rq_clone(struct request *clone)
874{
875 struct dm_rq_target_io *tio = clone->end_io_data;
876
877 blk_rq_unprep_clone(clone);
878 free_rq_tio(tio);
879}
880
980691e5
KU
881/*
882 * Complete the clone and the original request.
883 * Must be called without queue lock.
884 */
885static void dm_end_request(struct request *clone, int error)
886{
887 int rw = rq_data_dir(clone);
888 struct dm_rq_target_io *tio = clone->end_io_data;
889 struct mapped_device *md = tio->md;
890 struct request *rq = tio->orig;
891
29e4013d 892 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
980691e5
KU
893 rq->errors = clone->errors;
894 rq->resid_len = clone->resid_len;
895
896 if (rq->sense)
897 /*
898 * We are using the sense buffer of the original
899 * request.
900 * So setting the length of the sense data is enough.
901 */
902 rq->sense_len = clone->sense_len;
903 }
904
905 free_rq_clone(clone);
29e4013d
TH
906 blk_end_request_all(rq, error);
907 rq_completed(md, rw, true);
980691e5
KU
908}
909
cec47e3d
KU
910static void dm_unprep_request(struct request *rq)
911{
912 struct request *clone = rq->special;
cec47e3d
KU
913
914 rq->special = NULL;
915 rq->cmd_flags &= ~REQ_DONTPREP;
916
a77e28c7 917 free_rq_clone(clone);
cec47e3d
KU
918}
919
920/*
921 * Requeue the original request of a clone.
922 */
923void dm_requeue_unmapped_request(struct request *clone)
924{
b4324fee 925 int rw = rq_data_dir(clone);
cec47e3d
KU
926 struct dm_rq_target_io *tio = clone->end_io_data;
927 struct mapped_device *md = tio->md;
928 struct request *rq = tio->orig;
929 struct request_queue *q = rq->q;
930 unsigned long flags;
931
932 dm_unprep_request(rq);
933
934 spin_lock_irqsave(q->queue_lock, flags);
cec47e3d
KU
935 blk_requeue_request(q, rq);
936 spin_unlock_irqrestore(q->queue_lock, flags);
937
b4324fee 938 rq_completed(md, rw, 0);
cec47e3d
KU
939}
940EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
941
942static void __stop_queue(struct request_queue *q)
943{
944 blk_stop_queue(q);
945}
946
947static void stop_queue(struct request_queue *q)
948{
949 unsigned long flags;
950
951 spin_lock_irqsave(q->queue_lock, flags);
952 __stop_queue(q);
953 spin_unlock_irqrestore(q->queue_lock, flags);
954}
955
956static void __start_queue(struct request_queue *q)
957{
958 if (blk_queue_stopped(q))
959 blk_start_queue(q);
960}
961
962static void start_queue(struct request_queue *q)
963{
964 unsigned long flags;
965
966 spin_lock_irqsave(q->queue_lock, flags);
967 __start_queue(q);
968 spin_unlock_irqrestore(q->queue_lock, flags);
969}
970
11a68244 971static void dm_done(struct request *clone, int error, bool mapped)
cec47e3d 972{
11a68244 973 int r = error;
cec47e3d 974 struct dm_rq_target_io *tio = clone->end_io_data;
ba1cbad9 975 dm_request_endio_fn rq_end_io = NULL;
cec47e3d 976
ba1cbad9
MS
977 if (tio->ti) {
978 rq_end_io = tio->ti->type->rq_end_io;
979
980 if (mapped && rq_end_io)
981 r = rq_end_io(tio->ti, clone, error, &tio->info);
982 }
cec47e3d 983
11a68244 984 if (r <= 0)
cec47e3d 985 /* The target wants to complete the I/O */
11a68244
KU
986 dm_end_request(clone, r);
987 else if (r == DM_ENDIO_INCOMPLETE)
cec47e3d
KU
988 /* The target will handle the I/O */
989 return;
11a68244 990 else if (r == DM_ENDIO_REQUEUE)
cec47e3d
KU
991 /* The target wants to requeue the I/O */
992 dm_requeue_unmapped_request(clone);
993 else {
11a68244 994 DMWARN("unimplemented target endio return value: %d", r);
cec47e3d
KU
995 BUG();
996 }
997}
998
11a68244
KU
999/*
1000 * Request completion handler for request-based dm
1001 */
1002static void dm_softirq_done(struct request *rq)
1003{
1004 bool mapped = true;
1005 struct request *clone = rq->completion_data;
1006 struct dm_rq_target_io *tio = clone->end_io_data;
1007
1008 if (rq->cmd_flags & REQ_FAILED)
1009 mapped = false;
1010
1011 dm_done(clone, tio->error, mapped);
1012}
1013
cec47e3d
KU
1014/*
1015 * Complete the clone and the original request with the error status
1016 * through softirq context.
1017 */
1018static void dm_complete_request(struct request *clone, int error)
1019{
1020 struct dm_rq_target_io *tio = clone->end_io_data;
1021 struct request *rq = tio->orig;
1022
1023 tio->error = error;
1024 rq->completion_data = clone;
1025 blk_complete_request(rq);
1026}
1027
1028/*
1029 * Complete the not-mapped clone and the original request with the error status
1030 * through softirq context.
1031 * Target's rq_end_io() function isn't called.
1032 * This may be used when the target's map_rq() function fails.
1033 */
1034void dm_kill_unmapped_request(struct request *clone, int error)
1035{
1036 struct dm_rq_target_io *tio = clone->end_io_data;
1037 struct request *rq = tio->orig;
1038
1039 rq->cmd_flags |= REQ_FAILED;
1040 dm_complete_request(clone, error);
1041}
1042EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
1043
1044/*
1045 * Called with the queue lock held
1046 */
1047static void end_clone_request(struct request *clone, int error)
1048{
1049 /*
1050 * For just cleaning up the information of the queue in which
1051 * the clone was dispatched.
1052 * The clone is *NOT* freed actually here because it is alloced from
1053 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
1054 */
1055 __blk_put_request(clone->q, clone);
1056
1057 /*
1058 * Actual request completion is done in a softirq context which doesn't
1059 * hold the queue lock. Otherwise, deadlock could occur because:
1060 * - another request may be submitted by the upper level driver
1061 * of the stacking during the completion
1062 * - the submission which requires queue lock may be done
1063 * against this queue
1064 */
1065 dm_complete_request(clone, error);
1066}
1067
56a67df7
MS
1068/*
1069 * Return maximum size of I/O possible at the supplied sector up to the current
1070 * target boundary.
1071 */
1072static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
1073{
1074 sector_t target_offset = dm_target_offset(ti, sector);
1075
1076 return ti->len - target_offset;
1077}
1078
1079static sector_t max_io_len(sector_t sector, struct dm_target *ti)
1da177e4 1080{
56a67df7 1081 sector_t len = max_io_len_target_boundary(sector, ti);
542f9038 1082 sector_t offset, max_len;
1da177e4
LT
1083
1084 /*
542f9038 1085 * Does the target need to split even further?
1da177e4 1086 */
542f9038
MS
1087 if (ti->max_io_len) {
1088 offset = dm_target_offset(ti, sector);
1089 if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
1090 max_len = sector_div(offset, ti->max_io_len);
1091 else
1092 max_len = offset & (ti->max_io_len - 1);
1093 max_len = ti->max_io_len - max_len;
1094
1095 if (len > max_len)
1096 len = max_len;
1da177e4
LT
1097 }
1098
1099 return len;
1100}
1101
542f9038
MS
1102int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1103{
1104 if (len > UINT_MAX) {
1105 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1106 (unsigned long long)len, UINT_MAX);
1107 ti->error = "Maximum size of target IO is too large";
1108 return -EINVAL;
1109 }
1110
1111 ti->max_io_len = (uint32_t) len;
1112
1113 return 0;
1114}
1115EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1116
bd2a49b8 1117static void __map_bio(struct dm_target_io *tio)
1da177e4
LT
1118{
1119 int r;
2056a782 1120 sector_t sector;
9faf400f 1121 struct mapped_device *md;
dba14160 1122 struct bio *clone = &tio->clone;
bd2a49b8 1123 struct dm_target *ti = tio->ti;
1da177e4 1124
1da177e4
LT
1125 clone->bi_end_io = clone_endio;
1126 clone->bi_private = tio;
1127
1128 /*
1129 * Map the clone. If r == 0 we don't need to do
1130 * anything, the target has assumed ownership of
1131 * this io.
1132 */
1133 atomic_inc(&tio->io->io_count);
2056a782 1134 sector = clone->bi_sector;
7de3ee57 1135 r = ti->type->map(ti, clone);
45cbcd79 1136 if (r == DM_MAPIO_REMAPPED) {
1da177e4 1137 /* the bio has been remapped so dispatch it */
2056a782 1138
d07335e5
MS
1139 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
1140 tio->io->bio->bi_bdev->bd_dev, sector);
2056a782 1141
1da177e4 1142 generic_make_request(clone);
2e93ccc1
KU
1143 } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
1144 /* error the io and bail out, or requeue it if needed */
9faf400f
SB
1145 md = tio->io->md;
1146 dec_pending(tio->io, r);
9faf400f 1147 free_tio(md, tio);
45cbcd79
KU
1148 } else if (r) {
1149 DMWARN("unimplemented target map return value: %d", r);
1150 BUG();
1da177e4
LT
1151 }
1152}
1153
1154struct clone_info {
1155 struct mapped_device *md;
1156 struct dm_table *map;
1157 struct bio *bio;
1158 struct dm_io *io;
1159 sector_t sector;
1160 sector_t sector_count;
1161 unsigned short idx;
1162};
1163
bd2a49b8
AK
1164static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len)
1165{
1166 bio->bi_sector = sector;
1167 bio->bi_size = to_bytes(len);
1168}
1169
1170static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count)
1171{
1172 bio->bi_idx = idx;
1173 bio->bi_vcnt = idx + bv_count;
1174 bio->bi_flags &= ~(1 << BIO_SEG_VALID);
1175}
1176
1177static void clone_bio_integrity(struct bio *bio, struct bio *clone,
1178 unsigned short idx, unsigned len, unsigned offset,
1179 unsigned trim)
1180{
1181 if (!bio_integrity(bio))
1182 return;
1183
1184 bio_integrity_clone(clone, bio, GFP_NOIO);
1185
1186 if (trim)
1187 bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len);
1188}
1189
1da177e4 1190/*
d87f4c14 1191 * Creates a little bio that just does part of a bvec.
1da177e4 1192 */
14fe594d
AK
1193static void clone_split_bio(struct dm_target_io *tio, struct bio *bio,
1194 sector_t sector, unsigned short idx,
1195 unsigned offset, unsigned len)
1da177e4 1196{
dba14160 1197 struct bio *clone = &tio->clone;
1da177e4
LT
1198 struct bio_vec *bv = bio->bi_io_vec + idx;
1199
1da177e4
LT
1200 *clone->bi_io_vec = *bv;
1201
bd2a49b8
AK
1202 bio_setup_sector(clone, sector, len);
1203
1da177e4 1204 clone->bi_bdev = bio->bi_bdev;
d87f4c14 1205 clone->bi_rw = bio->bi_rw;
1da177e4 1206 clone->bi_vcnt = 1;
1da177e4
LT
1207 clone->bi_io_vec->bv_offset = offset;
1208 clone->bi_io_vec->bv_len = clone->bi_size;
f3e1d26e 1209 clone->bi_flags |= 1 << BIO_CLONED;
1da177e4 1210
bd2a49b8 1211 clone_bio_integrity(bio, clone, idx, len, offset, 1);
1da177e4
LT
1212}
1213
1214/*
1215 * Creates a bio that consists of range of complete bvecs.
1216 */
dba14160
MP
1217static void clone_bio(struct dm_target_io *tio, struct bio *bio,
1218 sector_t sector, unsigned short idx,
e4c93811 1219 unsigned short bv_count, unsigned len)
1da177e4 1220{
dba14160 1221 struct bio *clone = &tio->clone;
bd2a49b8 1222 unsigned trim = 0;
1da177e4 1223
9faf400f 1224 __bio_clone(clone, bio);
bd2a49b8
AK
1225 bio_setup_sector(clone, sector, len);
1226 bio_setup_bv(clone, idx, bv_count);
1227
1228 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
1229 trim = 1;
1230 clone_bio_integrity(bio, clone, idx, len, 0, trim);
1da177e4
LT
1231}
1232
9015df24 1233static struct dm_target_io *alloc_tio(struct clone_info *ci,
bd2a49b8 1234 struct dm_target *ti, int nr_iovecs,
55a62eef 1235 unsigned target_bio_nr)
f9ab94ce 1236{
dba14160
MP
1237 struct dm_target_io *tio;
1238 struct bio *clone;
1239
1240 clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, ci->md->bs);
1241 tio = container_of(clone, struct dm_target_io, clone);
f9ab94ce
MP
1242
1243 tio->io = ci->io;
1244 tio->ti = ti;
f9ab94ce 1245 memset(&tio->info, 0, sizeof(tio->info));
55a62eef 1246 tio->target_bio_nr = target_bio_nr;
9015df24
AK
1247
1248 return tio;
1249}
1250
14fe594d
AK
1251static void __clone_and_map_simple_bio(struct clone_info *ci,
1252 struct dm_target *ti,
1253 unsigned target_bio_nr, sector_t len)
9015df24 1254{
55a62eef 1255 struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr);
dba14160 1256 struct bio *clone = &tio->clone;
9015df24 1257
06a426ce
MS
1258 /*
1259 * Discard requests require the bio's inline iovecs be initialized.
1260 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
1261 * and discard, so no need for concern about wasted bvec allocations.
1262 */
dba14160 1263 __bio_clone(clone, ci->bio);
bd2a49b8
AK
1264 if (len)
1265 bio_setup_sector(clone, ci->sector, len);
f9ab94ce 1266
bd2a49b8 1267 __map_bio(tio);
f9ab94ce
MP
1268}
1269
14fe594d
AK
1270static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1271 unsigned num_bios, sector_t len)
06a426ce 1272{
55a62eef 1273 unsigned target_bio_nr;
06a426ce 1274
55a62eef 1275 for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
14fe594d 1276 __clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
06a426ce
MS
1277}
1278
14fe594d 1279static int __send_empty_flush(struct clone_info *ci)
f9ab94ce 1280{
06a426ce 1281 unsigned target_nr = 0;
f9ab94ce
MP
1282 struct dm_target *ti;
1283
b372d360 1284 BUG_ON(bio_has_data(ci->bio));
f9ab94ce 1285 while ((ti = dm_table_get_target(ci->map, target_nr++)))
14fe594d 1286 __send_duplicate_bios(ci, ti, ti->num_flush_bios, 0);
f9ab94ce 1287
f9ab94ce
MP
1288 return 0;
1289}
1290
e4c93811
AK
1291static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1292 sector_t sector, int nr_iovecs,
1293 unsigned short idx, unsigned short bv_count,
1294 unsigned offset, unsigned len,
1295 unsigned split_bvec)
5ae89a87 1296{
dba14160 1297 struct bio *bio = ci->bio;
5ae89a87 1298 struct dm_target_io *tio;
b0d8ed4d
AK
1299 unsigned target_bio_nr;
1300 unsigned num_target_bios = 1;
5ae89a87 1301
b0d8ed4d
AK
1302 /*
1303 * Does the target want to receive duplicate copies of the bio?
1304 */
1305 if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
1306 num_target_bios = ti->num_write_bios(ti, bio);
e4c93811 1307
b0d8ed4d
AK
1308 for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
1309 tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr);
1310 if (split_bvec)
1311 clone_split_bio(tio, bio, sector, idx, offset, len);
1312 else
1313 clone_bio(tio, bio, sector, idx, bv_count, len);
1314 __map_bio(tio);
1315 }
5ae89a87
MS
1316}
1317
55a62eef 1318typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
23508a96 1319
55a62eef 1320static unsigned get_num_discard_bios(struct dm_target *ti)
23508a96 1321{
55a62eef 1322 return ti->num_discard_bios;
23508a96
MS
1323}
1324
55a62eef 1325static unsigned get_num_write_same_bios(struct dm_target *ti)
23508a96 1326{
55a62eef 1327 return ti->num_write_same_bios;
23508a96
MS
1328}
1329
1330typedef bool (*is_split_required_fn)(struct dm_target *ti);
1331
1332static bool is_split_required_for_discard(struct dm_target *ti)
1333{
55a62eef 1334 return ti->split_discard_bios;
23508a96
MS
1335}
1336
14fe594d
AK
1337static int __send_changing_extent_only(struct clone_info *ci,
1338 get_num_bios_fn get_num_bios,
1339 is_split_required_fn is_split_required)
5ae89a87
MS
1340{
1341 struct dm_target *ti;
a79245b3 1342 sector_t len;
55a62eef 1343 unsigned num_bios;
5ae89a87 1344
a79245b3
MS
1345 do {
1346 ti = dm_table_find_target(ci->map, ci->sector);
1347 if (!dm_target_is_valid(ti))
1348 return -EIO;
5ae89a87 1349
5ae89a87 1350 /*
23508a96
MS
1351 * Even though the device advertised support for this type of
1352 * request, that does not mean every target supports it, and
936688d7 1353 * reconfiguration might also have changed that since the
a79245b3 1354 * check was performed.
5ae89a87 1355 */
55a62eef
AK
1356 num_bios = get_num_bios ? get_num_bios(ti) : 0;
1357 if (!num_bios)
a79245b3 1358 return -EOPNOTSUPP;
5ae89a87 1359
23508a96 1360 if (is_split_required && !is_split_required(ti))
7acf0277
MP
1361 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1362 else
1363 len = min(ci->sector_count, max_io_len(ci->sector, ti));
06a426ce 1364
14fe594d 1365 __send_duplicate_bios(ci, ti, num_bios, len);
a79245b3
MS
1366
1367 ci->sector += len;
1368 } while (ci->sector_count -= len);
5ae89a87
MS
1369
1370 return 0;
1371}
1372
14fe594d 1373static int __send_discard(struct clone_info *ci)
23508a96 1374{
14fe594d
AK
1375 return __send_changing_extent_only(ci, get_num_discard_bios,
1376 is_split_required_for_discard);
23508a96
MS
1377}
1378
14fe594d 1379static int __send_write_same(struct clone_info *ci)
23508a96 1380{
14fe594d 1381 return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
23508a96
MS
1382}
1383
e4c93811
AK
1384/*
1385 * Find maximum number of sectors / bvecs we can process with a single bio.
1386 */
1387static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx)
1388{
1389 struct bio *bio = ci->bio;
1390 sector_t bv_len, total_len = 0;
1391
1392 for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) {
1393 bv_len = to_sector(bio->bi_io_vec[*idx].bv_len);
1394
1395 if (bv_len > max)
1396 break;
1397
1398 max -= bv_len;
1399 total_len += bv_len;
1400 }
1401
1402 return total_len;
1403}
1404
1405static int __split_bvec_across_targets(struct clone_info *ci,
1406 struct dm_target *ti, sector_t max)
1407{
1408 struct bio *bio = ci->bio;
1409 struct bio_vec *bv = bio->bi_io_vec + ci->idx;
1410 sector_t remaining = to_sector(bv->bv_len);
1411 unsigned offset = 0;
1412 sector_t len;
1413
1414 do {
1415 if (offset) {
1416 ti = dm_table_find_target(ci->map, ci->sector);
1417 if (!dm_target_is_valid(ti))
1418 return -EIO;
1419
1420 max = max_io_len(ci->sector, ti);
1421 }
1422
1423 len = min(remaining, max);
1424
1425 __clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0,
1426 bv->bv_offset + offset, len, 1);
1427
1428 ci->sector += len;
1429 ci->sector_count -= len;
1430 offset += to_bytes(len);
1431 } while (remaining -= len);
1432
1433 ci->idx++;
1434
1435 return 0;
1436}
1437
1438/*
1439 * Select the correct strategy for processing a non-flush bio.
1440 */
14fe594d 1441static int __split_and_process_non_flush(struct clone_info *ci)
1da177e4 1442{
dba14160 1443 struct bio *bio = ci->bio;
512875bd 1444 struct dm_target *ti;
e4c93811
AK
1445 sector_t len, max;
1446 int idx;
1da177e4 1447
5ae89a87 1448 if (unlikely(bio->bi_rw & REQ_DISCARD))
14fe594d 1449 return __send_discard(ci);
23508a96 1450 else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
14fe594d 1451 return __send_write_same(ci);
5ae89a87 1452
512875bd
JN
1453 ti = dm_table_find_target(ci->map, ci->sector);
1454 if (!dm_target_is_valid(ti))
1455 return -EIO;
1456
56a67df7 1457 max = max_io_len(ci->sector, ti);
512875bd 1458
e4c93811
AK
1459 /*
1460 * Optimise for the simple case where we can do all of
1461 * the remaining io with a single clone.
1462 */
1da177e4 1463 if (ci->sector_count <= max) {
e4c93811
AK
1464 __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
1465 ci->idx, bio->bi_vcnt - ci->idx, 0,
1466 ci->sector_count, 0);
1467 ci->sector_count = 0;
1468 return 0;
1469 }
1da177e4 1470
e4c93811
AK
1471 /*
1472 * There are some bvecs that don't span targets.
1473 * Do as many of these as possible.
1474 */
1475 if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
1476 len = __len_within_target(ci, max, &idx);
1da177e4 1477
e4c93811
AK
1478 __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
1479 ci->idx, idx - ci->idx, 0, len, 0);
1da177e4
LT
1480
1481 ci->sector += len;
1482 ci->sector_count -= len;
e4c93811 1483 ci->idx = idx;
d2044a94 1484
e4c93811 1485 return 0;
1da177e4 1486 }
512875bd 1487
e4c93811
AK
1488 /*
1489 * Handle a bvec that must be split between two or more targets.
1490 */
1491 return __split_bvec_across_targets(ci, ti, max);
1da177e4
LT
1492}
1493
1494/*
14fe594d 1495 * Entry point to split a bio into clones and submit them to the targets.
1da177e4 1496 */
83d5e5b0
MP
1497static void __split_and_process_bio(struct mapped_device *md,
1498 struct dm_table *map, struct bio *bio)
1da177e4
LT
1499{
1500 struct clone_info ci;
512875bd 1501 int error = 0;
1da177e4 1502
83d5e5b0 1503 if (unlikely(!map)) {
6a8736d1 1504 bio_io_error(bio);
f0b9a450
MP
1505 return;
1506 }
692d0eb9 1507
83d5e5b0 1508 ci.map = map;
1da177e4 1509 ci.md = md;
1da177e4
LT
1510 ci.io = alloc_io(md);
1511 ci.io->error = 0;
1512 atomic_set(&ci.io->io_count, 1);
1513 ci.io->bio = bio;
1514 ci.io->md = md;
f88fb981 1515 spin_lock_init(&ci.io->endio_lock);
1da177e4 1516 ci.sector = bio->bi_sector;
1da177e4
LT
1517 ci.idx = bio->bi_idx;
1518
3eaf840e 1519 start_io_acct(ci.io);
bd2a49b8 1520
b372d360
MS
1521 if (bio->bi_rw & REQ_FLUSH) {
1522 ci.bio = &ci.md->flush_bio;
1523 ci.sector_count = 0;
14fe594d 1524 error = __send_empty_flush(&ci);
b372d360
MS
1525 /* dec_pending submits any data associated with flush */
1526 } else {
6a8736d1 1527 ci.bio = bio;
d87f4c14 1528 ci.sector_count = bio_sectors(bio);
b372d360 1529 while (ci.sector_count && !error)
14fe594d 1530 error = __split_and_process_non_flush(&ci);
d87f4c14 1531 }
1da177e4
LT
1532
1533 /* drop the extra reference count */
512875bd 1534 dec_pending(ci.io, error);
1da177e4
LT
1535}
1536/*-----------------------------------------------------------------
1537 * CRUD END
1538 *---------------------------------------------------------------*/
1539
f6fccb12
MB
1540static int dm_merge_bvec(struct request_queue *q,
1541 struct bvec_merge_data *bvm,
1542 struct bio_vec *biovec)
1543{
1544 struct mapped_device *md = q->queuedata;
83d5e5b0 1545 struct dm_table *map = dm_get_live_table_fast(md);
f6fccb12
MB
1546 struct dm_target *ti;
1547 sector_t max_sectors;
5037108a 1548 int max_size = 0;
f6fccb12
MB
1549
1550 if (unlikely(!map))
5037108a 1551 goto out;
f6fccb12
MB
1552
1553 ti = dm_table_find_target(map, bvm->bi_sector);
b01cd5ac 1554 if (!dm_target_is_valid(ti))
83d5e5b0 1555 goto out;
f6fccb12
MB
1556
1557 /*
1558 * Find maximum amount of I/O that won't need splitting
1559 */
56a67df7 1560 max_sectors = min(max_io_len(bvm->bi_sector, ti),
f6fccb12
MB
1561 (sector_t) BIO_MAX_SECTORS);
1562 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
1563 if (max_size < 0)
1564 max_size = 0;
1565
1566 /*
1567 * merge_bvec_fn() returns number of bytes
1568 * it can accept at this offset
1569 * max is precomputed maximal io size
1570 */
1571 if (max_size && ti->type->merge)
1572 max_size = ti->type->merge(ti, bvm, biovec, max_size);
8cbeb67a
MP
1573 /*
1574 * If the target doesn't support merge method and some of the devices
1575 * provided their merge_bvec method (we know this by looking at
1576 * queue_max_hw_sectors), then we can't allow bios with multiple vector
1577 * entries. So always set max_size to 0, and the code below allows
1578 * just one page.
1579 */
1580 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
1581
1582 max_size = 0;
f6fccb12 1583
5037108a 1584out:
83d5e5b0 1585 dm_put_live_table_fast(md);
f6fccb12
MB
1586 /*
1587 * Always allow an entire first page
1588 */
1589 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
1590 max_size = biovec->bv_len;
1591
f6fccb12
MB
1592 return max_size;
1593}
1594
1da177e4
LT
1595/*
1596 * The request function that just remaps the bio built up by
1597 * dm_merge_bvec.
1598 */
5a7bbad2 1599static void _dm_request(struct request_queue *q, struct bio *bio)
1da177e4 1600{
12f03a49 1601 int rw = bio_data_dir(bio);
1da177e4 1602 struct mapped_device *md = q->queuedata;
c9959059 1603 int cpu;
83d5e5b0
MP
1604 int srcu_idx;
1605 struct dm_table *map;
1da177e4 1606
83d5e5b0 1607 map = dm_get_live_table(md, &srcu_idx);
1da177e4 1608
074a7aca
TH
1609 cpu = part_stat_lock();
1610 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
1611 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
1612 part_stat_unlock();
12f03a49 1613
6a8736d1
TH
1614 /* if we're suspended, we have to queue this io for later */
1615 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
83d5e5b0 1616 dm_put_live_table(md, srcu_idx);
1da177e4 1617
6a8736d1
TH
1618 if (bio_rw(bio) != READA)
1619 queue_io(md, bio);
1620 else
54d9a1b4 1621 bio_io_error(bio);
5a7bbad2 1622 return;
1da177e4
LT
1623 }
1624
83d5e5b0
MP
1625 __split_and_process_bio(md, map, bio);
1626 dm_put_live_table(md, srcu_idx);
5a7bbad2 1627 return;
cec47e3d
KU
1628}
1629
fd2ed4d2 1630int dm_request_based(struct mapped_device *md)
cec47e3d
KU
1631{
1632 return blk_queue_stackable(md->queue);
1633}
1634
5a7bbad2 1635static void dm_request(struct request_queue *q, struct bio *bio)
cec47e3d
KU
1636{
1637 struct mapped_device *md = q->queuedata;
1638
1639 if (dm_request_based(md))
5a7bbad2
CH
1640 blk_queue_bio(q, bio);
1641 else
1642 _dm_request(q, bio);
cec47e3d
KU
1643}
1644
1645void dm_dispatch_request(struct request *rq)
1646{
1647 int r;
1648
1649 if (blk_queue_io_stat(rq->q))
1650 rq->cmd_flags |= REQ_IO_STAT;
1651
1652 rq->start_time = jiffies;
1653 r = blk_insert_cloned_request(rq->q, rq);
1654 if (r)
1655 dm_complete_request(rq, r);
1656}
1657EXPORT_SYMBOL_GPL(dm_dispatch_request);
1658
cec47e3d
KU
1659static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1660 void *data)
1661{
1662 struct dm_rq_target_io *tio = data;
94818742
KO
1663 struct dm_rq_clone_bio_info *info =
1664 container_of(bio, struct dm_rq_clone_bio_info, clone);
cec47e3d
KU
1665
1666 info->orig = bio_orig;
1667 info->tio = tio;
1668 bio->bi_end_io = end_clone_bio;
1669 bio->bi_private = info;
cec47e3d
KU
1670
1671 return 0;
1672}
1673
1674static int setup_clone(struct request *clone, struct request *rq,
1675 struct dm_rq_target_io *tio)
1676{
d0bcb878 1677 int r;
cec47e3d 1678
29e4013d
TH
1679 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1680 dm_rq_bio_constructor, tio);
1681 if (r)
1682 return r;
cec47e3d 1683
29e4013d
TH
1684 clone->cmd = rq->cmd;
1685 clone->cmd_len = rq->cmd_len;
1686 clone->sense = rq->sense;
1687 clone->buffer = rq->buffer;
cec47e3d
KU
1688 clone->end_io = end_clone_request;
1689 clone->end_io_data = tio;
1690
1691 return 0;
1692}
1693
6facdaff
KU
1694static struct request *clone_rq(struct request *rq, struct mapped_device *md,
1695 gfp_t gfp_mask)
1696{
1697 struct request *clone;
1698 struct dm_rq_target_io *tio;
1699
1700 tio = alloc_rq_tio(md, gfp_mask);
1701 if (!tio)
1702 return NULL;
1703
1704 tio->md = md;
1705 tio->ti = NULL;
1706 tio->orig = rq;
1707 tio->error = 0;
1708 memset(&tio->info, 0, sizeof(tio->info));
1709
1710 clone = &tio->clone;
1711 if (setup_clone(clone, rq, tio)) {
1712 /* -ENOMEM */
1713 free_rq_tio(tio);
1714 return NULL;
1715 }
1716
1717 return clone;
1718}
1719
cec47e3d
KU
1720/*
1721 * Called with the queue lock held.
1722 */
1723static int dm_prep_fn(struct request_queue *q, struct request *rq)
1724{
1725 struct mapped_device *md = q->queuedata;
cec47e3d
KU
1726 struct request *clone;
1727
cec47e3d
KU
1728 if (unlikely(rq->special)) {
1729 DMWARN("Already has something in rq->special.");
1730 return BLKPREP_KILL;
1731 }
1732
6facdaff
KU
1733 clone = clone_rq(rq, md, GFP_ATOMIC);
1734 if (!clone)
cec47e3d 1735 return BLKPREP_DEFER;
cec47e3d
KU
1736
1737 rq->special = clone;
1738 rq->cmd_flags |= REQ_DONTPREP;
1739
1740 return BLKPREP_OK;
1741}
1742
9eef87da
KU
1743/*
1744 * Returns:
1745 * 0 : the request has been processed (not requeued)
1746 * !0 : the request has been requeued
1747 */
1748static int map_request(struct dm_target *ti, struct request *clone,
1749 struct mapped_device *md)
cec47e3d 1750{
9eef87da 1751 int r, requeued = 0;
cec47e3d
KU
1752 struct dm_rq_target_io *tio = clone->end_io_data;
1753
cec47e3d
KU
1754 tio->ti = ti;
1755 r = ti->type->map_rq(ti, clone, &tio->info);
1756 switch (r) {
1757 case DM_MAPIO_SUBMITTED:
1758 /* The target has taken the I/O to submit by itself later */
1759 break;
1760 case DM_MAPIO_REMAPPED:
1761 /* The target has remapped the I/O so dispatch it */
6db4ccd6
JN
1762 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
1763 blk_rq_pos(tio->orig));
cec47e3d
KU
1764 dm_dispatch_request(clone);
1765 break;
1766 case DM_MAPIO_REQUEUE:
1767 /* The target wants to requeue the I/O */
1768 dm_requeue_unmapped_request(clone);
9eef87da 1769 requeued = 1;
cec47e3d
KU
1770 break;
1771 default:
1772 if (r > 0) {
1773 DMWARN("unimplemented target map return value: %d", r);
1774 BUG();
1775 }
1776
1777 /* The target wants to complete the I/O */
1778 dm_kill_unmapped_request(clone, r);
1779 break;
1780 }
9eef87da
KU
1781
1782 return requeued;
cec47e3d
KU
1783}
1784
ba1cbad9
MS
1785static struct request *dm_start_request(struct mapped_device *md, struct request *orig)
1786{
1787 struct request *clone;
1788
1789 blk_start_request(orig);
1790 clone = orig->special;
1791 atomic_inc(&md->pending[rq_data_dir(clone)]);
1792
1793 /*
1794 * Hold the md reference here for the in-flight I/O.
1795 * We can't rely on the reference count by device opener,
1796 * because the device may be closed during the request completion
1797 * when all bios are completed.
1798 * See the comment in rq_completed() too.
1799 */
1800 dm_get(md);
1801
1802 return clone;
1803}
1804
cec47e3d
KU
1805/*
1806 * q->request_fn for request-based dm.
1807 * Called with the queue lock held.
1808 */
1809static void dm_request_fn(struct request_queue *q)
1810{
1811 struct mapped_device *md = q->queuedata;
83d5e5b0
MP
1812 int srcu_idx;
1813 struct dm_table *map = dm_get_live_table(md, &srcu_idx);
cec47e3d 1814 struct dm_target *ti;
b4324fee 1815 struct request *rq, *clone;
29e4013d 1816 sector_t pos;
cec47e3d
KU
1817
1818 /*
b4324fee
KU
1819 * For suspend, check blk_queue_stopped() and increment
1820 * ->pending within a single queue_lock not to increment the
1821 * number of in-flight I/Os after the queue is stopped in
1822 * dm_suspend().
cec47e3d 1823 */
7eaceacc 1824 while (!blk_queue_stopped(q)) {
cec47e3d
KU
1825 rq = blk_peek_request(q);
1826 if (!rq)
7eaceacc 1827 goto delay_and_out;
cec47e3d 1828
29e4013d
TH
1829 /* always use block 0 to find the target for flushes for now */
1830 pos = 0;
1831 if (!(rq->cmd_flags & REQ_FLUSH))
1832 pos = blk_rq_pos(rq);
1833
1834 ti = dm_table_find_target(map, pos);
ba1cbad9
MS
1835 if (!dm_target_is_valid(ti)) {
1836 /*
1837 * Must perform setup, that dm_done() requires,
1838 * before calling dm_kill_unmapped_request
1839 */
1840 DMERR_LIMIT("request attempted access beyond the end of device");
1841 clone = dm_start_request(md, rq);
1842 dm_kill_unmapped_request(clone, -EIO);
1843 continue;
1844 }
d0bcb878 1845
cec47e3d 1846 if (ti->type->busy && ti->type->busy(ti))
7eaceacc 1847 goto delay_and_out;
cec47e3d 1848
ba1cbad9 1849 clone = dm_start_request(md, rq);
b4324fee 1850
cec47e3d 1851 spin_unlock(q->queue_lock);
9eef87da
KU
1852 if (map_request(ti, clone, md))
1853 goto requeued;
1854
052189a2
KU
1855 BUG_ON(!irqs_disabled());
1856 spin_lock(q->queue_lock);
cec47e3d
KU
1857 }
1858
1859 goto out;
1860
9eef87da 1861requeued:
052189a2
KU
1862 BUG_ON(!irqs_disabled());
1863 spin_lock(q->queue_lock);
9eef87da 1864
7eaceacc
JA
1865delay_and_out:
1866 blk_delay_queue(q, HZ / 10);
cec47e3d 1867out:
83d5e5b0 1868 dm_put_live_table(md, srcu_idx);
cec47e3d
KU
1869}
1870
1871int dm_underlying_device_busy(struct request_queue *q)
1872{
1873 return blk_lld_busy(q);
1874}
1875EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
1876
1877static int dm_lld_busy(struct request_queue *q)
1878{
1879 int r;
1880 struct mapped_device *md = q->queuedata;
83d5e5b0 1881 struct dm_table *map = dm_get_live_table_fast(md);
cec47e3d
KU
1882
1883 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
1884 r = 1;
1885 else
1886 r = dm_table_any_busy_target(map);
1887
83d5e5b0 1888 dm_put_live_table_fast(md);
cec47e3d
KU
1889
1890 return r;
1891}
1892
1da177e4
LT
1893static int dm_any_congested(void *congested_data, int bdi_bits)
1894{
8a57dfc6
CS
1895 int r = bdi_bits;
1896 struct mapped_device *md = congested_data;
1897 struct dm_table *map;
1da177e4 1898
1eb787ec 1899 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
83d5e5b0 1900 map = dm_get_live_table_fast(md);
8a57dfc6 1901 if (map) {
cec47e3d
KU
1902 /*
1903 * Request-based dm cares about only own queue for
1904 * the query about congestion status of request_queue
1905 */
1906 if (dm_request_based(md))
1907 r = md->queue->backing_dev_info.state &
1908 bdi_bits;
1909 else
1910 r = dm_table_any_congested(map, bdi_bits);
8a57dfc6 1911 }
83d5e5b0 1912 dm_put_live_table_fast(md);
8a57dfc6
CS
1913 }
1914
1da177e4
LT
1915 return r;
1916}
1917
1918/*-----------------------------------------------------------------
1919 * An IDR is used to keep track of allocated minor numbers.
1920 *---------------------------------------------------------------*/
2b06cfff 1921static void free_minor(int minor)
1da177e4 1922{
f32c10b0 1923 spin_lock(&_minor_lock);
1da177e4 1924 idr_remove(&_minor_idr, minor);
f32c10b0 1925 spin_unlock(&_minor_lock);
1da177e4
LT
1926}
1927
1928/*
1929 * See if the device with a specific minor # is free.
1930 */
cf13ab8e 1931static int specific_minor(int minor)
1da177e4 1932{
c9d76be6 1933 int r;
1da177e4
LT
1934
1935 if (minor >= (1 << MINORBITS))
1936 return -EINVAL;
1937
c9d76be6 1938 idr_preload(GFP_KERNEL);
f32c10b0 1939 spin_lock(&_minor_lock);
1da177e4 1940
c9d76be6 1941 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1da177e4 1942
f32c10b0 1943 spin_unlock(&_minor_lock);
c9d76be6
TH
1944 idr_preload_end();
1945 if (r < 0)
1946 return r == -ENOSPC ? -EBUSY : r;
1947 return 0;
1da177e4
LT
1948}
1949
cf13ab8e 1950static int next_free_minor(int *minor)
1da177e4 1951{
c9d76be6 1952 int r;
62f75c2f 1953
c9d76be6 1954 idr_preload(GFP_KERNEL);
f32c10b0 1955 spin_lock(&_minor_lock);
1da177e4 1956
c9d76be6 1957 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1da177e4 1958
f32c10b0 1959 spin_unlock(&_minor_lock);
c9d76be6
TH
1960 idr_preload_end();
1961 if (r < 0)
1962 return r;
1963 *minor = r;
1964 return 0;
1da177e4
LT
1965}
1966
83d5cde4 1967static const struct block_device_operations dm_blk_dops;
1da177e4 1968
53d5914f
MP
1969static void dm_wq_work(struct work_struct *work);
1970
4a0b4ddf
MS
1971static void dm_init_md_queue(struct mapped_device *md)
1972{
1973 /*
1974 * Request-based dm devices cannot be stacked on top of bio-based dm
1975 * devices. The type of this dm device has not been decided yet.
1976 * The type is decided at the first table loading time.
1977 * To prevent problematic device stacking, clear the queue flag
1978 * for request stacking support until then.
1979 *
1980 * This queue is new, so no concurrency on the queue_flags.
1981 */
1982 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
1983
1984 md->queue->queuedata = md;
1985 md->queue->backing_dev_info.congested_fn = dm_any_congested;
1986 md->queue->backing_dev_info.congested_data = md;
1987 blk_queue_make_request(md->queue, dm_request);
1988 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
4a0b4ddf
MS
1989 blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1990}
1991
1da177e4
LT
1992/*
1993 * Allocate and initialise a blank device with a given minor.
1994 */
2b06cfff 1995static struct mapped_device *alloc_dev(int minor)
1da177e4
LT
1996{
1997 int r;
cf13ab8e 1998 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
ba61fdd1 1999 void *old_md;
1da177e4
LT
2000
2001 if (!md) {
2002 DMWARN("unable to allocate device, out of memory.");
2003 return NULL;
2004 }
2005
10da4f79 2006 if (!try_module_get(THIS_MODULE))
6ed7ade8 2007 goto bad_module_get;
10da4f79 2008
1da177e4 2009 /* get a minor number for the dev */
2b06cfff 2010 if (minor == DM_ANY_MINOR)
cf13ab8e 2011 r = next_free_minor(&minor);
2b06cfff 2012 else
cf13ab8e 2013 r = specific_minor(minor);
1da177e4 2014 if (r < 0)
6ed7ade8 2015 goto bad_minor;
1da177e4 2016
83d5e5b0
MP
2017 r = init_srcu_struct(&md->io_barrier);
2018 if (r < 0)
2019 goto bad_io_barrier;
2020
a5664dad 2021 md->type = DM_TYPE_NONE;
e61290a4 2022 mutex_init(&md->suspend_lock);
a5664dad 2023 mutex_init(&md->type_lock);
022c2611 2024 spin_lock_init(&md->deferred_lock);
1da177e4 2025 atomic_set(&md->holders, 1);
5c6bd75d 2026 atomic_set(&md->open_count, 0);
1da177e4 2027 atomic_set(&md->event_nr, 0);
7a8c3d3b
MA
2028 atomic_set(&md->uevent_seq, 0);
2029 INIT_LIST_HEAD(&md->uevent_list);
2030 spin_lock_init(&md->uevent_lock);
1da177e4 2031
4a0b4ddf 2032 md->queue = blk_alloc_queue(GFP_KERNEL);
1da177e4 2033 if (!md->queue)
6ed7ade8 2034 goto bad_queue;
1da177e4 2035
4a0b4ddf 2036 dm_init_md_queue(md);
9faf400f 2037
1da177e4
LT
2038 md->disk = alloc_disk(1);
2039 if (!md->disk)
6ed7ade8 2040 goto bad_disk;
1da177e4 2041
316d315b
NK
2042 atomic_set(&md->pending[0], 0);
2043 atomic_set(&md->pending[1], 0);
f0b04115 2044 init_waitqueue_head(&md->wait);
53d5914f 2045 INIT_WORK(&md->work, dm_wq_work);
f0b04115 2046 init_waitqueue_head(&md->eventq);
be35f486 2047 init_completion(&md->kobj_completion);
f0b04115 2048
1da177e4
LT
2049 md->disk->major = _major;
2050 md->disk->first_minor = minor;
2051 md->disk->fops = &dm_blk_dops;
2052 md->disk->queue = md->queue;
2053 md->disk->private_data = md;
2054 sprintf(md->disk->disk_name, "dm-%d", minor);
2055 add_disk(md->disk);
7e51f257 2056 format_dev_t(md->name, MKDEV(_major, minor));
1da177e4 2057
670368a8 2058 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
304f3f6a
MB
2059 if (!md->wq)
2060 goto bad_thread;
2061
32a926da
MP
2062 md->bdev = bdget_disk(md->disk, 0);
2063 if (!md->bdev)
2064 goto bad_bdev;
2065
6a8736d1
TH
2066 bio_init(&md->flush_bio);
2067 md->flush_bio.bi_bdev = md->bdev;
2068 md->flush_bio.bi_rw = WRITE_FLUSH;
2069
fd2ed4d2
MP
2070 dm_stats_init(&md->stats);
2071
ba61fdd1 2072 /* Populate the mapping, nobody knows we exist yet */
f32c10b0 2073 spin_lock(&_minor_lock);
ba61fdd1 2074 old_md = idr_replace(&_minor_idr, md, minor);
f32c10b0 2075 spin_unlock(&_minor_lock);
ba61fdd1
JM
2076
2077 BUG_ON(old_md != MINOR_ALLOCED);
2078
1da177e4
LT
2079 return md;
2080
32a926da
MP
2081bad_bdev:
2082 destroy_workqueue(md->wq);
304f3f6a 2083bad_thread:
03022c54 2084 del_gendisk(md->disk);
304f3f6a 2085 put_disk(md->disk);
6ed7ade8 2086bad_disk:
1312f40e 2087 blk_cleanup_queue(md->queue);
6ed7ade8 2088bad_queue:
83d5e5b0
MP
2089 cleanup_srcu_struct(&md->io_barrier);
2090bad_io_barrier:
1da177e4 2091 free_minor(minor);
6ed7ade8 2092bad_minor:
10da4f79 2093 module_put(THIS_MODULE);
6ed7ade8 2094bad_module_get:
1da177e4
LT
2095 kfree(md);
2096 return NULL;
2097}
2098
ae9da83f
JN
2099static void unlock_fs(struct mapped_device *md);
2100
1da177e4
LT
2101static void free_dev(struct mapped_device *md)
2102{
f331c029 2103 int minor = MINOR(disk_devt(md->disk));
63d94e48 2104
32a926da
MP
2105 unlock_fs(md);
2106 bdput(md->bdev);
304f3f6a 2107 destroy_workqueue(md->wq);
e6ee8c0b
KU
2108 if (md->io_pool)
2109 mempool_destroy(md->io_pool);
2110 if (md->bs)
2111 bioset_free(md->bs);
9c47008d 2112 blk_integrity_unregister(md->disk);
1da177e4 2113 del_gendisk(md->disk);
83d5e5b0 2114 cleanup_srcu_struct(&md->io_barrier);
63d94e48 2115 free_minor(minor);
fba9f90e
JM
2116
2117 spin_lock(&_minor_lock);
2118 md->disk->private_data = NULL;
2119 spin_unlock(&_minor_lock);
2120
1da177e4 2121 put_disk(md->disk);
1312f40e 2122 blk_cleanup_queue(md->queue);
fd2ed4d2 2123 dm_stats_cleanup(&md->stats);
10da4f79 2124 module_put(THIS_MODULE);
1da177e4
LT
2125 kfree(md);
2126}
2127
e6ee8c0b
KU
2128static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
2129{
c0820cf5 2130 struct dm_md_mempools *p = dm_table_get_md_mempools(t);
e6ee8c0b 2131
5f015204 2132 if (md->io_pool && md->bs) {
16245bdc
JN
2133 /* The md already has necessary mempools. */
2134 if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
2135 /*
2136 * Reload bioset because front_pad may have changed
2137 * because a different table was loaded.
2138 */
2139 bioset_free(md->bs);
2140 md->bs = p->bs;
2141 p->bs = NULL;
2142 } else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) {
16245bdc
JN
2143 /*
2144 * There's no need to reload with request-based dm
2145 * because the size of front_pad doesn't change.
2146 * Note for future: If you are to reload bioset,
2147 * prep-ed requests in the queue may refer
2148 * to bio from the old bioset, so you must walk
2149 * through the queue to unprep.
2150 */
2151 }
e6ee8c0b 2152 goto out;
c0820cf5 2153 }
e6ee8c0b 2154
5f015204 2155 BUG_ON(!p || md->io_pool || md->bs);
e6ee8c0b
KU
2156
2157 md->io_pool = p->io_pool;
2158 p->io_pool = NULL;
e6ee8c0b
KU
2159 md->bs = p->bs;
2160 p->bs = NULL;
2161
2162out:
2163 /* mempool bind completed, now no need any mempools in the table */
2164 dm_table_free_md_mempools(t);
2165}
2166
1da177e4
LT
2167/*
2168 * Bind a table to the device.
2169 */
2170static void event_callback(void *context)
2171{
7a8c3d3b
MA
2172 unsigned long flags;
2173 LIST_HEAD(uevents);
1da177e4
LT
2174 struct mapped_device *md = (struct mapped_device *) context;
2175
7a8c3d3b
MA
2176 spin_lock_irqsave(&md->uevent_lock, flags);
2177 list_splice_init(&md->uevent_list, &uevents);
2178 spin_unlock_irqrestore(&md->uevent_lock, flags);
2179
ed9e1982 2180 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
7a8c3d3b 2181
1da177e4
LT
2182 atomic_inc(&md->event_nr);
2183 wake_up(&md->eventq);
2184}
2185
c217649b
MS
2186/*
2187 * Protected by md->suspend_lock obtained by dm_swap_table().
2188 */
4e90188b 2189static void __set_size(struct mapped_device *md, sector_t size)
1da177e4 2190{
4e90188b 2191 set_capacity(md->disk, size);
1da177e4 2192
db8fef4f 2193 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
1da177e4
LT
2194}
2195
d5b9dd04
MP
2196/*
2197 * Return 1 if the queue has a compulsory merge_bvec_fn function.
2198 *
2199 * If this function returns 0, then the device is either a non-dm
2200 * device without a merge_bvec_fn, or it is a dm device that is
2201 * able to split any bios it receives that are too big.
2202 */
2203int dm_queue_merge_is_compulsory(struct request_queue *q)
2204{
2205 struct mapped_device *dev_md;
2206
2207 if (!q->merge_bvec_fn)
2208 return 0;
2209
2210 if (q->make_request_fn == dm_request) {
2211 dev_md = q->queuedata;
2212 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
2213 return 0;
2214 }
2215
2216 return 1;
2217}
2218
2219static int dm_device_merge_is_compulsory(struct dm_target *ti,
2220 struct dm_dev *dev, sector_t start,
2221 sector_t len, void *data)
2222{
2223 struct block_device *bdev = dev->bdev;
2224 struct request_queue *q = bdev_get_queue(bdev);
2225
2226 return dm_queue_merge_is_compulsory(q);
2227}
2228
2229/*
2230 * Return 1 if it is acceptable to ignore merge_bvec_fn based
2231 * on the properties of the underlying devices.
2232 */
2233static int dm_table_merge_is_optional(struct dm_table *table)
2234{
2235 unsigned i = 0;
2236 struct dm_target *ti;
2237
2238 while (i < dm_table_get_num_targets(table)) {
2239 ti = dm_table_get_target(table, i++);
2240
2241 if (ti->type->iterate_devices &&
2242 ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
2243 return 0;
2244 }
2245
2246 return 1;
2247}
2248
042d2a9b
AK
2249/*
2250 * Returns old map, which caller must destroy.
2251 */
2252static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2253 struct queue_limits *limits)
1da177e4 2254{
042d2a9b 2255 struct dm_table *old_map;
165125e1 2256 struct request_queue *q = md->queue;
1da177e4 2257 sector_t size;
d5b9dd04 2258 int merge_is_optional;
1da177e4
LT
2259
2260 size = dm_table_get_size(t);
3ac51e74
DW
2261
2262 /*
2263 * Wipe any geometry if the size of the table changed.
2264 */
fd2ed4d2 2265 if (size != dm_get_size(md))
3ac51e74
DW
2266 memset(&md->geometry, 0, sizeof(md->geometry));
2267
32a926da 2268 __set_size(md, size);
d5816876 2269
2ca3310e
AK
2270 dm_table_event_callback(t, event_callback, md);
2271
e6ee8c0b
KU
2272 /*
2273 * The queue hasn't been stopped yet, if the old table type wasn't
2274 * for request-based during suspension. So stop it to prevent
2275 * I/O mapping before resume.
2276 * This must be done before setting the queue restrictions,
2277 * because request-based dm may be run just after the setting.
2278 */
2279 if (dm_table_request_based(t) && !blk_queue_stopped(q))
2280 stop_queue(q);
2281
2282 __bind_mempools(md, t);
2283
d5b9dd04
MP
2284 merge_is_optional = dm_table_merge_is_optional(t);
2285
042d2a9b 2286 old_map = md->map;
83d5e5b0 2287 rcu_assign_pointer(md->map, t);
36a0456f
AK
2288 md->immutable_target_type = dm_table_get_immutable_target_type(t);
2289
754c5fc7 2290 dm_table_set_restrictions(t, q, limits);
d5b9dd04
MP
2291 if (merge_is_optional)
2292 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2293 else
2294 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
83d5e5b0 2295 dm_sync_table(md);
1da177e4 2296
042d2a9b 2297 return old_map;
1da177e4
LT
2298}
2299
a7940155
AK
2300/*
2301 * Returns unbound table for the caller to free.
2302 */
2303static struct dm_table *__unbind(struct mapped_device *md)
1da177e4
LT
2304{
2305 struct dm_table *map = md->map;
2306
2307 if (!map)
a7940155 2308 return NULL;
1da177e4
LT
2309
2310 dm_table_event_callback(map, NULL, NULL);
83d5e5b0
MP
2311 rcu_assign_pointer(md->map, NULL);
2312 dm_sync_table(md);
a7940155
AK
2313
2314 return map;
1da177e4
LT
2315}
2316
2317/*
2318 * Constructor for a new device.
2319 */
2b06cfff 2320int dm_create(int minor, struct mapped_device **result)
1da177e4
LT
2321{
2322 struct mapped_device *md;
2323
2b06cfff 2324 md = alloc_dev(minor);
1da177e4
LT
2325 if (!md)
2326 return -ENXIO;
2327
784aae73
MB
2328 dm_sysfs_init(md);
2329
1da177e4
LT
2330 *result = md;
2331 return 0;
2332}
2333
a5664dad
MS
2334/*
2335 * Functions to manage md->type.
2336 * All are required to hold md->type_lock.
2337 */
2338void dm_lock_md_type(struct mapped_device *md)
2339{
2340 mutex_lock(&md->type_lock);
2341}
2342
2343void dm_unlock_md_type(struct mapped_device *md)
2344{
2345 mutex_unlock(&md->type_lock);
2346}
2347
2348void dm_set_md_type(struct mapped_device *md, unsigned type)
2349{
00c4fc3b 2350 BUG_ON(!mutex_is_locked(&md->type_lock));
a5664dad
MS
2351 md->type = type;
2352}
2353
2354unsigned dm_get_md_type(struct mapped_device *md)
2355{
00c4fc3b 2356 BUG_ON(!mutex_is_locked(&md->type_lock));
a5664dad
MS
2357 return md->type;
2358}
2359
36a0456f
AK
2360struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2361{
2362 return md->immutable_target_type;
2363}
2364
f84cb8a4
MS
2365/*
2366 * The queue_limits are only valid as long as you have a reference
2367 * count on 'md'.
2368 */
2369struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2370{
2371 BUG_ON(!atomic_read(&md->holders));
2372 return &md->queue->limits;
2373}
2374EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2375
4a0b4ddf
MS
2376/*
2377 * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
2378 */
2379static int dm_init_request_based_queue(struct mapped_device *md)
2380{
2381 struct request_queue *q = NULL;
2382
2383 if (md->queue->elevator)
2384 return 1;
2385
2386 /* Fully initialize the queue */
2387 q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
2388 if (!q)
2389 return 0;
2390
2391 md->queue = q;
4a0b4ddf
MS
2392 dm_init_md_queue(md);
2393 blk_queue_softirq_done(md->queue, dm_softirq_done);
2394 blk_queue_prep_rq(md->queue, dm_prep_fn);
2395 blk_queue_lld_busy(md->queue, dm_lld_busy);
4a0b4ddf
MS
2396
2397 elv_register_queue(md->queue);
2398
2399 return 1;
2400}
2401
2402/*
2403 * Setup the DM device's queue based on md's type
2404 */
2405int dm_setup_md_queue(struct mapped_device *md)
2406{
2407 if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&
2408 !dm_init_request_based_queue(md)) {
2409 DMWARN("Cannot initialize queue for request-based mapped device");
2410 return -EINVAL;
2411 }
2412
2413 return 0;
2414}
2415
637842cf 2416static struct mapped_device *dm_find_md(dev_t dev)
1da177e4
LT
2417{
2418 struct mapped_device *md;
1da177e4
LT
2419 unsigned minor = MINOR(dev);
2420
2421 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2422 return NULL;
2423
f32c10b0 2424 spin_lock(&_minor_lock);
1da177e4
LT
2425
2426 md = idr_find(&_minor_idr, minor);
fba9f90e 2427 if (md && (md == MINOR_ALLOCED ||
f331c029 2428 (MINOR(disk_devt(dm_disk(md))) != minor) ||
abdc568b 2429 dm_deleting_md(md) ||
17b2f66f 2430 test_bit(DMF_FREEING, &md->flags))) {
637842cf 2431 md = NULL;
fba9f90e
JM
2432 goto out;
2433 }
1da177e4 2434
fba9f90e 2435out:
f32c10b0 2436 spin_unlock(&_minor_lock);
1da177e4 2437
637842cf
DT
2438 return md;
2439}
2440
d229a958
DT
2441struct mapped_device *dm_get_md(dev_t dev)
2442{
2443 struct mapped_device *md = dm_find_md(dev);
2444
2445 if (md)
2446 dm_get(md);
2447
2448 return md;
2449}
3cf2e4ba 2450EXPORT_SYMBOL_GPL(dm_get_md);
d229a958 2451
9ade92a9 2452void *dm_get_mdptr(struct mapped_device *md)
637842cf 2453{
9ade92a9 2454 return md->interface_ptr;
1da177e4
LT
2455}
2456
2457void dm_set_mdptr(struct mapped_device *md, void *ptr)
2458{
2459 md->interface_ptr = ptr;
2460}
2461
2462void dm_get(struct mapped_device *md)
2463{
2464 atomic_inc(&md->holders);
3f77316d 2465 BUG_ON(test_bit(DMF_FREEING, &md->flags));
1da177e4
LT
2466}
2467
72d94861
AK
2468const char *dm_device_name(struct mapped_device *md)
2469{
2470 return md->name;
2471}
2472EXPORT_SYMBOL_GPL(dm_device_name);
2473
3f77316d 2474static void __dm_destroy(struct mapped_device *md, bool wait)
1da177e4 2475{
1134e5ae 2476 struct dm_table *map;
83d5e5b0 2477 int srcu_idx;
1da177e4 2478
3f77316d 2479 might_sleep();
fba9f90e 2480
3f77316d 2481 spin_lock(&_minor_lock);
83d5e5b0 2482 map = dm_get_live_table(md, &srcu_idx);
3f77316d
KU
2483 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2484 set_bit(DMF_FREEING, &md->flags);
2485 spin_unlock(&_minor_lock);
2486
2487 if (!dm_suspended_md(md)) {
2488 dm_table_presuspend_targets(map);
2489 dm_table_postsuspend_targets(map);
1da177e4 2490 }
3f77316d 2491
83d5e5b0
MP
2492 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2493 dm_put_live_table(md, srcu_idx);
2494
3f77316d
KU
2495 /*
2496 * Rare, but there may be I/O requests still going to complete,
2497 * for example. Wait for all references to disappear.
2498 * No one should increment the reference count of the mapped_device,
2499 * after the mapped_device state becomes DMF_FREEING.
2500 */
2501 if (wait)
2502 while (atomic_read(&md->holders))
2503 msleep(1);
2504 else if (atomic_read(&md->holders))
2505 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2506 dm_device_name(md), atomic_read(&md->holders));
2507
2508 dm_sysfs_exit(md);
3f77316d
KU
2509 dm_table_destroy(__unbind(md));
2510 free_dev(md);
2511}
2512
2513void dm_destroy(struct mapped_device *md)
2514{
2515 __dm_destroy(md, true);
2516}
2517
2518void dm_destroy_immediate(struct mapped_device *md)
2519{
2520 __dm_destroy(md, false);
2521}
2522
2523void dm_put(struct mapped_device *md)
2524{
2525 atomic_dec(&md->holders);
1da177e4 2526}
79eb885c 2527EXPORT_SYMBOL_GPL(dm_put);
1da177e4 2528
401600df 2529static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
46125c1c
MB
2530{
2531 int r = 0;
b44ebeb0
MP
2532 DECLARE_WAITQUEUE(wait, current);
2533
b44ebeb0 2534 add_wait_queue(&md->wait, &wait);
46125c1c
MB
2535
2536 while (1) {
401600df 2537 set_current_state(interruptible);
46125c1c 2538
b4324fee 2539 if (!md_in_flight(md))
46125c1c
MB
2540 break;
2541
401600df
MP
2542 if (interruptible == TASK_INTERRUPTIBLE &&
2543 signal_pending(current)) {
46125c1c
MB
2544 r = -EINTR;
2545 break;
2546 }
2547
2548 io_schedule();
2549 }
2550 set_current_state(TASK_RUNNING);
2551
b44ebeb0
MP
2552 remove_wait_queue(&md->wait, &wait);
2553
46125c1c
MB
2554 return r;
2555}
2556
1da177e4
LT
2557/*
2558 * Process the deferred bios
2559 */
ef208587 2560static void dm_wq_work(struct work_struct *work)
1da177e4 2561{
ef208587
MP
2562 struct mapped_device *md = container_of(work, struct mapped_device,
2563 work);
6d6f10df 2564 struct bio *c;
83d5e5b0
MP
2565 int srcu_idx;
2566 struct dm_table *map;
1da177e4 2567
83d5e5b0 2568 map = dm_get_live_table(md, &srcu_idx);
ef208587 2569
3b00b203 2570 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
df12ee99
AK
2571 spin_lock_irq(&md->deferred_lock);
2572 c = bio_list_pop(&md->deferred);
2573 spin_unlock_irq(&md->deferred_lock);
2574
6a8736d1 2575 if (!c)
df12ee99 2576 break;
022c2611 2577
e6ee8c0b
KU
2578 if (dm_request_based(md))
2579 generic_make_request(c);
6a8736d1 2580 else
83d5e5b0 2581 __split_and_process_bio(md, map, c);
022c2611 2582 }
73d410c0 2583
83d5e5b0 2584 dm_put_live_table(md, srcu_idx);
1da177e4
LT
2585}
2586
9a1fb464 2587static void dm_queue_flush(struct mapped_device *md)
304f3f6a 2588{
3b00b203
MP
2589 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2590 smp_mb__after_clear_bit();
53d5914f 2591 queue_work(md->wq, &md->work);
304f3f6a
MB
2592}
2593
1da177e4 2594/*
042d2a9b 2595 * Swap in a new table, returning the old one for the caller to destroy.
1da177e4 2596 */
042d2a9b 2597struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
1da177e4 2598{
87eb5b21 2599 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
754c5fc7 2600 struct queue_limits limits;
042d2a9b 2601 int r;
1da177e4 2602
e61290a4 2603 mutex_lock(&md->suspend_lock);
1da177e4
LT
2604
2605 /* device must be suspended */
4f186f8b 2606 if (!dm_suspended_md(md))
93c534ae 2607 goto out;
1da177e4 2608
3ae70656
MS
2609 /*
2610 * If the new table has no data devices, retain the existing limits.
2611 * This helps multipath with queue_if_no_path if all paths disappear,
2612 * then new I/O is queued based on these limits, and then some paths
2613 * reappear.
2614 */
2615 if (dm_table_has_no_data_devices(table)) {
83d5e5b0 2616 live_map = dm_get_live_table_fast(md);
3ae70656
MS
2617 if (live_map)
2618 limits = md->queue->limits;
83d5e5b0 2619 dm_put_live_table_fast(md);
3ae70656
MS
2620 }
2621
87eb5b21
MC
2622 if (!live_map) {
2623 r = dm_calculate_queue_limits(table, &limits);
2624 if (r) {
2625 map = ERR_PTR(r);
2626 goto out;
2627 }
042d2a9b 2628 }
754c5fc7 2629
042d2a9b 2630 map = __bind(md, table, &limits);
1da177e4 2631
93c534ae 2632out:
e61290a4 2633 mutex_unlock(&md->suspend_lock);
042d2a9b 2634 return map;
1da177e4
LT
2635}
2636
2637/*
2638 * Functions to lock and unlock any filesystem running on the
2639 * device.
2640 */
2ca3310e 2641static int lock_fs(struct mapped_device *md)
1da177e4 2642{
e39e2e95 2643 int r;
1da177e4
LT
2644
2645 WARN_ON(md->frozen_sb);
dfbe03f6 2646
db8fef4f 2647 md->frozen_sb = freeze_bdev(md->bdev);
dfbe03f6 2648 if (IS_ERR(md->frozen_sb)) {
cf222b37 2649 r = PTR_ERR(md->frozen_sb);
e39e2e95
AK
2650 md->frozen_sb = NULL;
2651 return r;
dfbe03f6
AK
2652 }
2653
aa8d7c2f
AK
2654 set_bit(DMF_FROZEN, &md->flags);
2655
1da177e4
LT
2656 return 0;
2657}
2658
2ca3310e 2659static void unlock_fs(struct mapped_device *md)
1da177e4 2660{
aa8d7c2f
AK
2661 if (!test_bit(DMF_FROZEN, &md->flags))
2662 return;
2663
db8fef4f 2664 thaw_bdev(md->bdev, md->frozen_sb);
1da177e4 2665 md->frozen_sb = NULL;
aa8d7c2f 2666 clear_bit(DMF_FROZEN, &md->flags);
1da177e4
LT
2667}
2668
2669/*
2670 * We need to be able to change a mapping table under a mounted
2671 * filesystem. For example we might want to move some data in
2672 * the background. Before the table can be swapped with
2673 * dm_bind_table, dm_suspend must be called to flush any in
2674 * flight bios and ensure that any further io gets deferred.
2675 */
cec47e3d
KU
2676/*
2677 * Suspend mechanism in request-based dm.
2678 *
9f518b27
KU
2679 * 1. Flush all I/Os by lock_fs() if needed.
2680 * 2. Stop dispatching any I/O by stopping the request_queue.
2681 * 3. Wait for all in-flight I/Os to be completed or requeued.
cec47e3d 2682 *
9f518b27 2683 * To abort suspend, start the request_queue.
cec47e3d 2684 */
a3d77d35 2685int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1da177e4 2686{
2ca3310e 2687 struct dm_table *map = NULL;
46125c1c 2688 int r = 0;
a3d77d35 2689 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
2e93ccc1 2690 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
1da177e4 2691
e61290a4 2692 mutex_lock(&md->suspend_lock);
2ca3310e 2693
4f186f8b 2694 if (dm_suspended_md(md)) {
73d410c0 2695 r = -EINVAL;
d287483d 2696 goto out_unlock;
73d410c0 2697 }
1da177e4 2698
83d5e5b0 2699 map = md->map;
1da177e4 2700
2e93ccc1
KU
2701 /*
2702 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2703 * This flag is cleared before dm_suspend returns.
2704 */
2705 if (noflush)
2706 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2707
cf222b37
AK
2708 /* This does not get reverted if there's an error later. */
2709 dm_table_presuspend_targets(map);
2710
32a926da 2711 /*
9f518b27
KU
2712 * Flush I/O to the device.
2713 * Any I/O submitted after lock_fs() may not be flushed.
2714 * noflush takes precedence over do_lockfs.
2715 * (lock_fs() flushes I/Os and waits for them to complete.)
32a926da
MP
2716 */
2717 if (!noflush && do_lockfs) {
2718 r = lock_fs(md);
2719 if (r)
83d5e5b0 2720 goto out_unlock;
aa8d7c2f 2721 }
1da177e4
LT
2722
2723 /*
3b00b203
MP
2724 * Here we must make sure that no processes are submitting requests
2725 * to target drivers i.e. no one may be executing
2726 * __split_and_process_bio. This is called from dm_request and
2727 * dm_wq_work.
2728 *
2729 * To get all processes out of __split_and_process_bio in dm_request,
2730 * we take the write lock. To prevent any process from reentering
6a8736d1
TH
2731 * __split_and_process_bio from dm_request and quiesce the thread
2732 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2733 * flush_workqueue(md->wq).
1da177e4 2734 */
1eb787ec 2735 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
83d5e5b0 2736 synchronize_srcu(&md->io_barrier);
1da177e4 2737
d0bcb878 2738 /*
29e4013d
TH
2739 * Stop md->queue before flushing md->wq in case request-based
2740 * dm defers requests to md->wq from md->queue.
d0bcb878 2741 */
cec47e3d 2742 if (dm_request_based(md))
9f518b27 2743 stop_queue(md->queue);
cec47e3d 2744
d0bcb878
KU
2745 flush_workqueue(md->wq);
2746
1da177e4 2747 /*
3b00b203
MP
2748 * At this point no more requests are entering target request routines.
2749 * We call dm_wait_for_completion to wait for all existing requests
2750 * to finish.
1da177e4 2751 */
401600df 2752 r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
1da177e4 2753
6d6f10df 2754 if (noflush)
022c2611 2755 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
83d5e5b0 2756 synchronize_srcu(&md->io_barrier);
2e93ccc1 2757
1da177e4 2758 /* were we interrupted ? */
46125c1c 2759 if (r < 0) {
9a1fb464 2760 dm_queue_flush(md);
73d410c0 2761
cec47e3d 2762 if (dm_request_based(md))
9f518b27 2763 start_queue(md->queue);
cec47e3d 2764
2ca3310e 2765 unlock_fs(md);
83d5e5b0 2766 goto out_unlock; /* pushback list is already flushed, so skip flush */
2ca3310e 2767 }
1da177e4 2768
3b00b203
MP
2769 /*
2770 * If dm_wait_for_completion returned 0, the device is completely
2771 * quiescent now. There is no request-processing activity. All new
2772 * requests are being added to md->deferred list.
2773 */
2774
2ca3310e 2775 set_bit(DMF_SUSPENDED, &md->flags);
b84b0287 2776
4d4471cb
KU
2777 dm_table_postsuspend_targets(map);
2778
d287483d 2779out_unlock:
e61290a4 2780 mutex_unlock(&md->suspend_lock);
cf222b37 2781 return r;
1da177e4
LT
2782}
2783
2784int dm_resume(struct mapped_device *md)
2785{
cf222b37 2786 int r = -EINVAL;
cf222b37 2787 struct dm_table *map = NULL;
1da177e4 2788
e61290a4 2789 mutex_lock(&md->suspend_lock);
4f186f8b 2790 if (!dm_suspended_md(md))
cf222b37 2791 goto out;
cf222b37 2792
83d5e5b0 2793 map = md->map;
2ca3310e 2794 if (!map || !dm_table_get_size(map))
cf222b37 2795 goto out;
1da177e4 2796
8757b776
MB
2797 r = dm_table_resume_targets(map);
2798 if (r)
2799 goto out;
2ca3310e 2800
9a1fb464 2801 dm_queue_flush(md);
2ca3310e 2802
cec47e3d
KU
2803 /*
2804 * Flushing deferred I/Os must be done after targets are resumed
2805 * so that mapping of targets can work correctly.
2806 * Request-based dm is queueing the deferred I/Os in its request_queue.
2807 */
2808 if (dm_request_based(md))
2809 start_queue(md->queue);
2810
2ca3310e
AK
2811 unlock_fs(md);
2812
2813 clear_bit(DMF_SUSPENDED, &md->flags);
2814
cf222b37
AK
2815 r = 0;
2816out:
e61290a4 2817 mutex_unlock(&md->suspend_lock);
2ca3310e 2818
cf222b37 2819 return r;
1da177e4
LT
2820}
2821
fd2ed4d2
MP
2822/*
2823 * Internal suspend/resume works like userspace-driven suspend. It waits
2824 * until all bios finish and prevents issuing new bios to the target drivers.
2825 * It may be used only from the kernel.
2826 *
2827 * Internal suspend holds md->suspend_lock, which prevents interaction with
2828 * userspace-driven suspend.
2829 */
2830
2831void dm_internal_suspend(struct mapped_device *md)
2832{
2833 mutex_lock(&md->suspend_lock);
2834 if (dm_suspended_md(md))
2835 return;
2836
2837 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2838 synchronize_srcu(&md->io_barrier);
2839 flush_workqueue(md->wq);
2840 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2841}
2842
2843void dm_internal_resume(struct mapped_device *md)
2844{
2845 if (dm_suspended_md(md))
2846 goto done;
2847
2848 dm_queue_flush(md);
2849
2850done:
2851 mutex_unlock(&md->suspend_lock);
2852}
2853
1da177e4
LT
2854/*-----------------------------------------------------------------
2855 * Event notification.
2856 *---------------------------------------------------------------*/
3abf85b5 2857int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
60935eb2 2858 unsigned cookie)
69267a30 2859{
60935eb2
MB
2860 char udev_cookie[DM_COOKIE_LENGTH];
2861 char *envp[] = { udev_cookie, NULL };
2862
2863 if (!cookie)
3abf85b5 2864 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
60935eb2
MB
2865 else {
2866 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2867 DM_COOKIE_ENV_VAR_NAME, cookie);
3abf85b5
PR
2868 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2869 action, envp);
60935eb2 2870 }
69267a30
AK
2871}
2872
7a8c3d3b
MA
2873uint32_t dm_next_uevent_seq(struct mapped_device *md)
2874{
2875 return atomic_add_return(1, &md->uevent_seq);
2876}
2877
1da177e4
LT
2878uint32_t dm_get_event_nr(struct mapped_device *md)
2879{
2880 return atomic_read(&md->event_nr);
2881}
2882
2883int dm_wait_event(struct mapped_device *md, int event_nr)
2884{
2885 return wait_event_interruptible(md->eventq,
2886 (event_nr != atomic_read(&md->event_nr)));
2887}
2888
7a8c3d3b
MA
2889void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2890{
2891 unsigned long flags;
2892
2893 spin_lock_irqsave(&md->uevent_lock, flags);
2894 list_add(elist, &md->uevent_list);
2895 spin_unlock_irqrestore(&md->uevent_lock, flags);
2896}
2897
1da177e4
LT
2898/*
2899 * The gendisk is only valid as long as you have a reference
2900 * count on 'md'.
2901 */
2902struct gendisk *dm_disk(struct mapped_device *md)
2903{
2904 return md->disk;
2905}
2906
784aae73
MB
2907struct kobject *dm_kobject(struct mapped_device *md)
2908{
2909 return &md->kobj;
2910}
2911
784aae73
MB
2912struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2913{
2914 struct mapped_device *md;
2915
2916 md = container_of(kobj, struct mapped_device, kobj);
784aae73 2917
4d89b7b4 2918 if (test_bit(DMF_FREEING, &md->flags) ||
432a212c 2919 dm_deleting_md(md))
4d89b7b4
MB
2920 return NULL;
2921
784aae73
MB
2922 dm_get(md);
2923 return md;
2924}
2925
be35f486
MP
2926struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
2927{
2928 struct mapped_device *md = container_of(kobj, struct mapped_device, kobj);
2929
2930 return &md->kobj_completion;
2931}
2932
4f186f8b 2933int dm_suspended_md(struct mapped_device *md)
1da177e4
LT
2934{
2935 return test_bit(DMF_SUSPENDED, &md->flags);
2936}
2937
2c140a24
MP
2938int dm_test_deferred_remove_flag(struct mapped_device *md)
2939{
2940 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2941}
2942
64dbce58
KU
2943int dm_suspended(struct dm_target *ti)
2944{
ecdb2e25 2945 return dm_suspended_md(dm_table_get_md(ti->table));
64dbce58
KU
2946}
2947EXPORT_SYMBOL_GPL(dm_suspended);
2948
2e93ccc1
KU
2949int dm_noflush_suspending(struct dm_target *ti)
2950{
ecdb2e25 2951 return __noflush_suspending(dm_table_get_md(ti->table));
2e93ccc1
KU
2952}
2953EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2954
c0820cf5 2955struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size)
e6ee8c0b 2956{
5f015204
JN
2957 struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
2958 struct kmem_cache *cachep;
2959 unsigned int pool_size;
2960 unsigned int front_pad;
e6ee8c0b
KU
2961
2962 if (!pools)
2963 return NULL;
2964
23e5083b 2965 if (type == DM_TYPE_BIO_BASED) {
5f015204 2966 cachep = _io_cache;
e8603136 2967 pool_size = dm_get_reserved_bio_based_ios();
5f015204
JN
2968 front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
2969 } else if (type == DM_TYPE_REQUEST_BASED) {
2970 cachep = _rq_tio_cache;
f4790826 2971 pool_size = dm_get_reserved_rq_based_ios();
5f015204
JN
2972 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
2973 /* per_bio_data_size is not used. See __bind_mempools(). */
2974 WARN_ON(per_bio_data_size != 0);
2975 } else
2976 goto out;
e6ee8c0b 2977
6cfa5857 2978 pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
5f015204
JN
2979 if (!pools->io_pool)
2980 goto out;
e6ee8c0b 2981
5f015204 2982 pools->bs = bioset_create(pool_size, front_pad);
e6ee8c0b 2983 if (!pools->bs)
5f015204 2984 goto out;
e6ee8c0b 2985
a91a2785 2986 if (integrity && bioset_integrity_create(pools->bs, pool_size))
5f015204 2987 goto out;
a91a2785 2988
e6ee8c0b
KU
2989 return pools;
2990
5f015204
JN
2991out:
2992 dm_free_md_mempools(pools);
e6ee8c0b
KU
2993
2994 return NULL;
2995}
2996
2997void dm_free_md_mempools(struct dm_md_mempools *pools)
2998{
2999 if (!pools)
3000 return;
3001
3002 if (pools->io_pool)
3003 mempool_destroy(pools->io_pool);
3004
e6ee8c0b
KU
3005 if (pools->bs)
3006 bioset_free(pools->bs);
3007
3008 kfree(pools);
3009}
3010
83d5cde4 3011static const struct block_device_operations dm_blk_dops = {
1da177e4
LT
3012 .open = dm_blk_open,
3013 .release = dm_blk_close,
aa129a22 3014 .ioctl = dm_blk_ioctl,
3ac51e74 3015 .getgeo = dm_blk_getgeo,
1da177e4
LT
3016 .owner = THIS_MODULE
3017};
3018
3019EXPORT_SYMBOL(dm_get_mapinfo);
3020
3021/*
3022 * module hooks
3023 */
3024module_init(dm_init);
3025module_exit(dm_exit);
3026
3027module_param(major, uint, 0);
3028MODULE_PARM_DESC(major, "The major number of the device mapper");
f4790826 3029
e8603136
MS
3030module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3031MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3032
f4790826
MS
3033module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
3034MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
3035
1da177e4
LT
3036MODULE_DESCRIPTION(DM_NAME " driver");
3037MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3038MODULE_LICENSE("GPL");
This page took 0.982822 seconds and 5 git commands to generate.