2 * Copyright (C) 2012 Fusion-io All rights reserved.
3 * Copyright (C) 2012 Intel Corp. All rights reserved.
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
19 #include <linux/sched.h>
20 #include <linux/wait.h>
21 #include <linux/bio.h>
22 #include <linux/slab.h>
23 #include <linux/buffer_head.h>
24 #include <linux/blkdev.h>
25 #include <linux/random.h>
26 #include <linux/iocontext.h>
27 #include <linux/capability.h>
28 #include <linux/ratelimit.h>
29 #include <linux/kthread.h>
30 #include <linux/raid/pq.h>
31 #include <linux/hash.h>
32 #include <linux/list_sort.h>
33 #include <linux/raid/xor.h>
34 #include <asm/div64.h>
37 #include "extent_map.h"
39 #include "transaction.h"
40 #include "print-tree.h"
43 #include "async-thread.h"
44 #include "check-integrity.h"
45 #include "rcu-string.h"
47 /* set when additional merges to this rbio are not allowed */
48 #define RBIO_RMW_LOCKED_BIT 1
50 struct btrfs_raid_bio
{
51 struct btrfs_fs_info
*fs_info
;
52 struct btrfs_bio
*bbio
;
55 * logical block numbers for the start of each stripe
56 * The last one or two are p/q. These are sorted,
57 * so raid_map[0] is the start of our full stripe
61 /* while we're doing rmw on a stripe
62 * we put it into a hash table so we can
63 * lock the stripe and merge more rbios
66 struct list_head hash_list
;
69 * for scheduling work in the helper threads
71 struct btrfs_work work
;
74 * bio list and bio_list_lock are used
75 * to add more bios into the stripe
76 * in hopes of avoiding the full rmw
78 struct bio_list bio_list
;
79 spinlock_t bio_list_lock
;
82 * also protected by the bio_list_lock, the
83 * stripe locking code uses plug_list to hand off
84 * the stripe lock to the next pending IO
86 struct list_head plug_list
;
89 * flags that tell us if it is safe to
94 /* size of each individual stripe on disk */
97 /* number of data stripes (no p/q) */
101 * set if we're doing a parity rebuild
102 * for a read from higher up, which is handled
103 * differently from a parity rebuild as part of
108 /* first bad stripe */
111 /* second bad stripe (for raid6 use) */
115 * number of pages needed to represent the full
121 * size of all the bios in the bio_list. This
122 * helps us decide if the rbio maps to a full
130 * these are two arrays of pointers. We allocate the
131 * rbio big enough to hold them both and setup their
132 * locations when the rbio is allocated
135 /* pointers to pages that we allocated for
136 * reading/writing stripes directly from the disk (including P/Q)
138 struct page
**stripe_pages
;
141 * pointers to the pages in the bio_list. Stored
142 * here for faster lookup
144 struct page
**bio_pages
;
147 static int __raid56_parity_recover(struct btrfs_raid_bio
*rbio
);
148 static noinline
void finish_rmw(struct btrfs_raid_bio
*rbio
);
149 static void rmw_work(struct btrfs_work
*work
);
150 static void read_rebuild_work(struct btrfs_work
*work
);
151 static void async_rmw_stripe(struct btrfs_raid_bio
*rbio
);
152 static void async_read_rebuild(struct btrfs_raid_bio
*rbio
);
153 static int fail_bio_stripe(struct btrfs_raid_bio
*rbio
, struct bio
*bio
);
154 static int fail_rbio_index(struct btrfs_raid_bio
*rbio
, int failed
);
155 static void __free_raid_bio(struct btrfs_raid_bio
*rbio
);
156 static void index_rbio_pages(struct btrfs_raid_bio
*rbio
);
157 static int alloc_rbio_pages(struct btrfs_raid_bio
*rbio
);
160 * the stripe hash table is used for locking, and to collect
161 * bios in hopes of making a full stripe
163 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info
*info
)
165 struct btrfs_stripe_hash_table
*table
;
166 struct btrfs_stripe_hash_table
*x
;
167 struct btrfs_stripe_hash
*cur
;
168 struct btrfs_stripe_hash
*h
;
169 int num_entries
= 1 << BTRFS_STRIPE_HASH_TABLE_BITS
;
172 if (info
->stripe_hash_table
)
175 table
= kzalloc(sizeof(*table
) + sizeof(*h
) * num_entries
, GFP_NOFS
);
179 table
->table
= (void *)(table
+ 1);
182 for (i
= 0; i
< num_entries
; i
++) {
184 INIT_LIST_HEAD(&cur
->hash_list
);
185 spin_lock_init(&cur
->lock
);
186 init_waitqueue_head(&cur
->wait
);
189 x
= cmpxchg(&info
->stripe_hash_table
, NULL
, table
);
196 * we hash on the first logical address of the stripe
198 static int rbio_bucket(struct btrfs_raid_bio
*rbio
)
200 u64 num
= rbio
->raid_map
[0];
203 * we shift down quite a bit. We're using byte
204 * addressing, and most of the lower bits are zeros.
205 * This tends to upset hash_64, and it consistently
206 * returns just one or two different values.
208 * shifting off the lower bits fixes things.
210 return hash_64(num
>> 16, BTRFS_STRIPE_HASH_TABLE_BITS
);
214 * merging means we take the bio_list from the victim and
215 * splice it into the destination. The victim should
216 * be discarded afterwards.
218 * must be called with dest->rbio_list_lock held
220 static void merge_rbio(struct btrfs_raid_bio
*dest
,
221 struct btrfs_raid_bio
*victim
)
223 bio_list_merge(&dest
->bio_list
, &victim
->bio_list
);
224 dest
->bio_list_bytes
+= victim
->bio_list_bytes
;
225 bio_list_init(&victim
->bio_list
);
229 * free the hash table used by unmount
231 void btrfs_free_stripe_hash_table(struct btrfs_fs_info
*info
)
233 if (!info
->stripe_hash_table
)
235 kfree(info
->stripe_hash_table
);
236 info
->stripe_hash_table
= NULL
;
240 * helper function to run the xor_blocks api. It is only
241 * able to do MAX_XOR_BLOCKS at a time, so we need to
244 static void run_xor(void **pages
, int src_cnt
, ssize_t len
)
248 void *dest
= pages
[src_cnt
];
251 xor_src_cnt
= min(src_cnt
, MAX_XOR_BLOCKS
);
252 xor_blocks(xor_src_cnt
, len
, dest
, pages
+ src_off
);
254 src_cnt
-= xor_src_cnt
;
255 src_off
+= xor_src_cnt
;
260 * returns true if the bio list inside this rbio
261 * covers an entire stripe (no rmw required).
262 * Must be called with the bio list lock held, or
263 * at a time when you know it is impossible to add
264 * new bios into the list
266 static int __rbio_is_full(struct btrfs_raid_bio
*rbio
)
268 unsigned long size
= rbio
->bio_list_bytes
;
271 if (size
!= rbio
->nr_data
* rbio
->stripe_len
)
274 BUG_ON(size
> rbio
->nr_data
* rbio
->stripe_len
);
278 static int rbio_is_full(struct btrfs_raid_bio
*rbio
)
283 spin_lock_irqsave(&rbio
->bio_list_lock
, flags
);
284 ret
= __rbio_is_full(rbio
);
285 spin_unlock_irqrestore(&rbio
->bio_list_lock
, flags
);
290 * returns 1 if it is safe to merge two rbios together.
291 * The merging is safe if the two rbios correspond to
292 * the same stripe and if they are both going in the same
293 * direction (read vs write), and if neither one is
294 * locked for final IO
296 * The caller is responsible for locking such that
297 * rmw_locked is safe to test
299 static int rbio_can_merge(struct btrfs_raid_bio
*last
,
300 struct btrfs_raid_bio
*cur
)
302 if (test_bit(RBIO_RMW_LOCKED_BIT
, &last
->flags
) ||
303 test_bit(RBIO_RMW_LOCKED_BIT
, &cur
->flags
))
306 if (last
->raid_map
[0] !=
310 /* reads can't merge with writes */
311 if (last
->read_rebuild
!=
320 * helper to index into the pstripe
322 static struct page
*rbio_pstripe_page(struct btrfs_raid_bio
*rbio
, int index
)
324 index
+= (rbio
->nr_data
* rbio
->stripe_len
) >> PAGE_CACHE_SHIFT
;
325 return rbio
->stripe_pages
[index
];
329 * helper to index into the qstripe, returns null
330 * if there is no qstripe
332 static struct page
*rbio_qstripe_page(struct btrfs_raid_bio
*rbio
, int index
)
334 if (rbio
->nr_data
+ 1 == rbio
->bbio
->num_stripes
)
337 index
+= ((rbio
->nr_data
+ 1) * rbio
->stripe_len
) >>
339 return rbio
->stripe_pages
[index
];
343 * The first stripe in the table for a logical address
344 * has the lock. rbios are added in one of three ways:
346 * 1) Nobody has the stripe locked yet. The rbio is given
347 * the lock and 0 is returned. The caller must start the IO
350 * 2) Someone has the stripe locked, but we're able to merge
351 * with the lock owner. The rbio is freed and the IO will
352 * start automatically along with the existing rbio. 1 is returned.
354 * 3) Someone has the stripe locked, but we're not able to merge.
355 * The rbio is added to the lock owner's plug list, or merged into
356 * an rbio already on the plug list. When the lock owner unlocks,
357 * the next rbio on the list is run and the IO is started automatically.
360 * If we return 0, the caller still owns the rbio and must continue with
361 * IO submission. If we return 1, the caller must assume the rbio has
362 * already been freed.
364 static noinline
int lock_stripe_add(struct btrfs_raid_bio
*rbio
)
366 int bucket
= rbio_bucket(rbio
);
367 struct btrfs_stripe_hash
*h
= rbio
->fs_info
->stripe_hash_table
->table
+ bucket
;
368 struct btrfs_raid_bio
*cur
;
369 struct btrfs_raid_bio
*pending
;
372 struct btrfs_raid_bio
*freeit
= NULL
;
376 spin_lock_irqsave(&h
->lock
, flags
);
377 list_for_each_entry(cur
, &h
->hash_list
, hash_list
) {
379 if (cur
->raid_map
[0] == rbio
->raid_map
[0]) {
380 spin_lock(&cur
->bio_list_lock
);
382 /* can we merge into the lock owner? */
383 if (rbio_can_merge(cur
, rbio
)) {
384 merge_rbio(cur
, rbio
);
385 spin_unlock(&cur
->bio_list_lock
);
392 * we couldn't merge with the running
393 * rbio, see if we can merge with the
394 * pending ones. We don't have to
395 * check for rmw_locked because there
396 * is no way they are inside finish_rmw
399 list_for_each_entry(pending
, &cur
->plug_list
,
401 if (rbio_can_merge(pending
, rbio
)) {
402 merge_rbio(pending
, rbio
);
403 spin_unlock(&cur
->bio_list_lock
);
410 /* no merging, put us on the tail of the plug list,
411 * our rbio will be started with the currently
412 * running rbio unlocks
414 list_add_tail(&rbio
->plug_list
, &cur
->plug_list
);
415 spin_unlock(&cur
->bio_list_lock
);
421 atomic_inc(&rbio
->refs
);
422 list_add(&rbio
->hash_list
, &h
->hash_list
);
424 spin_unlock_irqrestore(&h
->lock
, flags
);
426 __free_raid_bio(freeit
);
431 * called as rmw or parity rebuild is completed. If the plug list has more
432 * rbios waiting for this stripe, the next one on the list will be started
434 static noinline
void unlock_stripe(struct btrfs_raid_bio
*rbio
)
437 struct btrfs_stripe_hash
*h
;
440 bucket
= rbio_bucket(rbio
);
441 h
= rbio
->fs_info
->stripe_hash_table
->table
+ bucket
;
443 spin_lock_irqsave(&h
->lock
, flags
);
444 spin_lock(&rbio
->bio_list_lock
);
446 if (!list_empty(&rbio
->hash_list
)) {
448 list_del_init(&rbio
->hash_list
);
449 atomic_dec(&rbio
->refs
);
452 * we use the plug list to hold all the rbios
453 * waiting for the chance to lock this stripe.
454 * hand the lock over to one of them.
456 if (!list_empty(&rbio
->plug_list
)) {
457 struct btrfs_raid_bio
*next
;
458 struct list_head
*head
= rbio
->plug_list
.next
;
460 next
= list_entry(head
, struct btrfs_raid_bio
,
463 list_del_init(&rbio
->plug_list
);
465 list_add(&next
->hash_list
, &h
->hash_list
);
466 atomic_inc(&next
->refs
);
467 spin_unlock(&rbio
->bio_list_lock
);
468 spin_unlock_irqrestore(&h
->lock
, flags
);
470 if (next
->read_rebuild
)
471 async_read_rebuild(next
);
473 async_rmw_stripe(next
);
477 } else if (waitqueue_active(&h
->wait
)) {
478 spin_unlock(&rbio
->bio_list_lock
);
479 spin_unlock_irqrestore(&h
->lock
, flags
);
484 spin_unlock(&rbio
->bio_list_lock
);
485 spin_unlock_irqrestore(&h
->lock
, flags
);
491 static void __free_raid_bio(struct btrfs_raid_bio
*rbio
)
495 WARN_ON(atomic_read(&rbio
->refs
) < 0);
496 if (!atomic_dec_and_test(&rbio
->refs
))
499 WARN_ON(!list_empty(&rbio
->hash_list
));
500 WARN_ON(!bio_list_empty(&rbio
->bio_list
));
502 for (i
= 0; i
< rbio
->nr_pages
; i
++) {
503 if (rbio
->stripe_pages
[i
]) {
504 __free_page(rbio
->stripe_pages
[i
]);
505 rbio
->stripe_pages
[i
] = NULL
;
508 kfree(rbio
->raid_map
);
513 static void free_raid_bio(struct btrfs_raid_bio
*rbio
)
516 __free_raid_bio(rbio
);
520 * this frees the rbio and runs through all the bios in the
521 * bio_list and calls end_io on them
523 static void rbio_orig_end_io(struct btrfs_raid_bio
*rbio
, int err
, int uptodate
)
525 struct bio
*cur
= bio_list_get(&rbio
->bio_list
);
533 set_bit(BIO_UPTODATE
, &cur
->bi_flags
);
540 * end io function used by finish_rmw. When we finally
541 * get here, we've written a full stripe
543 static void raid_write_end_io(struct bio
*bio
, int err
)
545 struct btrfs_raid_bio
*rbio
= bio
->bi_private
;
548 fail_bio_stripe(rbio
, bio
);
552 if (!atomic_dec_and_test(&rbio
->bbio
->stripes_pending
))
557 /* OK, we have read all the stripes we need to. */
558 if (atomic_read(&rbio
->bbio
->error
) > rbio
->bbio
->max_errors
)
561 rbio_orig_end_io(rbio
, err
, 0);
566 * the read/modify/write code wants to use the original bio for
567 * any pages it included, and then use the rbio for everything
568 * else. This function decides if a given index (stripe number)
569 * and page number in that stripe fall inside the original bio
572 * if you set bio_list_only, you'll get a NULL back for any ranges
573 * that are outside the bio_list
575 * This doesn't take any refs on anything, you get a bare page pointer
576 * and the caller must bump refs as required.
578 * You must call index_rbio_pages once before you can trust
579 * the answers from this function.
581 static struct page
*page_in_rbio(struct btrfs_raid_bio
*rbio
,
582 int index
, int pagenr
, int bio_list_only
)
585 struct page
*p
= NULL
;
587 chunk_page
= index
* (rbio
->stripe_len
>> PAGE_SHIFT
) + pagenr
;
589 spin_lock_irq(&rbio
->bio_list_lock
);
590 p
= rbio
->bio_pages
[chunk_page
];
591 spin_unlock_irq(&rbio
->bio_list_lock
);
593 if (p
|| bio_list_only
)
596 return rbio
->stripe_pages
[chunk_page
];
600 * number of pages we need for the entire stripe across all the
603 static unsigned long rbio_nr_pages(unsigned long stripe_len
, int nr_stripes
)
605 unsigned long nr
= stripe_len
* nr_stripes
;
606 return (nr
+ PAGE_CACHE_SIZE
- 1) >> PAGE_CACHE_SHIFT
;
610 * allocation and initial setup for the btrfs_raid_bio. Not
611 * this does not allocate any pages for rbio->pages.
613 static struct btrfs_raid_bio
*alloc_rbio(struct btrfs_root
*root
,
614 struct btrfs_bio
*bbio
, u64
*raid_map
,
617 struct btrfs_raid_bio
*rbio
;
619 int num_pages
= rbio_nr_pages(stripe_len
, bbio
->num_stripes
);
622 rbio
= kzalloc(sizeof(*rbio
) + num_pages
* sizeof(struct page
*) * 2,
627 return ERR_PTR(-ENOMEM
);
630 bio_list_init(&rbio
->bio_list
);
631 INIT_LIST_HEAD(&rbio
->plug_list
);
632 spin_lock_init(&rbio
->bio_list_lock
);
633 INIT_LIST_HEAD(&rbio
->hash_list
);
635 rbio
->raid_map
= raid_map
;
636 rbio
->fs_info
= root
->fs_info
;
637 rbio
->stripe_len
= stripe_len
;
638 rbio
->nr_pages
= num_pages
;
641 atomic_set(&rbio
->refs
, 1);
644 * the stripe_pages and bio_pages array point to the extra
645 * memory we allocated past the end of the rbio
648 rbio
->stripe_pages
= p
;
649 rbio
->bio_pages
= p
+ sizeof(struct page
*) * num_pages
;
651 if (raid_map
[bbio
->num_stripes
- 1] == RAID6_Q_STRIPE
)
652 nr_data
= bbio
->num_stripes
- 2;
654 nr_data
= bbio
->num_stripes
- 1;
656 rbio
->nr_data
= nr_data
;
660 /* allocate pages for all the stripes in the bio, including parity */
661 static int alloc_rbio_pages(struct btrfs_raid_bio
*rbio
)
666 for (i
= 0; i
< rbio
->nr_pages
; i
++) {
667 if (rbio
->stripe_pages
[i
])
669 page
= alloc_page(GFP_NOFS
| __GFP_HIGHMEM
);
672 rbio
->stripe_pages
[i
] = page
;
673 ClearPageUptodate(page
);
678 /* allocate pages for just the p/q stripes */
679 static int alloc_rbio_parity_pages(struct btrfs_raid_bio
*rbio
)
684 i
= (rbio
->nr_data
* rbio
->stripe_len
) >> PAGE_CACHE_SHIFT
;
686 for (; i
< rbio
->nr_pages
; i
++) {
687 if (rbio
->stripe_pages
[i
])
689 page
= alloc_page(GFP_NOFS
| __GFP_HIGHMEM
);
692 rbio
->stripe_pages
[i
] = page
;
698 * add a single page from a specific stripe into our list of bios for IO
699 * this will try to merge into existing bios if possible, and returns
700 * zero if all went well.
702 int rbio_add_io_page(struct btrfs_raid_bio
*rbio
,
703 struct bio_list
*bio_list
,
706 unsigned long page_index
,
707 unsigned long bio_max_len
)
709 struct bio
*last
= bio_list
->tail
;
713 struct btrfs_bio_stripe
*stripe
;
716 stripe
= &rbio
->bbio
->stripes
[stripe_nr
];
717 disk_start
= stripe
->physical
+ (page_index
<< PAGE_CACHE_SHIFT
);
719 /* if the device is missing, just fail this stripe */
720 if (!stripe
->dev
->bdev
)
721 return fail_rbio_index(rbio
, stripe_nr
);
723 /* see if we can add this page onto our existing bio */
725 last_end
= (u64
)last
->bi_sector
<< 9;
726 last_end
+= last
->bi_size
;
729 * we can't merge these if they are from different
730 * devices or if they are not contiguous
732 if (last_end
== disk_start
&& stripe
->dev
->bdev
&&
733 test_bit(BIO_UPTODATE
, &last
->bi_flags
) &&
734 last
->bi_bdev
== stripe
->dev
->bdev
) {
735 ret
= bio_add_page(last
, page
, PAGE_CACHE_SIZE
, 0);
736 if (ret
== PAGE_CACHE_SIZE
)
741 /* put a new bio on the list */
742 bio
= bio_alloc(GFP_NOFS
, bio_max_len
>> PAGE_SHIFT
?:1);
747 bio
->bi_bdev
= stripe
->dev
->bdev
;
748 bio
->bi_sector
= disk_start
>> 9;
749 set_bit(BIO_UPTODATE
, &bio
->bi_flags
);
751 bio_add_page(bio
, page
, PAGE_CACHE_SIZE
, 0);
752 bio_list_add(bio_list
, bio
);
757 * while we're doing the read/modify/write cycle, we could
758 * have errors in reading pages off the disk. This checks
759 * for errors and if we're not able to read the page it'll
760 * trigger parity reconstruction. The rmw will be finished
761 * after we've reconstructed the failed stripes
763 static void validate_rbio_for_rmw(struct btrfs_raid_bio
*rbio
)
765 if (rbio
->faila
>= 0 || rbio
->failb
>= 0) {
766 BUG_ON(rbio
->faila
== rbio
->bbio
->num_stripes
- 1);
767 __raid56_parity_recover(rbio
);
774 * these are just the pages from the rbio array, not from anything
775 * the FS sent down to us
777 static struct page
*rbio_stripe_page(struct btrfs_raid_bio
*rbio
, int stripe
, int page
)
780 index
= stripe
* (rbio
->stripe_len
>> PAGE_CACHE_SHIFT
);
782 return rbio
->stripe_pages
[index
];
786 * helper function to walk our bio list and populate the bio_pages array with
787 * the result. This seems expensive, but it is faster than constantly
788 * searching through the bio list as we setup the IO in finish_rmw or stripe
791 * This must be called before you trust the answers from page_in_rbio
793 static void index_rbio_pages(struct btrfs_raid_bio
*rbio
)
797 unsigned long stripe_offset
;
798 unsigned long page_index
;
802 spin_lock_irq(&rbio
->bio_list_lock
);
803 bio_list_for_each(bio
, &rbio
->bio_list
) {
804 start
= (u64
)bio
->bi_sector
<< 9;
805 stripe_offset
= start
- rbio
->raid_map
[0];
806 page_index
= stripe_offset
>> PAGE_CACHE_SHIFT
;
808 for (i
= 0; i
< bio
->bi_vcnt
; i
++) {
809 p
= bio
->bi_io_vec
[i
].bv_page
;
810 rbio
->bio_pages
[page_index
+ i
] = p
;
813 spin_unlock_irq(&rbio
->bio_list_lock
);
817 * this is called from one of two situations. We either
818 * have a full stripe from the higher layers, or we've read all
819 * the missing bits off disk.
821 * This will calculate the parity and then send down any
824 static noinline
void finish_rmw(struct btrfs_raid_bio
*rbio
)
826 struct btrfs_bio
*bbio
= rbio
->bbio
;
827 void *pointers
[bbio
->num_stripes
];
828 int stripe_len
= rbio
->stripe_len
;
829 int nr_data
= rbio
->nr_data
;
834 struct bio_list bio_list
;
836 int pages_per_stripe
= stripe_len
>> PAGE_CACHE_SHIFT
;
839 bio_list_init(&bio_list
);
841 if (bbio
->num_stripes
- rbio
->nr_data
== 1) {
842 p_stripe
= bbio
->num_stripes
- 1;
843 } else if (bbio
->num_stripes
- rbio
->nr_data
== 2) {
844 p_stripe
= bbio
->num_stripes
- 2;
845 q_stripe
= bbio
->num_stripes
- 1;
850 /* at this point we either have a full stripe,
851 * or we've read the full stripe from the drive.
852 * recalculate the parity and write the new results.
854 * We're not allowed to add any new bios to the
855 * bio list here, anyone else that wants to
856 * change this stripe needs to do their own rmw.
858 spin_lock_irq(&rbio
->bio_list_lock
);
859 set_bit(RBIO_RMW_LOCKED_BIT
, &rbio
->flags
);
860 spin_unlock_irq(&rbio
->bio_list_lock
);
862 atomic_set(&rbio
->bbio
->error
, 0);
865 * now that we've set rmw_locked, run through the
866 * bio list one last time and map the page pointers
868 index_rbio_pages(rbio
);
870 for (pagenr
= 0; pagenr
< pages_per_stripe
; pagenr
++) {
872 /* first collect one page from each data stripe */
873 for (stripe
= 0; stripe
< nr_data
; stripe
++) {
874 p
= page_in_rbio(rbio
, stripe
, pagenr
, 0);
875 pointers
[stripe
] = kmap(p
);
878 /* then add the parity stripe */
879 p
= rbio_pstripe_page(rbio
, pagenr
);
881 pointers
[stripe
++] = kmap(p
);
883 if (q_stripe
!= -1) {
886 * raid6, add the qstripe and call the
887 * library function to fill in our p/q
889 p
= rbio_qstripe_page(rbio
, pagenr
);
891 pointers
[stripe
++] = kmap(p
);
893 raid6_call
.gen_syndrome(bbio
->num_stripes
, PAGE_SIZE
,
897 memcpy(pointers
[nr_data
], pointers
[0], PAGE_SIZE
);
898 run_xor(pointers
+ 1, nr_data
- 1, PAGE_CACHE_SIZE
);
902 for (stripe
= 0; stripe
< bbio
->num_stripes
; stripe
++)
903 kunmap(page_in_rbio(rbio
, stripe
, pagenr
, 0));
907 * time to start writing. Make bios for everything from the
908 * higher layers (the bio_list in our rbio) and our p/q. Ignore
911 for (stripe
= 0; stripe
< bbio
->num_stripes
; stripe
++) {
912 for (pagenr
= 0; pagenr
< pages_per_stripe
; pagenr
++) {
914 if (stripe
< rbio
->nr_data
) {
915 page
= page_in_rbio(rbio
, stripe
, pagenr
, 1);
919 page
= rbio_stripe_page(rbio
, stripe
, pagenr
);
922 ret
= rbio_add_io_page(rbio
, &bio_list
,
923 page
, stripe
, pagenr
, rbio
->stripe_len
);
929 atomic_set(&bbio
->stripes_pending
, bio_list_size(&bio_list
));
930 BUG_ON(atomic_read(&bbio
->stripes_pending
) == 0);
933 bio
= bio_list_pop(&bio_list
);
937 bio
->bi_private
= rbio
;
938 bio
->bi_end_io
= raid_write_end_io
;
939 BUG_ON(!test_bit(BIO_UPTODATE
, &bio
->bi_flags
));
940 submit_bio(WRITE
, bio
);
945 rbio_orig_end_io(rbio
, -EIO
, 0);
949 * helper to find the stripe number for a given bio. Used to figure out which
950 * stripe has failed. This expects the bio to correspond to a physical disk,
951 * so it looks up based on physical sector numbers.
953 static int find_bio_stripe(struct btrfs_raid_bio
*rbio
,
956 u64 physical
= bio
->bi_sector
;
959 struct btrfs_bio_stripe
*stripe
;
963 for (i
= 0; i
< rbio
->bbio
->num_stripes
; i
++) {
964 stripe
= &rbio
->bbio
->stripes
[i
];
965 stripe_start
= stripe
->physical
;
966 if (physical
>= stripe_start
&&
967 physical
< stripe_start
+ rbio
->stripe_len
) {
975 * helper to find the stripe number for a given
976 * bio (before mapping). Used to figure out which stripe has
977 * failed. This looks up based on logical block numbers.
979 static int find_logical_bio_stripe(struct btrfs_raid_bio
*rbio
,
982 u64 logical
= bio
->bi_sector
;
988 for (i
= 0; i
< rbio
->nr_data
; i
++) {
989 stripe_start
= rbio
->raid_map
[i
];
990 if (logical
>= stripe_start
&&
991 logical
< stripe_start
+ rbio
->stripe_len
) {
999 * returns -EIO if we had too many failures
1001 static int fail_rbio_index(struct btrfs_raid_bio
*rbio
, int failed
)
1003 unsigned long flags
;
1006 spin_lock_irqsave(&rbio
->bio_list_lock
, flags
);
1008 /* we already know this stripe is bad, move on */
1009 if (rbio
->faila
== failed
|| rbio
->failb
== failed
)
1012 if (rbio
->faila
== -1) {
1013 /* first failure on this rbio */
1014 rbio
->faila
= failed
;
1015 atomic_inc(&rbio
->bbio
->error
);
1016 } else if (rbio
->failb
== -1) {
1017 /* second failure on this rbio */
1018 rbio
->failb
= failed
;
1019 atomic_inc(&rbio
->bbio
->error
);
1024 spin_unlock_irqrestore(&rbio
->bio_list_lock
, flags
);
1030 * helper to fail a stripe based on a physical disk
1033 static int fail_bio_stripe(struct btrfs_raid_bio
*rbio
,
1036 int failed
= find_bio_stripe(rbio
, bio
);
1041 return fail_rbio_index(rbio
, failed
);
1045 * this sets each page in the bio uptodate. It should only be used on private
1046 * rbio pages, nothing that comes in from the higher layers
1048 static void set_bio_pages_uptodate(struct bio
*bio
)
1053 for (i
= 0; i
< bio
->bi_vcnt
; i
++) {
1054 p
= bio
->bi_io_vec
[i
].bv_page
;
1060 * end io for the read phase of the rmw cycle. All the bios here are physical
1061 * stripe bios we've read from the disk so we can recalculate the parity of the
1064 * This will usually kick off finish_rmw once all the bios are read in, but it
1065 * may trigger parity reconstruction if we had any errors along the way
1067 static void raid_rmw_end_io(struct bio
*bio
, int err
)
1069 struct btrfs_raid_bio
*rbio
= bio
->bi_private
;
1072 fail_bio_stripe(rbio
, bio
);
1074 set_bio_pages_uptodate(bio
);
1078 if (!atomic_dec_and_test(&rbio
->bbio
->stripes_pending
))
1082 if (atomic_read(&rbio
->bbio
->error
) > rbio
->bbio
->max_errors
)
1086 * this will normally call finish_rmw to start our write
1087 * but if there are any failed stripes we'll reconstruct
1090 validate_rbio_for_rmw(rbio
);
1095 rbio_orig_end_io(rbio
, -EIO
, 0);
1098 static void async_rmw_stripe(struct btrfs_raid_bio
*rbio
)
1100 rbio
->work
.flags
= 0;
1101 rbio
->work
.func
= rmw_work
;
1103 btrfs_queue_worker(&rbio
->fs_info
->rmw_workers
,
1107 static void async_read_rebuild(struct btrfs_raid_bio
*rbio
)
1109 rbio
->work
.flags
= 0;
1110 rbio
->work
.func
= read_rebuild_work
;
1112 btrfs_queue_worker(&rbio
->fs_info
->rmw_workers
,
1117 * the stripe must be locked by the caller. It will
1118 * unlock after all the writes are done
1120 static int raid56_rmw_stripe(struct btrfs_raid_bio
*rbio
)
1122 int bios_to_read
= 0;
1123 struct btrfs_bio
*bbio
= rbio
->bbio
;
1124 struct bio_list bio_list
;
1126 int nr_pages
= (rbio
->stripe_len
+ PAGE_CACHE_SIZE
- 1) >> PAGE_CACHE_SHIFT
;
1131 bio_list_init(&bio_list
);
1133 ret
= alloc_rbio_pages(rbio
);
1137 index_rbio_pages(rbio
);
1139 atomic_set(&rbio
->bbio
->error
, 0);
1141 * build a list of bios to read all the missing parts of this
1144 for (stripe
= 0; stripe
< rbio
->nr_data
; stripe
++) {
1145 for (pagenr
= 0; pagenr
< nr_pages
; pagenr
++) {
1148 * we want to find all the pages missing from
1149 * the rbio and read them from the disk. If
1150 * page_in_rbio finds a page in the bio list
1151 * we don't need to read it off the stripe.
1153 page
= page_in_rbio(rbio
, stripe
, pagenr
, 1);
1157 page
= rbio_stripe_page(rbio
, stripe
, pagenr
);
1158 ret
= rbio_add_io_page(rbio
, &bio_list
, page
,
1159 stripe
, pagenr
, rbio
->stripe_len
);
1165 bios_to_read
= bio_list_size(&bio_list
);
1166 if (!bios_to_read
) {
1168 * this can happen if others have merged with
1169 * us, it means there is nothing left to read.
1170 * But if there are missing devices it may not be
1171 * safe to do the full stripe write yet.
1177 * the bbio may be freed once we submit the last bio. Make sure
1178 * not to touch it after that
1180 atomic_set(&bbio
->stripes_pending
, bios_to_read
);
1182 bio
= bio_list_pop(&bio_list
);
1186 bio
->bi_private
= rbio
;
1187 bio
->bi_end_io
= raid_rmw_end_io
;
1189 btrfs_bio_wq_end_io(rbio
->fs_info
, bio
,
1190 BTRFS_WQ_ENDIO_RAID56
);
1192 BUG_ON(!test_bit(BIO_UPTODATE
, &bio
->bi_flags
));
1193 submit_bio(READ
, bio
);
1195 /* the actual write will happen once the reads are done */
1199 rbio_orig_end_io(rbio
, -EIO
, 0);
1203 validate_rbio_for_rmw(rbio
);
1208 * if the upper layers pass in a full stripe, we thank them by only allocating
1209 * enough pages to hold the parity, and sending it all down quickly.
1211 static int full_stripe_write(struct btrfs_raid_bio
*rbio
)
1215 ret
= alloc_rbio_parity_pages(rbio
);
1219 ret
= lock_stripe_add(rbio
);
1226 * partial stripe writes get handed over to async helpers.
1227 * We're really hoping to merge a few more writes into this
1228 * rbio before calculating new parity
1230 static int partial_stripe_write(struct btrfs_raid_bio
*rbio
)
1234 ret
= lock_stripe_add(rbio
);
1236 async_rmw_stripe(rbio
);
1241 * sometimes while we were reading from the drive to
1242 * recalculate parity, enough new bios come into create
1243 * a full stripe. So we do a check here to see if we can
1244 * go directly to finish_rmw
1246 static int __raid56_parity_write(struct btrfs_raid_bio
*rbio
)
1248 /* head off into rmw land if we don't have a full stripe */
1249 if (!rbio_is_full(rbio
))
1250 return partial_stripe_write(rbio
);
1251 return full_stripe_write(rbio
);
1255 * our main entry point for writes from the rest of the FS.
1257 int raid56_parity_write(struct btrfs_root
*root
, struct bio
*bio
,
1258 struct btrfs_bio
*bbio
, u64
*raid_map
,
1261 struct btrfs_raid_bio
*rbio
;
1263 rbio
= alloc_rbio(root
, bbio
, raid_map
, stripe_len
);
1267 return PTR_ERR(rbio
);
1269 bio_list_add(&rbio
->bio_list
, bio
);
1270 rbio
->bio_list_bytes
= bio
->bi_size
;
1271 return __raid56_parity_write(rbio
);
1275 * all parity reconstruction happens here. We've read in everything
1276 * we can find from the drives and this does the heavy lifting of
1277 * sorting the good from the bad.
1279 static void __raid_recover_end_io(struct btrfs_raid_bio
*rbio
)
1283 int faila
= -1, failb
= -1;
1284 int nr_pages
= (rbio
->stripe_len
+ PAGE_CACHE_SIZE
- 1) >> PAGE_CACHE_SHIFT
;
1289 pointers
= kzalloc(rbio
->bbio
->num_stripes
* sizeof(void *),
1296 faila
= rbio
->faila
;
1297 failb
= rbio
->failb
;
1299 if (rbio
->read_rebuild
) {
1300 spin_lock_irq(&rbio
->bio_list_lock
);
1301 set_bit(RBIO_RMW_LOCKED_BIT
, &rbio
->flags
);
1302 spin_unlock_irq(&rbio
->bio_list_lock
);
1305 index_rbio_pages(rbio
);
1307 for (pagenr
= 0; pagenr
< nr_pages
; pagenr
++) {
1308 /* setup our array of pointers with pages
1311 for (stripe
= 0; stripe
< rbio
->bbio
->num_stripes
; stripe
++) {
1313 * if we're rebuilding a read, we have to use
1314 * pages from the bio list
1316 if (rbio
->read_rebuild
&&
1317 (stripe
== faila
|| stripe
== failb
)) {
1318 page
= page_in_rbio(rbio
, stripe
, pagenr
, 0);
1320 page
= rbio_stripe_page(rbio
, stripe
, pagenr
);
1322 pointers
[stripe
] = kmap(page
);
1325 /* all raid6 handling here */
1326 if (rbio
->raid_map
[rbio
->bbio
->num_stripes
- 1] ==
1330 * single failure, rebuild from parity raid5
1334 if (faila
== rbio
->nr_data
) {
1336 * Just the P stripe has failed, without
1337 * a bad data or Q stripe.
1338 * TODO, we should redo the xor here.
1344 * a single failure in raid6 is rebuilt
1345 * in the pstripe code below
1350 /* make sure our ps and qs are in order */
1351 if (faila
> failb
) {
1357 /* if the q stripe is failed, do a pstripe reconstruction
1359 * If both the q stripe and the P stripe are failed, we're
1360 * here due to a crc mismatch and we can't give them the
1363 if (rbio
->raid_map
[failb
] == RAID6_Q_STRIPE
) {
1364 if (rbio
->raid_map
[faila
] == RAID5_P_STRIPE
) {
1369 * otherwise we have one bad data stripe and
1370 * a good P stripe. raid5!
1375 if (rbio
->raid_map
[failb
] == RAID5_P_STRIPE
) {
1376 raid6_datap_recov(rbio
->bbio
->num_stripes
,
1377 PAGE_SIZE
, faila
, pointers
);
1379 raid6_2data_recov(rbio
->bbio
->num_stripes
,
1380 PAGE_SIZE
, faila
, failb
,
1386 /* rebuild from P stripe here (raid5 or raid6) */
1387 BUG_ON(failb
!= -1);
1389 /* Copy parity block into failed block to start with */
1390 memcpy(pointers
[faila
],
1391 pointers
[rbio
->nr_data
],
1394 /* rearrange the pointer array */
1395 p
= pointers
[faila
];
1396 for (stripe
= faila
; stripe
< rbio
->nr_data
- 1; stripe
++)
1397 pointers
[stripe
] = pointers
[stripe
+ 1];
1398 pointers
[rbio
->nr_data
- 1] = p
;
1400 /* xor in the rest */
1401 run_xor(pointers
, rbio
->nr_data
- 1, PAGE_CACHE_SIZE
);
1403 /* if we're doing this rebuild as part of an rmw, go through
1404 * and set all of our private rbio pages in the
1405 * failed stripes as uptodate. This way finish_rmw will
1406 * know they can be trusted. If this was a read reconstruction,
1407 * other endio functions will fiddle the uptodate bits
1409 if (!rbio
->read_rebuild
) {
1410 for (i
= 0; i
< nr_pages
; i
++) {
1412 page
= rbio_stripe_page(rbio
, faila
, i
);
1413 SetPageUptodate(page
);
1416 page
= rbio_stripe_page(rbio
, failb
, i
);
1417 SetPageUptodate(page
);
1421 for (stripe
= 0; stripe
< rbio
->bbio
->num_stripes
; stripe
++) {
1423 * if we're rebuilding a read, we have to use
1424 * pages from the bio list
1426 if (rbio
->read_rebuild
&&
1427 (stripe
== faila
|| stripe
== failb
)) {
1428 page
= page_in_rbio(rbio
, stripe
, pagenr
, 0);
1430 page
= rbio_stripe_page(rbio
, stripe
, pagenr
);
1442 if (rbio
->read_rebuild
) {
1443 rbio_orig_end_io(rbio
, err
, err
== 0);
1444 } else if (err
== 0) {
1449 rbio_orig_end_io(rbio
, err
, 0);
1454 * This is called only for stripes we've read from disk to
1455 * reconstruct the parity.
1457 static void raid_recover_end_io(struct bio
*bio
, int err
)
1459 struct btrfs_raid_bio
*rbio
= bio
->bi_private
;
1462 * we only read stripe pages off the disk, set them
1463 * up to date if there were no errors
1466 fail_bio_stripe(rbio
, bio
);
1468 set_bio_pages_uptodate(bio
);
1471 if (!atomic_dec_and_test(&rbio
->bbio
->stripes_pending
))
1474 if (atomic_read(&rbio
->bbio
->error
) > rbio
->bbio
->max_errors
)
1475 rbio_orig_end_io(rbio
, -EIO
, 0);
1477 __raid_recover_end_io(rbio
);
1481 * reads everything we need off the disk to reconstruct
1482 * the parity. endio handlers trigger final reconstruction
1483 * when the IO is done.
1485 * This is used both for reads from the higher layers and for
1486 * parity construction required to finish a rmw cycle.
1488 static int __raid56_parity_recover(struct btrfs_raid_bio
*rbio
)
1490 int bios_to_read
= 0;
1491 struct btrfs_bio
*bbio
= rbio
->bbio
;
1492 struct bio_list bio_list
;
1494 int nr_pages
= (rbio
->stripe_len
+ PAGE_CACHE_SIZE
- 1) >> PAGE_CACHE_SHIFT
;
1499 bio_list_init(&bio_list
);
1501 ret
= alloc_rbio_pages(rbio
);
1505 atomic_set(&rbio
->bbio
->error
, 0);
1508 * read everything that hasn't failed.
1510 for (stripe
= 0; stripe
< bbio
->num_stripes
; stripe
++) {
1511 if (rbio
->faila
== stripe
||
1512 rbio
->failb
== stripe
)
1515 for (pagenr
= 0; pagenr
< nr_pages
; pagenr
++) {
1519 * the rmw code may have already read this
1522 p
= rbio_stripe_page(rbio
, stripe
, pagenr
);
1523 if (PageUptodate(p
))
1526 ret
= rbio_add_io_page(rbio
, &bio_list
,
1527 rbio_stripe_page(rbio
, stripe
, pagenr
),
1528 stripe
, pagenr
, rbio
->stripe_len
);
1534 bios_to_read
= bio_list_size(&bio_list
);
1535 if (!bios_to_read
) {
1537 * we might have no bios to read just because the pages
1538 * were up to date, or we might have no bios to read because
1539 * the devices were gone.
1541 if (atomic_read(&rbio
->bbio
->error
) <= rbio
->bbio
->max_errors
) {
1542 __raid_recover_end_io(rbio
);
1550 * the bbio may be freed once we submit the last bio. Make sure
1551 * not to touch it after that
1553 atomic_set(&bbio
->stripes_pending
, bios_to_read
);
1555 bio
= bio_list_pop(&bio_list
);
1559 bio
->bi_private
= rbio
;
1560 bio
->bi_end_io
= raid_recover_end_io
;
1562 btrfs_bio_wq_end_io(rbio
->fs_info
, bio
,
1563 BTRFS_WQ_ENDIO_RAID56
);
1565 BUG_ON(!test_bit(BIO_UPTODATE
, &bio
->bi_flags
));
1566 submit_bio(READ
, bio
);
1572 if (rbio
->read_rebuild
)
1573 rbio_orig_end_io(rbio
, -EIO
, 0);
1578 * the main entry point for reads from the higher layers. This
1579 * is really only called when the normal read path had a failure,
1580 * so we assume the bio they send down corresponds to a failed part
1583 int raid56_parity_recover(struct btrfs_root
*root
, struct bio
*bio
,
1584 struct btrfs_bio
*bbio
, u64
*raid_map
,
1585 u64 stripe_len
, int mirror_num
)
1587 struct btrfs_raid_bio
*rbio
;
1590 rbio
= alloc_rbio(root
, bbio
, raid_map
, stripe_len
);
1592 return PTR_ERR(rbio
);
1595 rbio
->read_rebuild
= 1;
1596 bio_list_add(&rbio
->bio_list
, bio
);
1597 rbio
->bio_list_bytes
= bio
->bi_size
;
1599 rbio
->faila
= find_logical_bio_stripe(rbio
, bio
);
1600 if (rbio
->faila
== -1) {
1607 * reconstruct from the q stripe if they are
1608 * asking for mirror 3
1610 if (mirror_num
== 3)
1611 rbio
->failb
= bbio
->num_stripes
- 2;
1613 ret
= lock_stripe_add(rbio
);
1616 * __raid56_parity_recover will end the bio with
1617 * any errors it hits. We don't want to return
1618 * its error value up the stack because our caller
1619 * will end up calling bio_endio with any nonzero
1623 __raid56_parity_recover(rbio
);
1625 * our rbio has been added to the list of
1626 * rbios that will be handled after the
1627 * currently lock owner is done
1633 static void rmw_work(struct btrfs_work
*work
)
1635 struct btrfs_raid_bio
*rbio
;
1637 rbio
= container_of(work
, struct btrfs_raid_bio
, work
);
1638 raid56_rmw_stripe(rbio
);
1641 static void read_rebuild_work(struct btrfs_work
*work
)
1643 struct btrfs_raid_bio
*rbio
;
1645 rbio
= container_of(work
, struct btrfs_raid_bio
, work
);
1646 __raid56_parity_recover(rbio
);