fs/btrfs/raid56.c

   1 /*
   2  * Copyright (C) 2012 Fusion-io  All rights reserved.
   3  * Copyright (C) 2012 Intel Corp. All rights reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public
   7  * License v2 as published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public
  15  * License along with this program; if not, write to the
  16  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  17  * Boston, MA 021110-1307, USA.
  18  */
  19 #include <linux/sched.h>
  20 #include <linux/wait.h>
  21 #include <linux/bio.h>
  22 #include <linux/slab.h>
  23 #include <linux/buffer_head.h>
  24 #include <linux/blkdev.h>
  25 #include <linux/random.h>
  26 #include <linux/iocontext.h>
  27 #include <linux/capability.h>
  28 #include <linux/ratelimit.h>
  29 #include <linux/kthread.h>
  30 #include <linux/raid/pq.h>
  31 #include <linux/hash.h>
  32 #include <linux/list_sort.h>
  33 #include <linux/raid/xor.h>
  34 #include <linux/vmalloc.h>
  35 #include <asm/div64.h>
  36 #include "ctree.h"
  37 #include "extent_map.h"
  38 #include "disk-io.h"
  39 #include "transaction.h"
  40 #include "print-tree.h"
  41 #include "volumes.h"
  42 #include "raid56.h"
  43 #include "async-thread.h"
  44 #include "check-integrity.h"
  45 #include "rcu-string.h"
  46
  47 /* set when additional merges to this rbio are not allowed */
  48 #define RBIO_RMW_LOCKED_BIT     1
  49
  50 /*
  51  * set when this rbio is sitting in the hash, but it is just a cache
  52  * of past RMW
  53  */
  54 #define RBIO_CACHE_BIT          2
  55
  56 /*
  57  * set when it is safe to trust the stripe_pages for caching
  58  */
  59 #define RBIO_CACHE_READY_BIT    3
  60
  61
  62 #define RBIO_CACHE_SIZE 1024
  63
  64 struct btrfs_raid_bio {
  65         struct btrfs_fs_info *fs_info;
  66         struct btrfs_bio *bbio;
  67
  68         /*
  69          * logical block numbers for the start of each stripe
  70          * The last one or two are p/q.  These are sorted,
  71          * so raid_map[0] is the start of our full stripe
  72          */
  73         u64 *raid_map;
  74
  75         /* while we're doing rmw on a stripe
  76          * we put it into a hash table so we can
  77          * lock the stripe and merge more rbios
  78          * into it.
  79          */
  80         struct list_head hash_list;
  81
  82         /*
  83          * LRU list for the stripe cache
  84          */
  85         struct list_head stripe_cache;
  86
  87         /*
  88          * for scheduling work in the helper threads
  89          */
  90         struct btrfs_work work;
  91
  92         /*
  93          * bio list and bio_list_lock are used
  94          * to add more bios into the stripe
  95          * in hopes of avoiding the full rmw
  96          */
  97         struct bio_list bio_list;
  98         spinlock_t bio_list_lock;
  99
 100         /* also protected by the bio_list_lock, the
 101          * plug list is used by the plugging code
 102          * to collect partial bios while plugged.  The
 103          * stripe locking code also uses it to hand off
 104          * the stripe lock to the next pending IO
 105          */
 106         struct list_head plug_list;
 107
 108         /*
 109          * flags that tell us if it is safe to
 110          * merge with this bio
 111          */
 112         unsigned long flags;
 113
 114         /* size of each individual stripe on disk */
 115         int stripe_len;
 116
 117         /* number of data stripes (no p/q) */
 118         int nr_data;
 119
 120         /*
 121          * set if we're doing a parity rebuild
 122          * for a read from higher up, which is handled
 123          * differently from a parity rebuild as part of
 124          * rmw
 125          */
 126         int read_rebuild;
 127
 128         /* first bad stripe */
 129         int faila;
 130
 131         /* second bad stripe (for raid6 use) */
 132         int failb;
 133
 134         /*
 135          * number of pages needed to represent the full
 136          * stripe
 137          */
 138         int nr_pages;
 139
 140         /*
 141          * size of all the bios in the bio_list.  This
 142          * helps us decide if the rbio maps to a full
 143          * stripe or not
 144          */
 145         int bio_list_bytes;
 146
 147         atomic_t refs;
 148
 149         /*
 150          * these are two arrays of pointers.  We allocate the
 151          * rbio big enough to hold them both and setup their
 152          * locations when the rbio is allocated
 153          */
 154
 155         /* pointers to pages that we allocated for
 156          * reading/writing stripes directly from the disk (including P/Q)
 157          */
 158         struct page **stripe_pages;
 159
 160         /*
 161          * pointers to the pages in the bio_list.  Stored
 162          * here for faster lookup
 163          */
 164         struct page **bio_pages;
 165 };
 166
 167 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 168 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
 169 static void rmw_work(struct btrfs_work *work);
 170 static void read_rebuild_work(struct btrfs_work *work);
 171 static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
 172 static void async_read_rebuild(struct btrfs_raid_bio *rbio);
 173 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
 174 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
 175 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
 176 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
 177 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
 178
 179 /*
 180  * the stripe hash table is used for locking, and to collect
 181  * bios in hopes of making a full stripe
 182  */
 183 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 184 {
 185         struct btrfs_stripe_hash_table *table;
 186         struct btrfs_stripe_hash_table *x;
 187         struct btrfs_stripe_hash *cur;
 188         struct btrfs_stripe_hash *h;
 189         int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
 190         int i;
 191         int table_size;
 192
 193         if (info->stripe_hash_table)
 194                 return 0;
 195
 196         /*
 197          * The table is large, starting with order 4 and can go as high as
 198          * order 7 in case lock debugging is turned on.
 199          *
 200          * Try harder to allocate and fallback to vmalloc to lower the chance
 201          * of a failing mount.
 202          */
 203         table_size = sizeof(*table) + sizeof(*h) * num_entries;
 204         table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
 205         if (!table) {
 206                 table = vzalloc(table_size);
 207                 if (!table)
 208                         return -ENOMEM;
 209         }
 210
 211         spin_lock_init(&table->cache_lock);
 212         INIT_LIST_HEAD(&table->stripe_cache);
 213
 214         h = table->table;
 215
 216         for (i = 0; i < num_entries; i++) {
 217                 cur = h + i;
 218                 INIT_LIST_HEAD(&cur->hash_list);
 219                 spin_lock_init(&cur->lock);
 220                 init_waitqueue_head(&cur->wait);
 221         }
 222
 223         x = cmpxchg(&info->stripe_hash_table, NULL, table);
 224         if (x) {
 225                 if (is_vmalloc_addr(x))
 226                         vfree(x);
 227                 else
 228                         kfree(x);
 229         }
 230         return 0;
 231 }
 232
 233 /*
 234  * caching an rbio means to copy anything from the
 235  * bio_pages array into the stripe_pages array.  We
 236  * use the page uptodate bit in the stripe cache array
 237  * to indicate if it has valid data
 238  *
 239  * once the caching is done, we set the cache ready
 240  * bit.
 241  */
 242 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
 243 {
 244         int i;
 245         char *s;
 246         char *d;
 247         int ret;
 248
 249         ret = alloc_rbio_pages(rbio);
 250         if (ret)
 251                 return;
 252
 253         for (i = 0; i < rbio->nr_pages; i++) {
 254                 if (!rbio->bio_pages[i])
 255                         continue;
 256
 257                 s = kmap(rbio->bio_pages[i]);
 258                 d = kmap(rbio->stripe_pages[i]);
 259
 260                 memcpy(d, s, PAGE_CACHE_SIZE);
 261
 262                 kunmap(rbio->bio_pages[i]);
 263                 kunmap(rbio->stripe_pages[i]);
 264                 SetPageUptodate(rbio->stripe_pages[i]);
 265         }
 266         set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 267 }
 268
 269 /*
 270  * we hash on the first logical address of the stripe
 271  */
 272 static int rbio_bucket(struct btrfs_raid_bio *rbio)
 273 {
 274         u64 num = rbio->raid_map[0];
 275
 276         /*
 277          * we shift down quite a bit.  We're using byte
 278          * addressing, and most of the lower bits are zeros.
 279          * This tends to upset hash_64, and it consistently
 280          * returns just one or two different values.
 281          *
 282          * shifting off the lower bits fixes things.
 283          */
 284         return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
 285 }
 286
 287 /*
 288  * stealing an rbio means taking all the uptodate pages from the stripe
 289  * array in the source rbio and putting them into the destination rbio
 290  */
 291 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
 292 {
 293         int i;
 294         struct page *s;
 295         struct page *d;
 296
 297         if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
 298                 return;
 299
 300         for (i = 0; i < dest->nr_pages; i++) {
 301                 s = src->stripe_pages[i];
 302                 if (!s || !PageUptodate(s)) {
 303                         continue;
 304                 }
 305
 306                 d = dest->stripe_pages[i];
 307                 if (d)
 308                         __free_page(d);
 309
 310                 dest->stripe_pages[i] = s;
 311                 src->stripe_pages[i] = NULL;
 312         }
 313 }
 314
 315 /*
 316  * merging means we take the bio_list from the victim and
 317  * splice it into the destination.  The victim should
 318  * be discarded afterwards.
 319  *
 320  * must be called with dest->rbio_list_lock held
 321  */
 322 static void merge_rbio(struct btrfs_raid_bio *dest,
 323                        struct btrfs_raid_bio *victim)
 324 {
 325         bio_list_merge(&dest->bio_list, &victim->bio_list);
 326         dest->bio_list_bytes += victim->bio_list_bytes;
 327         bio_list_init(&victim->bio_list);
 328 }
 329
 330 /*
 331  * used to prune items that are in the cache.  The caller
 332  * must hold the hash table lock.
 333  */
 334 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 335 {
 336         int bucket = rbio_bucket(rbio);
 337         struct btrfs_stripe_hash_table *table;
 338         struct btrfs_stripe_hash *h;
 339         int freeit = 0;
 340
 341         /*
 342          * check the bit again under the hash table lock.
 343          */
 344         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 345                 return;
 346
 347         table = rbio->fs_info->stripe_hash_table;
 348         h = table->table + bucket;
 349
 350         /* hold the lock for the bucket because we may be
 351          * removing it from the hash table
 352          */
 353         spin_lock(&h->lock);
 354
 355         /*
 356          * hold the lock for the bio list because we need
 357          * to make sure the bio list is empty
 358          */
 359         spin_lock(&rbio->bio_list_lock);
 360
 361         if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 362                 list_del_init(&rbio->stripe_cache);
 363                 table->cache_size -= 1;
 364                 freeit = 1;
 365
 366                 /* if the bio list isn't empty, this rbio is
 367                  * still involved in an IO.  We take it out
 368                  * of the cache list, and drop the ref that
 369                  * was held for the list.
 370                  *
 371                  * If the bio_list was empty, we also remove
 372                  * the rbio from the hash_table, and drop
 373                  * the corresponding ref
 374                  */
 375                 if (bio_list_empty(&rbio->bio_list)) {
 376                         if (!list_empty(&rbio->hash_list)) {
 377                                 list_del_init(&rbio->hash_list);
 378                                 atomic_dec(&rbio->refs);
 379                                 BUG_ON(!list_empty(&rbio->plug_list));
 380                         }
 381                 }
 382         }
 383
 384         spin_unlock(&rbio->bio_list_lock);
 385         spin_unlock(&h->lock);
 386
 387         if (freeit)
 388                 __free_raid_bio(rbio);
 389 }
 390
 391 /*
 392  * prune a given rbio from the cache
 393  */
 394 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 395 {
 396         struct btrfs_stripe_hash_table *table;
 397         unsigned long flags;
 398
 399         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 400                 return;
 401
 402         table = rbio->fs_info->stripe_hash_table;
 403
 404         spin_lock_irqsave(&table->cache_lock, flags);
 405         __remove_rbio_from_cache(rbio);
 406         spin_unlock_irqrestore(&table->cache_lock, flags);
 407 }
 408
 409 /*
 410  * remove everything in the cache
 411  */
 412 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
 413 {
 414         struct btrfs_stripe_hash_table *table;
 415         unsigned long flags;
 416         struct btrfs_raid_bio *rbio;
 417
 418         table = info->stripe_hash_table;
 419
 420         spin_lock_irqsave(&table->cache_lock, flags);
 421         while (!list_empty(&table->stripe_cache)) {
 422                 rbio = list_entry(table->stripe_cache.next,
 423                                   struct btrfs_raid_bio,
 424                                   stripe_cache);
 425                 __remove_rbio_from_cache(rbio);
 426         }
 427         spin_unlock_irqrestore(&table->cache_lock, flags);
 428 }
 429
 430 /*
 431  * remove all cached entries and free the hash table
 432  * used by unmount
 433  */
 434 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
 435 {
 436         if (!info->stripe_hash_table)
 437                 return;
 438         btrfs_clear_rbio_cache(info);
 439         if (is_vmalloc_addr(info->stripe_hash_table))
 440                 vfree(info->stripe_hash_table);
 441         else
 442                 kfree(info->stripe_hash_table);
 443         info->stripe_hash_table = NULL;
 444 }
 445
 446 /*
 447  * insert an rbio into the stripe cache.  It
 448  * must have already been prepared by calling
 449  * cache_rbio_pages
 450  *
 451  * If this rbio was already cached, it gets
 452  * moved to the front of the lru.
 453  *
 454  * If the size of the rbio cache is too big, we
 455  * prune an item.
 456  */
 457 static void cache_rbio(struct btrfs_raid_bio *rbio)
 458 {
 459         struct btrfs_stripe_hash_table *table;
 460         unsigned long flags;
 461
 462         if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
 463                 return;
 464
 465         table = rbio->fs_info->stripe_hash_table;
 466
 467         spin_lock_irqsave(&table->cache_lock, flags);
 468         spin_lock(&rbio->bio_list_lock);
 469
 470         /* bump our ref if we were not in the list before */
 471         if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
 472                 atomic_inc(&rbio->refs);
 473
 474         if (!list_empty(&rbio->stripe_cache)){
 475                 list_move(&rbio->stripe_cache, &table->stripe_cache);
 476         } else {
 477                 list_add(&rbio->stripe_cache, &table->stripe_cache);
 478                 table->cache_size += 1;
 479         }
 480
 481         spin_unlock(&rbio->bio_list_lock);
 482
 483         if (table->cache_size > RBIO_CACHE_SIZE) {
 484                 struct btrfs_raid_bio *found;
 485
 486                 found = list_entry(table->stripe_cache.prev,
 487                                   struct btrfs_raid_bio,
 488                                   stripe_cache);
 489
 490                 if (found != rbio)
 491                         __remove_rbio_from_cache(found);
 492         }
 493
 494         spin_unlock_irqrestore(&table->cache_lock, flags);
 495         return;
 496 }
 497
 498 /*
 499  * helper function to run the xor_blocks api.  It is only
 500  * able to do MAX_XOR_BLOCKS at a time, so we need to
 501  * loop through.
 502  */
 503 static void run_xor(void **pages, int src_cnt, ssize_t len)
 504 {
 505         int src_off = 0;
 506         int xor_src_cnt = 0;
 507         void *dest = pages[src_cnt];
 508
 509         while(src_cnt > 0) {
 510                 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
 511                 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
 512
 513                 src_cnt -= xor_src_cnt;
 514                 src_off += xor_src_cnt;
 515         }
 516 }
 517
 518 /*
 519  * returns true if the bio list inside this rbio
 520  * covers an entire stripe (no rmw required).
 521  * Must be called with the bio list lock held, or
 522  * at a time when you know it is impossible to add
 523  * new bios into the list
 524  */
 525 static int __rbio_is_full(struct btrfs_raid_bio *rbio)
 526 {
 527         unsigned long size = rbio->bio_list_bytes;
 528         int ret = 1;
 529
 530         if (size != rbio->nr_data * rbio->stripe_len)
 531                 ret = 0;
 532
 533         BUG_ON(size > rbio->nr_data * rbio->stripe_len);
 534         return ret;
 535 }
 536
 537 static int rbio_is_full(struct btrfs_raid_bio *rbio)
 538 {
 539         unsigned long flags;
 540         int ret;
 541
 542         spin_lock_irqsave(&rbio->bio_list_lock, flags);
 543         ret = __rbio_is_full(rbio);
 544         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
 545         return ret;
 546 }
 547
 548 /*
 549  * returns 1 if it is safe to merge two rbios together.
 550  * The merging is safe if the two rbios correspond to
 551  * the same stripe and if they are both going in the same
 552  * direction (read vs write), and if neither one is
 553  * locked for final IO
 554  *
 555  * The caller is responsible for locking such that
 556  * rmw_locked is safe to test
 557  */
 558 static int rbio_can_merge(struct btrfs_raid_bio *last,
 559                           struct btrfs_raid_bio *cur)
 560 {
 561         if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
 562             test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
 563                 return 0;
 564
 565         /*
 566          * we can't merge with cached rbios, since the
 567          * idea is that when we merge the destination
 568          * rbio is going to run our IO for us.  We can
 569          * steal from cached rbio's though, other functions
 570          * handle that.
 571          */
 572         if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
 573             test_bit(RBIO_CACHE_BIT, &cur->flags))
 574                 return 0;
 575
 576         if (last->raid_map[0] !=
 577             cur->raid_map[0])
 578                 return 0;
 579
 580         /* reads can't merge with writes */
 581         if (last->read_rebuild !=
 582             cur->read_rebuild) {
 583                 return 0;
 584         }
 585
 586         return 1;
 587 }
 588
 589 /*
 590  * helper to index into the pstripe
 591  */
 592 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
 593 {
 594         index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
 595         return rbio->stripe_pages[index];
 596 }
 597
 598 /*
 599  * helper to index into the qstripe, returns null
 600  * if there is no qstripe
 601  */
 602 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
 603 {
 604         if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
 605                 return NULL;
 606
 607         index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
 608                 PAGE_CACHE_SHIFT;
 609         return rbio->stripe_pages[index];
 610 }
 611
 612 /*
 613  * The first stripe in the table for a logical address
 614  * has the lock.  rbios are added in one of three ways:
 615  *
 616  * 1) Nobody has the stripe locked yet.  The rbio is given
 617  * the lock and 0 is returned.  The caller must start the IO
 618  * themselves.
 619  *
 620  * 2) Someone has the stripe locked, but we're able to merge
 621  * with the lock owner.  The rbio is freed and the IO will
 622  * start automatically along with the existing rbio.  1 is returned.
 623  *
 624  * 3) Someone has the stripe locked, but we're not able to merge.
 625  * The rbio is added to the lock owner's plug list, or merged into
 626  * an rbio already on the plug list.  When the lock owner unlocks,
 627  * the next rbio on the list is run and the IO is started automatically.
 628  * 1 is returned
 629  *
 630  * If we return 0, the caller still owns the rbio and must continue with
 631  * IO submission.  If we return 1, the caller must assume the rbio has
 632  * already been freed.
 633  */
 634 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 635 {
 636         int bucket = rbio_bucket(rbio);
 637         struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
 638         struct btrfs_raid_bio *cur;
 639         struct btrfs_raid_bio *pending;
 640         unsigned long flags;
 641         DEFINE_WAIT(wait);
 642         struct btrfs_raid_bio *freeit = NULL;
 643         struct btrfs_raid_bio *cache_drop = NULL;
 644         int ret = 0;
 645         int walk = 0;
 646
 647         spin_lock_irqsave(&h->lock, flags);
 648         list_for_each_entry(cur, &h->hash_list, hash_list) {
 649                 walk++;
 650                 if (cur->raid_map[0] == rbio->raid_map[0]) {
 651                         spin_lock(&cur->bio_list_lock);
 652
 653                         /* can we steal this cached rbio's pages? */
 654                         if (bio_list_empty(&cur->bio_list) &&
 655                             list_empty(&cur->plug_list) &&
 656                             test_bit(RBIO_CACHE_BIT, &cur->flags) &&
 657                             !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
 658                                 list_del_init(&cur->hash_list);
 659                                 atomic_dec(&cur->refs);
 660
 661                                 steal_rbio(cur, rbio);
 662                                 cache_drop = cur;
 663                                 spin_unlock(&cur->bio_list_lock);
 664
 665                                 goto lockit;
 666                         }
 667
 668                         /* can we merge into the lock owner? */
 669                         if (rbio_can_merge(cur, rbio)) {
 670                                 merge_rbio(cur, rbio);
 671                                 spin_unlock(&cur->bio_list_lock);
 672                                 freeit = rbio;
 673                                 ret = 1;
 674                                 goto out;
 675                         }
 676
 677
 678                         /*
 679                          * we couldn't merge with the running
 680                          * rbio, see if we can merge with the
 681                          * pending ones.  We don't have to
 682                          * check for rmw_locked because there
 683                          * is no way they are inside finish_rmw
 684                          * right now
 685                          */
 686                         list_for_each_entry(pending, &cur->plug_list,
 687                                             plug_list) {
 688                                 if (rbio_can_merge(pending, rbio)) {
 689                                         merge_rbio(pending, rbio);
 690                                         spin_unlock(&cur->bio_list_lock);
 691                                         freeit = rbio;
 692                                         ret = 1;
 693                                         goto out;
 694                                 }
 695                         }
 696
 697                         /* no merging, put us on the tail of the plug list,
 698                          * our rbio will be started with the currently
 699                          * running rbio unlocks
 700                          */
 701                         list_add_tail(&rbio->plug_list, &cur->plug_list);
 702                         spin_unlock(&cur->bio_list_lock);
 703                         ret = 1;
 704                         goto out;
 705                 }
 706         }
 707 lockit:
 708         atomic_inc(&rbio->refs);
 709         list_add(&rbio->hash_list, &h->hash_list);
 710 out:
 711         spin_unlock_irqrestore(&h->lock, flags);
 712         if (cache_drop)
 713                 remove_rbio_from_cache(cache_drop);
 714         if (freeit)
 715                 __free_raid_bio(freeit);
 716         return ret;
 717 }
 718
 719 /*
 720  * called as rmw or parity rebuild is completed.  If the plug list has more
 721  * rbios waiting for this stripe, the next one on the list will be started
 722  */
 723 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 724 {
 725         int bucket;
 726         struct btrfs_stripe_hash *h;
 727         unsigned long flags;
 728         int keep_cache = 0;
 729
 730         bucket = rbio_bucket(rbio);
 731         h = rbio->fs_info->stripe_hash_table->table + bucket;
 732
 733         if (list_empty(&rbio->plug_list))
 734                 cache_rbio(rbio);
 735
 736         spin_lock_irqsave(&h->lock, flags);
 737         spin_lock(&rbio->bio_list_lock);
 738
 739         if (!list_empty(&rbio->hash_list)) {
 740                 /*
 741                  * if we're still cached and there is no other IO
 742                  * to perform, just leave this rbio here for others
 743                  * to steal from later
 744                  */
 745                 if (list_empty(&rbio->plug_list) &&
 746                     test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 747                         keep_cache = 1;
 748                         clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
 749                         BUG_ON(!bio_list_empty(&rbio->bio_list));
 750                         goto done;
 751                 }
 752
 753                 list_del_init(&rbio->hash_list);
 754                 atomic_dec(&rbio->refs);
 755
 756                 /*
 757                  * we use the plug list to hold all the rbios
 758                  * waiting for the chance to lock this stripe.
 759                  * hand the lock over to one of them.
 760                  */
 761                 if (!list_empty(&rbio->plug_list)) {
 762                         struct btrfs_raid_bio *next;
 763                         struct list_head *head = rbio->plug_list.next;
 764
 765                         next = list_entry(head, struct btrfs_raid_bio,
 766                                           plug_list);
 767
 768                         list_del_init(&rbio->plug_list);
 769
 770                         list_add(&next->hash_list, &h->hash_list);
 771                         atomic_inc(&next->refs);
 772                         spin_unlock(&rbio->bio_list_lock);
 773                         spin_unlock_irqrestore(&h->lock, flags);
 774
 775                         if (next->read_rebuild)
 776                                 async_read_rebuild(next);
 777                         else {
 778                                 steal_rbio(rbio, next);
 779                                 async_rmw_stripe(next);
 780                         }
 781
 782                         goto done_nolock;
 783                 } else  if (waitqueue_active(&h->wait)) {
 784                         spin_unlock(&rbio->bio_list_lock);
 785                         spin_unlock_irqrestore(&h->lock, flags);
 786                         wake_up(&h->wait);
 787                         goto done_nolock;
 788                 }
 789         }
 790 done:
 791         spin_unlock(&rbio->bio_list_lock);
 792         spin_unlock_irqrestore(&h->lock, flags);
 793
 794 done_nolock:
 795         if (!keep_cache)
 796                 remove_rbio_from_cache(rbio);
 797 }
 798
 799 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 800 {
 801         int i;
 802
 803         WARN_ON(atomic_read(&rbio->refs) < 0);
 804         if (!atomic_dec_and_test(&rbio->refs))
 805                 return;
 806
 807         WARN_ON(!list_empty(&rbio->stripe_cache));
 808         WARN_ON(!list_empty(&rbio->hash_list));
 809         WARN_ON(!bio_list_empty(&rbio->bio_list));
 810
 811         for (i = 0; i < rbio->nr_pages; i++) {
 812                 if (rbio->stripe_pages[i]) {
 813                         __free_page(rbio->stripe_pages[i]);
 814                         rbio->stripe_pages[i] = NULL;
 815                 }
 816         }
 817         kfree(rbio->raid_map);
 818         kfree(rbio->bbio);
 819         kfree(rbio);
 820 }
 821
 822 static void free_raid_bio(struct btrfs_raid_bio *rbio)
 823 {
 824         unlock_stripe(rbio);
 825         __free_raid_bio(rbio);
 826 }
 827
 828 /*
 829  * this frees the rbio and runs through all the bios in the
 830  * bio_list and calls end_io on them
 831  */
 832 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
 833 {
 834         struct bio *cur = bio_list_get(&rbio->bio_list);
 835         struct bio *next;
 836         free_raid_bio(rbio);
 837
 838         while (cur) {
 839                 next = cur->bi_next;
 840                 cur->bi_next = NULL;
 841                 if (uptodate)
 842                         set_bit(BIO_UPTODATE, &cur->bi_flags);
 843                 bio_endio(cur, err);
 844                 cur = next;
 845         }
 846 }
 847
 848 /*
 849  * end io function used by finish_rmw.  When we finally
 850  * get here, we've written a full stripe
 851  */
 852 static void raid_write_end_io(struct bio *bio, int err)
 853 {
 854         struct btrfs_raid_bio *rbio = bio->bi_private;
 855
 856         if (err)
 857                 fail_bio_stripe(rbio, bio);
 858
 859         bio_put(bio);
 860
 861         if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
 862                 return;
 863
 864         err = 0;
 865
 866         /* OK, we have read all the stripes we need to. */
 867         if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
 868                 err = -EIO;
 869
 870         rbio_orig_end_io(rbio, err, 0);
 871         return;
 872 }
 873
 874 /*
 875  * the read/modify/write code wants to use the original bio for
 876  * any pages it included, and then use the rbio for everything
 877  * else.  This function decides if a given index (stripe number)
 878  * and page number in that stripe fall inside the original bio
 879  * or the rbio.
 880  *
 881  * if you set bio_list_only, you'll get a NULL back for any ranges
 882  * that are outside the bio_list
 883  *
 884  * This doesn't take any refs on anything, you get a bare page pointer
 885  * and the caller must bump refs as required.
 886  *
 887  * You must call index_rbio_pages once before you can trust
 888  * the answers from this function.
 889  */
 890 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
 891                                  int index, int pagenr, int bio_list_only)
 892 {
 893         int chunk_page;
 894         struct page *p = NULL;
 895
 896         chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
 897
 898         spin_lock_irq(&rbio->bio_list_lock);
 899         p = rbio->bio_pages[chunk_page];
 900         spin_unlock_irq(&rbio->bio_list_lock);
 901
 902         if (p || bio_list_only)
 903                 return p;
 904
 905         return rbio->stripe_pages[chunk_page];
 906 }
 907
 908 /*
 909  * number of pages we need for the entire stripe across all the
 910  * drives
 911  */
 912 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
 913 {
 914         unsigned long nr = stripe_len * nr_stripes;
 915         return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 916 }
 917
 918 /*
 919  * allocation and initial setup for the btrfs_raid_bio.  Not
 920  * this does not allocate any pages for rbio->pages.
 921  */
 922 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
 923                           struct btrfs_bio *bbio, u64 *raid_map,
 924                           u64 stripe_len)
 925 {
 926         struct btrfs_raid_bio *rbio;
 927         int nr_data = 0;
 928         int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
 929         void *p;
 930
 931         rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
 932                         GFP_NOFS);
 933         if (!rbio) {
 934                 kfree(raid_map);
 935                 kfree(bbio);
 936                 return ERR_PTR(-ENOMEM);
 937         }
 938
 939         bio_list_init(&rbio->bio_list);
 940         INIT_LIST_HEAD(&rbio->plug_list);
 941         spin_lock_init(&rbio->bio_list_lock);
 942         INIT_LIST_HEAD(&rbio->stripe_cache);
 943         INIT_LIST_HEAD(&rbio->hash_list);
 944         rbio->bbio = bbio;
 945         rbio->raid_map = raid_map;
 946         rbio->fs_info = root->fs_info;
 947         rbio->stripe_len = stripe_len;
 948         rbio->nr_pages = num_pages;
 949         rbio->faila = -1;
 950         rbio->failb = -1;
 951         atomic_set(&rbio->refs, 1);
 952
 953         /*
 954          * the stripe_pages and bio_pages array point to the extra
 955          * memory we allocated past the end of the rbio
 956          */
 957         p = rbio + 1;
 958         rbio->stripe_pages = p;
 959         rbio->bio_pages = p + sizeof(struct page *) * num_pages;
 960
 961         if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
 962                 nr_data = bbio->num_stripes - 2;
 963         else
 964                 nr_data = bbio->num_stripes - 1;
 965
 966         rbio->nr_data = nr_data;
 967         return rbio;
 968 }
 969
 970 /* allocate pages for all the stripes in the bio, including parity */
 971 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
 972 {
 973         int i;
 974         struct page *page;
 975
 976         for (i = 0; i < rbio->nr_pages; i++) {
 977                 if (rbio->stripe_pages[i])
 978                         continue;
 979                 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 980                 if (!page)
 981                         return -ENOMEM;
 982                 rbio->stripe_pages[i] = page;
 983                 ClearPageUptodate(page);
 984         }
 985         return 0;
 986 }
 987
 988 /* allocate pages for just the p/q stripes */
 989 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
 990 {
 991         int i;
 992         struct page *page;
 993
 994         i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
 995
 996         for (; i < rbio->nr_pages; i++) {
 997                 if (rbio->stripe_pages[i])
 998                         continue;
 999                 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1000                 if (!page)
1001                         return -ENOMEM;
1002                 rbio->stripe_pages[i] = page;
1003         }
1004         return 0;
1005 }
1006
1007 /*
1008  * add a single page from a specific stripe into our list of bios for IO
1009  * this will try to merge into existing bios if possible, and returns
1010  * zero if all went well.
1011  */
1012 static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1013                             struct bio_list *bio_list,
1014                             struct page *page,
1015                             int stripe_nr,
1016                             unsigned long page_index,
1017                             unsigned long bio_max_len)
1018 {
1019         struct bio *last = bio_list->tail;
1020         u64 last_end = 0;
1021         int ret;
1022         struct bio *bio;
1023         struct btrfs_bio_stripe *stripe;
1024         u64 disk_start;
1025
1026         stripe = &rbio->bbio->stripes[stripe_nr];
1027         disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
1028
1029         /* if the device is missing, just fail this stripe */
1030         if (!stripe->dev->bdev)
1031                 return fail_rbio_index(rbio, stripe_nr);
1032
1033         /* see if we can add this page onto our existing bio */
1034         if (last) {
1035                 last_end = (u64)last->bi_iter.bi_sector << 9;
1036                 last_end += last->bi_iter.bi_size;
1037
1038                 /*
1039                  * we can't merge these if they are from different
1040                  * devices or if they are not contiguous
1041                  */
1042                 if (last_end == disk_start && stripe->dev->bdev &&
1043                     test_bit(BIO_UPTODATE, &last->bi_flags) &&
1044                     last->bi_bdev == stripe->dev->bdev) {
1045                         ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
1046                         if (ret == PAGE_CACHE_SIZE)
1047                                 return 0;
1048                 }
1049         }
1050
1051         /* put a new bio on the list */
1052         bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
1053         if (!bio)
1054                 return -ENOMEM;
1055
1056         bio->bi_iter.bi_size = 0;
1057         bio->bi_bdev = stripe->dev->bdev;
1058         bio->bi_iter.bi_sector = disk_start >> 9;
1059         set_bit(BIO_UPTODATE, &bio->bi_flags);
1060
1061         bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
1062         bio_list_add(bio_list, bio);
1063         return 0;
1064 }
1065
1066 /*
1067  * while we're doing the read/modify/write cycle, we could
1068  * have errors in reading pages off the disk.  This checks
1069  * for errors and if we're not able to read the page it'll
1070  * trigger parity reconstruction.  The rmw will be finished
1071  * after we've reconstructed the failed stripes
1072  */
1073 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1074 {
1075         if (rbio->faila >= 0 || rbio->failb >= 0) {
1076                 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
1077                 __raid56_parity_recover(rbio);
1078         } else {
1079                 finish_rmw(rbio);
1080         }
1081 }
1082
1083 /*
1084  * these are just the pages from the rbio array, not from anything
1085  * the FS sent down to us
1086  */
1087 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
1088 {
1089         int index;
1090         index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
1091         index += page;
1092         return rbio->stripe_pages[index];
1093 }
1094
1095 /*
1096  * helper function to walk our bio list and populate the bio_pages array with
1097  * the result.  This seems expensive, but it is faster than constantly
1098  * searching through the bio list as we setup the IO in finish_rmw or stripe
1099  * reconstruction.
1100  *
1101  * This must be called before you trust the answers from page_in_rbio
1102  */
1103 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1104 {
1105         struct bio *bio;
1106         u64 start;
1107         unsigned long stripe_offset;
1108         unsigned long page_index;
1109         struct page *p;
1110         int i;
1111
1112         spin_lock_irq(&rbio->bio_list_lock);
1113         bio_list_for_each(bio, &rbio->bio_list) {
1114                 start = (u64)bio->bi_iter.bi_sector << 9;
1115                 stripe_offset = start - rbio->raid_map[0];
1116                 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
1117
1118                 for (i = 0; i < bio->bi_vcnt; i++) {
1119                         p = bio->bi_io_vec[i].bv_page;
1120                         rbio->bio_pages[page_index + i] = p;
1121                 }
1122         }
1123         spin_unlock_irq(&rbio->bio_list_lock);
1124 }
1125
1126 /*
1127  * this is called from one of two situations.  We either
1128  * have a full stripe from the higher layers, or we've read all
1129  * the missing bits off disk.
1130  *
1131  * This will calculate the parity and then send down any
1132  * changed blocks.
1133  */
1134 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1135 {
1136         struct btrfs_bio *bbio = rbio->bbio;
1137         void *pointers[bbio->num_stripes];
1138         int stripe_len = rbio->stripe_len;
1139         int nr_data = rbio->nr_data;
1140         int stripe;
1141         int pagenr;
1142         int p_stripe = -1;
1143         int q_stripe = -1;
1144         struct bio_list bio_list;
1145         struct bio *bio;
1146         int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
1147         int ret;
1148
1149         bio_list_init(&bio_list);
1150
1151         if (bbio->num_stripes - rbio->nr_data == 1) {
1152                 p_stripe = bbio->num_stripes - 1;
1153         } else if (bbio->num_stripes - rbio->nr_data == 2) {
1154                 p_stripe = bbio->num_stripes - 2;
1155                 q_stripe = bbio->num_stripes - 1;
1156         } else {
1157                 BUG();
1158         }
1159
1160         /* at this point we either have a full stripe,
1161          * or we've read the full stripe from the drive.
1162          * recalculate the parity and write the new results.
1163          *
1164          * We're not allowed to add any new bios to the
1165          * bio list here, anyone else that wants to
1166          * change this stripe needs to do their own rmw.
1167          */
1168         spin_lock_irq(&rbio->bio_list_lock);
1169         set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1170         spin_unlock_irq(&rbio->bio_list_lock);
1171
1172         atomic_set(&rbio->bbio->error, 0);
1173
1174         /*
1175          * now that we've set rmw_locked, run through the
1176          * bio list one last time and map the page pointers
1177          *
1178          * We don't cache full rbios because we're assuming
1179          * the higher layers are unlikely to use this area of
1180          * the disk again soon.  If they do use it again,
1181          * hopefully they will send another full bio.
1182          */
1183         index_rbio_pages(rbio);
1184         if (!rbio_is_full(rbio))
1185                 cache_rbio_pages(rbio);
1186         else
1187                 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1188
1189         for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1190                 struct page *p;
1191                 /* first collect one page from each data stripe */
1192                 for (stripe = 0; stripe < nr_data; stripe++) {
1193                         p = page_in_rbio(rbio, stripe, pagenr, 0);
1194                         pointers[stripe] = kmap(p);
1195                 }
1196
1197                 /* then add the parity stripe */
1198                 p = rbio_pstripe_page(rbio, pagenr);
1199                 SetPageUptodate(p);
1200                 pointers[stripe++] = kmap(p);
1201
1202                 if (q_stripe != -1) {
1203
1204                         /*
1205                          * raid6, add the qstripe and call the
1206                          * library function to fill in our p/q
1207                          */
1208                         p = rbio_qstripe_page(rbio, pagenr);
1209                         SetPageUptodate(p);
1210                         pointers[stripe++] = kmap(p);
1211
1212                         raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
1213                                                 pointers);
1214                 } else {
1215                         /* raid5 */
1216                         memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
1217                         run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
1218                 }
1219
1220
1221                 for (stripe = 0; stripe < bbio->num_stripes; stripe++)
1222                         kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1223         }
1224
1225         /*
1226          * time to start writing.  Make bios for everything from the
1227          * higher layers (the bio_list in our rbio) and our p/q.  Ignore
1228          * everything else.
1229          */
1230         for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1231                 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1232                         struct page *page;
1233                         if (stripe < rbio->nr_data) {
1234                                 page = page_in_rbio(rbio, stripe, pagenr, 1);
1235                                 if (!page)
1236                                         continue;
1237                         } else {
1238                                page = rbio_stripe_page(rbio, stripe, pagenr);
1239                         }
1240
1241                         ret = rbio_add_io_page(rbio, &bio_list,
1242                                        page, stripe, pagenr, rbio->stripe_len);
1243                         if (ret)
1244                                 goto cleanup;
1245                 }
1246         }
1247
1248         atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
1249         BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
1250
1251         while (1) {
1252                 bio = bio_list_pop(&bio_list);
1253                 if (!bio)
1254                         break;
1255
1256                 bio->bi_private = rbio;
1257                 bio->bi_end_io = raid_write_end_io;
1258                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1259                 submit_bio(WRITE, bio);
1260         }
1261         return;
1262
1263 cleanup:
1264         rbio_orig_end_io(rbio, -EIO, 0);
1265 }
1266
1267 /*
1268  * helper to find the stripe number for a given bio.  Used to figure out which
1269  * stripe has failed.  This expects the bio to correspond to a physical disk,
1270  * so it looks up based on physical sector numbers.
1271  */
1272 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1273                            struct bio *bio)
1274 {
1275         u64 physical = bio->bi_iter.bi_sector;
1276         u64 stripe_start;
1277         int i;
1278         struct btrfs_bio_stripe *stripe;
1279
1280         physical <<= 9;
1281
1282         for (i = 0; i < rbio->bbio->num_stripes; i++) {
1283                 stripe = &rbio->bbio->stripes[i];
1284                 stripe_start = stripe->physical;
1285                 if (physical >= stripe_start &&
1286                     physical < stripe_start + rbio->stripe_len) {
1287                         return i;
1288                 }
1289         }
1290         return -1;
1291 }
1292
1293 /*
1294  * helper to find the stripe number for a given
1295  * bio (before mapping).  Used to figure out which stripe has
1296  * failed.  This looks up based on logical block numbers.
1297  */
1298 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1299                                    struct bio *bio)
1300 {
1301         u64 logical = bio->bi_iter.bi_sector;
1302         u64 stripe_start;
1303         int i;
1304
1305         logical <<= 9;
1306
1307         for (i = 0; i < rbio->nr_data; i++) {
1308                 stripe_start = rbio->raid_map[i];
1309                 if (logical >= stripe_start &&
1310                     logical < stripe_start + rbio->stripe_len) {
1311                         return i;
1312                 }
1313         }
1314         return -1;
1315 }
1316
1317 /*
1318  * returns -EIO if we had too many failures
1319  */
1320 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1321 {
1322         unsigned long flags;
1323         int ret = 0;
1324
1325         spin_lock_irqsave(&rbio->bio_list_lock, flags);
1326
1327         /* we already know this stripe is bad, move on */
1328         if (rbio->faila == failed || rbio->failb == failed)
1329                 goto out;
1330
1331         if (rbio->faila == -1) {
1332                 /* first failure on this rbio */
1333                 rbio->faila = failed;
1334                 atomic_inc(&rbio->bbio->error);
1335         } else if (rbio->failb == -1) {
1336                 /* second failure on this rbio */
1337                 rbio->failb = failed;
1338                 atomic_inc(&rbio->bbio->error);
1339         } else {
1340                 ret = -EIO;
1341         }
1342 out:
1343         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1344
1345         return ret;
1346 }
1347
1348 /*
1349  * helper to fail a stripe based on a physical disk
1350  * bio.
1351  */
1352 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1353                            struct bio *bio)
1354 {
1355         int failed = find_bio_stripe(rbio, bio);
1356
1357         if (failed < 0)
1358                 return -EIO;
1359
1360         return fail_rbio_index(rbio, failed);
1361 }
1362
1363 /*
1364  * this sets each page in the bio uptodate.  It should only be used on private
1365  * rbio pages, nothing that comes in from the higher layers
1366  */
1367 static void set_bio_pages_uptodate(struct bio *bio)
1368 {
1369         int i;
1370         struct page *p;
1371
1372         for (i = 0; i < bio->bi_vcnt; i++) {
1373                 p = bio->bi_io_vec[i].bv_page;
1374                 SetPageUptodate(p);
1375         }
1376 }
1377
1378 /*
1379  * end io for the read phase of the rmw cycle.  All the bios here are physical
1380  * stripe bios we've read from the disk so we can recalculate the parity of the
1381  * stripe.
1382  *
1383  * This will usually kick off finish_rmw once all the bios are read in, but it
1384  * may trigger parity reconstruction if we had any errors along the way
1385  */
1386 static void raid_rmw_end_io(struct bio *bio, int err)
1387 {
1388         struct btrfs_raid_bio *rbio = bio->bi_private;
1389
1390         if (err)
1391                 fail_bio_stripe(rbio, bio);
1392         else
1393                 set_bio_pages_uptodate(bio);
1394
1395         bio_put(bio);
1396
1397         if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1398                 return;
1399
1400         err = 0;
1401         if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1402                 goto cleanup;
1403
1404         /*
1405          * this will normally call finish_rmw to start our write
1406          * but if there are any failed stripes we'll reconstruct
1407          * from parity first
1408          */
1409         validate_rbio_for_rmw(rbio);
1410         return;
1411
1412 cleanup:
1413
1414         rbio_orig_end_io(rbio, -EIO, 0);
1415 }
1416
1417 static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1418 {
1419         btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
1420
1421         btrfs_queue_work(rbio->fs_info->rmw_workers,
1422                          &rbio->work);
1423 }
1424
1425 static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1426 {
1427         btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
1428
1429         btrfs_queue_work(rbio->fs_info->rmw_workers,
1430                          &rbio->work);
1431 }
1432
1433 /*
1434  * the stripe must be locked by the caller.  It will
1435  * unlock after all the writes are done
1436  */
1437 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1438 {
1439         int bios_to_read = 0;
1440         struct btrfs_bio *bbio = rbio->bbio;
1441         struct bio_list bio_list;
1442         int ret;
1443         int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1444         int pagenr;
1445         int stripe;
1446         struct bio *bio;
1447
1448         bio_list_init(&bio_list);
1449
1450         ret = alloc_rbio_pages(rbio);
1451         if (ret)
1452                 goto cleanup;
1453
1454         index_rbio_pages(rbio);
1455
1456         atomic_set(&rbio->bbio->error, 0);
1457         /*
1458          * build a list of bios to read all the missing parts of this
1459          * stripe
1460          */
1461         for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1462                 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1463                         struct page *page;
1464                         /*
1465                          * we want to find all the pages missing from
1466                          * the rbio and read them from the disk.  If
1467                          * page_in_rbio finds a page in the bio list
1468                          * we don't need to read it off the stripe.
1469                          */
1470                         page = page_in_rbio(rbio, stripe, pagenr, 1);
1471                         if (page)
1472                                 continue;
1473
1474                         page = rbio_stripe_page(rbio, stripe, pagenr);
1475                         /*
1476                          * the bio cache may have handed us an uptodate
1477                          * page.  If so, be happy and use it
1478                          */
1479                         if (PageUptodate(page))
1480                                 continue;
1481
1482                         ret = rbio_add_io_page(rbio, &bio_list, page,
1483                                        stripe, pagenr, rbio->stripe_len);
1484                         if (ret)
1485                                 goto cleanup;
1486                 }
1487         }
1488
1489         bios_to_read = bio_list_size(&bio_list);
1490         if (!bios_to_read) {
1491                 /*
1492                  * this can happen if others have merged with
1493                  * us, it means there is nothing left to read.
1494                  * But if there are missing devices it may not be
1495                  * safe to do the full stripe write yet.
1496                  */
1497                 goto finish;
1498         }
1499
1500         /*
1501          * the bbio may be freed once we submit the last bio.  Make sure
1502          * not to touch it after that
1503          */
1504         atomic_set(&bbio->stripes_pending, bios_to_read);
1505         while (1) {
1506                 bio = bio_list_pop(&bio_list);
1507                 if (!bio)
1508                         break;
1509
1510                 bio->bi_private = rbio;
1511                 bio->bi_end_io = raid_rmw_end_io;
1512
1513                 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1514                                     BTRFS_WQ_ENDIO_RAID56);
1515
1516                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1517                 submit_bio(READ, bio);
1518         }
1519         /* the actual write will happen once the reads are done */
1520         return 0;
1521
1522 cleanup:
1523         rbio_orig_end_io(rbio, -EIO, 0);
1524         return -EIO;
1525
1526 finish:
1527         validate_rbio_for_rmw(rbio);
1528         return 0;
1529 }
1530
1531 /*
1532  * if the upper layers pass in a full stripe, we thank them by only allocating
1533  * enough pages to hold the parity, and sending it all down quickly.
1534  */
1535 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1536 {
1537         int ret;
1538
1539         ret = alloc_rbio_parity_pages(rbio);
1540         if (ret) {
1541                 __free_raid_bio(rbio);
1542                 return ret;
1543         }
1544
1545         ret = lock_stripe_add(rbio);
1546         if (ret == 0)
1547                 finish_rmw(rbio);
1548         return 0;
1549 }
1550
1551 /*
1552  * partial stripe writes get handed over to async helpers.
1553  * We're really hoping to merge a few more writes into this
1554  * rbio before calculating new parity
1555  */
1556 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1557 {
1558         int ret;
1559
1560         ret = lock_stripe_add(rbio);
1561         if (ret == 0)
1562                 async_rmw_stripe(rbio);
1563         return 0;
1564 }
1565
1566 /*
1567  * sometimes while we were reading from the drive to
1568  * recalculate parity, enough new bios come into create
1569  * a full stripe.  So we do a check here to see if we can
1570  * go directly to finish_rmw
1571  */
1572 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1573 {
1574         /* head off into rmw land if we don't have a full stripe */
1575         if (!rbio_is_full(rbio))
1576                 return partial_stripe_write(rbio);
1577         return full_stripe_write(rbio);
1578 }
1579
1580 /*
1581  * We use plugging call backs to collect full stripes.
1582  * Any time we get a partial stripe write while plugged
1583  * we collect it into a list.  When the unplug comes down,
1584  * we sort the list by logical block number and merge
1585  * everything we can into the same rbios
1586  */
1587 struct btrfs_plug_cb {
1588         struct blk_plug_cb cb;
1589         struct btrfs_fs_info *info;
1590         struct list_head rbio_list;
1591         struct btrfs_work work;
1592 };
1593
1594 /*
1595  * rbios on the plug list are sorted for easier merging.
1596  */
1597 static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1598 {
1599         struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1600                                                  plug_list);
1601         struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1602                                                  plug_list);
1603         u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1604         u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1605
1606         if (a_sector < b_sector)
1607                 return -1;
1608         if (a_sector > b_sector)
1609                 return 1;
1610         return 0;
1611 }
1612
1613 static void run_plug(struct btrfs_plug_cb *plug)
1614 {
1615         struct btrfs_raid_bio *cur;
1616         struct btrfs_raid_bio *last = NULL;
1617
1618         /*
1619          * sort our plug list then try to merge
1620          * everything we can in hopes of creating full
1621          * stripes.
1622          */
1623         list_sort(NULL, &plug->rbio_list, plug_cmp);
1624         while (!list_empty(&plug->rbio_list)) {
1625                 cur = list_entry(plug->rbio_list.next,
1626                                  struct btrfs_raid_bio, plug_list);
1627                 list_del_init(&cur->plug_list);
1628
1629                 if (rbio_is_full(cur)) {
1630                         /* we have a full stripe, send it down */
1631                         full_stripe_write(cur);
1632                         continue;
1633                 }
1634                 if (last) {
1635                         if (rbio_can_merge(last, cur)) {
1636                                 merge_rbio(last, cur);
1637                                 __free_raid_bio(cur);
1638                                 continue;
1639
1640                         }
1641                         __raid56_parity_write(last);
1642                 }
1643                 last = cur;
1644         }
1645         if (last) {
1646                 __raid56_parity_write(last);
1647         }
1648         kfree(plug);
1649 }
1650
1651 /*
1652  * if the unplug comes from schedule, we have to push the
1653  * work off to a helper thread
1654  */
1655 static void unplug_work(struct btrfs_work *work)
1656 {
1657         struct btrfs_plug_cb *plug;
1658         plug = container_of(work, struct btrfs_plug_cb, work);
1659         run_plug(plug);
1660 }
1661
1662 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1663 {
1664         struct btrfs_plug_cb *plug;
1665         plug = container_of(cb, struct btrfs_plug_cb, cb);
1666
1667         if (from_schedule) {
1668                 btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
1669                 btrfs_queue_work(plug->info->rmw_workers,
1670                                  &plug->work);
1671                 return;
1672         }
1673         run_plug(plug);
1674 }
1675
1676 /*
1677  * our main entry point for writes from the rest of the FS.
1678  */
1679 int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1680                         struct btrfs_bio *bbio, u64 *raid_map,
1681                         u64 stripe_len)
1682 {
1683         struct btrfs_raid_bio *rbio;
1684         struct btrfs_plug_cb *plug = NULL;
1685         struct blk_plug_cb *cb;
1686
1687         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1688         if (IS_ERR(rbio))
1689                 return PTR_ERR(rbio);
1690         bio_list_add(&rbio->bio_list, bio);
1691         rbio->bio_list_bytes = bio->bi_iter.bi_size;
1692
1693         /*
1694          * don't plug on full rbios, just get them out the door
1695          * as quickly as we can
1696          */
1697         if (rbio_is_full(rbio))
1698                 return full_stripe_write(rbio);
1699
1700         cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
1701                                sizeof(*plug));
1702         if (cb) {
1703                 plug = container_of(cb, struct btrfs_plug_cb, cb);
1704                 if (!plug->info) {
1705                         plug->info = root->fs_info;
1706                         INIT_LIST_HEAD(&plug->rbio_list);
1707                 }
1708                 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1709         } else {
1710                 return __raid56_parity_write(rbio);
1711         }
1712         return 0;
1713 }
1714
1715 /*
1716  * all parity reconstruction happens here.  We've read in everything
1717  * we can find from the drives and this does the heavy lifting of
1718  * sorting the good from the bad.
1719  */
1720 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1721 {
1722         int pagenr, stripe;
1723         void **pointers;
1724         int faila = -1, failb = -1;
1725         int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1726         struct page *page;
1727         int err;
1728         int i;
1729
1730         pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
1731                            GFP_NOFS);
1732         if (!pointers) {
1733                 err = -ENOMEM;
1734                 goto cleanup_io;
1735         }
1736
1737         faila = rbio->faila;
1738         failb = rbio->failb;
1739
1740         if (rbio->read_rebuild) {
1741                 spin_lock_irq(&rbio->bio_list_lock);
1742                 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1743                 spin_unlock_irq(&rbio->bio_list_lock);
1744         }
1745
1746         index_rbio_pages(rbio);
1747
1748         for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1749                 /* setup our array of pointers with pages
1750                  * from each stripe
1751                  */
1752                 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1753                         /*
1754                          * if we're rebuilding a read, we have to use
1755                          * pages from the bio list
1756                          */
1757                         if (rbio->read_rebuild &&
1758                             (stripe == faila || stripe == failb)) {
1759                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
1760                         } else {
1761                                 page = rbio_stripe_page(rbio, stripe, pagenr);
1762                         }
1763                         pointers[stripe] = kmap(page);
1764                 }
1765
1766                 /* all raid6 handling here */
1767                 if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
1768                     RAID6_Q_STRIPE) {
1769
1770                         /*
1771                          * single failure, rebuild from parity raid5
1772                          * style
1773                          */
1774                         if (failb < 0) {
1775                                 if (faila == rbio->nr_data) {
1776                                         /*
1777                                          * Just the P stripe has failed, without
1778                                          * a bad data or Q stripe.
1779                                          * TODO, we should redo the xor here.
1780                                          */
1781                                         err = -EIO;
1782                                         goto cleanup;
1783                                 }
1784                                 /*
1785                                  * a single failure in raid6 is rebuilt
1786                                  * in the pstripe code below
1787                                  */
1788                                 goto pstripe;
1789                         }
1790
1791                         /* make sure our ps and qs are in order */
1792                         if (faila > failb) {
1793                                 int tmp = failb;
1794                                 failb = faila;
1795                                 faila = tmp;
1796                         }
1797
1798                         /* if the q stripe is failed, do a pstripe reconstruction
1799                          * from the xors.
1800                          * If both the q stripe and the P stripe are failed, we're
1801                          * here due to a crc mismatch and we can't give them the
1802                          * data they want
1803                          */
1804                         if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
1805                                 if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
1806                                         err = -EIO;
1807                                         goto cleanup;
1808                                 }
1809                                 /*
1810                                  * otherwise we have one bad data stripe and
1811                                  * a good P stripe.  raid5!
1812                                  */
1813                                 goto pstripe;
1814                         }
1815
1816                         if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1817                                 raid6_datap_recov(rbio->bbio->num_stripes,
1818                                                   PAGE_SIZE, faila, pointers);
1819                         } else {
1820                                 raid6_2data_recov(rbio->bbio->num_stripes,
1821                                                   PAGE_SIZE, faila, failb,
1822                                                   pointers);
1823                         }
1824                 } else {
1825                         void *p;
1826
1827                         /* rebuild from P stripe here (raid5 or raid6) */
1828                         BUG_ON(failb != -1);
1829 pstripe:
1830                         /* Copy parity block into failed block to start with */
1831                         memcpy(pointers[faila],
1832                                pointers[rbio->nr_data],
1833                                PAGE_CACHE_SIZE);
1834
1835                         /* rearrange the pointer array */
1836                         p = pointers[faila];
1837                         for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1838                                 pointers[stripe] = pointers[stripe + 1];
1839                         pointers[rbio->nr_data - 1] = p;
1840
1841                         /* xor in the rest */
1842                         run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1843                 }
1844                 /* if we're doing this rebuild as part of an rmw, go through
1845                  * and set all of our private rbio pages in the
1846                  * failed stripes as uptodate.  This way finish_rmw will
1847                  * know they can be trusted.  If this was a read reconstruction,
1848                  * other endio functions will fiddle the uptodate bits
1849                  */
1850                 if (!rbio->read_rebuild) {
1851                         for (i = 0;  i < nr_pages; i++) {
1852                                 if (faila != -1) {
1853                                         page = rbio_stripe_page(rbio, faila, i);
1854                                         SetPageUptodate(page);
1855                                 }
1856                                 if (failb != -1) {
1857                                         page = rbio_stripe_page(rbio, failb, i);
1858                                         SetPageUptodate(page);
1859                                 }
1860                         }
1861                 }
1862                 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1863                         /*
1864                          * if we're rebuilding a read, we have to use
1865                          * pages from the bio list
1866                          */
1867                         if (rbio->read_rebuild &&
1868                             (stripe == faila || stripe == failb)) {
1869                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
1870                         } else {
1871                                 page = rbio_stripe_page(rbio, stripe, pagenr);
1872                         }
1873                         kunmap(page);
1874                 }
1875         }
1876
1877         err = 0;
1878 cleanup:
1879         kfree(pointers);
1880
1881 cleanup_io:
1882
1883         if (rbio->read_rebuild) {
1884                 if (err == 0)
1885                         cache_rbio_pages(rbio);
1886                 else
1887                         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1888
1889                 rbio_orig_end_io(rbio, err, err == 0);
1890         } else if (err == 0) {
1891                 rbio->faila = -1;
1892                 rbio->failb = -1;
1893                 finish_rmw(rbio);
1894         } else {
1895                 rbio_orig_end_io(rbio, err, 0);
1896         }
1897 }
1898
1899 /*
1900  * This is called only for stripes we've read from disk to
1901  * reconstruct the parity.
1902  */
1903 static void raid_recover_end_io(struct bio *bio, int err)
1904 {
1905         struct btrfs_raid_bio *rbio = bio->bi_private;
1906
1907         /*
1908          * we only read stripe pages off the disk, set them
1909          * up to date if there were no errors
1910          */
1911         if (err)
1912                 fail_bio_stripe(rbio, bio);
1913         else
1914                 set_bio_pages_uptodate(bio);
1915         bio_put(bio);
1916
1917         if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1918                 return;
1919
1920         if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1921                 rbio_orig_end_io(rbio, -EIO, 0);
1922         else
1923                 __raid_recover_end_io(rbio);
1924 }
1925
1926 /*
1927  * reads everything we need off the disk to reconstruct
1928  * the parity. endio handlers trigger final reconstruction
1929  * when the IO is done.
1930  *
1931  * This is used both for reads from the higher layers and for
1932  * parity construction required to finish a rmw cycle.
1933  */
1934 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1935 {
1936         int bios_to_read = 0;
1937         struct btrfs_bio *bbio = rbio->bbio;
1938         struct bio_list bio_list;
1939         int ret;
1940         int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1941         int pagenr;
1942         int stripe;
1943         struct bio *bio;
1944
1945         bio_list_init(&bio_list);
1946
1947         ret = alloc_rbio_pages(rbio);
1948         if (ret)
1949                 goto cleanup;
1950
1951         atomic_set(&rbio->bbio->error, 0);
1952
1953         /*
1954          * read everything that hasn't failed.  Thanks to the
1955          * stripe cache, it is possible that some or all of these
1956          * pages are going to be uptodate.
1957          */
1958         for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1959                 if (rbio->faila == stripe || rbio->failb == stripe) {
1960                         atomic_inc(&rbio->bbio->error);
1961                         continue;
1962                 }
1963
1964                 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1965                         struct page *p;
1966
1967                         /*
1968                          * the rmw code may have already read this
1969                          * page in
1970                          */
1971                         p = rbio_stripe_page(rbio, stripe, pagenr);
1972                         if (PageUptodate(p))
1973                                 continue;
1974
1975                         ret = rbio_add_io_page(rbio, &bio_list,
1976                                        rbio_stripe_page(rbio, stripe, pagenr),
1977                                        stripe, pagenr, rbio->stripe_len);
1978                         if (ret < 0)
1979                                 goto cleanup;
1980                 }
1981         }
1982
1983         bios_to_read = bio_list_size(&bio_list);
1984         if (!bios_to_read) {
1985                 /*
1986                  * we might have no bios to read just because the pages
1987                  * were up to date, or we might have no bios to read because
1988                  * the devices were gone.
1989                  */
1990                 if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
1991                         __raid_recover_end_io(rbio);
1992                         goto out;
1993                 } else {
1994                         goto cleanup;
1995                 }
1996         }
1997
1998         /*
1999          * the bbio may be freed once we submit the last bio.  Make sure
2000          * not to touch it after that
2001          */
2002         atomic_set(&bbio->stripes_pending, bios_to_read);
2003         while (1) {
2004                 bio = bio_list_pop(&bio_list);
2005                 if (!bio)
2006                         break;
2007
2008                 bio->bi_private = rbio;
2009                 bio->bi_end_io = raid_recover_end_io;
2010
2011                 btrfs_bio_wq_end_io(rbio->fs_info, bio,
2012                                     BTRFS_WQ_ENDIO_RAID56);
2013
2014                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2015                 submit_bio(READ, bio);
2016         }
2017 out:
2018         return 0;
2019
2020 cleanup:
2021         if (rbio->read_rebuild)
2022                 rbio_orig_end_io(rbio, -EIO, 0);
2023         return -EIO;
2024 }
2025
2026 /*
2027  * the main entry point for reads from the higher layers.  This
2028  * is really only called when the normal read path had a failure,
2029  * so we assume the bio they send down corresponds to a failed part
2030  * of the drive.
2031  */
2032 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2033                           struct btrfs_bio *bbio, u64 *raid_map,
2034                           u64 stripe_len, int mirror_num)
2035 {
2036         struct btrfs_raid_bio *rbio;
2037         int ret;
2038
2039         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2040         if (IS_ERR(rbio))
2041                 return PTR_ERR(rbio);
2042
2043         rbio->read_rebuild = 1;
2044         bio_list_add(&rbio->bio_list, bio);
2045         rbio->bio_list_bytes = bio->bi_iter.bi_size;
2046
2047         rbio->faila = find_logical_bio_stripe(rbio, bio);
2048         if (rbio->faila == -1) {
2049                 BUG();
2050                 kfree(raid_map);
2051                 kfree(bbio);
2052                 kfree(rbio);
2053                 return -EIO;
2054         }
2055
2056         /*
2057          * reconstruct from the q stripe if they are
2058          * asking for mirror 3
2059          */
2060         if (mirror_num == 3)
2061                 rbio->failb = bbio->num_stripes - 2;
2062
2063         ret = lock_stripe_add(rbio);
2064
2065         /*
2066          * __raid56_parity_recover will end the bio with
2067          * any errors it hits.  We don't want to return
2068          * its error value up the stack because our caller
2069          * will end up calling bio_endio with any nonzero
2070          * return
2071          */
2072         if (ret == 0)
2073                 __raid56_parity_recover(rbio);
2074         /*
2075          * our rbio has been added to the list of
2076          * rbios that will be handled after the
2077          * currently lock owner is done
2078          */
2079         return 0;
2080
2081 }
2082
2083 static void rmw_work(struct btrfs_work *work)
2084 {
2085         struct btrfs_raid_bio *rbio;
2086
2087         rbio = container_of(work, struct btrfs_raid_bio, work);
2088         raid56_rmw_stripe(rbio);
2089 }
2090
2091 static void read_rebuild_work(struct btrfs_work *work)
2092 {
2093         struct btrfs_raid_bio *rbio;
2094
2095         rbio = container_of(work, struct btrfs_raid_bio, work);
2096         __raid56_parity_recover(rbio);
2097 }