fs/btrfs/scrub.c

   1 /*
   2  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18
  19 #include <linux/blkdev.h>
  20 #include <linux/ratelimit.h>
  21 #include "ctree.h"
  22 #include "volumes.h"
  23 #include "disk-io.h"
  24 #include "ordered-data.h"
  25 #include "transaction.h"
  26 #include "backref.h"
  27 #include "extent_io.h"
  28 #include "dev-replace.h"
  29 #include "check-integrity.h"
  30 #include "rcu-string.h"
  31 #include "raid56.h"
  32
  33 /*
  34  * This is only the first step towards a full-features scrub. It reads all
  35  * extent and super block and verifies the checksums. In case a bad checksum
  36  * is found or the extent cannot be read, good data will be written back if
  37  * any can be found.
  38  *
  39  * Future enhancements:
  40  *  - In case an unrepairable extent is encountered, track which files are
  41  *    affected and report them
  42  *  - track and record media errors, throw out bad devices
  43  *  - add a mode to also read unallocated space
  44  */
  45
  46 struct scrub_block;
  47 struct scrub_ctx;
  48
  49 /*
  50  * the following three values only influence the performance.
  51  * The last one configures the number of parallel and outstanding I/O
  52  * operations. The first two values configure an upper limit for the number
  53  * of (dynamically allocated) pages that are added to a bio.
  54  */
  55 #define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
  56 #define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
  57 #define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
  58
  59 /*
  60  * the following value times PAGE_SIZE needs to be large enough to match the
  61  * largest node/leaf/sector size that shall be supported.
  62  * Values larger than BTRFS_STRIPE_LEN are not supported.
  63  */
  64 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
  65
  66 struct scrub_recover {
  67         atomic_t                refs;
  68         struct btrfs_bio        *bbio;
  69         u64                     map_length;
  70 };
  71
  72 struct scrub_page {
  73         struct scrub_block      *sblock;
  74         struct page             *page;
  75         struct btrfs_device     *dev;
  76         struct list_head        list;
  77         u64                     flags;  /* extent flags */
  78         u64                     generation;
  79         u64                     logical;
  80         u64                     physical;
  81         u64                     physical_for_dev_replace;
  82         atomic_t                refs;
  83         struct {
  84                 unsigned int    mirror_num:8;
  85                 unsigned int    have_csum:1;
  86                 unsigned int    io_error:1;
  87         };
  88         u8                      csum[BTRFS_CSUM_SIZE];
  89
  90         struct scrub_recover    *recover;
  91 };
  92
  93 struct scrub_bio {
  94         int                     index;
  95         struct scrub_ctx        *sctx;
  96         struct btrfs_device     *dev;
  97         struct bio              *bio;
  98         int                     err;
  99         u64                     logical;
 100         u64                     physical;
 101 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
 102         struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
 103 #else
 104         struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
 105 #endif
 106         int                     page_count;
 107         int                     next_free;
 108         struct btrfs_work       work;
 109 };
 110
 111 struct scrub_block {
 112         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
 113         int                     page_count;
 114         atomic_t                outstanding_pages;
 115         atomic_t                refs; /* free mem on transition to zero */
 116         struct scrub_ctx        *sctx;
 117         struct scrub_parity     *sparity;
 118         struct {
 119                 unsigned int    header_error:1;
 120                 unsigned int    checksum_error:1;
 121                 unsigned int    no_io_error_seen:1;
 122                 unsigned int    generation_error:1; /* also sets header_error */
 123
 124                 /* The following is for the data used to check parity */
 125                 /* It is for the data with checksum */
 126                 unsigned int    data_corrected:1;
 127         };
 128 };
 129
 130 /* Used for the chunks with parity stripe such RAID5/6 */
 131 struct scrub_parity {
 132         struct scrub_ctx        *sctx;
 133
 134         struct btrfs_device     *scrub_dev;
 135
 136         u64                     logic_start;
 137
 138         u64                     logic_end;
 139
 140         int                     nsectors;
 141
 142         int                     stripe_len;
 143
 144         atomic_t                refs;
 145
 146         struct list_head        spages;
 147
 148         /* Work of parity check and repair */
 149         struct btrfs_work       work;
 150
 151         /* Mark the parity blocks which have data */
 152         unsigned long           *dbitmap;
 153
 154         /*
 155          * Mark the parity blocks which have data, but errors happen when
 156          * read data or check data
 157          */
 158         unsigned long           *ebitmap;
 159
 160         unsigned long           bitmap[0];
 161 };
 162
 163 struct scrub_wr_ctx {
 164         struct scrub_bio *wr_curr_bio;
 165         struct btrfs_device *tgtdev;
 166         int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
 167         atomic_t flush_all_writes;
 168         struct mutex wr_lock;
 169 };
 170
 171 struct scrub_ctx {
 172         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
 173         struct btrfs_root       *dev_root;
 174         int                     first_free;
 175         int                     curr;
 176         atomic_t                bios_in_flight;
 177         atomic_t                workers_pending;
 178         spinlock_t              list_lock;
 179         wait_queue_head_t       list_wait;
 180         u16                     csum_size;
 181         struct list_head        csum_list;
 182         atomic_t                cancel_req;
 183         int                     readonly;
 184         int                     pages_per_rd_bio;
 185         u32                     sectorsize;
 186         u32                     nodesize;
 187
 188         int                     is_dev_replace;
 189         struct scrub_wr_ctx     wr_ctx;
 190
 191         /*
 192          * statistics
 193          */
 194         struct btrfs_scrub_progress stat;
 195         spinlock_t              stat_lock;
 196
 197         /*
 198          * Use a ref counter to avoid use-after-free issues. Scrub workers
 199          * decrement bios_in_flight and workers_pending and then do a wakeup
 200          * on the list_wait wait queue. We must ensure the main scrub task
 201          * doesn't free the scrub context before or while the workers are
 202          * doing the wakeup() call.
 203          */
 204         atomic_t                refs;
 205 };
 206
 207 struct scrub_fixup_nodatasum {
 208         struct scrub_ctx        *sctx;
 209         struct btrfs_device     *dev;
 210         u64                     logical;
 211         struct btrfs_root       *root;
 212         struct btrfs_work       work;
 213         int                     mirror_num;
 214 };
 215
 216 struct scrub_nocow_inode {
 217         u64                     inum;
 218         u64                     offset;
 219         u64                     root;
 220         struct list_head        list;
 221 };
 222
 223 struct scrub_copy_nocow_ctx {
 224         struct scrub_ctx        *sctx;
 225         u64                     logical;
 226         u64                     len;
 227         int                     mirror_num;
 228         u64                     physical_for_dev_replace;
 229         struct list_head        inodes;
 230         struct btrfs_work       work;
 231 };
 232
 233 struct scrub_warning {
 234         struct btrfs_path       *path;
 235         u64                     extent_item_size;
 236         const char              *errstr;
 237         sector_t                sector;
 238         u64                     logical;
 239         struct btrfs_device     *dev;
 240 };
 241
 242 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
 243 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
 244 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
 245 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
 246 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
 247 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 248                                      struct scrub_block *sblocks_for_recheck);
 249 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 250                                 struct scrub_block *sblock, int is_metadata,
 251                                 int have_csum, u8 *csum, u64 generation,
 252                                 u16 csum_size, int retry_failed_mirror);
 253 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 254                                          struct scrub_block *sblock,
 255                                          int is_metadata, int have_csum,
 256                                          const u8 *csum, u64 generation,
 257                                          u16 csum_size);
 258 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 259                                              struct scrub_block *sblock_good);
 260 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 261                                             struct scrub_block *sblock_good,
 262                                             int page_num, int force_write);
 263 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
 264 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
 265                                            int page_num);
 266 static int scrub_checksum_data(struct scrub_block *sblock);
 267 static int scrub_checksum_tree_block(struct scrub_block *sblock);
 268 static int scrub_checksum_super(struct scrub_block *sblock);
 269 static void scrub_block_get(struct scrub_block *sblock);
 270 static void scrub_block_put(struct scrub_block *sblock);
 271 static void scrub_page_get(struct scrub_page *spage);
 272 static void scrub_page_put(struct scrub_page *spage);
 273 static void scrub_parity_get(struct scrub_parity *sparity);
 274 static void scrub_parity_put(struct scrub_parity *sparity);
 275 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
 276                                     struct scrub_page *spage);
 277 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 278                        u64 physical, struct btrfs_device *dev, u64 flags,
 279                        u64 gen, int mirror_num, u8 *csum, int force,
 280                        u64 physical_for_dev_replace);
 281 static void scrub_bio_end_io(struct bio *bio);
 282 static void scrub_bio_end_io_worker(struct btrfs_work *work);
 283 static void scrub_block_complete(struct scrub_block *sblock);
 284 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
 285                                u64 extent_logical, u64 extent_len,
 286                                u64 *extent_physical,
 287                                struct btrfs_device **extent_dev,
 288                                int *extent_mirror_num);
 289 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
 290                               struct scrub_wr_ctx *wr_ctx,
 291                               struct btrfs_fs_info *fs_info,
 292                               struct btrfs_device *dev,
 293                               int is_dev_replace);
 294 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
 295 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 296                                     struct scrub_page *spage);
 297 static void scrub_wr_submit(struct scrub_ctx *sctx);
 298 static void scrub_wr_bio_end_io(struct bio *bio);
 299 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
 300 static int write_page_nocow(struct scrub_ctx *sctx,
 301                             u64 physical_for_dev_replace, struct page *page);
 302 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 303                                       struct scrub_copy_nocow_ctx *ctx);
 304 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 305                             int mirror_num, u64 physical_for_dev_replace);
 306 static void copy_nocow_pages_worker(struct btrfs_work *work);
 307 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 308 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 309 static void scrub_put_ctx(struct scrub_ctx *sctx);
 310
 311
 312 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 313 {
 314         atomic_inc(&sctx->refs);
 315         atomic_inc(&sctx->bios_in_flight);
 316 }
 317
 318 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 319 {
 320         atomic_dec(&sctx->bios_in_flight);
 321         wake_up(&sctx->list_wait);
 322         scrub_put_ctx(sctx);
 323 }
 324
 325 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 326 {
 327         while (atomic_read(&fs_info->scrub_pause_req)) {
 328                 mutex_unlock(&fs_info->scrub_lock);
 329                 wait_event(fs_info->scrub_pause_wait,
 330                    atomic_read(&fs_info->scrub_pause_req) == 0);
 331                 mutex_lock(&fs_info->scrub_lock);
 332         }
 333 }
 334
 335 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 336 {
 337         atomic_inc(&fs_info->scrubs_paused);
 338         wake_up(&fs_info->scrub_pause_wait);
 339
 340         mutex_lock(&fs_info->scrub_lock);
 341         __scrub_blocked_if_needed(fs_info);
 342         atomic_dec(&fs_info->scrubs_paused);
 343         mutex_unlock(&fs_info->scrub_lock);
 344
 345         wake_up(&fs_info->scrub_pause_wait);
 346 }
 347
 348 /*
 349  * used for workers that require transaction commits (i.e., for the
 350  * NOCOW case)
 351  */
 352 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
 353 {
 354         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 355
 356         atomic_inc(&sctx->refs);
 357         /*
 358          * increment scrubs_running to prevent cancel requests from
 359          * completing as long as a worker is running. we must also
 360          * increment scrubs_paused to prevent deadlocking on pause
 361          * requests used for transactions commits (as the worker uses a
 362          * transaction context). it is safe to regard the worker
 363          * as paused for all matters practical. effectively, we only
 364          * avoid cancellation requests from completing.
 365          */
 366         mutex_lock(&fs_info->scrub_lock);
 367         atomic_inc(&fs_info->scrubs_running);
 368         atomic_inc(&fs_info->scrubs_paused);
 369         mutex_unlock(&fs_info->scrub_lock);
 370
 371         /*
 372          * check if @scrubs_running=@scrubs_paused condition
 373          * inside wait_event() is not an atomic operation.
 374          * which means we may inc/dec @scrub_running/paused
 375          * at any time. Let's wake up @scrub_pause_wait as
 376          * much as we can to let commit transaction blocked less.
 377          */
 378         wake_up(&fs_info->scrub_pause_wait);
 379
 380         atomic_inc(&sctx->workers_pending);
 381 }
 382
 383 /* used for workers that require transaction commits */
 384 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
 385 {
 386         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 387
 388         /*
 389          * see scrub_pending_trans_workers_inc() why we're pretending
 390          * to be paused in the scrub counters
 391          */
 392         mutex_lock(&fs_info->scrub_lock);
 393         atomic_dec(&fs_info->scrubs_running);
 394         atomic_dec(&fs_info->scrubs_paused);
 395         mutex_unlock(&fs_info->scrub_lock);
 396         atomic_dec(&sctx->workers_pending);
 397         wake_up(&fs_info->scrub_pause_wait);
 398         wake_up(&sctx->list_wait);
 399         scrub_put_ctx(sctx);
 400 }
 401
 402 static void scrub_free_csums(struct scrub_ctx *sctx)
 403 {
 404         while (!list_empty(&sctx->csum_list)) {
 405                 struct btrfs_ordered_sum *sum;
 406                 sum = list_first_entry(&sctx->csum_list,
 407                                        struct btrfs_ordered_sum, list);
 408                 list_del(&sum->list);
 409                 kfree(sum);
 410         }
 411 }
 412
 413 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 414 {
 415         int i;
 416
 417         if (!sctx)
 418                 return;
 419
 420         scrub_free_wr_ctx(&sctx->wr_ctx);
 421
 422         /* this can happen when scrub is cancelled */
 423         if (sctx->curr != -1) {
 424                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
 425
 426                 for (i = 0; i < sbio->page_count; i++) {
 427                         WARN_ON(!sbio->pagev[i]->page);
 428                         scrub_block_put(sbio->pagev[i]->sblock);
 429                 }
 430                 bio_put(sbio->bio);
 431         }
 432
 433         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 434                 struct scrub_bio *sbio = sctx->bios[i];
 435
 436                 if (!sbio)
 437                         break;
 438                 kfree(sbio);
 439         }
 440
 441         scrub_free_csums(sctx);
 442         kfree(sctx);
 443 }
 444
 445 static void scrub_put_ctx(struct scrub_ctx *sctx)
 446 {
 447         if (atomic_dec_and_test(&sctx->refs))
 448                 scrub_free_ctx(sctx);
 449 }
 450
 451 static noinline_for_stack
 452 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 453 {
 454         struct scrub_ctx *sctx;
 455         int             i;
 456         struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
 457         int ret;
 458
 459         sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
 460         if (!sctx)
 461                 goto nomem;
 462         atomic_set(&sctx->refs, 1);
 463         sctx->is_dev_replace = is_dev_replace;
 464         sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
 465         sctx->curr = -1;
 466         sctx->dev_root = dev->dev_root;
 467         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 468                 struct scrub_bio *sbio;
 469
 470                 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
 471                 if (!sbio)
 472                         goto nomem;
 473                 sctx->bios[i] = sbio;
 474
 475                 sbio->index = i;
 476                 sbio->sctx = sctx;
 477                 sbio->page_count = 0;
 478                 btrfs_init_work(&sbio->work, btrfs_scrub_helper,
 479                                 scrub_bio_end_io_worker, NULL, NULL);
 480
 481                 if (i != SCRUB_BIOS_PER_SCTX - 1)
 482                         sctx->bios[i]->next_free = i + 1;
 483                 else
 484                         sctx->bios[i]->next_free = -1;
 485         }
 486         sctx->first_free = 0;
 487         sctx->nodesize = dev->dev_root->nodesize;
 488         sctx->sectorsize = dev->dev_root->sectorsize;
 489         atomic_set(&sctx->bios_in_flight, 0);
 490         atomic_set(&sctx->workers_pending, 0);
 491         atomic_set(&sctx->cancel_req, 0);
 492         sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
 493         INIT_LIST_HEAD(&sctx->csum_list);
 494
 495         spin_lock_init(&sctx->list_lock);
 496         spin_lock_init(&sctx->stat_lock);
 497         init_waitqueue_head(&sctx->list_wait);
 498
 499         ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
 500                                  fs_info->dev_replace.tgtdev, is_dev_replace);
 501         if (ret) {
 502                 scrub_free_ctx(sctx);
 503                 return ERR_PTR(ret);
 504         }
 505         return sctx;
 506
 507 nomem:
 508         scrub_free_ctx(sctx);
 509         return ERR_PTR(-ENOMEM);
 510 }
 511
 512 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 513                                      void *warn_ctx)
 514 {
 515         u64 isize;
 516         u32 nlink;
 517         int ret;
 518         int i;
 519         struct extent_buffer *eb;
 520         struct btrfs_inode_item *inode_item;
 521         struct scrub_warning *swarn = warn_ctx;
 522         struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
 523         struct inode_fs_paths *ipath = NULL;
 524         struct btrfs_root *local_root;
 525         struct btrfs_key root_key;
 526         struct btrfs_key key;
 527
 528         root_key.objectid = root;
 529         root_key.type = BTRFS_ROOT_ITEM_KEY;
 530         root_key.offset = (u64)-1;
 531         local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
 532         if (IS_ERR(local_root)) {
 533                 ret = PTR_ERR(local_root);
 534                 goto err;
 535         }
 536
 537         /*
 538          * this makes the path point to (inum INODE_ITEM ioff)
 539          */
 540         key.objectid = inum;
 541         key.type = BTRFS_INODE_ITEM_KEY;
 542         key.offset = 0;
 543
 544         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
 545         if (ret) {
 546                 btrfs_release_path(swarn->path);
 547                 goto err;
 548         }
 549
 550         eb = swarn->path->nodes[0];
 551         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 552                                         struct btrfs_inode_item);
 553         isize = btrfs_inode_size(eb, inode_item);
 554         nlink = btrfs_inode_nlink(eb, inode_item);
 555         btrfs_release_path(swarn->path);
 556
 557         ipath = init_ipath(4096, local_root, swarn->path);
 558         if (IS_ERR(ipath)) {
 559                 ret = PTR_ERR(ipath);
 560                 ipath = NULL;
 561                 goto err;
 562         }
 563         ret = paths_from_inode(inum, ipath);
 564
 565         if (ret < 0)
 566                 goto err;
 567
 568         /*
 569          * we deliberately ignore the bit ipath might have been too small to
 570          * hold all of the paths here
 571          */
 572         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 573                 printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
 574                         "%s, sector %llu, root %llu, inode %llu, offset %llu, "
 575                         "length %llu, links %u (path: %s)\n", swarn->errstr,
 576                         swarn->logical, rcu_str_deref(swarn->dev->name),
 577                         (unsigned long long)swarn->sector, root, inum, offset,
 578                         min(isize - offset, (u64)PAGE_SIZE), nlink,
 579                         (char *)(unsigned long)ipath->fspath->val[i]);
 580
 581         free_ipath(ipath);
 582         return 0;
 583
 584 err:
 585         printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
 586                 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
 587                 "resolving failed with ret=%d\n", swarn->errstr,
 588                 swarn->logical, rcu_str_deref(swarn->dev->name),
 589                 (unsigned long long)swarn->sector, root, inum, offset, ret);
 590
 591         free_ipath(ipath);
 592         return 0;
 593 }
 594
 595 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 596 {
 597         struct btrfs_device *dev;
 598         struct btrfs_fs_info *fs_info;
 599         struct btrfs_path *path;
 600         struct btrfs_key found_key;
 601         struct extent_buffer *eb;
 602         struct btrfs_extent_item *ei;
 603         struct scrub_warning swarn;
 604         unsigned long ptr = 0;
 605         u64 extent_item_pos;
 606         u64 flags = 0;
 607         u64 ref_root;
 608         u32 item_size;
 609         u8 ref_level;
 610         int ret;
 611
 612         WARN_ON(sblock->page_count < 1);
 613         dev = sblock->pagev[0]->dev;
 614         fs_info = sblock->sctx->dev_root->fs_info;
 615
 616         path = btrfs_alloc_path();
 617         if (!path)
 618                 return;
 619
 620         swarn.sector = (sblock->pagev[0]->physical) >> 9;
 621         swarn.logical = sblock->pagev[0]->logical;
 622         swarn.errstr = errstr;
 623         swarn.dev = NULL;
 624
 625         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 626                                   &flags);
 627         if (ret < 0)
 628                 goto out;
 629
 630         extent_item_pos = swarn.logical - found_key.objectid;
 631         swarn.extent_item_size = found_key.offset;
 632
 633         eb = path->nodes[0];
 634         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 635         item_size = btrfs_item_size_nr(eb, path->slots[0]);
 636
 637         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 638                 do {
 639                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 640                                                       item_size, &ref_root,
 641                                                       &ref_level);
 642                         printk_in_rcu(KERN_WARNING
 643                                 "BTRFS: %s at logical %llu on dev %s, "
 644                                 "sector %llu: metadata %s (level %d) in tree "
 645                                 "%llu\n", errstr, swarn.logical,
 646                                 rcu_str_deref(dev->name),
 647                                 (unsigned long long)swarn.sector,
 648                                 ref_level ? "node" : "leaf",
 649                                 ret < 0 ? -1 : ref_level,
 650                                 ret < 0 ? -1 : ref_root);
 651                 } while (ret != 1);
 652                 btrfs_release_path(path);
 653         } else {
 654                 btrfs_release_path(path);
 655                 swarn.path = path;
 656                 swarn.dev = dev;
 657                 iterate_extent_inodes(fs_info, found_key.objectid,
 658                                         extent_item_pos, 1,
 659                                         scrub_print_warning_inode, &swarn);
 660         }
 661
 662 out:
 663         btrfs_free_path(path);
 664 }
 665
 666 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
 667 {
 668         struct page *page = NULL;
 669         unsigned long index;
 670         struct scrub_fixup_nodatasum *fixup = fixup_ctx;
 671         int ret;
 672         int corrected = 0;
 673         struct btrfs_key key;
 674         struct inode *inode = NULL;
 675         struct btrfs_fs_info *fs_info;
 676         u64 end = offset + PAGE_SIZE - 1;
 677         struct btrfs_root *local_root;
 678         int srcu_index;
 679
 680         key.objectid = root;
 681         key.type = BTRFS_ROOT_ITEM_KEY;
 682         key.offset = (u64)-1;
 683
 684         fs_info = fixup->root->fs_info;
 685         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
 686
 687         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
 688         if (IS_ERR(local_root)) {
 689                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 690                 return PTR_ERR(local_root);
 691         }
 692
 693         key.type = BTRFS_INODE_ITEM_KEY;
 694         key.objectid = inum;
 695         key.offset = 0;
 696         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
 697         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 698         if (IS_ERR(inode))
 699                 return PTR_ERR(inode);
 700
 701         index = offset >> PAGE_CACHE_SHIFT;
 702
 703         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
 704         if (!page) {
 705                 ret = -ENOMEM;
 706                 goto out;
 707         }
 708
 709         if (PageUptodate(page)) {
 710                 if (PageDirty(page)) {
 711                         /*
 712                          * we need to write the data to the defect sector. the
 713                          * data that was in that sector is not in memory,
 714                          * because the page was modified. we must not write the
 715                          * modified page to that sector.
 716                          *
 717                          * TODO: what could be done here: wait for the delalloc
 718                          *       runner to write out that page (might involve
 719                          *       COW) and see whether the sector is still
 720                          *       referenced afterwards.
 721                          *
 722                          * For the meantime, we'll treat this error
 723                          * incorrectable, although there is a chance that a
 724                          * later scrub will find the bad sector again and that
 725                          * there's no dirty page in memory, then.
 726                          */
 727                         ret = -EIO;
 728                         goto out;
 729                 }
 730                 ret = repair_io_failure(inode, offset, PAGE_SIZE,
 731                                         fixup->logical, page,
 732                                         offset - page_offset(page),
 733                                         fixup->mirror_num);
 734                 unlock_page(page);
 735                 corrected = !ret;
 736         } else {
 737                 /*
 738                  * we need to get good data first. the general readpage path
 739                  * will call repair_io_failure for us, we just have to make
 740                  * sure we read the bad mirror.
 741                  */
 742                 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 743                                         EXTENT_DAMAGED, GFP_NOFS);
 744                 if (ret) {
 745                         /* set_extent_bits should give proper error */
 746                         WARN_ON(ret > 0);
 747                         if (ret > 0)
 748                                 ret = -EFAULT;
 749                         goto out;
 750                 }
 751
 752                 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
 753                                                 btrfs_get_extent,
 754                                                 fixup->mirror_num);
 755                 wait_on_page_locked(page);
 756
 757                 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
 758                                                 end, EXTENT_DAMAGED, 0, NULL);
 759                 if (!corrected)
 760                         clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 761                                                 EXTENT_DAMAGED, GFP_NOFS);
 762         }
 763
 764 out:
 765         if (page)
 766                 put_page(page);
 767
 768         iput(inode);
 769
 770         if (ret < 0)
 771                 return ret;
 772
 773         if (ret == 0 && corrected) {
 774                 /*
 775                  * we only need to call readpage for one of the inodes belonging
 776                  * to this extent. so make iterate_extent_inodes stop
 777                  */
 778                 return 1;
 779         }
 780
 781         return -EIO;
 782 }
 783
 784 static void scrub_fixup_nodatasum(struct btrfs_work *work)
 785 {
 786         int ret;
 787         struct scrub_fixup_nodatasum *fixup;
 788         struct scrub_ctx *sctx;
 789         struct btrfs_trans_handle *trans = NULL;
 790         struct btrfs_path *path;
 791         int uncorrectable = 0;
 792
 793         fixup = container_of(work, struct scrub_fixup_nodatasum, work);
 794         sctx = fixup->sctx;
 795
 796         path = btrfs_alloc_path();
 797         if (!path) {
 798                 spin_lock(&sctx->stat_lock);
 799                 ++sctx->stat.malloc_errors;
 800                 spin_unlock(&sctx->stat_lock);
 801                 uncorrectable = 1;
 802                 goto out;
 803         }
 804
 805         trans = btrfs_join_transaction(fixup->root);
 806         if (IS_ERR(trans)) {
 807                 uncorrectable = 1;
 808                 goto out;
 809         }
 810
 811         /*
 812          * the idea is to trigger a regular read through the standard path. we
 813          * read a page from the (failed) logical address by specifying the
 814          * corresponding copynum of the failed sector. thus, that readpage is
 815          * expected to fail.
 816          * that is the point where on-the-fly error correction will kick in
 817          * (once it's finished) and rewrite the failed sector if a good copy
 818          * can be found.
 819          */
 820         ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
 821                                                 path, scrub_fixup_readpage,
 822                                                 fixup);
 823         if (ret < 0) {
 824                 uncorrectable = 1;
 825                 goto out;
 826         }
 827         WARN_ON(ret != 1);
 828
 829         spin_lock(&sctx->stat_lock);
 830         ++sctx->stat.corrected_errors;
 831         spin_unlock(&sctx->stat_lock);
 832
 833 out:
 834         if (trans && !IS_ERR(trans))
 835                 btrfs_end_transaction(trans, fixup->root);
 836         if (uncorrectable) {
 837                 spin_lock(&sctx->stat_lock);
 838                 ++sctx->stat.uncorrectable_errors;
 839                 spin_unlock(&sctx->stat_lock);
 840                 btrfs_dev_replace_stats_inc(
 841                         &sctx->dev_root->fs_info->dev_replace.
 842                         num_uncorrectable_read_errors);
 843                 printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
 844                     "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
 845                         fixup->logical, rcu_str_deref(fixup->dev->name));
 846         }
 847
 848         btrfs_free_path(path);
 849         kfree(fixup);
 850
 851         scrub_pending_trans_workers_dec(sctx);
 852 }
 853
 854 static inline void scrub_get_recover(struct scrub_recover *recover)
 855 {
 856         atomic_inc(&recover->refs);
 857 }
 858
 859 static inline void scrub_put_recover(struct scrub_recover *recover)
 860 {
 861         if (atomic_dec_and_test(&recover->refs)) {
 862                 btrfs_put_bbio(recover->bbio);
 863                 kfree(recover);
 864         }
 865 }
 866
 867 /*
 868  * scrub_handle_errored_block gets called when either verification of the
 869  * pages failed or the bio failed to read, e.g. with EIO. In the latter
 870  * case, this function handles all pages in the bio, even though only one
 871  * may be bad.
 872  * The goal of this function is to repair the errored block by using the
 873  * contents of one of the mirrors.
 874  */
 875 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 876 {
 877         struct scrub_ctx *sctx = sblock_to_check->sctx;
 878         struct btrfs_device *dev;
 879         struct btrfs_fs_info *fs_info;
 880         u64 length;
 881         u64 logical;
 882         u64 generation;
 883         unsigned int failed_mirror_index;
 884         unsigned int is_metadata;
 885         unsigned int have_csum;
 886         u8 *csum;
 887         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
 888         struct scrub_block *sblock_bad;
 889         int ret;
 890         int mirror_index;
 891         int page_num;
 892         int success;
 893         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
 894                                       DEFAULT_RATELIMIT_BURST);
 895
 896         BUG_ON(sblock_to_check->page_count < 1);
 897         fs_info = sctx->dev_root->fs_info;
 898         if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
 899                 /*
 900                  * if we find an error in a super block, we just report it.
 901                  * They will get written with the next transaction commit
 902                  * anyway
 903                  */
 904                 spin_lock(&sctx->stat_lock);
 905                 ++sctx->stat.super_errors;
 906                 spin_unlock(&sctx->stat_lock);
 907                 return 0;
 908         }
 909         length = sblock_to_check->page_count * PAGE_SIZE;
 910         logical = sblock_to_check->pagev[0]->logical;
 911         generation = sblock_to_check->pagev[0]->generation;
 912         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
 913         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
 914         is_metadata = !(sblock_to_check->pagev[0]->flags &
 915                         BTRFS_EXTENT_FLAG_DATA);
 916         have_csum = sblock_to_check->pagev[0]->have_csum;
 917         csum = sblock_to_check->pagev[0]->csum;
 918         dev = sblock_to_check->pagev[0]->dev;
 919
 920         if (sctx->is_dev_replace && !is_metadata && !have_csum) {
 921                 sblocks_for_recheck = NULL;
 922                 goto nodatasum_case;
 923         }
 924
 925         /*
 926          * read all mirrors one after the other. This includes to
 927          * re-read the extent or metadata block that failed (that was
 928          * the cause that this fixup code is called) another time,
 929          * page by page this time in order to know which pages
 930          * caused I/O errors and which ones are good (for all mirrors).
 931          * It is the goal to handle the situation when more than one
 932          * mirror contains I/O errors, but the errors do not
 933          * overlap, i.e. the data can be repaired by selecting the
 934          * pages from those mirrors without I/O error on the
 935          * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
 936          * would be that mirror #1 has an I/O error on the first page,
 937          * the second page is good, and mirror #2 has an I/O error on
 938          * the second page, but the first page is good.
 939          * Then the first page of the first mirror can be repaired by
 940          * taking the first page of the second mirror, and the
 941          * second page of the second mirror can be repaired by
 942          * copying the contents of the 2nd page of the 1st mirror.
 943          * One more note: if the pages of one mirror contain I/O
 944          * errors, the checksum cannot be verified. In order to get
 945          * the best data for repairing, the first attempt is to find
 946          * a mirror without I/O errors and with a validated checksum.
 947          * Only if this is not possible, the pages are picked from
 948          * mirrors with I/O errors without considering the checksum.
 949          * If the latter is the case, at the end, the checksum of the
 950          * repaired area is verified in order to correctly maintain
 951          * the statistics.
 952          */
 953
 954         sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
 955                                       sizeof(*sblocks_for_recheck), GFP_NOFS);
 956         if (!sblocks_for_recheck) {
 957                 spin_lock(&sctx->stat_lock);
 958                 sctx->stat.malloc_errors++;
 959                 sctx->stat.read_errors++;
 960                 sctx->stat.uncorrectable_errors++;
 961                 spin_unlock(&sctx->stat_lock);
 962                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 963                 goto out;
 964         }
 965
 966         /* setup the context, map the logical blocks and alloc the pages */
 967         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
 968         if (ret) {
 969                 spin_lock(&sctx->stat_lock);
 970                 sctx->stat.read_errors++;
 971                 sctx->stat.uncorrectable_errors++;
 972                 spin_unlock(&sctx->stat_lock);
 973                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 974                 goto out;
 975         }
 976         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
 977         sblock_bad = sblocks_for_recheck + failed_mirror_index;
 978
 979         /* build and submit the bios for the failed mirror, check checksums */
 980         scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
 981                             csum, generation, sctx->csum_size, 1);
 982
 983         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
 984             sblock_bad->no_io_error_seen) {
 985                 /*
 986                  * the error disappeared after reading page by page, or
 987                  * the area was part of a huge bio and other parts of the
 988                  * bio caused I/O errors, or the block layer merged several
 989                  * read requests into one and the error is caused by a
 990                  * different bio (usually one of the two latter cases is
 991                  * the cause)
 992                  */
 993                 spin_lock(&sctx->stat_lock);
 994                 sctx->stat.unverified_errors++;
 995                 sblock_to_check->data_corrected = 1;
 996                 spin_unlock(&sctx->stat_lock);
 997
 998                 if (sctx->is_dev_replace)
 999                         scrub_write_block_to_dev_replace(sblock_bad);
1000                 goto out;
1001         }
1002
1003         if (!sblock_bad->no_io_error_seen) {
1004                 spin_lock(&sctx->stat_lock);
1005                 sctx->stat.read_errors++;
1006                 spin_unlock(&sctx->stat_lock);
1007                 if (__ratelimit(&_rs))
1008                         scrub_print_warning("i/o error", sblock_to_check);
1009                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1010         } else if (sblock_bad->checksum_error) {
1011                 spin_lock(&sctx->stat_lock);
1012                 sctx->stat.csum_errors++;
1013                 spin_unlock(&sctx->stat_lock);
1014                 if (__ratelimit(&_rs))
1015                         scrub_print_warning("checksum error", sblock_to_check);
1016                 btrfs_dev_stat_inc_and_print(dev,
1017                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
1018         } else if (sblock_bad->header_error) {
1019                 spin_lock(&sctx->stat_lock);
1020                 sctx->stat.verify_errors++;
1021                 spin_unlock(&sctx->stat_lock);
1022                 if (__ratelimit(&_rs))
1023                         scrub_print_warning("checksum/header error",
1024                                             sblock_to_check);
1025                 if (sblock_bad->generation_error)
1026                         btrfs_dev_stat_inc_and_print(dev,
1027                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1028                 else
1029                         btrfs_dev_stat_inc_and_print(dev,
1030                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1031         }
1032
1033         if (sctx->readonly) {
1034                 ASSERT(!sctx->is_dev_replace);
1035                 goto out;
1036         }
1037
1038         if (!is_metadata && !have_csum) {
1039                 struct scrub_fixup_nodatasum *fixup_nodatasum;
1040
1041                 WARN_ON(sctx->is_dev_replace);
1042
1043 nodatasum_case:
1044
1045                 /*
1046                  * !is_metadata and !have_csum, this means that the data
1047                  * might not be COW'ed, that it might be modified
1048                  * concurrently. The general strategy to work on the
1049                  * commit root does not help in the case when COW is not
1050                  * used.
1051                  */
1052                 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
1053                 if (!fixup_nodatasum)
1054                         goto did_not_correct_error;
1055                 fixup_nodatasum->sctx = sctx;
1056                 fixup_nodatasum->dev = dev;
1057                 fixup_nodatasum->logical = logical;
1058                 fixup_nodatasum->root = fs_info->extent_root;
1059                 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
1060                 scrub_pending_trans_workers_inc(sctx);
1061                 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1062                                 scrub_fixup_nodatasum, NULL, NULL);
1063                 btrfs_queue_work(fs_info->scrub_workers,
1064                                  &fixup_nodatasum->work);
1065                 goto out;
1066         }
1067
1068         /*
1069          * now build and submit the bios for the other mirrors, check
1070          * checksums.
1071          * First try to pick the mirror which is completely without I/O
1072          * errors and also does not have a checksum error.
1073          * If one is found, and if a checksum is present, the full block
1074          * that is known to contain an error is rewritten. Afterwards
1075          * the block is known to be corrected.
1076          * If a mirror is found which is completely correct, and no
1077          * checksum is present, only those pages are rewritten that had
1078          * an I/O error in the block to be repaired, since it cannot be
1079          * determined, which copy of the other pages is better (and it
1080          * could happen otherwise that a correct page would be
1081          * overwritten by a bad one).
1082          */
1083         for (mirror_index = 0;
1084              mirror_index < BTRFS_MAX_MIRRORS &&
1085              sblocks_for_recheck[mirror_index].page_count > 0;
1086              mirror_index++) {
1087                 struct scrub_block *sblock_other;
1088
1089                 if (mirror_index == failed_mirror_index)
1090                         continue;
1091                 sblock_other = sblocks_for_recheck + mirror_index;
1092
1093                 /* build and submit the bios, check checksums */
1094                 scrub_recheck_block(fs_info, sblock_other, is_metadata,
1095                                     have_csum, csum, generation,
1096                                     sctx->csum_size, 0);
1097
1098                 if (!sblock_other->header_error &&
1099                     !sblock_other->checksum_error &&
1100                     sblock_other->no_io_error_seen) {
1101                         if (sctx->is_dev_replace) {
1102                                 scrub_write_block_to_dev_replace(sblock_other);
1103                                 goto corrected_error;
1104                         } else {
1105                                 ret = scrub_repair_block_from_good_copy(
1106                                                 sblock_bad, sblock_other);
1107                                 if (!ret)
1108                                         goto corrected_error;
1109                         }
1110                 }
1111         }
1112
1113         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1114                 goto did_not_correct_error;
1115
1116         /*
1117          * In case of I/O errors in the area that is supposed to be
1118          * repaired, continue by picking good copies of those pages.
1119          * Select the good pages from mirrors to rewrite bad pages from
1120          * the area to fix. Afterwards verify the checksum of the block
1121          * that is supposed to be repaired. This verification step is
1122          * only done for the purpose of statistic counting and for the
1123          * final scrub report, whether errors remain.
1124          * A perfect algorithm could make use of the checksum and try
1125          * all possible combinations of pages from the different mirrors
1126          * until the checksum verification succeeds. For example, when
1127          * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1128          * of mirror #2 is readable but the final checksum test fails,
1129          * then the 2nd page of mirror #3 could be tried, whether now
1130          * the final checksum succeedes. But this would be a rare
1131          * exception and is therefore not implemented. At least it is
1132          * avoided that the good copy is overwritten.
1133          * A more useful improvement would be to pick the sectors
1134          * without I/O error based on sector sizes (512 bytes on legacy
1135          * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1136          * mirror could be repaired by taking 512 byte of a different
1137          * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1138          * area are unreadable.
1139          */
1140         success = 1;
1141         for (page_num = 0; page_num < sblock_bad->page_count;
1142              page_num++) {
1143                 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1144                 struct scrub_block *sblock_other = NULL;
1145
1146                 /* skip no-io-error page in scrub */
1147                 if (!page_bad->io_error && !sctx->is_dev_replace)
1148                         continue;
1149
1150                 /* try to find no-io-error page in mirrors */
1151                 if (page_bad->io_error) {
1152                         for (mirror_index = 0;
1153                              mirror_index < BTRFS_MAX_MIRRORS &&
1154                              sblocks_for_recheck[mirror_index].page_count > 0;
1155                              mirror_index++) {
1156                                 if (!sblocks_for_recheck[mirror_index].
1157                                     pagev[page_num]->io_error) {
1158                                         sblock_other = sblocks_for_recheck +
1159                                                        mirror_index;
1160                                         break;
1161                                 }
1162                         }
1163                         if (!sblock_other)
1164                                 success = 0;
1165                 }
1166
1167                 if (sctx->is_dev_replace) {
1168                         /*
1169                          * did not find a mirror to fetch the page
1170                          * from. scrub_write_page_to_dev_replace()
1171                          * handles this case (page->io_error), by
1172                          * filling the block with zeros before
1173                          * submitting the write request
1174                          */
1175                         if (!sblock_other)
1176                                 sblock_other = sblock_bad;
1177
1178                         if (scrub_write_page_to_dev_replace(sblock_other,
1179                                                             page_num) != 0) {
1180                                 btrfs_dev_replace_stats_inc(
1181                                         &sctx->dev_root->
1182                                         fs_info->dev_replace.
1183                                         num_write_errors);
1184                                 success = 0;
1185                         }
1186                 } else if (sblock_other) {
1187                         ret = scrub_repair_page_from_good_copy(sblock_bad,
1188                                                                sblock_other,
1189                                                                page_num, 0);
1190                         if (0 == ret)
1191                                 page_bad->io_error = 0;
1192                         else
1193                                 success = 0;
1194                 }
1195         }
1196
1197         if (success && !sctx->is_dev_replace) {
1198                 if (is_metadata || have_csum) {
1199                         /*
1200                          * need to verify the checksum now that all
1201                          * sectors on disk are repaired (the write
1202                          * request for data to be repaired is on its way).
1203                          * Just be lazy and use scrub_recheck_block()
1204                          * which re-reads the data before the checksum
1205                          * is verified, but most likely the data comes out
1206                          * of the page cache.
1207                          */
1208                         scrub_recheck_block(fs_info, sblock_bad,
1209                                             is_metadata, have_csum, csum,
1210                                             generation, sctx->csum_size, 1);
1211                         if (!sblock_bad->header_error &&
1212                             !sblock_bad->checksum_error &&
1213                             sblock_bad->no_io_error_seen)
1214                                 goto corrected_error;
1215                         else
1216                                 goto did_not_correct_error;
1217                 } else {
1218 corrected_error:
1219                         spin_lock(&sctx->stat_lock);
1220                         sctx->stat.corrected_errors++;
1221                         sblock_to_check->data_corrected = 1;
1222                         spin_unlock(&sctx->stat_lock);
1223                         printk_ratelimited_in_rcu(KERN_ERR
1224                                 "BTRFS: fixed up error at logical %llu on dev %s\n",
1225                                 logical, rcu_str_deref(dev->name));
1226                 }
1227         } else {
1228 did_not_correct_error:
1229                 spin_lock(&sctx->stat_lock);
1230                 sctx->stat.uncorrectable_errors++;
1231                 spin_unlock(&sctx->stat_lock);
1232                 printk_ratelimited_in_rcu(KERN_ERR
1233                         "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
1234                         logical, rcu_str_deref(dev->name));
1235         }
1236
1237 out:
1238         if (sblocks_for_recheck) {
1239                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1240                      mirror_index++) {
1241                         struct scrub_block *sblock = sblocks_for_recheck +
1242                                                      mirror_index;
1243                         struct scrub_recover *recover;
1244                         int page_index;
1245
1246                         for (page_index = 0; page_index < sblock->page_count;
1247                              page_index++) {
1248                                 sblock->pagev[page_index]->sblock = NULL;
1249                                 recover = sblock->pagev[page_index]->recover;
1250                                 if (recover) {
1251                                         scrub_put_recover(recover);
1252                                         sblock->pagev[page_index]->recover =
1253                                                                         NULL;
1254                                 }
1255                                 scrub_page_put(sblock->pagev[page_index]);
1256                         }
1257                 }
1258                 kfree(sblocks_for_recheck);
1259         }
1260
1261         return 0;
1262 }
1263
1264 static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1265 {
1266         if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1267                 return 2;
1268         else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1269                 return 3;
1270         else
1271                 return (int)bbio->num_stripes;
1272 }
1273
1274 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1275                                                  u64 *raid_map,
1276                                                  u64 mapped_length,
1277                                                  int nstripes, int mirror,
1278                                                  int *stripe_index,
1279                                                  u64 *stripe_offset)
1280 {
1281         int i;
1282
1283         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1284                 /* RAID5/6 */
1285                 for (i = 0; i < nstripes; i++) {
1286                         if (raid_map[i] == RAID6_Q_STRIPE ||
1287                             raid_map[i] == RAID5_P_STRIPE)
1288                                 continue;
1289
1290                         if (logical >= raid_map[i] &&
1291                             logical < raid_map[i] + mapped_length)
1292                                 break;
1293                 }
1294
1295                 *stripe_index = i;
1296                 *stripe_offset = logical - raid_map[i];
1297         } else {
1298                 /* The other RAID type */
1299                 *stripe_index = mirror;
1300                 *stripe_offset = 0;
1301         }
1302 }
1303
1304 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1305                                      struct scrub_block *sblocks_for_recheck)
1306 {
1307         struct scrub_ctx *sctx = original_sblock->sctx;
1308         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
1309         u64 length = original_sblock->page_count * PAGE_SIZE;
1310         u64 logical = original_sblock->pagev[0]->logical;
1311         struct scrub_recover *recover;
1312         struct btrfs_bio *bbio;
1313         u64 sublen;
1314         u64 mapped_length;
1315         u64 stripe_offset;
1316         int stripe_index;
1317         int page_index = 0;
1318         int mirror_index;
1319         int nmirrors;
1320         int ret;
1321
1322         /*
1323          * note: the two members refs and outstanding_pages
1324          * are not used (and not set) in the blocks that are used for
1325          * the recheck procedure
1326          */
1327
1328         while (length > 0) {
1329                 sublen = min_t(u64, length, PAGE_SIZE);
1330                 mapped_length = sublen;
1331                 bbio = NULL;
1332
1333                 /*
1334                  * with a length of PAGE_SIZE, each returned stripe
1335                  * represents one mirror
1336                  */
1337                 ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
1338                                        &mapped_length, &bbio, 0, 1);
1339                 if (ret || !bbio || mapped_length < sublen) {
1340                         btrfs_put_bbio(bbio);
1341                         return -EIO;
1342                 }
1343
1344                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1345                 if (!recover) {
1346                         btrfs_put_bbio(bbio);
1347                         return -ENOMEM;
1348                 }
1349
1350                 atomic_set(&recover->refs, 1);
1351                 recover->bbio = bbio;
1352                 recover->map_length = mapped_length;
1353
1354                 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1355
1356                 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1357
1358                 for (mirror_index = 0; mirror_index < nmirrors;
1359                      mirror_index++) {
1360                         struct scrub_block *sblock;
1361                         struct scrub_page *page;
1362
1363                         sblock = sblocks_for_recheck + mirror_index;
1364                         sblock->sctx = sctx;
1365                         page = kzalloc(sizeof(*page), GFP_NOFS);
1366                         if (!page) {
1367 leave_nomem:
1368                                 spin_lock(&sctx->stat_lock);
1369                                 sctx->stat.malloc_errors++;
1370                                 spin_unlock(&sctx->stat_lock);
1371                                 scrub_put_recover(recover);
1372                                 return -ENOMEM;
1373                         }
1374                         scrub_page_get(page);
1375                         sblock->pagev[page_index] = page;
1376                         page->logical = logical;
1377
1378                         scrub_stripe_index_and_offset(logical,
1379                                                       bbio->map_type,
1380                                                       bbio->raid_map,
1381                                                       mapped_length,
1382                                                       bbio->num_stripes -
1383                                                       bbio->num_tgtdevs,
1384                                                       mirror_index,
1385                                                       &stripe_index,
1386                                                       &stripe_offset);
1387                         page->physical = bbio->stripes[stripe_index].physical +
1388                                          stripe_offset;
1389                         page->dev = bbio->stripes[stripe_index].dev;
1390
1391                         BUG_ON(page_index >= original_sblock->page_count);
1392                         page->physical_for_dev_replace =
1393                                 original_sblock->pagev[page_index]->
1394                                 physical_for_dev_replace;
1395                         /* for missing devices, dev->bdev is NULL */
1396                         page->mirror_num = mirror_index + 1;
1397                         sblock->page_count++;
1398                         page->page = alloc_page(GFP_NOFS);
1399                         if (!page->page)
1400                                 goto leave_nomem;
1401
1402                         scrub_get_recover(recover);
1403                         page->recover = recover;
1404                 }
1405                 scrub_put_recover(recover);
1406                 length -= sublen;
1407                 logical += sublen;
1408                 page_index++;
1409         }
1410
1411         return 0;
1412 }
1413
1414 struct scrub_bio_ret {
1415         struct completion event;
1416         int error;
1417 };
1418
1419 static void scrub_bio_wait_endio(struct bio *bio)
1420 {
1421         struct scrub_bio_ret *ret = bio->bi_private;
1422
1423         ret->error = bio->bi_error;
1424         complete(&ret->event);
1425 }
1426
1427 static inline int scrub_is_page_on_raid56(struct scrub_page *page)
1428 {
1429         return page->recover &&
1430                (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
1431 }
1432
1433 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1434                                         struct bio *bio,
1435                                         struct scrub_page *page)
1436 {
1437         struct scrub_bio_ret done;
1438         int ret;
1439
1440         init_completion(&done.event);
1441         done.error = 0;
1442         bio->bi_iter.bi_sector = page->logical >> 9;
1443         bio->bi_private = &done;
1444         bio->bi_end_io = scrub_bio_wait_endio;
1445
1446         ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
1447                                     page->recover->map_length,
1448                                     page->mirror_num, 0);
1449         if (ret)
1450                 return ret;
1451
1452         wait_for_completion(&done.event);
1453         if (done.error)
1454                 return -EIO;
1455
1456         return 0;
1457 }
1458
1459 /*
1460  * this function will check the on disk data for checksum errors, header
1461  * errors and read I/O errors. If any I/O errors happen, the exact pages
1462  * which are errored are marked as being bad. The goal is to enable scrub
1463  * to take those pages that are not errored from all the mirrors so that
1464  * the pages that are errored in the just handled mirror can be repaired.
1465  */
1466 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1467                                 struct scrub_block *sblock, int is_metadata,
1468                                 int have_csum, u8 *csum, u64 generation,
1469                                 u16 csum_size, int retry_failed_mirror)
1470 {
1471         int page_num;
1472
1473         sblock->no_io_error_seen = 1;
1474         sblock->header_error = 0;
1475         sblock->checksum_error = 0;
1476
1477         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1478                 struct bio *bio;
1479                 struct scrub_page *page = sblock->pagev[page_num];
1480
1481                 if (page->dev->bdev == NULL) {
1482                         page->io_error = 1;
1483                         sblock->no_io_error_seen = 0;
1484                         continue;
1485                 }
1486
1487                 WARN_ON(!page->page);
1488                 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1489                 if (!bio) {
1490                         page->io_error = 1;
1491                         sblock->no_io_error_seen = 0;
1492                         continue;
1493                 }
1494                 bio->bi_bdev = page->dev->bdev;
1495
1496                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1497                 if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
1498                         if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
1499                                 sblock->no_io_error_seen = 0;
1500                 } else {
1501                         bio->bi_iter.bi_sector = page->physical >> 9;
1502
1503                         if (btrfsic_submit_bio_wait(READ, bio))
1504                                 sblock->no_io_error_seen = 0;
1505                 }
1506
1507                 bio_put(bio);
1508         }
1509
1510         if (sblock->no_io_error_seen)
1511                 scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
1512                                              have_csum, csum, generation,
1513                                              csum_size);
1514
1515         return;
1516 }
1517
1518 static inline int scrub_check_fsid(u8 fsid[],
1519                                    struct scrub_page *spage)
1520 {
1521         struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1522         int ret;
1523
1524         ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
1525         return !ret;
1526 }
1527
1528 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1529                                          struct scrub_block *sblock,
1530                                          int is_metadata, int have_csum,
1531                                          const u8 *csum, u64 generation,
1532                                          u16 csum_size)
1533 {
1534         int page_num;
1535         u8 calculated_csum[BTRFS_CSUM_SIZE];
1536         u32 crc = ~(u32)0;
1537         void *mapped_buffer;
1538
1539         WARN_ON(!sblock->pagev[0]->page);
1540         if (is_metadata) {
1541                 struct btrfs_header *h;
1542
1543                 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1544                 h = (struct btrfs_header *)mapped_buffer;
1545
1546                 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
1547                     !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
1548                     memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1549                            BTRFS_UUID_SIZE)) {
1550                         sblock->header_error = 1;
1551                 } else if (generation != btrfs_stack_header_generation(h)) {
1552                         sblock->header_error = 1;
1553                         sblock->generation_error = 1;
1554                 }
1555                 csum = h->csum;
1556         } else {
1557                 if (!have_csum)
1558                         return;
1559
1560                 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1561         }
1562
1563         for (page_num = 0;;) {
1564                 if (page_num == 0 && is_metadata)
1565                         crc = btrfs_csum_data(
1566                                 ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1567                                 crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1568                 else
1569                         crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
1570
1571                 kunmap_atomic(mapped_buffer);
1572                 page_num++;
1573                 if (page_num >= sblock->page_count)
1574                         break;
1575                 WARN_ON(!sblock->pagev[page_num]->page);
1576
1577                 mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1578         }
1579
1580         btrfs_csum_final(crc, calculated_csum);
1581         if (memcmp(calculated_csum, csum, csum_size))
1582                 sblock->checksum_error = 1;
1583 }
1584
1585 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1586                                              struct scrub_block *sblock_good)
1587 {
1588         int page_num;
1589         int ret = 0;
1590
1591         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1592                 int ret_sub;
1593
1594                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1595                                                            sblock_good,
1596                                                            page_num, 1);
1597                 if (ret_sub)
1598                         ret = ret_sub;
1599         }
1600
1601         return ret;
1602 }
1603
1604 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1605                                             struct scrub_block *sblock_good,
1606                                             int page_num, int force_write)
1607 {
1608         struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1609         struct scrub_page *page_good = sblock_good->pagev[page_num];
1610
1611         BUG_ON(page_bad->page == NULL);
1612         BUG_ON(page_good->page == NULL);
1613         if (force_write || sblock_bad->header_error ||
1614             sblock_bad->checksum_error || page_bad->io_error) {
1615                 struct bio *bio;
1616                 int ret;
1617
1618                 if (!page_bad->dev->bdev) {
1619                         printk_ratelimited(KERN_WARNING "BTRFS: "
1620                                 "scrub_repair_page_from_good_copy(bdev == NULL) "
1621                                 "is unexpected!\n");
1622                         return -EIO;
1623                 }
1624
1625                 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1626                 if (!bio)
1627                         return -EIO;
1628                 bio->bi_bdev = page_bad->dev->bdev;
1629                 bio->bi_iter.bi_sector = page_bad->physical >> 9;
1630
1631                 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1632                 if (PAGE_SIZE != ret) {
1633                         bio_put(bio);
1634                         return -EIO;
1635                 }
1636
1637                 if (btrfsic_submit_bio_wait(WRITE, bio)) {
1638                         btrfs_dev_stat_inc_and_print(page_bad->dev,
1639                                 BTRFS_DEV_STAT_WRITE_ERRS);
1640                         btrfs_dev_replace_stats_inc(
1641                                 &sblock_bad->sctx->dev_root->fs_info->
1642                                 dev_replace.num_write_errors);
1643                         bio_put(bio);
1644                         return -EIO;
1645                 }
1646                 bio_put(bio);
1647         }
1648
1649         return 0;
1650 }
1651
1652 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1653 {
1654         int page_num;
1655
1656         /*
1657          * This block is used for the check of the parity on the source device,
1658          * so the data needn't be written into the destination device.
1659          */
1660         if (sblock->sparity)
1661                 return;
1662
1663         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1664                 int ret;
1665
1666                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1667                 if (ret)
1668                         btrfs_dev_replace_stats_inc(
1669                                 &sblock->sctx->dev_root->fs_info->dev_replace.
1670                                 num_write_errors);
1671         }
1672 }
1673
1674 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1675                                            int page_num)
1676 {
1677         struct scrub_page *spage = sblock->pagev[page_num];
1678
1679         BUG_ON(spage->page == NULL);
1680         if (spage->io_error) {
1681                 void *mapped_buffer = kmap_atomic(spage->page);
1682
1683                 memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1684                 flush_dcache_page(spage->page);
1685                 kunmap_atomic(mapped_buffer);
1686         }
1687         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1688 }
1689
1690 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1691                                     struct scrub_page *spage)
1692 {
1693         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1694         struct scrub_bio *sbio;
1695         int ret;
1696
1697         mutex_lock(&wr_ctx->wr_lock);
1698 again:
1699         if (!wr_ctx->wr_curr_bio) {
1700                 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1701                                               GFP_NOFS);
1702                 if (!wr_ctx->wr_curr_bio) {
1703                         mutex_unlock(&wr_ctx->wr_lock);
1704                         return -ENOMEM;
1705                 }
1706                 wr_ctx->wr_curr_bio->sctx = sctx;
1707                 wr_ctx->wr_curr_bio->page_count = 0;
1708         }
1709         sbio = wr_ctx->wr_curr_bio;
1710         if (sbio->page_count == 0) {
1711                 struct bio *bio;
1712
1713                 sbio->physical = spage->physical_for_dev_replace;
1714                 sbio->logical = spage->logical;
1715                 sbio->dev = wr_ctx->tgtdev;
1716                 bio = sbio->bio;
1717                 if (!bio) {
1718                         bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1719                         if (!bio) {
1720                                 mutex_unlock(&wr_ctx->wr_lock);
1721                                 return -ENOMEM;
1722                         }
1723                         sbio->bio = bio;
1724                 }
1725
1726                 bio->bi_private = sbio;
1727                 bio->bi_end_io = scrub_wr_bio_end_io;
1728                 bio->bi_bdev = sbio->dev->bdev;
1729                 bio->bi_iter.bi_sector = sbio->physical >> 9;
1730                 sbio->err = 0;
1731         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1732                    spage->physical_for_dev_replace ||
1733                    sbio->logical + sbio->page_count * PAGE_SIZE !=
1734                    spage->logical) {
1735                 scrub_wr_submit(sctx);
1736                 goto again;
1737         }
1738
1739         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1740         if (ret != PAGE_SIZE) {
1741                 if (sbio->page_count < 1) {
1742                         bio_put(sbio->bio);
1743                         sbio->bio = NULL;
1744                         mutex_unlock(&wr_ctx->wr_lock);
1745                         return -EIO;
1746                 }
1747                 scrub_wr_submit(sctx);
1748                 goto again;
1749         }
1750
1751         sbio->pagev[sbio->page_count] = spage;
1752         scrub_page_get(spage);
1753         sbio->page_count++;
1754         if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1755                 scrub_wr_submit(sctx);
1756         mutex_unlock(&wr_ctx->wr_lock);
1757
1758         return 0;
1759 }
1760
1761 static void scrub_wr_submit(struct scrub_ctx *sctx)
1762 {
1763         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1764         struct scrub_bio *sbio;
1765
1766         if (!wr_ctx->wr_curr_bio)
1767                 return;
1768
1769         sbio = wr_ctx->wr_curr_bio;
1770         wr_ctx->wr_curr_bio = NULL;
1771         WARN_ON(!sbio->bio->bi_bdev);
1772         scrub_pending_bio_inc(sctx);
1773         /* process all writes in a single worker thread. Then the block layer
1774          * orders the requests before sending them to the driver which
1775          * doubled the write performance on spinning disks when measured
1776          * with Linux 3.5 */
1777         btrfsic_submit_bio(WRITE, sbio->bio);
1778 }
1779
1780 static void scrub_wr_bio_end_io(struct bio *bio)
1781 {
1782         struct scrub_bio *sbio = bio->bi_private;
1783         struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1784
1785         sbio->err = bio->bi_error;
1786         sbio->bio = bio;
1787
1788         btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1789                          scrub_wr_bio_end_io_worker, NULL, NULL);
1790         btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1791 }
1792
1793 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1794 {
1795         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1796         struct scrub_ctx *sctx = sbio->sctx;
1797         int i;
1798
1799         WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1800         if (sbio->err) {
1801                 struct btrfs_dev_replace *dev_replace =
1802                         &sbio->sctx->dev_root->fs_info->dev_replace;
1803
1804                 for (i = 0; i < sbio->page_count; i++) {
1805                         struct scrub_page *spage = sbio->pagev[i];
1806
1807                         spage->io_error = 1;
1808                         btrfs_dev_replace_stats_inc(&dev_replace->
1809                                                     num_write_errors);
1810                 }
1811         }
1812
1813         for (i = 0; i < sbio->page_count; i++)
1814                 scrub_page_put(sbio->pagev[i]);
1815
1816         bio_put(sbio->bio);
1817         kfree(sbio);
1818         scrub_pending_bio_dec(sctx);
1819 }
1820
1821 static int scrub_checksum(struct scrub_block *sblock)
1822 {
1823         u64 flags;
1824         int ret;
1825
1826         WARN_ON(sblock->page_count < 1);
1827         flags = sblock->pagev[0]->flags;
1828         ret = 0;
1829         if (flags & BTRFS_EXTENT_FLAG_DATA)
1830                 ret = scrub_checksum_data(sblock);
1831         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1832                 ret = scrub_checksum_tree_block(sblock);
1833         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1834                 (void)scrub_checksum_super(sblock);
1835         else
1836                 WARN_ON(1);
1837         if (ret)
1838                 scrub_handle_errored_block(sblock);
1839
1840         return ret;
1841 }
1842
1843 static int scrub_checksum_data(struct scrub_block *sblock)
1844 {
1845         struct scrub_ctx *sctx = sblock->sctx;
1846         u8 csum[BTRFS_CSUM_SIZE];
1847         u8 *on_disk_csum;
1848         struct page *page;
1849         void *buffer;
1850         u32 crc = ~(u32)0;
1851         int fail = 0;
1852         u64 len;
1853         int index;
1854
1855         BUG_ON(sblock->page_count < 1);
1856         if (!sblock->pagev[0]->have_csum)
1857                 return 0;
1858
1859         on_disk_csum = sblock->pagev[0]->csum;
1860         page = sblock->pagev[0]->page;
1861         buffer = kmap_atomic(page);
1862
1863         len = sctx->sectorsize;
1864         index = 0;
1865         for (;;) {
1866                 u64 l = min_t(u64, len, PAGE_SIZE);
1867
1868                 crc = btrfs_csum_data(buffer, crc, l);
1869                 kunmap_atomic(buffer);
1870                 len -= l;
1871                 if (len == 0)
1872                         break;
1873                 index++;
1874                 BUG_ON(index >= sblock->page_count);
1875                 BUG_ON(!sblock->pagev[index]->page);
1876                 page = sblock->pagev[index]->page;
1877                 buffer = kmap_atomic(page);
1878         }
1879
1880         btrfs_csum_final(crc, csum);
1881         if (memcmp(csum, on_disk_csum, sctx->csum_size))
1882                 fail = 1;
1883
1884         return fail;
1885 }
1886
1887 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1888 {
1889         struct scrub_ctx *sctx = sblock->sctx;
1890         struct btrfs_header *h;
1891         struct btrfs_root *root = sctx->dev_root;
1892         struct btrfs_fs_info *fs_info = root->fs_info;
1893         u8 calculated_csum[BTRFS_CSUM_SIZE];
1894         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1895         struct page *page;
1896         void *mapped_buffer;
1897         u64 mapped_size;
1898         void *p;
1899         u32 crc = ~(u32)0;
1900         int fail = 0;
1901         int crc_fail = 0;
1902         u64 len;
1903         int index;
1904
1905         BUG_ON(sblock->page_count < 1);
1906         page = sblock->pagev[0]->page;
1907         mapped_buffer = kmap_atomic(page);
1908         h = (struct btrfs_header *)mapped_buffer;
1909         memcpy(on_disk_csum, h->csum, sctx->csum_size);
1910
1911         /*
1912          * we don't use the getter functions here, as we
1913          * a) don't have an extent buffer and
1914          * b) the page is already kmapped
1915          */
1916
1917         if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
1918                 ++fail;
1919
1920         if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
1921                 ++fail;
1922
1923         if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
1924                 ++fail;
1925
1926         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1927                    BTRFS_UUID_SIZE))
1928                 ++fail;
1929
1930         len = sctx->nodesize - BTRFS_CSUM_SIZE;
1931         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1932         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1933         index = 0;
1934         for (;;) {
1935                 u64 l = min_t(u64, len, mapped_size);
1936
1937                 crc = btrfs_csum_data(p, crc, l);
1938                 kunmap_atomic(mapped_buffer);
1939                 len -= l;
1940                 if (len == 0)
1941                         break;
1942                 index++;
1943                 BUG_ON(index >= sblock->page_count);
1944                 BUG_ON(!sblock->pagev[index]->page);
1945                 page = sblock->pagev[index]->page;
1946                 mapped_buffer = kmap_atomic(page);
1947                 mapped_size = PAGE_SIZE;
1948                 p = mapped_buffer;
1949         }
1950
1951         btrfs_csum_final(crc, calculated_csum);
1952         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1953                 ++crc_fail;
1954
1955         return fail || crc_fail;
1956 }
1957
1958 static int scrub_checksum_super(struct scrub_block *sblock)
1959 {
1960         struct btrfs_super_block *s;
1961         struct scrub_ctx *sctx = sblock->sctx;
1962         u8 calculated_csum[BTRFS_CSUM_SIZE];
1963         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1964         struct page *page;
1965         void *mapped_buffer;
1966         u64 mapped_size;
1967         void *p;
1968         u32 crc = ~(u32)0;
1969         int fail_gen = 0;
1970         int fail_cor = 0;
1971         u64 len;
1972         int index;
1973
1974         BUG_ON(sblock->page_count < 1);
1975         page = sblock->pagev[0]->page;
1976         mapped_buffer = kmap_atomic(page);
1977         s = (struct btrfs_super_block *)mapped_buffer;
1978         memcpy(on_disk_csum, s->csum, sctx->csum_size);
1979
1980         if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
1981                 ++fail_cor;
1982
1983         if (sblock->pagev[0]->generation != btrfs_super_generation(s))
1984                 ++fail_gen;
1985
1986         if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
1987                 ++fail_cor;
1988
1989         len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1990         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1991         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1992         index = 0;
1993         for (;;) {
1994                 u64 l = min_t(u64, len, mapped_size);
1995
1996                 crc = btrfs_csum_data(p, crc, l);
1997                 kunmap_atomic(mapped_buffer);
1998                 len -= l;
1999                 if (len == 0)
2000                         break;
2001                 index++;
2002                 BUG_ON(index >= sblock->page_count);
2003                 BUG_ON(!sblock->pagev[index]->page);
2004                 page = sblock->pagev[index]->page;
2005                 mapped_buffer = kmap_atomic(page);
2006                 mapped_size = PAGE_SIZE;
2007                 p = mapped_buffer;
2008         }
2009
2010         btrfs_csum_final(crc, calculated_csum);
2011         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
2012                 ++fail_cor;
2013
2014         if (fail_cor + fail_gen) {
2015                 /*
2016                  * if we find an error in a super block, we just report it.
2017                  * They will get written with the next transaction commit
2018                  * anyway
2019                  */
2020                 spin_lock(&sctx->stat_lock);
2021                 ++sctx->stat.super_errors;
2022                 spin_unlock(&sctx->stat_lock);
2023                 if (fail_cor)
2024                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2025                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
2026                 else
2027                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2028                                 BTRFS_DEV_STAT_GENERATION_ERRS);
2029         }
2030
2031         return fail_cor + fail_gen;
2032 }
2033
2034 static void scrub_block_get(struct scrub_block *sblock)
2035 {
2036         atomic_inc(&sblock->refs);
2037 }
2038
2039 static void scrub_block_put(struct scrub_block *sblock)
2040 {
2041         if (atomic_dec_and_test(&sblock->refs)) {
2042                 int i;
2043
2044                 if (sblock->sparity)
2045                         scrub_parity_put(sblock->sparity);
2046
2047                 for (i = 0; i < sblock->page_count; i++)
2048                         scrub_page_put(sblock->pagev[i]);
2049                 kfree(sblock);
2050         }
2051 }
2052
2053 static void scrub_page_get(struct scrub_page *spage)
2054 {
2055         atomic_inc(&spage->refs);
2056 }
2057
2058 static void scrub_page_put(struct scrub_page *spage)
2059 {
2060         if (atomic_dec_and_test(&spage->refs)) {
2061                 if (spage->page)
2062                         __free_page(spage->page);
2063                 kfree(spage);
2064         }
2065 }
2066
2067 static void scrub_submit(struct scrub_ctx *sctx)
2068 {
2069         struct scrub_bio *sbio;
2070
2071         if (sctx->curr == -1)
2072                 return;
2073
2074         sbio = sctx->bios[sctx->curr];
2075         sctx->curr = -1;
2076         scrub_pending_bio_inc(sctx);
2077
2078         if (!sbio->bio->bi_bdev) {
2079                 /*
2080                  * this case should not happen. If btrfs_map_block() is
2081                  * wrong, it could happen for dev-replace operations on
2082                  * missing devices when no mirrors are available, but in
2083                  * this case it should already fail the mount.
2084                  * This case is handled correctly (but _very_ slowly).
2085                  */
2086                 printk_ratelimited(KERN_WARNING
2087                         "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
2088                 bio_io_error(sbio->bio);
2089         } else {
2090                 btrfsic_submit_bio(READ, sbio->bio);
2091         }
2092 }
2093
2094 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2095                                     struct scrub_page *spage)
2096 {
2097         struct scrub_block *sblock = spage->sblock;
2098         struct scrub_bio *sbio;
2099         int ret;
2100
2101 again:
2102         /*
2103          * grab a fresh bio or wait for one to become available
2104          */
2105         while (sctx->curr == -1) {
2106                 spin_lock(&sctx->list_lock);
2107                 sctx->curr = sctx->first_free;
2108                 if (sctx->curr != -1) {
2109                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2110                         sctx->bios[sctx->curr]->next_free = -1;
2111                         sctx->bios[sctx->curr]->page_count = 0;
2112                         spin_unlock(&sctx->list_lock);
2113                 } else {
2114                         spin_unlock(&sctx->list_lock);
2115                         wait_event(sctx->list_wait, sctx->first_free != -1);
2116                 }
2117         }
2118         sbio = sctx->bios[sctx->curr];
2119         if (sbio->page_count == 0) {
2120                 struct bio *bio;
2121
2122                 sbio->physical = spage->physical;
2123                 sbio->logical = spage->logical;
2124                 sbio->dev = spage->dev;
2125                 bio = sbio->bio;
2126                 if (!bio) {
2127                         bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
2128                         if (!bio)
2129                                 return -ENOMEM;
2130                         sbio->bio = bio;
2131                 }
2132
2133                 bio->bi_private = sbio;
2134                 bio->bi_end_io = scrub_bio_end_io;
2135                 bio->bi_bdev = sbio->dev->bdev;
2136                 bio->bi_iter.bi_sector = sbio->physical >> 9;
2137                 sbio->err = 0;
2138         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2139                    spage->physical ||
2140                    sbio->logical + sbio->page_count * PAGE_SIZE !=
2141                    spage->logical ||
2142                    sbio->dev != spage->dev) {
2143                 scrub_submit(sctx);
2144                 goto again;
2145         }
2146
2147         sbio->pagev[sbio->page_count] = spage;
2148         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2149         if (ret != PAGE_SIZE) {
2150                 if (sbio->page_count < 1) {
2151                         bio_put(sbio->bio);
2152                         sbio->bio = NULL;
2153                         return -EIO;
2154                 }
2155                 scrub_submit(sctx);
2156                 goto again;
2157         }
2158
2159         scrub_block_get(sblock); /* one for the page added to the bio */
2160         atomic_inc(&sblock->outstanding_pages);
2161         sbio->page_count++;
2162         if (sbio->page_count == sctx->pages_per_rd_bio)
2163                 scrub_submit(sctx);
2164
2165         return 0;
2166 }
2167
2168 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2169                        u64 physical, struct btrfs_device *dev, u64 flags,
2170                        u64 gen, int mirror_num, u8 *csum, int force,
2171                        u64 physical_for_dev_replace)
2172 {
2173         struct scrub_block *sblock;
2174         int index;
2175
2176         sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2177         if (!sblock) {
2178                 spin_lock(&sctx->stat_lock);
2179                 sctx->stat.malloc_errors++;
2180                 spin_unlock(&sctx->stat_lock);
2181                 return -ENOMEM;
2182         }
2183
2184         /* one ref inside this function, plus one for each page added to
2185          * a bio later on */
2186         atomic_set(&sblock->refs, 1);
2187         sblock->sctx = sctx;
2188         sblock->no_io_error_seen = 1;
2189
2190         for (index = 0; len > 0; index++) {
2191                 struct scrub_page *spage;
2192                 u64 l = min_t(u64, len, PAGE_SIZE);
2193
2194                 spage = kzalloc(sizeof(*spage), GFP_NOFS);
2195                 if (!spage) {
2196 leave_nomem:
2197                         spin_lock(&sctx->stat_lock);
2198                         sctx->stat.malloc_errors++;
2199                         spin_unlock(&sctx->stat_lock);
2200                         scrub_block_put(sblock);
2201                         return -ENOMEM;
2202                 }
2203                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2204                 scrub_page_get(spage);
2205                 sblock->pagev[index] = spage;
2206                 spage->sblock = sblock;
2207                 spage->dev = dev;
2208                 spage->flags = flags;
2209                 spage->generation = gen;
2210                 spage->logical = logical;
2211                 spage->physical = physical;
2212                 spage->physical_for_dev_replace = physical_for_dev_replace;
2213                 spage->mirror_num = mirror_num;
2214                 if (csum) {
2215                         spage->have_csum = 1;
2216                         memcpy(spage->csum, csum, sctx->csum_size);
2217                 } else {
2218                         spage->have_csum = 0;
2219                 }
2220                 sblock->page_count++;
2221                 spage->page = alloc_page(GFP_NOFS);
2222                 if (!spage->page)
2223                         goto leave_nomem;
2224                 len -= l;
2225                 logical += l;
2226                 physical += l;
2227                 physical_for_dev_replace += l;
2228         }
2229
2230         WARN_ON(sblock->page_count == 0);
2231         for (index = 0; index < sblock->page_count; index++) {
2232                 struct scrub_page *spage = sblock->pagev[index];
2233                 int ret;
2234
2235                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2236                 if (ret) {
2237                         scrub_block_put(sblock);
2238                         return ret;
2239                 }
2240         }
2241
2242         if (force)
2243                 scrub_submit(sctx);
2244
2245         /* last one frees, either here or in bio completion for last page */
2246         scrub_block_put(sblock);
2247         return 0;
2248 }
2249
2250 static void scrub_bio_end_io(struct bio *bio)
2251 {
2252         struct scrub_bio *sbio = bio->bi_private;
2253         struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
2254
2255         sbio->err = bio->bi_error;
2256         sbio->bio = bio;
2257
2258         btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2259 }
2260
2261 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2262 {
2263         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2264         struct scrub_ctx *sctx = sbio->sctx;
2265         int i;
2266
2267         BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2268         if (sbio->err) {
2269                 for (i = 0; i < sbio->page_count; i++) {
2270                         struct scrub_page *spage = sbio->pagev[i];
2271
2272                         spage->io_error = 1;
2273                         spage->sblock->no_io_error_seen = 0;
2274                 }
2275         }
2276
2277         /* now complete the scrub_block items that have all pages completed */
2278         for (i = 0; i < sbio->page_count; i++) {
2279                 struct scrub_page *spage = sbio->pagev[i];
2280                 struct scrub_block *sblock = spage->sblock;
2281
2282                 if (atomic_dec_and_test(&sblock->outstanding_pages))
2283                         scrub_block_complete(sblock);
2284                 scrub_block_put(sblock);
2285         }
2286
2287         bio_put(sbio->bio);
2288         sbio->bio = NULL;
2289         spin_lock(&sctx->list_lock);
2290         sbio->next_free = sctx->first_free;
2291         sctx->first_free = sbio->index;
2292         spin_unlock(&sctx->list_lock);
2293
2294         if (sctx->is_dev_replace &&
2295             atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2296                 mutex_lock(&sctx->wr_ctx.wr_lock);
2297                 scrub_wr_submit(sctx);
2298                 mutex_unlock(&sctx->wr_ctx.wr_lock);
2299         }
2300
2301         scrub_pending_bio_dec(sctx);
2302 }
2303
2304 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2305                                        unsigned long *bitmap,
2306                                        u64 start, u64 len)
2307 {
2308         u32 offset;
2309         int nsectors;
2310         int sectorsize = sparity->sctx->dev_root->sectorsize;
2311
2312         if (len >= sparity->stripe_len) {
2313                 bitmap_set(bitmap, 0, sparity->nsectors);
2314                 return;
2315         }
2316
2317         start -= sparity->logic_start;
2318         start = div_u64_rem(start, sparity->stripe_len, &offset);
2319         offset /= sectorsize;
2320         nsectors = (int)len / sectorsize;
2321
2322         if (offset + nsectors <= sparity->nsectors) {
2323                 bitmap_set(bitmap, offset, nsectors);
2324                 return;
2325         }
2326
2327         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2328         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2329 }
2330
2331 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2332                                                    u64 start, u64 len)
2333 {
2334         __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2335 }
2336
2337 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2338                                                   u64 start, u64 len)
2339 {
2340         __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2341 }
2342
2343 static void scrub_block_complete(struct scrub_block *sblock)
2344 {
2345         int corrupted = 0;
2346
2347         if (!sblock->no_io_error_seen) {
2348                 corrupted = 1;
2349                 scrub_handle_errored_block(sblock);
2350         } else {
2351                 /*
2352                  * if has checksum error, write via repair mechanism in
2353                  * dev replace case, otherwise write here in dev replace
2354                  * case.
2355                  */
2356                 corrupted = scrub_checksum(sblock);
2357                 if (!corrupted && sblock->sctx->is_dev_replace)
2358                         scrub_write_block_to_dev_replace(sblock);
2359         }
2360
2361         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2362                 u64 start = sblock->pagev[0]->logical;
2363                 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2364                           PAGE_SIZE;
2365
2366                 scrub_parity_mark_sectors_error(sblock->sparity,
2367                                                 start, end - start);
2368         }
2369 }
2370
2371 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2372                            u8 *csum)
2373 {
2374         struct btrfs_ordered_sum *sum = NULL;
2375         unsigned long index;
2376         unsigned long num_sectors;
2377
2378         while (!list_empty(&sctx->csum_list)) {
2379                 sum = list_first_entry(&sctx->csum_list,
2380                                        struct btrfs_ordered_sum, list);
2381                 if (sum->bytenr > logical)
2382                         return 0;
2383                 if (sum->bytenr + sum->len > logical)
2384                         break;
2385
2386                 ++sctx->stat.csum_discards;
2387                 list_del(&sum->list);
2388                 kfree(sum);
2389                 sum = NULL;
2390         }
2391         if (!sum)
2392                 return 0;
2393
2394         index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
2395         num_sectors = sum->len / sctx->sectorsize;
2396         memcpy(csum, sum->sums + index, sctx->csum_size);
2397         if (index == num_sectors - 1) {
2398                 list_del(&sum->list);
2399                 kfree(sum);
2400         }
2401         return 1;
2402 }
2403
2404 /* scrub extent tries to collect up to 64 kB for each bio */
2405 static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2406                         u64 physical, struct btrfs_device *dev, u64 flags,
2407                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
2408 {
2409         int ret;
2410         u8 csum[BTRFS_CSUM_SIZE];
2411         u32 blocksize;
2412
2413         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2414                 blocksize = sctx->sectorsize;
2415                 spin_lock(&sctx->stat_lock);
2416                 sctx->stat.data_extents_scrubbed++;
2417                 sctx->stat.data_bytes_scrubbed += len;
2418                 spin_unlock(&sctx->stat_lock);
2419         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2420                 blocksize = sctx->nodesize;
2421                 spin_lock(&sctx->stat_lock);
2422                 sctx->stat.tree_extents_scrubbed++;
2423                 sctx->stat.tree_bytes_scrubbed += len;
2424                 spin_unlock(&sctx->stat_lock);
2425         } else {
2426                 blocksize = sctx->sectorsize;
2427                 WARN_ON(1);
2428         }
2429
2430         while (len) {
2431                 u64 l = min_t(u64, len, blocksize);
2432                 int have_csum = 0;
2433
2434                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2435                         /* push csums to sbio */
2436                         have_csum = scrub_find_csum(sctx, logical, l, csum);
2437                         if (have_csum == 0)
2438                                 ++sctx->stat.no_csum;
2439                         if (sctx->is_dev_replace && !have_csum) {
2440                                 ret = copy_nocow_pages(sctx, logical, l,
2441                                                        mirror_num,
2442                                                       physical_for_dev_replace);
2443                                 goto behind_scrub_pages;
2444                         }
2445                 }
2446                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2447                                   mirror_num, have_csum ? csum : NULL, 0,
2448                                   physical_for_dev_replace);
2449 behind_scrub_pages:
2450                 if (ret)
2451                         return ret;
2452                 len -= l;
2453                 logical += l;
2454                 physical += l;
2455                 physical_for_dev_replace += l;
2456         }
2457         return 0;
2458 }
2459
2460 static int scrub_pages_for_parity(struct scrub_parity *sparity,
2461                                   u64 logical, u64 len,
2462                                   u64 physical, struct btrfs_device *dev,
2463                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2464 {
2465         struct scrub_ctx *sctx = sparity->sctx;
2466         struct scrub_block *sblock;
2467         int index;
2468
2469         sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2470         if (!sblock) {
2471                 spin_lock(&sctx->stat_lock);
2472                 sctx->stat.malloc_errors++;
2473                 spin_unlock(&sctx->stat_lock);
2474                 return -ENOMEM;
2475         }
2476
2477         /* one ref inside this function, plus one for each page added to
2478          * a bio later on */
2479         atomic_set(&sblock->refs, 1);
2480         sblock->sctx = sctx;
2481         sblock->no_io_error_seen = 1;
2482         sblock->sparity = sparity;
2483         scrub_parity_get(sparity);
2484
2485         for (index = 0; len > 0; index++) {
2486                 struct scrub_page *spage;
2487                 u64 l = min_t(u64, len, PAGE_SIZE);
2488
2489                 spage = kzalloc(sizeof(*spage), GFP_NOFS);
2490                 if (!spage) {
2491 leave_nomem:
2492                         spin_lock(&sctx->stat_lock);
2493                         sctx->stat.malloc_errors++;
2494                         spin_unlock(&sctx->stat_lock);
2495                         scrub_block_put(sblock);
2496                         return -ENOMEM;
2497                 }
2498                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2499                 /* For scrub block */
2500                 scrub_page_get(spage);
2501                 sblock->pagev[index] = spage;
2502                 /* For scrub parity */
2503                 scrub_page_get(spage);
2504                 list_add_tail(&spage->list, &sparity->spages);
2505                 spage->sblock = sblock;
2506                 spage->dev = dev;
2507                 spage->flags = flags;
2508                 spage->generation = gen;
2509                 spage->logical = logical;
2510                 spage->physical = physical;
2511                 spage->mirror_num = mirror_num;
2512                 if (csum) {
2513                         spage->have_csum = 1;
2514                         memcpy(spage->csum, csum, sctx->csum_size);
2515                 } else {
2516                         spage->have_csum = 0;
2517                 }
2518                 sblock->page_count++;
2519                 spage->page = alloc_page(GFP_NOFS);
2520                 if (!spage->page)
2521                         goto leave_nomem;
2522                 len -= l;
2523                 logical += l;
2524                 physical += l;
2525         }
2526
2527         WARN_ON(sblock->page_count == 0);
2528         for (index = 0; index < sblock->page_count; index++) {
2529                 struct scrub_page *spage = sblock->pagev[index];
2530                 int ret;
2531
2532                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2533                 if (ret) {
2534                         scrub_block_put(sblock);
2535                         return ret;
2536                 }
2537         }
2538
2539         /* last one frees, either here or in bio completion for last page */
2540         scrub_block_put(sblock);
2541         return 0;
2542 }
2543
2544 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2545                                    u64 logical, u64 len,
2546                                    u64 physical, struct btrfs_device *dev,
2547                                    u64 flags, u64 gen, int mirror_num)
2548 {
2549         struct scrub_ctx *sctx = sparity->sctx;
2550         int ret;
2551         u8 csum[BTRFS_CSUM_SIZE];
2552         u32 blocksize;
2553
2554         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2555                 blocksize = sctx->sectorsize;
2556         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2557                 blocksize = sctx->nodesize;
2558         } else {
2559                 blocksize = sctx->sectorsize;
2560                 WARN_ON(1);
2561         }
2562
2563         while (len) {
2564                 u64 l = min_t(u64, len, blocksize);
2565                 int have_csum = 0;
2566
2567                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2568                         /* push csums to sbio */
2569                         have_csum = scrub_find_csum(sctx, logical, l, csum);
2570                         if (have_csum == 0)
2571                                 goto skip;
2572                 }
2573                 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2574                                              flags, gen, mirror_num,
2575                                              have_csum ? csum : NULL);
2576                 if (ret)
2577                         return ret;
2578 skip:
2579                 len -= l;
2580                 logical += l;
2581                 physical += l;
2582         }
2583         return 0;
2584 }
2585
2586 /*
2587  * Given a physical address, this will calculate it's
2588  * logical offset. if this is a parity stripe, it will return
2589  * the most left data stripe's logical offset.
2590  *
2591  * return 0 if it is a data stripe, 1 means parity stripe.
2592  */
2593 static int get_raid56_logic_offset(u64 physical, int num,
2594                                    struct map_lookup *map, u64 *offset,
2595                                    u64 *stripe_start)
2596 {
2597         int i;
2598         int j = 0;
2599         u64 stripe_nr;
2600         u64 last_offset;
2601         u32 stripe_index;
2602         u32 rot;
2603
2604         last_offset = (physical - map->stripes[num].physical) *
2605                       nr_data_stripes(map);
2606         if (stripe_start)
2607                 *stripe_start = last_offset;
2608
2609         *offset = last_offset;
2610         for (i = 0; i < nr_data_stripes(map); i++) {
2611                 *offset = last_offset + i * map->stripe_len;
2612
2613                 stripe_nr = div_u64(*offset, map->stripe_len);
2614                 stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
2615
2616                 /* Work out the disk rotation on this stripe-set */
2617                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2618                 /* calculate which stripe this data locates */
2619                 rot += i;
2620                 stripe_index = rot % map->num_stripes;
2621                 if (stripe_index == num)
2622                         return 0;
2623                 if (stripe_index < num)
2624                         j++;
2625         }
2626         *offset = last_offset + j * map->stripe_len;
2627         return 1;
2628 }
2629
2630 static void scrub_free_parity(struct scrub_parity *sparity)
2631 {
2632         struct scrub_ctx *sctx = sparity->sctx;
2633         struct scrub_page *curr, *next;
2634         int nbits;
2635
2636         nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2637         if (nbits) {
2638                 spin_lock(&sctx->stat_lock);
2639                 sctx->stat.read_errors += nbits;
2640                 sctx->stat.uncorrectable_errors += nbits;
2641                 spin_unlock(&sctx->stat_lock);
2642         }
2643
2644         list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2645                 list_del_init(&curr->list);
2646                 scrub_page_put(curr);
2647         }
2648
2649         kfree(sparity);
2650 }
2651
2652 static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
2653 {
2654         struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2655                                                     work);
2656         struct scrub_ctx *sctx = sparity->sctx;
2657
2658         scrub_free_parity(sparity);
2659         scrub_pending_bio_dec(sctx);
2660 }
2661
2662 static void scrub_parity_bio_endio(struct bio *bio)
2663 {
2664         struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2665
2666         if (bio->bi_error)
2667                 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2668                           sparity->nsectors);
2669
2670         bio_put(bio);
2671
2672         btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
2673                         scrub_parity_bio_endio_worker, NULL, NULL);
2674         btrfs_queue_work(sparity->sctx->dev_root->fs_info->scrub_parity_workers,
2675                          &sparity->work);
2676 }
2677
2678 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2679 {
2680         struct scrub_ctx *sctx = sparity->sctx;
2681         struct bio *bio;
2682         struct btrfs_raid_bio *rbio;
2683         struct scrub_page *spage;
2684         struct btrfs_bio *bbio = NULL;
2685         u64 length;
2686         int ret;
2687
2688         if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2689                            sparity->nsectors))
2690                 goto out;
2691
2692         length = sparity->logic_end - sparity->logic_start + 1;
2693         ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
2694                                sparity->logic_start,
2695                                &length, &bbio, 0, 1);
2696         if (ret || !bbio || !bbio->raid_map)
2697                 goto bbio_out;
2698
2699         bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
2700         if (!bio)
2701                 goto bbio_out;
2702
2703         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2704         bio->bi_private = sparity;
2705         bio->bi_end_io = scrub_parity_bio_endio;
2706
2707         rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
2708                                               length, sparity->scrub_dev,
2709                                               sparity->dbitmap,
2710                                               sparity->nsectors);
2711         if (!rbio)
2712                 goto rbio_out;
2713
2714         list_for_each_entry(spage, &sparity->spages, list)
2715                 raid56_parity_add_scrub_pages(rbio, spage->page,
2716                                               spage->logical);
2717
2718         scrub_pending_bio_inc(sctx);
2719         raid56_parity_submit_scrub_rbio(rbio);
2720         return;
2721
2722 rbio_out:
2723         bio_put(bio);
2724 bbio_out:
2725         btrfs_put_bbio(bbio);
2726         bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2727                   sparity->nsectors);
2728         spin_lock(&sctx->stat_lock);
2729         sctx->stat.malloc_errors++;
2730         spin_unlock(&sctx->stat_lock);
2731 out:
2732         scrub_free_parity(sparity);
2733 }
2734
2735 static inline int scrub_calc_parity_bitmap_len(int nsectors)
2736 {
2737         return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
2738 }
2739
2740 static void scrub_parity_get(struct scrub_parity *sparity)
2741 {
2742         atomic_inc(&sparity->refs);
2743 }
2744
2745 static void scrub_parity_put(struct scrub_parity *sparity)
2746 {
2747         if (!atomic_dec_and_test(&sparity->refs))
2748                 return;
2749
2750         scrub_parity_check_and_repair(sparity);
2751 }
2752
2753 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2754                                                   struct map_lookup *map,
2755                                                   struct btrfs_device *sdev,
2756                                                   struct btrfs_path *path,
2757                                                   u64 logic_start,
2758                                                   u64 logic_end)
2759 {
2760         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2761         struct btrfs_root *root = fs_info->extent_root;
2762         struct btrfs_root *csum_root = fs_info->csum_root;
2763         struct btrfs_extent_item *extent;
2764         u64 flags;
2765         int ret;
2766         int slot;
2767         struct extent_buffer *l;
2768         struct btrfs_key key;
2769         u64 generation;
2770         u64 extent_logical;
2771         u64 extent_physical;
2772         u64 extent_len;
2773         struct btrfs_device *extent_dev;
2774         struct scrub_parity *sparity;
2775         int nsectors;
2776         int bitmap_len;
2777         int extent_mirror_num;
2778         int stop_loop = 0;
2779
2780         nsectors = map->stripe_len / root->sectorsize;
2781         bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2782         sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2783                           GFP_NOFS);
2784         if (!sparity) {
2785                 spin_lock(&sctx->stat_lock);
2786                 sctx->stat.malloc_errors++;
2787                 spin_unlock(&sctx->stat_lock);
2788                 return -ENOMEM;
2789         }
2790
2791         sparity->stripe_len = map->stripe_len;
2792         sparity->nsectors = nsectors;
2793         sparity->sctx = sctx;
2794         sparity->scrub_dev = sdev;
2795         sparity->logic_start = logic_start;
2796         sparity->logic_end = logic_end;
2797         atomic_set(&sparity->refs, 1);
2798         INIT_LIST_HEAD(&sparity->spages);
2799         sparity->dbitmap = sparity->bitmap;
2800         sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2801
2802         ret = 0;
2803         while (logic_start < logic_end) {
2804                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2805                         key.type = BTRFS_METADATA_ITEM_KEY;
2806                 else
2807                         key.type = BTRFS_EXTENT_ITEM_KEY;
2808                 key.objectid = logic_start;
2809                 key.offset = (u64)-1;
2810
2811                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2812                 if (ret < 0)
2813                         goto out;
2814
2815                 if (ret > 0) {
2816                         ret = btrfs_previous_extent_item(root, path, 0);
2817                         if (ret < 0)
2818                                 goto out;
2819                         if (ret > 0) {
2820                                 btrfs_release_path(path);
2821                                 ret = btrfs_search_slot(NULL, root, &key,
2822                                                         path, 0, 0);
2823                                 if (ret < 0)
2824                                         goto out;
2825                         }
2826                 }
2827
2828                 stop_loop = 0;
2829                 while (1) {
2830                         u64 bytes;
2831
2832                         l = path->nodes[0];
2833                         slot = path->slots[0];
2834                         if (slot >= btrfs_header_nritems(l)) {
2835                                 ret = btrfs_next_leaf(root, path);
2836                                 if (ret == 0)
2837                                         continue;
2838                                 if (ret < 0)
2839                                         goto out;
2840
2841                                 stop_loop = 1;
2842                                 break;
2843                         }
2844                         btrfs_item_key_to_cpu(l, &key, slot);
2845
2846                         if (key.type == BTRFS_METADATA_ITEM_KEY)
2847                                 bytes = root->nodesize;
2848                         else
2849                                 bytes = key.offset;
2850
2851                         if (key.objectid + bytes <= logic_start)
2852                                 goto next;
2853
2854                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2855                             key.type != BTRFS_METADATA_ITEM_KEY)
2856                                 goto next;
2857
2858                         if (key.objectid > logic_end) {
2859                                 stop_loop = 1;
2860                                 break;
2861                         }
2862
2863                         while (key.objectid >= logic_start + map->stripe_len)
2864                                 logic_start += map->stripe_len;
2865
2866                         extent = btrfs_item_ptr(l, slot,
2867                                                 struct btrfs_extent_item);
2868                         flags = btrfs_extent_flags(l, extent);
2869                         generation = btrfs_extent_generation(l, extent);
2870
2871                         if (key.objectid < logic_start &&
2872                             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2873                                 btrfs_err(fs_info,
2874                                           "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
2875                                            key.objectid, logic_start);
2876                                 goto next;
2877                         }
2878 again:
2879                         extent_logical = key.objectid;
2880                         extent_len = bytes;
2881
2882                         if (extent_logical < logic_start) {
2883                                 extent_len -= logic_start - extent_logical;
2884                                 extent_logical = logic_start;
2885                         }
2886
2887                         if (extent_logical + extent_len >
2888                             logic_start + map->stripe_len)
2889                                 extent_len = logic_start + map->stripe_len -
2890                                              extent_logical;
2891
2892                         scrub_parity_mark_sectors_data(sparity, extent_logical,
2893                                                        extent_len);
2894
2895                         scrub_remap_extent(fs_info, extent_logical,
2896                                            extent_len, &extent_physical,
2897                                            &extent_dev,
2898                                            &extent_mirror_num);
2899
2900                         ret = btrfs_lookup_csums_range(csum_root,
2901                                                 extent_logical,
2902                                                 extent_logical + extent_len - 1,
2903                                                 &sctx->csum_list, 1);
2904                         if (ret)
2905                                 goto out;
2906
2907                         ret = scrub_extent_for_parity(sparity, extent_logical,
2908                                                       extent_len,
2909                                                       extent_physical,
2910                                                       extent_dev, flags,
2911                                                       generation,
2912                                                       extent_mirror_num);
2913                         if (ret)
2914                                 goto out;
2915
2916                         scrub_free_csums(sctx);
2917                         if (extent_logical + extent_len <
2918                             key.objectid + bytes) {
2919                                 logic_start += map->stripe_len;
2920
2921                                 if (logic_start >= logic_end) {
2922                                         stop_loop = 1;
2923                                         break;
2924                                 }
2925
2926                                 if (logic_start < key.objectid + bytes) {
2927                                         cond_resched();
2928                                         goto again;
2929                                 }
2930                         }
2931 next:
2932                         path->slots[0]++;
2933                 }
2934
2935                 btrfs_release_path(path);
2936
2937                 if (stop_loop)
2938                         break;
2939
2940                 logic_start += map->stripe_len;
2941         }
2942 out:
2943         if (ret < 0)
2944                 scrub_parity_mark_sectors_error(sparity, logic_start,
2945                                                 logic_end - logic_start + 1);
2946         scrub_parity_put(sparity);
2947         scrub_submit(sctx);
2948         mutex_lock(&sctx->wr_ctx.wr_lock);
2949         scrub_wr_submit(sctx);
2950         mutex_unlock(&sctx->wr_ctx.wr_lock);
2951
2952         btrfs_release_path(path);
2953         return ret < 0 ? ret : 0;
2954 }
2955
2956 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2957                                            struct map_lookup *map,
2958                                            struct btrfs_device *scrub_dev,
2959                                            int num, u64 base, u64 length,
2960                                            int is_dev_replace)
2961 {
2962         struct btrfs_path *path, *ppath;
2963         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2964         struct btrfs_root *root = fs_info->extent_root;
2965         struct btrfs_root *csum_root = fs_info->csum_root;
2966         struct btrfs_extent_item *extent;
2967         struct blk_plug plug;
2968         u64 flags;
2969         int ret;
2970         int slot;
2971         u64 nstripes;
2972         struct extent_buffer *l;
2973         struct btrfs_key key;
2974         u64 physical;
2975         u64 logical;
2976         u64 logic_end;
2977         u64 physical_end;
2978         u64 generation;
2979         int mirror_num;
2980         struct reada_control *reada1;
2981         struct reada_control *reada2;
2982         struct btrfs_key key_start;
2983         struct btrfs_key key_end;
2984         u64 increment = map->stripe_len;
2985         u64 offset;
2986         u64 extent_logical;
2987         u64 extent_physical;
2988         u64 extent_len;
2989         u64 stripe_logical;
2990         u64 stripe_end;
2991         struct btrfs_device *extent_dev;
2992         int extent_mirror_num;
2993         int stop_loop = 0;
2994
2995         physical = map->stripes[num].physical;
2996         offset = 0;
2997         nstripes = div_u64(length, map->stripe_len);
2998         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2999                 offset = map->stripe_len * num;
3000                 increment = map->stripe_len * map->num_stripes;
3001                 mirror_num = 1;
3002         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3003                 int factor = map->num_stripes / map->sub_stripes;
3004                 offset = map->stripe_len * (num / map->sub_stripes);
3005                 increment = map->stripe_len * factor;
3006                 mirror_num = num % map->sub_stripes + 1;
3007         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3008                 increment = map->stripe_len;
3009                 mirror_num = num % map->num_stripes + 1;
3010         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3011                 increment = map->stripe_len;
3012                 mirror_num = num % map->num_stripes + 1;
3013         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3014                 get_raid56_logic_offset(physical, num, map, &offset, NULL);
3015                 increment = map->stripe_len * nr_data_stripes(map);
3016                 mirror_num = 1;
3017         } else {
3018                 increment = map->stripe_len;
3019                 mirror_num = 1;
3020         }
3021
3022         path = btrfs_alloc_path();
3023         if (!path)
3024                 return -ENOMEM;
3025
3026         ppath = btrfs_alloc_path();
3027         if (!ppath) {
3028                 btrfs_free_path(path);
3029                 return -ENOMEM;
3030         }
3031
3032         /*
3033          * work on commit root. The related disk blocks are static as
3034          * long as COW is applied. This means, it is save to rewrite
3035          * them to repair disk errors without any race conditions
3036          */
3037         path->search_commit_root = 1;
3038         path->skip_locking = 1;
3039
3040         ppath->search_commit_root = 1;
3041         ppath->skip_locking = 1;
3042         /*
3043          * trigger the readahead for extent tree csum tree and wait for
3044          * completion. During readahead, the scrub is officially paused
3045          * to not hold off transaction commits
3046          */
3047         logical = base + offset;
3048         physical_end = physical + nstripes * map->stripe_len;
3049         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3050                 get_raid56_logic_offset(physical_end, num,
3051                                         map, &logic_end, NULL);
3052                 logic_end += base;
3053         } else {
3054                 logic_end = logical + increment * nstripes;
3055         }
3056         wait_event(sctx->list_wait,
3057                    atomic_read(&sctx->bios_in_flight) == 0);
3058         scrub_blocked_if_needed(fs_info);
3059
3060         /* FIXME it might be better to start readahead at commit root */
3061         key_start.objectid = logical;
3062         key_start.type = BTRFS_EXTENT_ITEM_KEY;
3063         key_start.offset = (u64)0;
3064         key_end.objectid = logic_end;
3065         key_end.type = BTRFS_METADATA_ITEM_KEY;
3066         key_end.offset = (u64)-1;
3067         reada1 = btrfs_reada_add(root, &key_start, &key_end);
3068
3069         key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3070         key_start.type = BTRFS_EXTENT_CSUM_KEY;
3071         key_start.offset = logical;
3072         key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3073         key_end.type = BTRFS_EXTENT_CSUM_KEY;
3074         key_end.offset = logic_end;
3075         reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
3076
3077         if (!IS_ERR(reada1))
3078                 btrfs_reada_wait(reada1);
3079         if (!IS_ERR(reada2))
3080                 btrfs_reada_wait(reada2);
3081
3082
3083         /*
3084          * collect all data csums for the stripe to avoid seeking during
3085          * the scrub. This might currently (crc32) end up to be about 1MB
3086          */
3087         blk_start_plug(&plug);
3088
3089         /*
3090          * now find all extents for each stripe and scrub them
3091          */
3092         ret = 0;
3093         while (physical < physical_end) {
3094                 /* for raid56, we skip parity stripe */
3095                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3096                         ret = get_raid56_logic_offset(physical, num,
3097                                         map, &logical, &stripe_logical);
3098                         logical += base;
3099                         if (ret) {
3100                                 stripe_logical += base;
3101                                 stripe_end = stripe_logical + increment - 1;
3102                                 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3103                                                 ppath, stripe_logical,
3104                                                 stripe_end);
3105                                 if (ret)
3106                                         goto out;
3107                                 goto skip;
3108                         }
3109                 }
3110                 /*
3111                  * canceled?
3112                  */
3113                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3114                     atomic_read(&sctx->cancel_req)) {
3115                         ret = -ECANCELED;
3116                         goto out;
3117                 }
3118                 /*
3119                  * check to see if we have to pause
3120                  */
3121                 if (atomic_read(&fs_info->scrub_pause_req)) {
3122                         /* push queued extents */
3123                         atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
3124                         scrub_submit(sctx);
3125                         mutex_lock(&sctx->wr_ctx.wr_lock);
3126                         scrub_wr_submit(sctx);
3127                         mutex_unlock(&sctx->wr_ctx.wr_lock);
3128                         wait_event(sctx->list_wait,
3129                                    atomic_read(&sctx->bios_in_flight) == 0);
3130                         atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
3131                         scrub_blocked_if_needed(fs_info);
3132                 }
3133
3134                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3135                         key.type = BTRFS_METADATA_ITEM_KEY;
3136                 else
3137                         key.type = BTRFS_EXTENT_ITEM_KEY;
3138                 key.objectid = logical;
3139                 key.offset = (u64)-1;
3140
3141                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3142                 if (ret < 0)
3143                         goto out;
3144
3145                 if (ret > 0) {
3146                         ret = btrfs_previous_extent_item(root, path, 0);
3147                         if (ret < 0)
3148                                 goto out;
3149                         if (ret > 0) {
3150                                 /* there's no smaller item, so stick with the
3151                                  * larger one */
3152                                 btrfs_release_path(path);
3153                                 ret = btrfs_search_slot(NULL, root, &key,
3154                                                         path, 0, 0);
3155                                 if (ret < 0)
3156                                         goto out;
3157                         }
3158                 }
3159
3160                 stop_loop = 0;
3161                 while (1) {
3162                         u64 bytes;
3163
3164                         l = path->nodes[0];
3165                         slot = path->slots[0];
3166                         if (slot >= btrfs_header_nritems(l)) {
3167                                 ret = btrfs_next_leaf(root, path);
3168                                 if (ret == 0)
3169                                         continue;
3170                                 if (ret < 0)
3171                                         goto out;
3172
3173                                 stop_loop = 1;
3174                                 break;
3175                         }
3176                         btrfs_item_key_to_cpu(l, &key, slot);
3177
3178                         if (key.type == BTRFS_METADATA_ITEM_KEY)
3179                                 bytes = root->nodesize;
3180                         else
3181                                 bytes = key.offset;
3182
3183                         if (key.objectid + bytes <= logical)
3184                                 goto next;
3185
3186                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3187                             key.type != BTRFS_METADATA_ITEM_KEY)
3188                                 goto next;
3189
3190                         if (key.objectid >= logical + map->stripe_len) {
3191                                 /* out of this device extent */
3192                                 if (key.objectid >= logic_end)
3193                                         stop_loop = 1;
3194                                 break;
3195                         }
3196
3197                         extent = btrfs_item_ptr(l, slot,
3198                                                 struct btrfs_extent_item);
3199                         flags = btrfs_extent_flags(l, extent);
3200                         generation = btrfs_extent_generation(l, extent);
3201
3202                         if (key.objectid < logical &&
3203                             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
3204                                 btrfs_err(fs_info,
3205                                            "scrub: tree block %llu spanning "
3206                                            "stripes, ignored. logical=%llu",
3207                                        key.objectid, logical);
3208                                 goto next;
3209                         }
3210
3211 again:
3212                         extent_logical = key.objectid;
3213                         extent_len = bytes;
3214
3215                         /*
3216                          * trim extent to this stripe
3217                          */
3218                         if (extent_logical < logical) {
3219                                 extent_len -= logical - extent_logical;
3220                                 extent_logical = logical;
3221                         }
3222                         if (extent_logical + extent_len >
3223                             logical + map->stripe_len) {
3224                                 extent_len = logical + map->stripe_len -
3225                                              extent_logical;
3226                         }
3227
3228                         extent_physical = extent_logical - logical + physical;
3229                         extent_dev = scrub_dev;
3230                         extent_mirror_num = mirror_num;
3231                         if (is_dev_replace)
3232                                 scrub_remap_extent(fs_info, extent_logical,
3233                                                    extent_len, &extent_physical,
3234                                                    &extent_dev,
3235                                                    &extent_mirror_num);
3236
3237                         ret = btrfs_lookup_csums_range(csum_root, logical,
3238                                                 logical + map->stripe_len - 1,
3239                                                 &sctx->csum_list, 1);
3240                         if (ret)
3241                                 goto out;
3242
3243                         ret = scrub_extent(sctx, extent_logical, extent_len,
3244                                            extent_physical, extent_dev, flags,
3245                                            generation, extent_mirror_num,
3246                                            extent_logical - logical + physical);
3247                         if (ret)
3248                                 goto out;
3249
3250                         scrub_free_csums(sctx);
3251                         if (extent_logical + extent_len <
3252                             key.objectid + bytes) {
3253                                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3254                                         /*
3255                                          * loop until we find next data stripe
3256                                          * or we have finished all stripes.
3257                                          */
3258 loop:
3259                                         physical += map->stripe_len;
3260                                         ret = get_raid56_logic_offset(physical,
3261                                                         num, map, &logical,
3262                                                         &stripe_logical);
3263                                         logical += base;
3264
3265                                         if (ret && physical < physical_end) {
3266                                                 stripe_logical += base;
3267                                                 stripe_end = stripe_logical +
3268                                                                 increment - 1;
3269                                                 ret = scrub_raid56_parity(sctx,
3270                                                         map, scrub_dev, ppath,
3271                                                         stripe_logical,
3272                                                         stripe_end);
3273                                                 if (ret)
3274                                                         goto out;
3275                                                 goto loop;
3276                                         }
3277                                 } else {
3278                                         physical += map->stripe_len;
3279                                         logical += increment;
3280                                 }
3281                                 if (logical < key.objectid + bytes) {
3282                                         cond_resched();
3283                                         goto again;
3284                                 }
3285
3286                                 if (physical >= physical_end) {
3287                                         stop_loop = 1;
3288                                         break;
3289                                 }
3290                         }
3291 next:
3292                         path->slots[0]++;
3293                 }
3294                 btrfs_release_path(path);
3295 skip:
3296                 logical += increment;
3297                 physical += map->stripe_len;
3298                 spin_lock(&sctx->stat_lock);
3299                 if (stop_loop)
3300                         sctx->stat.last_physical = map->stripes[num].physical +
3301                                                    length;
3302                 else
3303                         sctx->stat.last_physical = physical;
3304                 spin_unlock(&sctx->stat_lock);
3305                 if (stop_loop)
3306                         break;
3307         }
3308 out:
3309         /* push queued extents */
3310         scrub_submit(sctx);
3311         mutex_lock(&sctx->wr_ctx.wr_lock);
3312         scrub_wr_submit(sctx);
3313         mutex_unlock(&sctx->wr_ctx.wr_lock);
3314
3315         blk_finish_plug(&plug);
3316         btrfs_free_path(path);
3317         btrfs_free_path(ppath);
3318         return ret < 0 ? ret : 0;
3319 }
3320
3321 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3322                                           struct btrfs_device *scrub_dev,
3323                                           u64 chunk_tree, u64 chunk_objectid,
3324                                           u64 chunk_offset, u64 length,
3325                                           u64 dev_offset, int is_dev_replace)
3326 {
3327         struct btrfs_mapping_tree *map_tree =
3328                 &sctx->dev_root->fs_info->mapping_tree;
3329         struct map_lookup *map;
3330         struct extent_map *em;
3331         int i;
3332         int ret = 0;
3333
3334         read_lock(&map_tree->map_tree.lock);
3335         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3336         read_unlock(&map_tree->map_tree.lock);
3337
3338         if (!em)
3339                 return -EINVAL;
3340
3341         map = (struct map_lookup *)em->bdev;
3342         if (em->start != chunk_offset)
3343                 goto out;
3344
3345         if (em->len < length)
3346                 goto out;
3347
3348         for (i = 0; i < map->num_stripes; ++i) {
3349                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3350                     map->stripes[i].physical == dev_offset) {
3351                         ret = scrub_stripe(sctx, map, scrub_dev, i,
3352                                            chunk_offset, length,
3353                                            is_dev_replace);
3354                         if (ret)
3355                                 goto out;
3356                 }
3357         }
3358 out:
3359         free_extent_map(em);
3360
3361         return ret;
3362 }
3363
3364 static noinline_for_stack
3365 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3366                            struct btrfs_device *scrub_dev, u64 start, u64 end,
3367                            int is_dev_replace)
3368 {
3369         struct btrfs_dev_extent *dev_extent = NULL;
3370         struct btrfs_path *path;
3371         struct btrfs_root *root = sctx->dev_root;
3372         struct btrfs_fs_info *fs_info = root->fs_info;
3373         u64 length;
3374         u64 chunk_tree;
3375         u64 chunk_objectid;
3376         u64 chunk_offset;
3377         int ret;
3378         int slot;
3379         struct extent_buffer *l;
3380         struct btrfs_key key;
3381         struct btrfs_key found_key;
3382         struct btrfs_block_group_cache *cache;
3383         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3384
3385         path = btrfs_alloc_path();
3386         if (!path)
3387                 return -ENOMEM;
3388
3389         path->reada = 2;
3390         path->search_commit_root = 1;
3391         path->skip_locking = 1;
3392
3393         key.objectid = scrub_dev->devid;
3394         key.offset = 0ull;
3395         key.type = BTRFS_DEV_EXTENT_KEY;
3396
3397         while (1) {
3398                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3399                 if (ret < 0)
3400                         break;
3401                 if (ret > 0) {
3402                         if (path->slots[0] >=
3403                             btrfs_header_nritems(path->nodes[0])) {
3404                                 ret = btrfs_next_leaf(root, path);
3405                                 if (ret)
3406                                         break;
3407                         }
3408                 }
3409
3410                 l = path->nodes[0];
3411                 slot = path->slots[0];
3412
3413                 btrfs_item_key_to_cpu(l, &found_key, slot);
3414
3415                 if (found_key.objectid != scrub_dev->devid)
3416                         break;
3417
3418                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3419                         break;
3420
3421                 if (found_key.offset >= end)
3422                         break;
3423
3424                 if (found_key.offset < key.offset)
3425                         break;
3426
3427                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3428                 length = btrfs_dev_extent_length(l, dev_extent);
3429
3430                 if (found_key.offset + length <= start)
3431                         goto skip;
3432
3433                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
3434                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
3435                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3436
3437                 /*
3438                  * get a reference on the corresponding block group to prevent
3439                  * the chunk from going away while we scrub it
3440                  */
3441                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3442
3443                 /* some chunks are removed but not committed to disk yet,
3444                  * continue scrubbing */
3445                 if (!cache)
3446                         goto skip;
3447
3448                 dev_replace->cursor_right = found_key.offset + length;
3449                 dev_replace->cursor_left = found_key.offset;
3450                 dev_replace->item_needs_writeback = 1;
3451                 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
3452                                   chunk_offset, length, found_key.offset,
3453                                   is_dev_replace);
3454
3455                 /*
3456                  * flush, submit all pending read and write bios, afterwards
3457                  * wait for them.
3458                  * Note that in the dev replace case, a read request causes
3459                  * write requests that are submitted in the read completion
3460                  * worker. Therefore in the current situation, it is required
3461                  * that all write requests are flushed, so that all read and
3462                  * write requests are really completed when bios_in_flight
3463                  * changes to 0.
3464                  */
3465                 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
3466                 scrub_submit(sctx);
3467                 mutex_lock(&sctx->wr_ctx.wr_lock);
3468                 scrub_wr_submit(sctx);
3469                 mutex_unlock(&sctx->wr_ctx.wr_lock);
3470
3471                 wait_event(sctx->list_wait,
3472                            atomic_read(&sctx->bios_in_flight) == 0);
3473                 atomic_inc(&fs_info->scrubs_paused);
3474                 wake_up(&fs_info->scrub_pause_wait);
3475
3476                 /*
3477                  * must be called before we decrease @scrub_paused.
3478                  * make sure we don't block transaction commit while
3479                  * we are waiting pending workers finished.
3480                  */
3481                 wait_event(sctx->list_wait,
3482                            atomic_read(&sctx->workers_pending) == 0);
3483                 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
3484
3485                 mutex_lock(&fs_info->scrub_lock);
3486                 __scrub_blocked_if_needed(fs_info);
3487                 atomic_dec(&fs_info->scrubs_paused);
3488                 mutex_unlock(&fs_info->scrub_lock);
3489                 wake_up(&fs_info->scrub_pause_wait);
3490
3491                 btrfs_put_block_group(cache);
3492                 if (ret)
3493                         break;
3494                 if (is_dev_replace &&
3495                     atomic64_read(&dev_replace->num_write_errors) > 0) {
3496                         ret = -EIO;
3497                         break;
3498                 }
3499                 if (sctx->stat.malloc_errors > 0) {
3500                         ret = -ENOMEM;
3501                         break;
3502                 }
3503
3504                 dev_replace->cursor_left = dev_replace->cursor_right;
3505                 dev_replace->item_needs_writeback = 1;
3506 skip:
3507                 key.offset = found_key.offset + length;
3508                 btrfs_release_path(path);
3509         }
3510
3511         btrfs_free_path(path);
3512
3513         /*
3514          * ret can still be 1 from search_slot or next_leaf,
3515          * that's not an error
3516          */
3517         return ret < 0 ? ret : 0;
3518 }
3519
3520 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3521                                            struct btrfs_device *scrub_dev)
3522 {
3523         int     i;
3524         u64     bytenr;
3525         u64     gen;
3526         int     ret;
3527         struct btrfs_root *root = sctx->dev_root;
3528
3529         if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
3530                 return -EIO;
3531
3532         /* Seed devices of a new filesystem has their own generation. */
3533         if (scrub_dev->fs_devices != root->fs_info->fs_devices)
3534                 gen = scrub_dev->generation;
3535         else
3536                 gen = root->fs_info->last_trans_committed;
3537
3538         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3539                 bytenr = btrfs_sb_offset(i);
3540                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
3541                     scrub_dev->commit_total_bytes)
3542                         break;
3543
3544                 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
3545                                   scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
3546                                   NULL, 1, bytenr);
3547                 if (ret)
3548                         return ret;
3549         }
3550         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3551
3552         return 0;
3553 }
3554
3555 /*
3556  * get a reference count on fs_info->scrub_workers. start worker if necessary
3557  */
3558 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
3559                                                 int is_dev_replace)
3560 {
3561         unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
3562         int max_active = fs_info->thread_pool_size;
3563
3564         if (fs_info->scrub_workers_refcnt == 0) {
3565                 if (is_dev_replace)
3566                         fs_info->scrub_workers =
3567                                 btrfs_alloc_workqueue("btrfs-scrub", flags,
3568                                                       1, 4);
3569                 else
3570                         fs_info->scrub_workers =
3571                                 btrfs_alloc_workqueue("btrfs-scrub", flags,
3572                                                       max_active, 4);
3573                 if (!fs_info->scrub_workers)
3574                         goto fail_scrub_workers;
3575
3576                 fs_info->scrub_wr_completion_workers =
3577                         btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
3578                                               max_active, 2);
3579                 if (!fs_info->scrub_wr_completion_workers)
3580                         goto fail_scrub_wr_completion_workers;
3581
3582                 fs_info->scrub_nocow_workers =
3583                         btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
3584                 if (!fs_info->scrub_nocow_workers)
3585                         goto fail_scrub_nocow_workers;
3586                 fs_info->scrub_parity_workers =
3587                         btrfs_alloc_workqueue("btrfs-scrubparity", flags,
3588                                               max_active, 2);
3589                 if (!fs_info->scrub_parity_workers)
3590                         goto fail_scrub_parity_workers;
3591         }
3592         ++fs_info->scrub_workers_refcnt;
3593         return 0;
3594
3595 fail_scrub_parity_workers:
3596         btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
3597 fail_scrub_nocow_workers:
3598         btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
3599 fail_scrub_wr_completion_workers:
3600         btrfs_destroy_workqueue(fs_info->scrub_workers);
3601 fail_scrub_workers:
3602         return -ENOMEM;
3603 }
3604
3605 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
3606 {
3607         if (--fs_info->scrub_workers_refcnt == 0) {
3608                 btrfs_destroy_workqueue(fs_info->scrub_workers);
3609                 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
3610                 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
3611                 btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
3612         }
3613         WARN_ON(fs_info->scrub_workers_refcnt < 0);
3614 }
3615
3616 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3617                     u64 end, struct btrfs_scrub_progress *progress,
3618                     int readonly, int is_dev_replace)
3619 {
3620         struct scrub_ctx *sctx;
3621         int ret;
3622         struct btrfs_device *dev;
3623         struct rcu_string *name;
3624
3625         if (btrfs_fs_closing(fs_info))
3626                 return -EINVAL;
3627
3628         if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
3629                 /*
3630                  * in this case scrub is unable to calculate the checksum
3631                  * the way scrub is implemented. Do not handle this
3632                  * situation at all because it won't ever happen.
3633                  */
3634                 btrfs_err(fs_info,
3635                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
3636                        fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
3637                 return -EINVAL;
3638         }
3639
3640         if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
3641                 /* not supported for data w/o checksums */
3642                 btrfs_err(fs_info,
3643                            "scrub: size assumption sectorsize != PAGE_SIZE "
3644                            "(%d != %lu) fails",
3645                        fs_info->chunk_root->sectorsize, PAGE_SIZE);
3646                 return -EINVAL;
3647         }
3648
3649         if (fs_info->chunk_root->nodesize >
3650             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
3651             fs_info->chunk_root->sectorsize >
3652             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
3653                 /*
3654                  * would exhaust the array bounds of pagev member in
3655                  * struct scrub_block
3656                  */
3657                 btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "
3658                            "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
3659                        fs_info->chunk_root->nodesize,
3660                        SCRUB_MAX_PAGES_PER_BLOCK,
3661                        fs_info->chunk_root->sectorsize,
3662                        SCRUB_MAX_PAGES_PER_BLOCK);
3663                 return -EINVAL;
3664         }
3665
3666
3667         mutex_lock(&fs_info->fs_devices->device_list_mutex);
3668         dev = btrfs_find_device(fs_info, devid, NULL, NULL);
3669         if (!dev || (dev->missing && !is_dev_replace)) {
3670                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3671                 return -ENODEV;
3672         }
3673
3674         if (!is_dev_replace && !readonly && !dev->writeable) {
3675                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3676                 rcu_read_lock();
3677                 name = rcu_dereference(dev->name);
3678                 btrfs_err(fs_info, "scrub: device %s is not writable",
3679                           name->str);
3680                 rcu_read_unlock();
3681                 return -EROFS;
3682         }
3683
3684         mutex_lock(&fs_info->scrub_lock);
3685         if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
3686                 mutex_unlock(&fs_info->scrub_lock);
3687                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3688                 return -EIO;
3689         }
3690
3691         btrfs_dev_replace_lock(&fs_info->dev_replace);
3692         if (dev->scrub_device ||
3693             (!is_dev_replace &&
3694              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3695                 btrfs_dev_replace_unlock(&fs_info->dev_replace);
3696                 mutex_unlock(&fs_info->scrub_lock);
3697                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3698                 return -EINPROGRESS;
3699         }
3700         btrfs_dev_replace_unlock(&fs_info->dev_replace);
3701
3702         ret = scrub_workers_get(fs_info, is_dev_replace);
3703         if (ret) {
3704                 mutex_unlock(&fs_info->scrub_lock);
3705                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3706                 return ret;
3707         }
3708
3709         sctx = scrub_setup_ctx(dev, is_dev_replace);
3710         if (IS_ERR(sctx)) {
3711                 mutex_unlock(&fs_info->scrub_lock);
3712                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3713                 scrub_workers_put(fs_info);
3714                 return PTR_ERR(sctx);
3715         }
3716         sctx->readonly = readonly;
3717         dev->scrub_device = sctx;
3718         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3719
3720         /*
3721          * checking @scrub_pause_req here, we can avoid
3722          * race between committing transaction and scrubbing.
3723          */
3724         __scrub_blocked_if_needed(fs_info);
3725         atomic_inc(&fs_info->scrubs_running);
3726         mutex_unlock(&fs_info->scrub_lock);
3727
3728         if (!is_dev_replace) {
3729                 /*
3730                  * by holding device list mutex, we can
3731                  * kick off writing super in log tree sync.
3732                  */
3733                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
3734                 ret = scrub_supers(sctx, dev);
3735                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3736         }
3737
3738         if (!ret)
3739                 ret = scrub_enumerate_chunks(sctx, dev, start, end,
3740                                              is_dev_replace);
3741
3742         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3743         atomic_dec(&fs_info->scrubs_running);
3744         wake_up(&fs_info->scrub_pause_wait);
3745
3746         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
3747
3748         if (progress)
3749                 memcpy(progress, &sctx->stat, sizeof(*progress));
3750
3751         mutex_lock(&fs_info->scrub_lock);
3752         dev->scrub_device = NULL;
3753         scrub_workers_put(fs_info);
3754         mutex_unlock(&fs_info->scrub_lock);
3755
3756         scrub_put_ctx(sctx);
3757
3758         return ret;
3759 }
3760
3761 void btrfs_scrub_pause(struct btrfs_root *root)
3762 {
3763         struct btrfs_fs_info *fs_info = root->fs_info;
3764
3765         mutex_lock(&fs_info->scrub_lock);
3766         atomic_inc(&fs_info->scrub_pause_req);
3767         while (atomic_read(&fs_info->scrubs_paused) !=
3768                atomic_read(&fs_info->scrubs_running)) {
3769                 mutex_unlock(&fs_info->scrub_lock);
3770                 wait_event(fs_info->scrub_pause_wait,
3771                            atomic_read(&fs_info->scrubs_paused) ==
3772                            atomic_read(&fs_info->scrubs_running));
3773                 mutex_lock(&fs_info->scrub_lock);
3774         }
3775         mutex_unlock(&fs_info->scrub_lock);
3776 }
3777
3778 void btrfs_scrub_continue(struct btrfs_root *root)
3779 {
3780         struct btrfs_fs_info *fs_info = root->fs_info;
3781
3782         atomic_dec(&fs_info->scrub_pause_req);
3783         wake_up(&fs_info->scrub_pause_wait);
3784 }
3785
3786 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
3787 {
3788         mutex_lock(&fs_info->scrub_lock);
3789         if (!atomic_read(&fs_info->scrubs_running)) {
3790                 mutex_unlock(&fs_info->scrub_lock);
3791                 return -ENOTCONN;
3792         }
3793
3794         atomic_inc(&fs_info->scrub_cancel_req);
3795         while (atomic_read(&fs_info->scrubs_running)) {
3796                 mutex_unlock(&fs_info->scrub_lock);
3797                 wait_event(fs_info->scrub_pause_wait,
3798                            atomic_read(&fs_info->scrubs_running) == 0);
3799                 mutex_lock(&fs_info->scrub_lock);
3800         }
3801         atomic_dec(&fs_info->scrub_cancel_req);
3802         mutex_unlock(&fs_info->scrub_lock);
3803
3804         return 0;
3805 }
3806
3807 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
3808                            struct btrfs_device *dev)
3809 {
3810         struct scrub_ctx *sctx;
3811
3812         mutex_lock(&fs_info->scrub_lock);
3813         sctx = dev->scrub_device;
3814         if (!sctx) {
3815                 mutex_unlock(&fs_info->scrub_lock);
3816                 return -ENOTCONN;
3817         }
3818         atomic_inc(&sctx->cancel_req);
3819         while (dev->scrub_device) {
3820                 mutex_unlock(&fs_info->scrub_lock);
3821                 wait_event(fs_info->scrub_pause_wait,
3822                            dev->scrub_device == NULL);
3823                 mutex_lock(&fs_info->scrub_lock);
3824         }
3825         mutex_unlock(&fs_info->scrub_lock);
3826
3827         return 0;
3828 }
3829
3830 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
3831                          struct btrfs_scrub_progress *progress)
3832 {
3833         struct btrfs_device *dev;
3834         struct scrub_ctx *sctx = NULL;
3835
3836         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
3837         dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
3838         if (dev)
3839                 sctx = dev->scrub_device;
3840         if (sctx)
3841                 memcpy(progress, &sctx->stat, sizeof(*progress));
3842         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3843
3844         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3845 }
3846
3847 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3848                                u64 extent_logical, u64 extent_len,
3849                                u64 *extent_physical,
3850                                struct btrfs_device **extent_dev,
3851                                int *extent_mirror_num)
3852 {
3853         u64 mapped_length;
3854         struct btrfs_bio *bbio = NULL;
3855         int ret;
3856
3857         mapped_length = extent_len;
3858         ret = btrfs_map_block(fs_info, READ, extent_logical,
3859                               &mapped_length, &bbio, 0);
3860         if (ret || !bbio || mapped_length < extent_len ||
3861             !bbio->stripes[0].dev->bdev) {
3862                 btrfs_put_bbio(bbio);
3863                 return;
3864         }
3865
3866         *extent_physical = bbio->stripes[0].physical;
3867         *extent_mirror_num = bbio->mirror_num;
3868         *extent_dev = bbio->stripes[0].dev;
3869         btrfs_put_bbio(bbio);
3870 }
3871
3872 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3873                               struct scrub_wr_ctx *wr_ctx,
3874                               struct btrfs_fs_info *fs_info,
3875                               struct btrfs_device *dev,
3876                               int is_dev_replace)
3877 {
3878         WARN_ON(wr_ctx->wr_curr_bio != NULL);
3879
3880         mutex_init(&wr_ctx->wr_lock);
3881         wr_ctx->wr_curr_bio = NULL;
3882         if (!is_dev_replace)
3883                 return 0;
3884
3885         WARN_ON(!dev->bdev);
3886         wr_ctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
3887         wr_ctx->tgtdev = dev;
3888         atomic_set(&wr_ctx->flush_all_writes, 0);
3889         return 0;
3890 }
3891
3892 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3893 {
3894         mutex_lock(&wr_ctx->wr_lock);
3895         kfree(wr_ctx->wr_curr_bio);
3896         wr_ctx->wr_curr_bio = NULL;
3897         mutex_unlock(&wr_ctx->wr_lock);
3898 }
3899
3900 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3901                             int mirror_num, u64 physical_for_dev_replace)
3902 {
3903         struct scrub_copy_nocow_ctx *nocow_ctx;
3904         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3905
3906         nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3907         if (!nocow_ctx) {
3908                 spin_lock(&sctx->stat_lock);
3909                 sctx->stat.malloc_errors++;
3910                 spin_unlock(&sctx->stat_lock);
3911                 return -ENOMEM;
3912         }
3913
3914         scrub_pending_trans_workers_inc(sctx);
3915
3916         nocow_ctx->sctx = sctx;
3917         nocow_ctx->logical = logical;
3918         nocow_ctx->len = len;
3919         nocow_ctx->mirror_num = mirror_num;
3920         nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3921         btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
3922                         copy_nocow_pages_worker, NULL, NULL);
3923         INIT_LIST_HEAD(&nocow_ctx->inodes);
3924         btrfs_queue_work(fs_info->scrub_nocow_workers,
3925                          &nocow_ctx->work);
3926
3927         return 0;
3928 }
3929
3930 static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
3931 {
3932         struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3933         struct scrub_nocow_inode *nocow_inode;
3934
3935         nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
3936         if (!nocow_inode)
3937                 return -ENOMEM;
3938         nocow_inode->inum = inum;
3939         nocow_inode->offset = offset;
3940         nocow_inode->root = root;
3941         list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
3942         return 0;
3943 }
3944
3945 #define COPY_COMPLETE 1
3946
3947 static void copy_nocow_pages_worker(struct btrfs_work *work)
3948 {
3949         struct scrub_copy_nocow_ctx *nocow_ctx =
3950                 container_of(work, struct scrub_copy_nocow_ctx, work);
3951         struct scrub_ctx *sctx = nocow_ctx->sctx;
3952         u64 logical = nocow_ctx->logical;
3953         u64 len = nocow_ctx->len;
3954         int mirror_num = nocow_ctx->mirror_num;
3955         u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3956         int ret;
3957         struct btrfs_trans_handle *trans = NULL;
3958         struct btrfs_fs_info *fs_info;
3959         struct btrfs_path *path;
3960         struct btrfs_root *root;
3961         int not_written = 0;
3962
3963         fs_info = sctx->dev_root->fs_info;
3964         root = fs_info->extent_root;
3965
3966         path = btrfs_alloc_path();
3967         if (!path) {
3968                 spin_lock(&sctx->stat_lock);
3969                 sctx->stat.malloc_errors++;
3970                 spin_unlock(&sctx->stat_lock);
3971                 not_written = 1;
3972                 goto out;
3973         }
3974
3975         trans = btrfs_join_transaction(root);
3976         if (IS_ERR(trans)) {
3977                 not_written = 1;
3978                 goto out;
3979         }
3980
3981         ret = iterate_inodes_from_logical(logical, fs_info, path,
3982                                           record_inode_for_nocow, nocow_ctx);
3983         if (ret != 0 && ret != -ENOENT) {
3984                 btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, "
3985                         "phys %llu, len %llu, mir %u, ret %d",
3986                         logical, physical_for_dev_replace, len, mirror_num,
3987                         ret);
3988                 not_written = 1;
3989                 goto out;
3990         }
3991
3992         btrfs_end_transaction(trans, root);
3993         trans = NULL;
3994         while (!list_empty(&nocow_ctx->inodes)) {
3995                 struct scrub_nocow_inode *entry;
3996                 entry = list_first_entry(&nocow_ctx->inodes,
3997                                          struct scrub_nocow_inode,
3998                                          list);
3999                 list_del_init(&entry->list);
4000                 ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
4001                                                  entry->root, nocow_ctx);
4002                 kfree(entry);
4003                 if (ret == COPY_COMPLETE) {
4004                         ret = 0;
4005                         break;
4006                 } else if (ret) {
4007                         break;
4008                 }
4009         }
4010 out:
4011         while (!list_empty(&nocow_ctx->inodes)) {
4012                 struct scrub_nocow_inode *entry;
4013                 entry = list_first_entry(&nocow_ctx->inodes,
4014                                          struct scrub_nocow_inode,
4015                                          list);
4016                 list_del_init(&entry->list);
4017                 kfree(entry);
4018         }
4019         if (trans && !IS_ERR(trans))
4020                 btrfs_end_transaction(trans, root);
4021         if (not_written)
4022                 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
4023                                             num_uncorrectable_read_errors);
4024
4025         btrfs_free_path(path);
4026         kfree(nocow_ctx);
4027
4028         scrub_pending_trans_workers_dec(sctx);
4029 }
4030
4031 static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
4032                                  u64 logical)
4033 {
4034         struct extent_state *cached_state = NULL;
4035         struct btrfs_ordered_extent *ordered;
4036         struct extent_io_tree *io_tree;
4037         struct extent_map *em;
4038         u64 lockstart = start, lockend = start + len - 1;
4039         int ret = 0;
4040
4041         io_tree = &BTRFS_I(inode)->io_tree;
4042
4043         lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
4044         ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
4045         if (ordered) {
4046                 btrfs_put_ordered_extent(ordered);
4047                 ret = 1;
4048                 goto out_unlock;
4049         }
4050
4051         em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
4052         if (IS_ERR(em)) {
4053                 ret = PTR_ERR(em);
4054                 goto out_unlock;
4055         }
4056
4057         /*
4058          * This extent does not actually cover the logical extent anymore,
4059          * move on to the next inode.
4060          */
4061         if (em->block_start > logical ||
4062             em->block_start + em->block_len < logical + len) {
4063                 free_extent_map(em);
4064                 ret = 1;
4065                 goto out_unlock;
4066         }
4067         free_extent_map(em);
4068
4069 out_unlock:
4070         unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
4071                              GFP_NOFS);
4072         return ret;
4073 }
4074
4075 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
4076                                       struct scrub_copy_nocow_ctx *nocow_ctx)
4077 {
4078         struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
4079         struct btrfs_key key;
4080         struct inode *inode;
4081         struct page *page;
4082         struct btrfs_root *local_root;
4083         struct extent_io_tree *io_tree;
4084         u64 physical_for_dev_replace;
4085         u64 nocow_ctx_logical;
4086         u64 len = nocow_ctx->len;
4087         unsigned long index;
4088         int srcu_index;
4089         int ret = 0;
4090         int err = 0;
4091
4092         key.objectid = root;
4093         key.type = BTRFS_ROOT_ITEM_KEY;
4094         key.offset = (u64)-1;
4095
4096         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
4097
4098         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
4099         if (IS_ERR(local_root)) {
4100                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4101                 return PTR_ERR(local_root);
4102         }
4103
4104         key.type = BTRFS_INODE_ITEM_KEY;
4105         key.objectid = inum;
4106         key.offset = 0;
4107         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
4108         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4109         if (IS_ERR(inode))
4110                 return PTR_ERR(inode);
4111
4112         /* Avoid truncate/dio/punch hole.. */
4113         mutex_lock(&inode->i_mutex);
4114         inode_dio_wait(inode);
4115
4116         physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
4117         io_tree = &BTRFS_I(inode)->io_tree;
4118         nocow_ctx_logical = nocow_ctx->logical;
4119
4120         ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
4121         if (ret) {
4122                 ret = ret > 0 ? 0 : ret;
4123                 goto out;
4124         }
4125
4126         while (len >= PAGE_CACHE_SIZE) {
4127                 index = offset >> PAGE_CACHE_SHIFT;
4128 again:
4129                 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
4130                 if (!page) {
4131                         btrfs_err(fs_info, "find_or_create_page() failed");
4132                         ret = -ENOMEM;
4133                         goto out;
4134                 }
4135
4136                 if (PageUptodate(page)) {
4137                         if (PageDirty(page))
4138                                 goto next_page;
4139                 } else {
4140                         ClearPageError(page);
4141                         err = extent_read_full_page(io_tree, page,
4142                                                            btrfs_get_extent,
4143                                                            nocow_ctx->mirror_num);
4144                         if (err) {
4145                                 ret = err;
4146                                 goto next_page;
4147                         }
4148
4149                         lock_page(page);
4150                         /*
4151                          * If the page has been remove from the page cache,
4152                          * the data on it is meaningless, because it may be
4153                          * old one, the new data may be written into the new
4154                          * page in the page cache.
4155                          */
4156                         if (page->mapping != inode->i_mapping) {
4157                                 unlock_page(page);
4158                                 page_cache_release(page);
4159                                 goto again;
4160                         }
4161                         if (!PageUptodate(page)) {
4162                                 ret = -EIO;
4163                                 goto next_page;
4164                         }
4165                 }
4166
4167                 ret = check_extent_to_block(inode, offset, len,
4168                                             nocow_ctx_logical);
4169                 if (ret) {
4170                         ret = ret > 0 ? 0 : ret;
4171                         goto next_page;
4172                 }
4173
4174                 err = write_page_nocow(nocow_ctx->sctx,
4175                                        physical_for_dev_replace, page);
4176                 if (err)
4177                         ret = err;
4178 next_page:
4179                 unlock_page(page);
4180                 page_cache_release(page);
4181
4182                 if (ret)
4183                         break;
4184
4185                 offset += PAGE_CACHE_SIZE;
4186                 physical_for_dev_replace += PAGE_CACHE_SIZE;
4187                 nocow_ctx_logical += PAGE_CACHE_SIZE;
4188                 len -= PAGE_CACHE_SIZE;
4189         }
4190         ret = COPY_COMPLETE;
4191 out:
4192         mutex_unlock(&inode->i_mutex);
4193         iput(inode);
4194         return ret;
4195 }
4196
4197 static int write_page_nocow(struct scrub_ctx *sctx,
4198                             u64 physical_for_dev_replace, struct page *page)
4199 {
4200         struct bio *bio;
4201         struct btrfs_device *dev;
4202         int ret;
4203
4204         dev = sctx->wr_ctx.tgtdev;
4205         if (!dev)
4206                 return -EIO;
4207         if (!dev->bdev) {
4208                 printk_ratelimited(KERN_WARNING
4209                         "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
4210                 return -EIO;
4211         }
4212         bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
4213         if (!bio) {
4214                 spin_lock(&sctx->stat_lock);
4215                 sctx->stat.malloc_errors++;
4216                 spin_unlock(&sctx->stat_lock);
4217                 return -ENOMEM;
4218         }
4219         bio->bi_iter.bi_size = 0;
4220         bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
4221         bio->bi_bdev = dev->bdev;
4222         ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
4223         if (ret != PAGE_CACHE_SIZE) {
4224 leave_with_eio:
4225                 bio_put(bio);
4226                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4227                 return -EIO;
4228         }
4229
4230         if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
4231                 goto leave_with_eio;
4232
4233         bio_put(bio);
4234         return 0;
4235 }