md/raid10: avoid reading from known bad blocks - part 1

[deliverable/linux.git] / drivers / md / raid10.c
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c

index f1b749c217179bbccb8b3c8e65c784a714a85165..872bf948f33a5e442a2ed7e2f575ec4cba1ce4cb 100644 (file)
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -191,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio)
  {
         conf_t *conf = r10_bio->mddev->private;
  
-       /*
-        * Wake up any possible resync thread that waits for the device
-        * to go idle.
-        */
-       allow_barrier(conf);
-
         put_all_bios(conf, r10_bio);
         mempool_free(r10_bio, conf->r10bio_pool);
  }
@@ -235,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio)
  static void raid_end_bio_io(r10bio_t *r10_bio)
  {
         struct bio *bio = r10_bio->master_bio;
+       int done;
+       conf_t *conf = r10_bio->mddev->private;
  
-       bio_endio(bio,
-               test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);
+       if (bio->bi_phys_segments) {
+               unsigned long flags;
+               spin_lock_irqsave(&conf->device_lock, flags);
+               bio->bi_phys_segments--;
+               done = (bio->bi_phys_segments == 0);
+               spin_unlock_irqrestore(&conf->device_lock, flags);
+       } else
+               done = 1;
+       if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
+               clear_bit(BIO_UPTODATE, &bio->bi_flags);
+       if (done) {
+               bio_endio(bio, 0);
+               /*
+                * Wake up any possible resync thread that waits for the device
+                * to go idle.
+                */
+               allow_barrier(conf);
+       }
         free_r10bio(r10_bio);
  }
  
@@ -307,6 +319,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
                                    mdname(conf->mddev),
                                    bdevname(conf->mirrors[dev].rdev->bdev, b),
                                    (unsigned long long)r10_bio->sector);
+               set_bit(R10BIO_ReadError, &r10_bio->state);
                 reschedule_retry(r10_bio);
         }
  }
@@ -505,11 +518,12 @@ static int raid10_mergeable_bvec(struct request_queue *q,
   * FIXME: possibly should rethink readbalancing and do it differently
   * depending on near_copies / far_copies geometry.
   */
-static int read_balance(conf_t *conf, r10bio_t *r10_bio)
+static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors)
  {
         const sector_t this_sector = r10_bio->sector;
         int disk, slot;
-       const int sectors = r10_bio->sectors;
+       int sectors = r10_bio->sectors;
+       int best_good_sectors;
         sector_t new_distance, best_dist;
         mdk_rdev_t *rdev;
         int do_balance;
@@ -518,8 +532,10 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
         raid10_find_phys(conf, r10_bio);
         rcu_read_lock();
  retry:
+       sectors = r10_bio->sectors;
         best_slot = -1;
         best_dist = MaxSector;
+       best_good_sectors = 0;
         do_balance = 1;
         /*
          * Check if we can balance. We can balance on the whole
@@ -532,6 +548,10 @@ retry:
                 do_balance = 0;
  
         for (slot = 0; slot < conf->copies ; slot++) {
+               sector_t first_bad;
+               int bad_sectors;
+               sector_t dev_sector;
+
                 if (r10_bio->devs[slot].bio == IO_BLOCKED)
                         continue;
                 disk = r10_bio->devs[slot].devnum;
@@ -541,6 +561,37 @@ retry:
                 if (!test_bit(In_sync, &rdev->flags))
                         continue;
  
+               dev_sector = r10_bio->devs[slot].addr;
+               if (is_badblock(rdev, dev_sector, sectors,
+                               &first_bad, &bad_sectors)) {
+                       if (best_dist < MaxSector)
+                               /* Already have a better slot */
+                               continue;
+                       if (first_bad <= dev_sector) {
+                               /* Cannot read here.  If this is the
+                                * 'primary' device, then we must not read
+                                * beyond 'bad_sectors' from another device.
+                                */
+                               bad_sectors -= (dev_sector - first_bad);
+                               if (!do_balance && sectors > bad_sectors)
+                                       sectors = bad_sectors;
+                               if (best_good_sectors > sectors)
+                                       best_good_sectors = sectors;
+                       } else {
+                               sector_t good_sectors =
+                                       first_bad - dev_sector;
+                               if (good_sectors > best_good_sectors) {
+                                       best_good_sectors = good_sectors;
+                                       best_slot = slot;
+                               }
+                               if (!do_balance)
+                                       /* Must read from here */
+                                       break;
+                       }
+                       continue;
+               } else
+                       best_good_sectors = sectors;
+
                 if (!do_balance)
                         break;
  
@@ -582,6 +633,7 @@ retry:
         } else
                 disk = -1;
         rcu_read_unlock();
+       *max_sectors = best_good_sectors;
  
         return disk;
  }
@@ -829,12 +881,27 @@ static int make_request(mddev_t *mddev, struct bio * bio)
         r10_bio->sector = bio->bi_sector;
         r10_bio->state = 0;
  
+       /* We might need to issue multiple reads to different
+        * devices if there are bad blocks around, so we keep
+        * track of the number of reads in bio->bi_phys_segments.
+        * If this is 0, there is only one r10_bio and no locking
+        * will be needed when the request completes.  If it is
+        * non-zero, then it is the number of not-completed requests.
+        */
+       bio->bi_phys_segments = 0;
+       clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+
         if (rw == READ) {
                 /*
                  * read balancing logic:
                  */
-               int disk = read_balance(conf, r10_bio);
-               int slot = r10_bio->read_slot;
+               int max_sectors;
+               int disk;
+               int slot;
+
+read_again:
+               disk = read_balance(conf, r10_bio, &max_sectors);
+               slot = r10_bio->read_slot;
                 if (disk < 0) {
                         raid_end_bio_io(r10_bio);
                         return 0;
@@ -842,6 +909,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                 mirror = conf->mirrors + disk;
  
                 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+               md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
+                           max_sectors);
  
                 r10_bio->devs[slot].bio = read_bio;
  
@@ -852,7 +921,39 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                 read_bio->bi_rw = READ | do_sync;
                 read_bio->bi_private = r10_bio;
  
-               generic_make_request(read_bio);
+               if (max_sectors < r10_bio->sectors) {
+                       /* Could not read all from this device, so we will
+                        * need another r10_bio.
+                        */
+                       int sectors_handled;
+
+                       sectors_handled = (r10_bio->sectors + max_sectors
+                                          - bio->bi_sector);
+                       r10_bio->sectors = max_sectors;
+                       spin_lock_irq(&conf->device_lock);
+                       if (bio->bi_phys_segments == 0)
+                               bio->bi_phys_segments = 2;
+                       else
+                               bio->bi_phys_segments++;
+                       spin_unlock(&conf->device_lock);
+                       /* Cannot call generic_make_request directly
+                        * as that will be queued in __generic_make_request
+                        * and subsequent mempool_alloc might block
+                        * waiting for it.  so hand bio over to raid10d.
+                        */
+                       reschedule_retry(r10_bio);
+
+                       r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+
+                       r10_bio->master_bio = bio;
+                       r10_bio->sectors = ((bio->bi_size >> 9)
+                                           - sectors_handled);
+                       r10_bio->state = 0;
+                       r10_bio->mddev = mddev;
+                       r10_bio->sector = bio->bi_sector + sectors_handled;
+                       goto read_again;
+               } else
+                       generic_make_request(read_bio);
                 return 0;
         }
  
@@ -1627,6 +1728,7 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
         mdk_rdev_t *rdev;
         char b[BDEVNAME_SIZE];
         unsigned long do_sync;
+       int max_sectors;
  
         /* we got a read error. Maybe the drive is bad.  Maybe just
          * the block and we can fix it.
@@ -1646,8 +1748,8 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
         bio = r10_bio->devs[slot].bio;
         r10_bio->devs[slot].bio =
                 mddev->ro ? IO_BLOCKED : NULL;
-       mirror = read_balance(conf, r10_bio);
-       if (mirror == -1) {
+       mirror = read_balance(conf, r10_bio, &max_sectors);
+       if (mirror == -1 || max_sectors < r10_bio->sectors) {
                 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
                        " read error for block %llu\n",
                        mdname(mddev),
@@ -1712,8 +1814,15 @@ static void raid10d(mddev_t *mddev)
                         sync_request_write(mddev, r10_bio);
                 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
                         recovery_request_write(mddev, r10_bio);
-               else
+               else if (test_bit(R10BIO_ReadError, &r10_bio->state))
                         handle_read_error(mddev, r10_bio);
+               else {
+                       /* just a partial read to be scheduled from a
+                        * separate context
+                        */
+                       int slot = r10_bio->read_slot;
+                       generic_make_request(r10_bio->devs[slot].bio);
+               }
  
                 cond_resched();
                 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))