md/raid10: avoid reading from known bad blocks - part 1
[deliverable/linux.git] / drivers / md / raid10.c
index f1b749c217179bbccb8b3c8e65c784a714a85165..872bf948f33a5e442a2ed7e2f575ec4cba1ce4cb 100644 (file)
@@ -191,12 +191,6 @@ static void free_r10bio(r10bio_t *r10_bio)
 {
        conf_t *conf = r10_bio->mddev->private;
 
-       /*
-        * Wake up any possible resync thread that waits for the device
-        * to go idle.
-        */
-       allow_barrier(conf);
-
        put_all_bios(conf, r10_bio);
        mempool_free(r10_bio, conf->r10bio_pool);
 }
@@ -235,9 +229,27 @@ static void reschedule_retry(r10bio_t *r10_bio)
 static void raid_end_bio_io(r10bio_t *r10_bio)
 {
        struct bio *bio = r10_bio->master_bio;
+       int done;
+       conf_t *conf = r10_bio->mddev->private;
 
-       bio_endio(bio,
-               test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);
+       if (bio->bi_phys_segments) {
+               unsigned long flags;
+               spin_lock_irqsave(&conf->device_lock, flags);
+               bio->bi_phys_segments--;
+               done = (bio->bi_phys_segments == 0);
+               spin_unlock_irqrestore(&conf->device_lock, flags);
+       } else
+               done = 1;
+       if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
+               clear_bit(BIO_UPTODATE, &bio->bi_flags);
+       if (done) {
+               bio_endio(bio, 0);
+               /*
+                * Wake up any possible resync thread that waits for the device
+                * to go idle.
+                */
+               allow_barrier(conf);
+       }
        free_r10bio(r10_bio);
 }
 
@@ -307,6 +319,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
                                   mdname(conf->mddev),
                                   bdevname(conf->mirrors[dev].rdev->bdev, b),
                                   (unsigned long long)r10_bio->sector);
+               set_bit(R10BIO_ReadError, &r10_bio->state);
                reschedule_retry(r10_bio);
        }
 }
@@ -505,11 +518,12 @@ static int raid10_mergeable_bvec(struct request_queue *q,
  * FIXME: possibly should rethink readbalancing and do it differently
  * depending on near_copies / far_copies geometry.
  */
-static int read_balance(conf_t *conf, r10bio_t *r10_bio)
+static int read_balance(conf_t *conf, r10bio_t *r10_bio, int *max_sectors)
 {
        const sector_t this_sector = r10_bio->sector;
        int disk, slot;
-       const int sectors = r10_bio->sectors;
+       int sectors = r10_bio->sectors;
+       int best_good_sectors;
        sector_t new_distance, best_dist;
        mdk_rdev_t *rdev;
        int do_balance;
@@ -518,8 +532,10 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
        raid10_find_phys(conf, r10_bio);
        rcu_read_lock();
 retry:
+       sectors = r10_bio->sectors;
        best_slot = -1;
        best_dist = MaxSector;
+       best_good_sectors = 0;
        do_balance = 1;
        /*
         * Check if we can balance. We can balance on the whole
@@ -532,6 +548,10 @@ retry:
                do_balance = 0;
 
        for (slot = 0; slot < conf->copies ; slot++) {
+               sector_t first_bad;
+               int bad_sectors;
+               sector_t dev_sector;
+
                if (r10_bio->devs[slot].bio == IO_BLOCKED)
                        continue;
                disk = r10_bio->devs[slot].devnum;
@@ -541,6 +561,37 @@ retry:
                if (!test_bit(In_sync, &rdev->flags))
                        continue;
 
+               dev_sector = r10_bio->devs[slot].addr;
+               if (is_badblock(rdev, dev_sector, sectors,
+                               &first_bad, &bad_sectors)) {
+                       if (best_dist < MaxSector)
+                               /* Already have a better slot */
+                               continue;
+                       if (first_bad <= dev_sector) {
+                               /* Cannot read here.  If this is the
+                                * 'primary' device, then we must not read
+                                * beyond 'bad_sectors' from another device.
+                                */
+                               bad_sectors -= (dev_sector - first_bad);
+                               if (!do_balance && sectors > bad_sectors)
+                                       sectors = bad_sectors;
+                               if (best_good_sectors > sectors)
+                                       best_good_sectors = sectors;
+                       } else {
+                               sector_t good_sectors =
+                                       first_bad - dev_sector;
+                               if (good_sectors > best_good_sectors) {
+                                       best_good_sectors = good_sectors;
+                                       best_slot = slot;
+                               }
+                               if (!do_balance)
+                                       /* Must read from here */
+                                       break;
+                       }
+                       continue;
+               } else
+                       best_good_sectors = sectors;
+
                if (!do_balance)
                        break;
 
@@ -582,6 +633,7 @@ retry:
        } else
                disk = -1;
        rcu_read_unlock();
+       *max_sectors = best_good_sectors;
 
        return disk;
 }
@@ -829,12 +881,27 @@ static int make_request(mddev_t *mddev, struct bio * bio)
        r10_bio->sector = bio->bi_sector;
        r10_bio->state = 0;
 
+       /* We might need to issue multiple reads to different
+        * devices if there are bad blocks around, so we keep
+        * track of the number of reads in bio->bi_phys_segments.
+        * If this is 0, there is only one r10_bio and no locking
+        * will be needed when the request completes.  If it is
+        * non-zero, then it is the number of not-completed requests.
+        */
+       bio->bi_phys_segments = 0;
+       clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+
        if (rw == READ) {
                /*
                 * read balancing logic:
                 */
-               int disk = read_balance(conf, r10_bio);
-               int slot = r10_bio->read_slot;
+               int max_sectors;
+               int disk;
+               int slot;
+
+read_again:
+               disk = read_balance(conf, r10_bio, &max_sectors);
+               slot = r10_bio->read_slot;
                if (disk < 0) {
                        raid_end_bio_io(r10_bio);
                        return 0;
@@ -842,6 +909,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                mirror = conf->mirrors + disk;
 
                read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+               md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
+                           max_sectors);
 
                r10_bio->devs[slot].bio = read_bio;
 
@@ -852,7 +921,39 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                read_bio->bi_rw = READ | do_sync;
                read_bio->bi_private = r10_bio;
 
-               generic_make_request(read_bio);
+               if (max_sectors < r10_bio->sectors) {
+                       /* Could not read all from this device, so we will
+                        * need another r10_bio.
+                        */
+                       int sectors_handled;
+
+                       sectors_handled = (r10_bio->sectors + max_sectors
+                                          - bio->bi_sector);
+                       r10_bio->sectors = max_sectors;
+                       spin_lock_irq(&conf->device_lock);
+                       if (bio->bi_phys_segments == 0)
+                               bio->bi_phys_segments = 2;
+                       else
+                               bio->bi_phys_segments++;
+                       spin_unlock(&conf->device_lock);
+                       /* Cannot call generic_make_request directly
+                        * as that will be queued in __generic_make_request
+                        * and subsequent mempool_alloc might block
+                        * waiting for it.  so hand bio over to raid10d.
+                        */
+                       reschedule_retry(r10_bio);
+
+                       r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+
+                       r10_bio->master_bio = bio;
+                       r10_bio->sectors = ((bio->bi_size >> 9)
+                                           - sectors_handled);
+                       r10_bio->state = 0;
+                       r10_bio->mddev = mddev;
+                       r10_bio->sector = bio->bi_sector + sectors_handled;
+                       goto read_again;
+               } else
+                       generic_make_request(read_bio);
                return 0;
        }
 
@@ -1627,6 +1728,7 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
        mdk_rdev_t *rdev;
        char b[BDEVNAME_SIZE];
        unsigned long do_sync;
+       int max_sectors;
 
        /* we got a read error. Maybe the drive is bad.  Maybe just
         * the block and we can fix it.
@@ -1646,8 +1748,8 @@ static void handle_read_error(mddev_t *mddev, r10bio_t *r10_bio)
        bio = r10_bio->devs[slot].bio;
        r10_bio->devs[slot].bio =
                mddev->ro ? IO_BLOCKED : NULL;
-       mirror = read_balance(conf, r10_bio);
-       if (mirror == -1) {
+       mirror = read_balance(conf, r10_bio, &max_sectors);
+       if (mirror == -1 || max_sectors < r10_bio->sectors) {
                printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
                       " read error for block %llu\n",
                       mdname(mddev),
@@ -1712,8 +1814,15 @@ static void raid10d(mddev_t *mddev)
                        sync_request_write(mddev, r10_bio);
                else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
                        recovery_request_write(mddev, r10_bio);
-               else
+               else if (test_bit(R10BIO_ReadError, &r10_bio->state))
                        handle_read_error(mddev, r10_bio);
+               else {
+                       /* just a partial read to be scheduled from a
+                        * separate context
+                        */
+                       int slot = r10_bio->read_slot;
+                       generic_make_request(r10_bio->devs[slot].bio);
+               }
 
                cond_resched();
                if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
This page took 0.031133 seconds and 5 git commands to generate.