diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bitmap.c | 194 | ||||
-rw-r--r-- | drivers/md/bitmap.h | 22 | ||||
-rw-r--r-- | drivers/md/dm-bufio.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 8 | ||||
-rw-r--r-- | drivers/md/dm-flakey.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-io.c | 23 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 45 | ||||
-rw-r--r-- | drivers/md/dm-thin-metadata.c | 25 | ||||
-rw-r--r-- | drivers/md/faulty.c | 2 | ||||
-rw-r--r-- | drivers/md/linear.c | 32 | ||||
-rw-r--r-- | drivers/md/md.c | 145 | ||||
-rw-r--r-- | drivers/md/md.h | 13 | ||||
-rw-r--r-- | drivers/md/multipath.c | 2 | ||||
-rw-r--r-- | drivers/md/raid0.c | 164 | ||||
-rw-r--r-- | drivers/md/raid0.h | 11 | ||||
-rw-r--r-- | drivers/md/raid1.c | 100 | ||||
-rw-r--r-- | drivers/md/raid10.c | 225 | ||||
-rw-r--r-- | drivers/md/raid5.c | 25 |
19 files changed, 605 insertions, 436 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index cdf36b1e9aa..3d0dfa7a89a 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -26,6 +26,7 @@ #include <linux/file.h> #include <linux/mount.h> #include <linux/buffer_head.h> +#include <linux/seq_file.h> #include "md.h" #include "bitmap.h" @@ -35,31 +36,6 @@ static inline char *bmname(struct bitmap *bitmap) } /* - * just a placeholder - calls kmalloc for bitmap pages - */ -static unsigned char *bitmap_alloc_page(struct bitmap *bitmap) -{ - unsigned char *page; - - page = kzalloc(PAGE_SIZE, GFP_NOIO); - if (!page) - printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap)); - else - pr_debug("%s: bitmap_alloc_page: allocated page at %p\n", - bmname(bitmap), page); - return page; -} - -/* - * for now just a placeholder -- just calls kfree for bitmap pages - */ -static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page) -{ - pr_debug("%s: bitmap_free_page: free page %p\n", bmname(bitmap), page); - kfree(page); -} - -/* * check a page and, if necessary, allocate it (or hijack it if the alloc fails) * * 1) check to see if this page is allocated, if it's not then try to alloc @@ -96,7 +72,7 @@ __acquires(bitmap->lock) /* this page has not been allocated yet */ spin_unlock_irq(&bitmap->lock); - mappage = bitmap_alloc_page(bitmap); + mappage = kzalloc(PAGE_SIZE, GFP_NOIO); spin_lock_irq(&bitmap->lock); if (mappage == NULL) { @@ -109,7 +85,7 @@ __acquires(bitmap->lock) } else if (bitmap->bp[page].map || bitmap->bp[page].hijacked) { /* somebody beat us to getting the page */ - bitmap_free_page(bitmap, mappage); + kfree(mappage); return 0; } else { @@ -141,7 +117,7 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) ptr = bitmap->bp[page].map; bitmap->bp[page].map = NULL; bitmap->missing_pages++; - bitmap_free_page(bitmap, ptr); + kfree(ptr); } } @@ -171,7 +147,7 @@ static struct page *read_sb_page(struct mddev *mddev, loff_t offset, did_alloc = 1; } - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (! test_bit(In_sync, &rdev->flags) || test_bit(Faulty, &rdev->flags)) continue; @@ -445,19 +421,14 @@ out: void bitmap_update_sb(struct bitmap *bitmap) { bitmap_super_t *sb; - unsigned long flags; if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ return; if (bitmap->mddev->bitmap_info.external) return; - spin_lock_irqsave(&bitmap->lock, flags); - if (!bitmap->sb_page) { /* no superblock */ - spin_unlock_irqrestore(&bitmap->lock, flags); + if (!bitmap->sb_page) /* no superblock */ return; - } - spin_unlock_irqrestore(&bitmap->lock, flags); - sb = kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page); sb->events = cpu_to_le64(bitmap->mddev->events); if (bitmap->mddev->events < bitmap->events_cleared) /* rocking back to read-only */ @@ -467,7 +438,7 @@ void bitmap_update_sb(struct bitmap *bitmap) /* Just in case these have been changed via sysfs: */ sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); - kunmap_atomic(sb, KM_USER0); + kunmap_atomic(sb); write_page(bitmap, bitmap->sb_page, 1); } @@ -478,7 +449,7 @@ void bitmap_print_sb(struct bitmap *bitmap) if (!bitmap || !bitmap->sb_page) return; - sb = kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page); printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); @@ -497,7 +468,7 @@ void bitmap_print_sb(struct bitmap *bitmap) printk(KERN_DEBUG " sync size: %llu KB\n", (unsigned long long)le64_to_cpu(sb->sync_size)/2); printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); - kunmap_atomic(sb, KM_USER0); + kunmap_atomic(sb); } /* @@ -525,7 +496,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) } bitmap->sb_page->index = 0; - sb = kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page); sb->magic = cpu_to_le32(BITMAP_MAGIC); sb->version = cpu_to_le32(BITMAP_MAJOR_HI); @@ -533,7 +504,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) chunksize = bitmap->mddev->bitmap_info.chunksize; BUG_ON(!chunksize); if (!is_power_of_2(chunksize)) { - kunmap_atomic(sb, KM_USER0); + kunmap_atomic(sb); printk(KERN_ERR "bitmap chunksize not a power of 2\n"); return -EINVAL; } @@ -571,7 +542,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) bitmap->flags |= BITMAP_HOSTENDIAN; sb->version = cpu_to_le32(BITMAP_MAJOR_HOSTENDIAN); - kunmap_atomic(sb, KM_USER0); + kunmap_atomic(sb); return 0; } @@ -603,7 +574,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) return err; } - sb = kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page); chunksize = le32_to_cpu(sb->chunksize); daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; @@ -632,26 +603,28 @@ static int bitmap_read_sb(struct bitmap *bitmap) /* keep the array size field of the bitmap superblock up to date */ sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); - if (!bitmap->mddev->persistent) - goto success; - - /* - * if we have a persistent array superblock, compare the - * bitmap's UUID and event counter to the mddev's - */ - if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { - printk(KERN_INFO "%s: bitmap superblock UUID mismatch\n", - bmname(bitmap)); - goto out; - } - events = le64_to_cpu(sb->events); - if (events < bitmap->mddev->events) { - printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) " - "-- forcing full recovery\n", bmname(bitmap), events, - (unsigned long long) bitmap->mddev->events); - sb->state |= cpu_to_le32(BITMAP_STALE); + if (bitmap->mddev->persistent) { + /* + * We have a persistent array superblock, so compare the + * bitmap's UUID and event counter to the mddev's + */ + if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { + printk(KERN_INFO + "%s: bitmap superblock UUID mismatch\n", + bmname(bitmap)); + goto out; + } + events = le64_to_cpu(sb->events); + if (events < bitmap->mddev->events) { + printk(KERN_INFO + "%s: bitmap file is out of date (%llu < %llu) " + "-- forcing full recovery\n", + bmname(bitmap), events, + (unsigned long long) bitmap->mddev->events); + sb->state |= cpu_to_le32(BITMAP_STALE); + } } -success: + /* assign fields using values from superblock */ bitmap->mddev->bitmap_info.chunksize = chunksize; bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; @@ -664,7 +637,7 @@ success: bitmap->events_cleared = bitmap->mddev->events; err = 0; out: - kunmap_atomic(sb, KM_USER0); + kunmap_atomic(sb); if (err) bitmap_print_sb(bitmap); return err; @@ -680,16 +653,11 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, enum bitmap_mask_op op) { bitmap_super_t *sb; - unsigned long flags; int old; - spin_lock_irqsave(&bitmap->lock, flags); - if (!bitmap->sb_page) { /* can't set the state */ - spin_unlock_irqrestore(&bitmap->lock, flags); + if (!bitmap->sb_page) /* can't set the state */ return 0; - } - spin_unlock_irqrestore(&bitmap->lock, flags); - sb = kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page); old = le32_to_cpu(sb->state) & bits; switch (op) { case MASK_SET: @@ -703,7 +671,7 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, default: BUG(); } - kunmap_atomic(sb, KM_USER0); + kunmap_atomic(sb); return old; } @@ -870,7 +838,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) unsigned long bit; struct page *page; void *kaddr; - unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); + unsigned long chunk = block >> bitmap->chunkshift; if (!bitmap->filemap) return; @@ -881,12 +849,12 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) bit = file_page_offset(bitmap, chunk); /* set the bit */ - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); if (bitmap->flags & BITMAP_HOSTENDIAN) set_bit(bit, kaddr); else __set_bit_le(bit, kaddr); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); pr_debug("set file bit %lu page %lu\n", bit, page->index); /* record page number so it gets flushed to disk when unplug occurs */ set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); @@ -1050,10 +1018,10 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) * if bitmap is out of date, dirty the * whole page and write it out */ - paddr = kmap_atomic(page, KM_USER0); + paddr = kmap_atomic(page); memset(paddr + offset, 0xff, PAGE_SIZE - offset); - kunmap_atomic(paddr, KM_USER0); + kunmap_atomic(paddr); write_page(bitmap, page, 1); ret = -EIO; @@ -1061,18 +1029,18 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) goto err; } } - paddr = kmap_atomic(page, KM_USER0); + paddr = kmap_atomic(page); if (bitmap->flags & BITMAP_HOSTENDIAN) b = test_bit(bit, paddr); else b = test_bit_le(bit, paddr); - kunmap_atomic(paddr, KM_USER0); + kunmap_atomic(paddr); if (b) { /* if the disk bit is set, set the memory bit */ - int needed = ((sector_t)(i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) + int needed = ((sector_t)(i+1) << bitmap->chunkshift >= start); bitmap_set_memory_bits(bitmap, - (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), + (sector_t)i << bitmap->chunkshift, needed); bit_cnt++; } @@ -1116,7 +1084,7 @@ void bitmap_write_all(struct bitmap *bitmap) static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) { - sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap); + sector_t chunk = offset >> bitmap->chunkshift; unsigned long page = chunk >> PAGE_COUNTER_SHIFT; bitmap->bp[page].count += inc; bitmap_checkfree(bitmap, page); @@ -1209,10 +1177,10 @@ void bitmap_daemon_work(struct mddev *mddev) mddev->bitmap_info.external == 0) { bitmap_super_t *sb; bitmap->need_sync = 0; - sb = kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->sb_page); sb->events_cleared = cpu_to_le64(bitmap->events_cleared); - kunmap_atomic(sb, KM_USER0); + kunmap_atomic(sb); write_page(bitmap, bitmap->sb_page, 1); } spin_lock_irqsave(&bitmap->lock, flags); @@ -1222,7 +1190,7 @@ void bitmap_daemon_work(struct mddev *mddev) bitmap->allclean = 0; } bmc = bitmap_get_counter(bitmap, - (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), + (sector_t)j << bitmap->chunkshift, &blocks, 0); if (!bmc) j |= PAGE_COUNTER_MASK; @@ -1231,11 +1199,11 @@ void bitmap_daemon_work(struct mddev *mddev) /* we can clear the bit */ *bmc = 0; bitmap_count_page(bitmap, - (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), + (sector_t)j << bitmap->chunkshift, -1); /* clear the bit */ - paddr = kmap_atomic(page, KM_USER0); + paddr = kmap_atomic(page); if (bitmap->flags & BITMAP_HOSTENDIAN) clear_bit(file_page_offset(bitmap, j), paddr); @@ -1244,7 +1212,7 @@ void bitmap_daemon_work(struct mddev *mddev) file_page_offset(bitmap, j), paddr); - kunmap_atomic(paddr, KM_USER0); + kunmap_atomic(paddr); } else if (*bmc <= 2) { *bmc = 1; /* maybe clear the bit next time */ set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); @@ -1285,7 +1253,7 @@ __acquires(bitmap->lock) * The lock must have been taken with interrupts enabled. * If !create, we don't release the lock. */ - sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap); + sector_t chunk = offset >> bitmap->chunkshift; unsigned long page = chunk >> PAGE_COUNTER_SHIFT; unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; sector_t csize; @@ -1295,10 +1263,10 @@ __acquires(bitmap->lock) if (bitmap->bp[page].hijacked || bitmap->bp[page].map == NULL) - csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) + + csize = ((sector_t)1) << (bitmap->chunkshift + PAGE_COUNTER_SHIFT - 1); else - csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap)); + csize = ((sector_t)1) << bitmap->chunkshift; *blocks = csize - (offset & (csize - 1)); if (err < 0) @@ -1424,7 +1392,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto set_page_attr(bitmap, filemap_get_page( bitmap, - offset >> CHUNK_BLOCK_SHIFT(bitmap)), + offset >> bitmap->chunkshift), BITMAP_PAGE_PENDING); bitmap->allclean = 0; } @@ -1512,7 +1480,7 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i else { if (*bmc <= 2) { set_page_attr(bitmap, - filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)), + filemap_get_page(bitmap, offset >> bitmap->chunkshift), BITMAP_PAGE_PENDING); bitmap->allclean = 0; } @@ -1559,7 +1527,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) bitmap->mddev->curr_resync_completed = sector; set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); - sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); + sector &= ~((1ULL << bitmap->chunkshift) - 1); s = 0; while (s < sector && s < bitmap->mddev->resync_max_sectors) { bitmap_end_sync(bitmap, s, &blocks, 0); @@ -1589,7 +1557,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n struct page *page; *bmc = 2 | (needed ? NEEDED_MASK : 0); bitmap_count_page(bitmap, offset, 1); - page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); + page = filemap_get_page(bitmap, offset >> bitmap->chunkshift); set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); bitmap->allclean = 0; } @@ -1602,7 +1570,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) unsigned long chunk; for (chunk = s; chunk <= e; chunk++) { - sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap); + sector_t sec = (sector_t)chunk << bitmap->chunkshift; bitmap_set_memory_bits(bitmap, sec, 1); spin_lock_irq(&bitmap->lock); bitmap_file_set_bit(bitmap, sec); @@ -1759,11 +1727,12 @@ int bitmap_create(struct mddev *mddev) goto error; bitmap->daemon_lastrun = jiffies; - bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize); + bitmap->chunkshift = (ffz(~mddev->bitmap_info.chunksize) + - BITMAP_BLOCK_SHIFT); /* now that chunksize and chunkshift are set, we can use these macros */ - chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> - CHUNK_BLOCK_SHIFT(bitmap); + chunks = (blocks + bitmap->chunkshift - 1) >> + bitmap->chunkshift; pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; BUG_ON(!pages); @@ -1836,6 +1805,33 @@ out: } EXPORT_SYMBOL_GPL(bitmap_load); +void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) +{ + unsigned long chunk_kb; + unsigned long flags; + + if (!bitmap) + return; + + spin_lock_irqsave(&bitmap->lock, flags); + chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; + seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " + "%lu%s chunk", + bitmap->pages - bitmap->missing_pages, + bitmap->pages, + (bitmap->pages - bitmap->missing_pages) + << (PAGE_SHIFT - 10), + chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, + chunk_kb ? "KB" : "B"); + if (bitmap->file) { + seq_printf(seq, ", file: "); + seq_path(seq, &bitmap->file->f_path, " \t\n"); + } + + seq_printf(seq, "\n"); + spin_unlock_irqrestore(&bitmap->lock, flags); +} + static ssize_t location_show(struct mddev *mddev, char *page) { @@ -1904,6 +1900,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->pers) { mddev->pers->quiesce(mddev, 1); rv = bitmap_create(mddev); + if (!rv) + rv = bitmap_load(mddev); if (rv) { bitmap_destroy(mddev); mddev->bitmap_info.offset = 0; diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index a15436dd9b3..55ca5aec84e 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -13,8 +13,6 @@ #define BITMAP_MAJOR_HI 4 #define BITMAP_MAJOR_HOSTENDIAN 3 -#define BITMAP_MINOR 39 - /* * in-memory bitmap: * @@ -101,21 +99,10 @@ typedef __u16 bitmap_counter_t; /* same, except a mask value for more efficient bitops */ #define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) -#define BITMAP_BLOCK_SIZE 512 #define BITMAP_BLOCK_SHIFT 9 /* how many blocks per chunk? (this is variable) */ #define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT) -#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) -#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) - -/* when hijacked, the counters and bits represent even larger "chunks" */ -/* there will be 1024 chunks represented by each counter in the page pointers */ -#define PAGEPTR_BLOCK_RATIO(bitmap) \ - (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1) -#define PAGEPTR_BLOCK_SHIFT(bitmap) \ - (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) -#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) #endif @@ -181,12 +168,6 @@ struct bitmap_page { unsigned int count:31; }; -/* keep track of bitmap file pages that have pending writes on them */ -struct page_list { - struct list_head list; - struct page *page; -}; - /* the main bitmap structure - one per mddev */ struct bitmap { struct bitmap_page *bp; @@ -196,7 +177,7 @@ struct bitmap { struct mddev *mddev; /* the md device that the bitmap is for */ /* bitmap chunksize -- how much data does each bit represent? */ - unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ + unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */ unsigned long chunks; /* total number of data chunks for the array */ __u64 events_cleared; @@ -245,6 +226,7 @@ void bitmap_destroy(struct mddev *mddev); void bitmap_print_sb(struct bitmap *bitmap); void bitmap_update_sb(struct bitmap *bitmap); +void bitmap_status(struct seq_file *seq, struct bitmap *bitmap); int bitmap_setallbits(struct bitmap *bitmap); void bitmap_write_all(struct bitmap *bitmap); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 0a6806f80ab..b6e58c7b6df 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -12,7 +12,6 @@ #include <linux/dm-io.h> #include <linux/slab.h> #include <linux/vmalloc.h> -#include <linux/version.h> #include <linux/shrinker.h> #include <linux/module.h> diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 8c2a000cf3f..db6b51639ce 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -590,9 +590,9 @@ static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, int r = 0; if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { - src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0); + src = kmap_atomic(sg_page(&dmreq->sg_in)); r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset); - kunmap_atomic(src, KM_USER0); + kunmap_atomic(src); } else memset(iv, 0, cc->iv_size); @@ -608,14 +608,14 @@ static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) return 0; - dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0); + dst = kmap_atomic(sg_page(&dmreq->sg_out)); r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset); /* Tweak the first block of plaintext sector */ if (!r) crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size); - kunmap_atomic(dst, KM_USER0); + kunmap_atomic(dst); return r; } diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 9fb18c14782..b280c433e4a 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -323,7 +323,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, * Corrupt successful READs while in down state. * If flags were specified, only corrupt those that match. */ - if (!error && bio_submitted_while_down && + if (fc->corrupt_bio_byte && !error && bio_submitted_while_down && (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) && all_corrupt_bio_flags_match(bio, fc)) corrupt_bio_data(bio, fc); diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index ad2eba40e31..ea5dd289fe2 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -296,6 +296,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, unsigned offset; unsigned num_bvecs; sector_t remaining = where->count; + struct request_queue *q = bdev_get_queue(where->bdev); + sector_t discard_sectors; /* * where->count may be zero if rw holds a flush and we need to @@ -305,9 +307,12 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, /* * Allocate a suitably sized-bio. */ - num_bvecs = dm_sector_div_up(remaining, - (PAGE_SIZE >> SECTOR_SHIFT)); - num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs); + if (rw & REQ_DISCARD) + num_bvecs = 1; + else + num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), + dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT))); + bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); bio->bi_sector = where->sector + (where->count - remaining); bio->bi_bdev = where->bdev; @@ -315,10 +320,14 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, bio->bi_destructor = dm_bio_destructor; store_io_and_region_in_bio(bio, io, region); - /* - * Try and add as many pages as possible. - */ - while (remaining) { + if (rw & REQ_DISCARD) { + discard_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining); + bio->bi_size = discard_sectors << SECTOR_SHIFT; + remaining -= discard_sectors; + } else while (remaining) { + /* + * Try and add as many pages as possible. + */ dp->get_page(dp, &page, &len, &offset); len = min(len, to_bytes(remaining)); if (!bio_add_page(bio, page, len, offset)) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 31c2dc25886..1ce84ed0b76 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1437,7 +1437,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size) if (!argc) { DMWARN("Empty message received."); - goto out; + goto out_argv; } table = dm_get_live_table(md); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index c2907d836e4..c5a875d7b88 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -56,7 +56,8 @@ struct raid_dev { struct raid_set { struct dm_target *ti; - uint64_t print_flags; + uint32_t bitmap_loaded; + uint32_t print_flags; struct mddev md; struct raid_type *raid_type; @@ -614,14 +615,14 @@ static int read_disk_sb(struct md_rdev *rdev, int size) static void super_sync(struct mddev *mddev, struct md_rdev *rdev) { - struct md_rdev *r, *t; + struct md_rdev *r; uint64_t failed_devices; struct dm_raid_superblock *sb; sb = page_address(rdev->sb_page); failed_devices = le64_to_cpu(sb->failed_devices); - rdev_for_each(r, t, mddev) + rdev_for_each(r, mddev) if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) failed_devices |= (1ULL << r->raid_disk); @@ -667,7 +668,14 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) return ret; sb = page_address(rdev->sb_page); - if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) { + + /* + * Two cases that we want to write new superblocks and rebuild: + * 1) New device (no matching magic number) + * 2) Device specified for rebuild (!In_sync w/ offset == 0) + */ + if ((sb->magic != cpu_to_le32(DM_RAID_MAGIC)) || + (!test_bit(In_sync, &rdev->flags) && !rdev->recovery_offset)) { super_sync(rdev->mddev, rdev); set_bit(FirstUse, &rdev->flags); @@ -699,7 +707,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) struct dm_raid_superblock *sb; uint32_t new_devs = 0; uint32_t rebuilds = 0; - struct md_rdev *r, *t; + struct md_rdev *r; struct dm_raid_superblock *sb2; sb = page_address(rdev->sb_page); @@ -742,13 +750,10 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) * case the In_sync bit will /not/ be set and * recovery_cp must be MaxSector. */ - rdev_for_each(r, t, mddev) { + rdev_for_each(r, mddev) { if (!test_bit(In_sync, &r->flags)) { - if (!test_bit(FirstUse, &r->flags)) - DMERR("Superblock area of " - "rebuild device %d should have been " - "cleared.", r->raid_disk); - set_bit(FirstUse, &r->flags); + DMINFO("Device %d specified for rebuild: " + "Clearing superblock", r->raid_disk); rebuilds++; } else if (test_bit(FirstUse, &r->flags)) new_devs++; @@ -777,7 +782,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) * Now we set the Faulty bit for those devices that are * recorded in the superblock as failed. */ - rdev_for_each(r, t, mddev) { + rdev_for_each(r, mddev) { if (!r->sb_page) continue; sb2 = page_address(r->sb_page); @@ -850,11 +855,11 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev) static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) { int ret; - struct md_rdev *rdev, *freshest, *tmp; + struct md_rdev *rdev, *freshest; struct mddev *mddev = &rs->md; freshest = NULL; - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each(rdev, mddev) { if (!rdev->meta_bdev) continue; @@ -883,7 +888,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) if (super_validate(mddev, freshest)) return -EINVAL; - rdev_for_each(rdev, tmp, mddev) + rdev_for_each(rdev, mddev) if ((rdev != freshest) && super_validate(mddev, rdev)) return -EINVAL; @@ -970,6 +975,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) INIT_WORK(&rs->md.event_work, do_table_event); ti->private = rs; + ti->num_flush_requests = 1; mutex_lock(&rs->md.reconfig_mutex); ret = md_run(&rs->md); @@ -1085,7 +1091,7 @@ static int raid_status(struct dm_target *ti, status_type_t type, raid_param_cnt += 2; } - raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2); + raid_param_cnt += (hweight32(rs->print_flags & ~DMPF_REBUILD) * 2); if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) raid_param_cnt--; @@ -1197,7 +1203,12 @@ static void raid_resume(struct dm_target *ti) { struct raid_set *rs = ti->private; - bitmap_load(&rs->md); + if (!rs->bitmap_loaded) { + bitmap_load(&rs->md); + rs->bitmap_loaded = 1; + } else + md_wakeup_thread(rs->md.thread); + mddev_resume(&rs->md); } diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 59c4f0446ff..237571af77f 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -385,6 +385,7 @@ static int init_pmd(struct dm_pool_metadata *pmd, data_sm = dm_sm_disk_create(tm, nr_blocks); if (IS_ERR(data_sm)) { DMERR("sm_disk_create failed"); + dm_tm_unlock(tm, sblock); r = PTR_ERR(data_sm); goto bad; } @@ -789,6 +790,11 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd) return 0; } +/* + * __open_device: Returns @td corresponding to device with id @dev, + * creating it if @create is set and incrementing @td->open_count. + * On failure, @td is undefined. + */ static int __open_device(struct dm_pool_metadata *pmd, dm_thin_id dev, int create, struct dm_thin_device **td) @@ -799,10 +805,16 @@ static int __open_device(struct dm_pool_metadata *pmd, struct disk_device_details details_le; /* - * Check the device isn't already open. + * If the device is already open, return it. */ list_for_each_entry(td2, &pmd->thin_devices, list) if (td2->id == dev) { + /* + * May not create an already-open device. + */ + if (create) + return -EEXIST; + td2->open_count++; *td = td2; return 0; @@ -817,6 +829,9 @@ static int __open_device(struct dm_pool_metadata *pmd, if (r != -ENODATA || !create) return r; + /* + * Create new device. + */ changed = 1; details_le.mapped_blocks = 0; details_le.transaction_id = cpu_to_le64(pmd->trans_id); @@ -882,12 +897,10 @@ static int __create_thin(struct dm_pool_metadata *pmd, r = __open_device(pmd, dev, 1, &td); if (r) { - __close_device(td); dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); dm_btree_del(&pmd->bl_info, dev_root); return r; } - td->changed = 1; __close_device(td); return r; @@ -967,14 +980,14 @@ static int __create_snap(struct dm_pool_metadata *pmd, goto bad; r = __set_snapshot_details(pmd, td, origin, pmd->time); + __close_device(td); + if (r) goto bad; - __close_device(td); return 0; bad: - __close_device(td); dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); dm_btree_remove(&pmd->details_info, pmd->details_root, &key, &pmd->details_root); @@ -1211,6 +1224,8 @@ static int __remove(struct dm_thin_device *td, dm_block_t block) if (r) return r; + td->mapped_blocks--; + td->changed = 1; pmd->need_commit = 1; return 0; diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index feb2c3c7bb4..45135f69509 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -315,7 +315,7 @@ static int run(struct mddev *mddev) } conf->nfaults = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) conf->rdev = rdev; md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 627456542fb..b0fcc7d02ad 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -68,10 +68,19 @@ static int linear_mergeable_bvec(struct request_queue *q, struct dev_info *dev0; unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); + int maxbytes = biovec->bv_len; + struct request_queue *subq; rcu_read_lock(); dev0 = which_dev(mddev, sector); maxsectors = dev0->end_sector - sector; + subq = bdev_get_queue(dev0->rdev->bdev); + if (subq->merge_bvec_fn) { + bvm->bi_bdev = dev0->rdev->bdev; + bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors; + maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm, + biovec)); + } rcu_read_unlock(); if (maxsectors < bio_sectors) @@ -80,12 +89,12 @@ static int linear_mergeable_bvec(struct request_queue *q, maxsectors -= bio_sectors; if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0) - return biovec->bv_len; - /* The bytes available at this offset could be really big, - * so we cap at 2^31 to avoid overflow */ - if (maxsectors > (1 << (31-9))) - return 1<<31; - return maxsectors << 9; + return maxbytes; + + if (maxsectors > (maxbytes >> 9)) + return maxbytes; + else + return maxsectors << 9; } static int linear_congested(void *data, int bits) @@ -138,7 +147,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) cnt = 0; conf->array_sectors = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { int j = rdev->raid_disk; struct dev_info *disk = conf->disks + j; sector_t sectors; @@ -158,15 +167,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit max_segments to 1 lying within - * a single page. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } conf->array_sectors += rdev->sectors; cnt++; diff --git a/drivers/md/md.c b/drivers/md/md.c index 9417ae2fa0b..b572e1e386c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -439,7 +439,7 @@ static void submit_flushes(struct work_struct *ws) INIT_WORK(&mddev->flush_work, md_submit_flush_data); atomic_set(&mddev->flush_pending, 1); rcu_read_lock(); - list_for_each_entry_rcu(rdev, &mddev->disks, same_set) + rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) { /* Take two references, one is dropped @@ -749,7 +749,7 @@ static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) { struct md_rdev *rdev; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->desc_nr == nr) return rdev; @@ -760,7 +760,7 @@ static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev) { struct md_rdev *rdev; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->bdev->bd_dev == dev) return rdev; @@ -1342,7 +1342,7 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) sb->state |= (1<<MD_SB_BITMAP_PRESENT); sb->disks[0].state = (1<<MD_DISK_REMOVED); - list_for_each_entry(rdev2, &mddev->disks, same_set) { + rdev_for_each(rdev2, mddev) { mdp_disk_t *d; int desc_nr; int is_active = test_bit(In_sync, &rdev2->flags); @@ -1805,18 +1805,18 @@ retry: | BB_LEN(internal_bb)); *bbp++ = cpu_to_le64(store_bb); } + bb->changed = 0; if (read_seqretry(&bb->lock, seq)) goto retry; bb->sector = (rdev->sb_start + (int)le32_to_cpu(sb->bblog_offset)); bb->size = le16_to_cpu(sb->bblog_size); - bb->changed = 0; } } max_dev = 0; - list_for_each_entry(rdev2, &mddev->disks, same_set) + rdev_for_each(rdev2, mddev) if (rdev2->desc_nr+1 > max_dev) max_dev = rdev2->desc_nr+1; @@ -1833,7 +1833,7 @@ retry: for (i=0; i<max_dev;i++) sb->dev_roles[i] = cpu_to_le16(0xfffe); - list_for_each_entry(rdev2, &mddev->disks, same_set) { + rdev_for_each(rdev2, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(0xfffe); @@ -1948,7 +1948,7 @@ int md_integrity_register(struct mddev *mddev) return 0; /* nothing to do */ if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) return 0; /* shouldn't register, or already is */ - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { /* skip spares and non-functional disks */ if (test_bit(Faulty, &rdev->flags)) continue; @@ -2175,7 +2175,7 @@ static void export_array(struct mddev *mddev) { struct md_rdev *rdev, *tmp; - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each_safe(rdev, tmp, mddev) { if (!rdev->mddev) { MD_BUG(); continue; @@ -2307,11 +2307,11 @@ static void md_print_devices(void) bitmap_print_sb(mddev->bitmap); else printk("%s: ", mdname(mddev)); - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) printk("<%s>", bdevname(rdev->bdev,b)); printk("\n"); - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) print_rdev(rdev, mddev->major_version); } printk("md: **********************************\n"); @@ -2328,7 +2328,7 @@ static void sync_sbs(struct mddev * mddev, int nospares) * with the rest of the array) */ struct md_rdev *rdev; - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->sb_events == mddev->events || (nospares && rdev->raid_disk < 0 && @@ -2351,7 +2351,7 @@ static void md_update_sb(struct mddev * mddev, int force_change) repeat: /* First make sure individual recovery_offsets are correct */ - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && !test_bit(In_sync, &rdev->flags) && @@ -2364,8 +2364,9 @@ repeat: clear_bit(MD_CHANGE_DEVS, &mddev->flags); if (!mddev->external) { clear_bit(MD_CHANGE_PENDING, &mddev->flags); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) { + rdev->badblocks.changed = 0; md_ack_all_badblocks(&rdev->badblocks); md_error(mddev, rdev); } @@ -2430,7 +2431,7 @@ repeat: mddev->events --; } - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) any_badblocks_changed++; if (test_bit(Faulty, &rdev->flags)) @@ -2444,7 +2445,7 @@ repeat: mdname(mddev), mddev->in_sync); bitmap_update_sb(mddev->bitmap); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; if (rdev->sb_loaded != 1) @@ -2493,7 +2494,7 @@ repeat: if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) sysfs_notify(&mddev->kobj, NULL, "sync_completed"); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (test_and_clear_bit(FaultRecorded, &rdev->flags)) clear_bit(Blocked, &rdev->flags); @@ -2896,7 +2897,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) struct md_rdev *rdev2; mddev_lock(mddev); - list_for_each_entry(rdev2, &mddev->disks, same_set) + rdev_for_each(rdev2, mddev) if (rdev->bdev == rdev2->bdev && rdev != rdev2 && overlaps(rdev->data_offset, rdev->sectors, @@ -3193,7 +3194,7 @@ static void analyze_sbs(struct mddev * mddev) char b[BDEVNAME_SIZE]; freshest = NULL; - rdev_for_each(rdev, tmp, mddev) + rdev_for_each_safe(rdev, tmp, mddev) switch (super_types[mddev->major_version]. load_super(rdev, freshest, mddev->minor_version)) { case 1: @@ -3214,7 +3215,7 @@ static void analyze_sbs(struct mddev * mddev) validate_super(mddev, freshest); i = 0; - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each_safe(rdev, tmp, mddev) { if (mddev->max_disks && (rdev->desc_nr >= mddev->max_disks || i > mddev->max_disks)) { @@ -3403,7 +3404,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) return -EINVAL; } - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) rdev->new_raid_disk = rdev->raid_disk; /* ->takeover must set new_* and/or delta_disks @@ -3456,7 +3457,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->safemode = 0; } - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; if (rdev->new_raid_disk >= mddev->raid_disks) @@ -3465,7 +3466,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) continue; sysfs_unlink_rdev(mddev, rdev); } - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; if (rdev->new_raid_disk == rdev->raid_disk) @@ -4796,7 +4797,7 @@ int md_run(struct mddev *mddev) * the only valid external interface is through the md * device. */ - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (test_bit(Faulty, &rdev->flags)) continue; sync_blockdev(rdev->bdev); @@ -4867,8 +4868,8 @@ int md_run(struct mddev *mddev) struct md_rdev *rdev2; int warned = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) - list_for_each_entry(rdev2, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) + rdev_for_each(rdev2, mddev) { if (rdev < rdev2 && rdev->bdev->bd_contains == rdev2->bdev->bd_contains) { @@ -4945,7 +4946,7 @@ int md_run(struct mddev *mddev) mddev->in_sync = 1; smp_wmb(); mddev->ready = 1; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) if (sysfs_link_rdev(mddev, rdev)) /* failure here is OK */; @@ -5073,6 +5074,7 @@ static void md_clean(struct mddev *mddev) mddev->changed = 0; mddev->degraded = 0; mddev->safemode = 0; + mddev->merge_check_needed = 0; mddev->bitmap_info.offset = 0; mddev->bitmap_info.default_offset = 0; mddev->bitmap_info.chunksize = 0; @@ -5175,7 +5177,7 @@ static int do_md_stop(struct mddev * mddev, int mode, int is_open) /* tell userspace to handle 'inactive' */ sysfs_notify_dirent_safe(mddev->sysfs_state); - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) sysfs_unlink_rdev(mddev, rdev); @@ -5226,7 +5228,7 @@ static void autorun_array(struct mddev *mddev) printk(KERN_INFO "md: running: "); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; printk("<%s>", bdevname(rdev->bdev,b)); } @@ -5356,7 +5358,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg) struct md_rdev *rdev; nr=working=insync=failed=spare=0; - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { nr++; if (test_bit(Faulty, &rdev->flags)) failed++; @@ -5923,7 +5925,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) * grow, and re-add. */ return -EBUSY; - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { sector_t avail = rdev->sectors; if (fit && (num_sectors == 0 || num_sectors > avail)) @@ -6724,7 +6726,6 @@ static int md_seq_show(struct seq_file *seq, void *v) struct mddev *mddev = v; sector_t sectors; struct md_rdev *rdev; - struct bitmap *bitmap; if (v == (void*)1) { struct md_personality *pers; @@ -6758,7 +6759,7 @@ static int md_seq_show(struct seq_file *seq, void *v) } sectors = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", bdevname(rdev->bdev,b), rdev->desc_nr); @@ -6812,27 +6813,7 @@ static int md_seq_show(struct seq_file *seq, void *v) } else seq_printf(seq, "\n "); - if ((bitmap = mddev->bitmap)) { - unsigned long chunk_kb; - unsigned long flags; - spin_lock_irqsave(&bitmap->lock, flags); - chunk_kb = mddev->bitmap_info.chunksize >> 10; - seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " - "%lu%s chunk", - bitmap->pages - bitmap->missing_pages, - bitmap->pages, - (bitmap->pages - bitmap->missing_pages) - << (PAGE_SHIFT - 10), - chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, - chunk_kb ? "KB" : "B"); - if (bitmap->file) { - seq_printf(seq, ", file: "); - seq_path(seq, &bitmap->file->f_path, " \t\n"); - } - - seq_printf(seq, "\n"); - spin_unlock_irqrestore(&bitmap->lock, flags); - } + bitmap_status(seq, mddev->bitmap); seq_printf(seq, "\n"); } @@ -7170,7 +7151,7 @@ void md_do_sync(struct mddev *mddev) max_sectors = mddev->dev_sectors; j = MaxSector; rcu_read_lock(); - list_for_each_entry_rcu(rdev, &mddev->disks, same_set) + rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && @@ -7333,7 +7314,8 @@ void md_do_sync(struct mddev *mddev) printk(KERN_INFO "md: checkpointing %s of %s.\n", desc, mdname(mddev)); - mddev->recovery_cp = mddev->curr_resync; + mddev->recovery_cp = + mddev->curr_resync_completed; } } else mddev->recovery_cp = MaxSector; @@ -7341,7 +7323,7 @@ void md_do_sync(struct mddev *mddev) if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; rcu_read_lock(); - list_for_each_entry_rcu(rdev, &mddev->disks, same_set) + rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && !test_bit(Faulty, &rdev->flags) && @@ -7351,9 +7333,9 @@ void md_do_sync(struct mddev *mddev) rcu_read_unlock(); } } + skip: set_bit(MD_CHANGE_DEVS, &mddev->flags); - skip: if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* We completed so min/max setting can be forgotten if used. */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) @@ -7387,7 +7369,7 @@ static int remove_and_add_spares(struct mddev *mddev) mddev->curr_resync_completed = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && (test_bit(Faulty, &rdev->flags) || @@ -7405,7 +7387,7 @@ static int remove_and_add_spares(struct mddev *mddev) "degraded"); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) @@ -7450,7 +7432,7 @@ static void reap_sync_thread(struct mddev *mddev) * do the superblock for an incrementally recovered device * written out. */ - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (!mddev->degraded || test_bit(In_sync, &rdev->flags)) rdev->saved_raid_disk = -1; @@ -7528,7 +7510,7 @@ void md_check_recovery(struct mddev *mddev) * failed devices. */ struct md_rdev *rdev; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && test_bit(Faulty, &rdev->flags) && @@ -8039,7 +8021,7 @@ void md_ack_all_badblocks(struct badblocks *bb) return; write_seqlock_irq(&bb->lock); - if (bb->changed == 0) { + if (bb->changed == 0 && bb->unacked_exist) { u64 *p = bb->page; int i; for (i = 0; i < bb->count ; i++) { @@ -8156,30 +8138,23 @@ static int md_notify_reboot(struct notifier_block *this, struct mddev *mddev; int need_delay = 0; - if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { - - printk(KERN_INFO "md: stopping all md devices.\n"); - - for_each_mddev(mddev, tmp) { - if (mddev_trylock(mddev)) { - /* Force a switch to readonly even array - * appears to still be in use. Hence - * the '100'. - */ - md_set_readonly(mddev, 100); - mddev_unlock(mddev); - } - need_delay = 1; + for_each_mddev(mddev, tmp) { + if (mddev_trylock(mddev)) { + __md_stop_writes(mddev); + mddev->safemode = 2; + mddev_unlock(mddev); } - /* - * certain more exotic SCSI devices are known to be - * volatile wrt too early system reboots. While the - * right place to handle this issue is the given - * driver, we do want to have a safe RAID driver ... - */ - if (need_delay) - mdelay(1000*1); + need_delay = 1; } + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + if (need_delay) + mdelay(1000*1); + return NOTIFY_DONE; } diff --git a/drivers/md/md.h b/drivers/md/md.h index 44c63dfeeb2..1c2063ccf48 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -128,6 +128,10 @@ struct md_rdev { enum flag_bits { Faulty, /* device is known to have a fault */ In_sync, /* device is in_sync with rest of array */ + Unmerged, /* device is being added to array and should + * be considerred for bvec_merge_fn but not + * yet for actual IO + */ WriteMostly, /* Avoid reading if at all possible */ AutoDetected, /* added by auto-detect */ Blocked, /* An error occurred but has not yet @@ -345,6 +349,10 @@ struct mddev { int degraded; /* whether md should consider * adding a spare */ + int merge_check_needed; /* at least one + * member device + * has a + * merge_bvec_fn */ atomic_t recovery_active; /* blocks scheduled, but not written */ wait_queue_head_t recovery_wait; @@ -519,7 +527,10 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) /* * iterates through the 'same array disks' ringlist */ -#define rdev_for_each(rdev, tmp, mddev) \ +#define rdev_for_each(rdev, mddev) \ + list_for_each_entry(rdev, &((mddev)->disks), same_set) + +#define rdev_for_each_safe(rdev, tmp, mddev) \ list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) #define rdev_for_each_rcu(rdev, mddev) \ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index a222f516660..9339e67fcc7 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -428,7 +428,7 @@ static int multipath_run (struct mddev *mddev) } working_disks = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { disk_idx = rdev->raid_disk; if (disk_idx < 0 || disk_idx >= mddev->raid_disks) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 7294bd115e3..6f31f5596e0 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -91,7 +91,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) if (!conf) return -ENOMEM; - list_for_each_entry(rdev1, &mddev->disks, same_set) { + rdev_for_each(rdev1, mddev) { pr_debug("md/raid0:%s: looking at %s\n", mdname(mddev), bdevname(rdev1->bdev, b)); @@ -102,7 +102,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) sector_div(sectors, mddev->chunk_sectors); rdev1->sectors = sectors * mddev->chunk_sectors; - list_for_each_entry(rdev2, &mddev->disks, same_set) { + rdev_for_each(rdev2, mddev) { pr_debug("md/raid0:%s: comparing %s(%llu)" " with %s(%llu)\n", mdname(mddev), @@ -157,7 +157,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) smallest = NULL; dev = conf->devlist; err = -EINVAL; - list_for_each_entry(rdev1, &mddev->disks, same_set) { + rdev_for_each(rdev1, mddev) { int j = rdev1->raid_disk; if (mddev->level == 10) { @@ -188,16 +188,10 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) disk_stack_limits(mddev->gendisk, rdev1->bdev, rdev1->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_segments to 1, lying within - * a single page. - */ - if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } + if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) + conf->has_merge_bvec = 1; + if (!smallest || (rdev1->sectors < smallest->sectors)) smallest = rdev1; cnt++; @@ -290,8 +284,64 @@ abort: return err; } +/* Find the zone which holds a particular offset + * Update *sectorp to be an offset in that zone + */ +static struct strip_zone *find_zone(struct r0conf *conf, + sector_t *sectorp) +{ + int i; + struct strip_zone *z = conf->strip_zone; + sector_t sector = *sectorp; + + for (i = 0; i < conf->nr_strip_zones; i++) + if (sector < z[i].zone_end) { + if (i) + *sectorp = sector - z[i-1].zone_end; + return z + i; + } + BUG(); +} + +/* + * remaps the bio to the target device. we separate two flows. + * power 2 flow and a general flow for the sake of perfromance +*/ +static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone, + sector_t sector, sector_t *sector_offset) +{ + unsigned int sect_in_chunk; + sector_t chunk; + struct r0conf *conf = mddev->private; + int raid_disks = conf->strip_zone[0].nb_dev; + unsigned int chunk_sects = mddev->chunk_sectors; + + if (is_power_of_2(chunk_sects)) { + int chunksect_bits = ffz(~chunk_sects); + /* find the sector offset inside the chunk */ + sect_in_chunk = sector & (chunk_sects - 1); + sector >>= chunksect_bits; + /* chunk in zone */ + chunk = *sector_offset; + /* quotient is the chunk in real device*/ + sector_div(chunk, zone->nb_dev << chunksect_bits); + } else{ + sect_in_chunk = sector_div(sector, chunk_sects); + chunk = *sector_offset; + sector_div(chunk, chunk_sects * zone->nb_dev); + } + /* + * position the bio over the real device + * real sector = chunk in device + starting of zone + * + the position in the chunk + */ + *sector_offset = (chunk * chunk_sects) + sect_in_chunk; + return conf->devlist[(zone - conf->strip_zone)*raid_disks + + sector_div(sector, zone->nb_dev)]; +} + /** - * raid0_mergeable_bvec -- tell bio layer if a two requests can be merged + * raid0_mergeable_bvec -- tell bio layer if two requests can be merged * @q: request queue * @bvm: properties of new bio * @biovec: the request that could be merged to it. @@ -303,10 +353,15 @@ static int raid0_mergeable_bvec(struct request_queue *q, struct bio_vec *biovec) { struct mddev *mddev = q->queuedata; + struct r0conf *conf = mddev->private; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); + sector_t sector_offset = sector; int max; unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int bio_sectors = bvm->bi_size >> 9; + struct strip_zone *zone; + struct md_rdev *rdev; + struct request_queue *subq; if (is_power_of_2(chunk_sectors)) max = (chunk_sectors - ((sector & (chunk_sectors-1)) @@ -314,10 +369,27 @@ static int raid0_mergeable_bvec(struct request_queue *q, else max = (chunk_sectors - (sector_div(sector, chunk_sectors) + bio_sectors)) << 9; - if (max < 0) max = 0; /* bio_add cannot handle a negative return */ + if (max < 0) + max = 0; /* bio_add cannot handle a negative return */ if (max <= biovec->bv_len && bio_sectors == 0) return biovec->bv_len; - else + if (max < biovec->bv_len) + /* too small already, no need to check further */ + return max; + if (!conf->has_merge_bvec) + return max; + + /* May need to check subordinate device */ + sector = sector_offset; + zone = find_zone(mddev->private, §or_offset); + rdev = map_sector(mddev, zone, sector, §or_offset); + subq = bdev_get_queue(rdev->bdev); + if (subq->merge_bvec_fn) { + bvm->bi_bdev = rdev->bdev; + bvm->bi_sector = sector_offset + zone->dev_start + + rdev->data_offset; + return min(max, subq->merge_bvec_fn(subq, bvm, biovec)); + } else return max; } @@ -329,7 +401,7 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks WARN_ONCE(sectors || raid_disks, "%s does not support generic reshape\n", __func__); - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) array_sectors += rdev->sectors; return array_sectors; @@ -397,62 +469,6 @@ static int raid0_stop(struct mddev *mddev) return 0; } -/* Find the zone which holds a particular offset - * Update *sectorp to be an offset in that zone - */ -static struct strip_zone *find_zone(struct r0conf *conf, - sector_t *sectorp) -{ - int i; - struct strip_zone *z = conf->strip_zone; - sector_t sector = *sectorp; - - for (i = 0; i < conf->nr_strip_zones; i++) - if (sector < z[i].zone_end) { - if (i) - *sectorp = sector - z[i-1].zone_end; - return z + i; - } - BUG(); -} - -/* - * remaps the bio to the target device. we separate two flows. - * power 2 flow and a general flow for the sake of perfromance -*/ -static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone, - sector_t sector, sector_t *sector_offset) -{ - unsigned int sect_in_chunk; - sector_t chunk; - struct r0conf *conf = mddev->private; - int raid_disks = conf->strip_zone[0].nb_dev; - unsigned int chunk_sects = mddev->chunk_sectors; - - if (is_power_of_2(chunk_sects)) { - int chunksect_bits = ffz(~chunk_sects); - /* find the sector offset inside the chunk */ - sect_in_chunk = sector & (chunk_sects - 1); - sector >>= chunksect_bits; - /* chunk in zone */ - chunk = *sector_offset; - /* quotient is the chunk in real device*/ - sector_div(chunk, zone->nb_dev << chunksect_bits); - } else{ - sect_in_chunk = sector_div(sector, chunk_sects); - chunk = *sector_offset; - sector_div(chunk, chunk_sects * zone->nb_dev); - } - /* - * position the bio over the real device - * real sector = chunk in device + starting of zone - * + the position in the chunk - */ - *sector_offset = (chunk * chunk_sects) + sect_in_chunk; - return conf->devlist[(zone - conf->strip_zone)*raid_disks - + sector_div(sector, zone->nb_dev)]; -} - /* * Is io distribute over 1 or more chunks ? */ @@ -505,7 +521,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) } sector_offset = bio->bi_sector; - zone = find_zone(mddev->private, §or_offset); + zone = find_zone(mddev->private, §or_offset); tmp_dev = map_sector(mddev, zone, bio->bi_sector, §or_offset); bio->bi_bdev = tmp_dev->bdev; @@ -543,7 +559,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev) return ERR_PTR(-EINVAL); } - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { /* check slot number for a disk */ if (rdev->raid_disk == mddev->raid_disks-1) { printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 0884bba8df4..05539d9c97f 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -4,13 +4,16 @@ struct strip_zone { sector_t zone_end; /* Start of the next zone (in sectors) */ sector_t dev_start; /* Zone offset in real dev (in sectors) */ - int nb_dev; /* # of devices attached to the zone */ + int nb_dev; /* # of devices attached to the zone */ }; struct r0conf { - struct strip_zone *strip_zone; - struct md_rdev **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ - int nr_strip_zones; + struct strip_zone *strip_zone; + struct md_rdev **devlist; /* lists of rdevs, pointed to + * by strip_zone->dev */ + int nr_strip_zones; + int has_merge_bvec; /* at least one member has + * a merge_bvec_fn */ }; #endif diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a368db2431a..4a40a200d76 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -523,6 +523,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect rdev = rcu_dereference(conf->mirrors[disk].rdev); if (r1_bio->bios[disk] == IO_BLOCKED || rdev == NULL + || test_bit(Unmerged, &rdev->flags) || test_bit(Faulty, &rdev->flags)) continue; if (!test_bit(In_sync, &rdev->flags) && @@ -614,6 +615,39 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect return best_disk; } +static int raid1_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) +{ + struct mddev *mddev = q->queuedata; + struct r1conf *conf = mddev->private; + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); + int max = biovec->bv_len; + + if (mddev->merge_check_needed) { + int disk; + rcu_read_lock(); + for (disk = 0; disk < conf->raid_disks * 2; disk++) { + struct md_rdev *rdev = rcu_dereference( + conf->mirrors[disk].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + struct request_queue *q = + bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) { + bvm->bi_sector = sector + + rdev->data_offset; + bvm->bi_bdev = rdev->bdev; + max = min(max, q->merge_bvec_fn( + q, bvm, biovec)); + } + } + } + rcu_read_unlock(); + } + return max; + +} + int md_raid1_congested(struct mddev *mddev, int bits) { struct r1conf *conf = mddev->private; @@ -624,7 +658,7 @@ int md_raid1_congested(struct mddev *mddev, int bits) return 1; rcu_read_lock(); - for (i = 0; i < conf->raid_disks; i++) { + for (i = 0; i < conf->raid_disks * 2; i++) { struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); @@ -737,9 +771,22 @@ static void wait_barrier(struct r1conf *conf) spin_lock_irq(&conf->resync_lock); if (conf->barrier) { conf->nr_waiting++; - wait_event_lock_irq(conf->wait_barrier, !conf->barrier, + /* Wait for the barrier to drop. + * However if there are already pending + * requests (preventing the barrier from + * rising completely), and the + * pre-process bio queue isn't empty, + * then don't wait, as we need to empty + * that queue to get the nr_pending + * count down. + */ + wait_event_lock_irq(conf->wait_barrier, + !conf->barrier || + (conf->nr_pending && + current->bio_list && + !bio_list_empty(current->bio_list)), conf->resync_lock, - ); + ); conf->nr_waiting--; } conf->nr_pending++; @@ -1002,7 +1049,8 @@ read_again: break; } r1_bio->bios[i] = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags)) { + if (!rdev || test_bit(Faulty, &rdev->flags) + || test_bit(Unmerged, &rdev->flags)) { if (i < conf->raid_disks) set_bit(R1BIO_Degraded, &r1_bio->state); continue; @@ -1322,6 +1370,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) struct mirror_info *p; int first = 0; int last = conf->raid_disks - 1; + struct request_queue *q = bdev_get_queue(rdev->bdev); if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; @@ -1329,23 +1378,17 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; + if (q->merge_bvec_fn) { + set_bit(Unmerged, &rdev->flags); + mddev->merge_check_needed = 1; + } + for (mirror = first; mirror <= last; mirror++) { p = conf->mirrors+mirror; if (!p->rdev) { disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must - * never risk violating it, so limit - * ->max_segments to one lying with a single - * page, as a one page request is never in - * violation. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } p->head_position = 0; rdev->raid_disk = mirror; @@ -1370,6 +1413,19 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) break; } } + if (err == 0 && test_bit(Unmerged, &rdev->flags)) { + /* Some requests might not have seen this new + * merge_bvec_fn. We must wait for them to complete + * before merging the device fully. + * First we make sure any code which has tested + * our function has submitted the request, then + * we wait for all outstanding requests to complete. + */ + synchronize_sched(); + raise_barrier(conf); + lower_barrier(conf); + clear_bit(Unmerged, &rdev->flags); + } md_integrity_add_rdev(rdev, mddev); print_conf(conf); return err; @@ -2491,7 +2547,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) err = -EINVAL; spin_lock_init(&conf->device_lock); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { int disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) @@ -2609,20 +2665,11 @@ static int run(struct mddev *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { if (!mddev->gendisk) continue; disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_segments to 1 lying within - * a single page, as a one page request is never in violation. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } } mddev->degraded = 0; @@ -2656,6 +2703,7 @@ static int run(struct mddev *mddev) if (mddev->queue) { mddev->queue->backing_dev_info.congested_fn = raid1_congested; mddev->queue->backing_dev_info.congested_data = mddev; + blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); } return md_integrity_register(mddev); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6e8aa213f0d..3540316886f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -67,6 +67,7 @@ static int max_queued_requests = 1024; static void allow_barrier(struct r10conf *conf); static void lower_barrier(struct r10conf *conf); +static int enough(struct r10conf *conf, int ignore); static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) { @@ -347,6 +348,19 @@ static void raid10_end_read_request(struct bio *bio, int error) * wait for the 'master' bio. */ set_bit(R10BIO_Uptodate, &r10_bio->state); + } else { + /* If all other devices that store this block have + * failed, we want to return the error upwards rather + * than fail the last device. Here we redefine + * "uptodate" to mean "Don't want to retry" + */ + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + if (!enough(conf, rdev->raid_disk)) + uptodate = 1; + spin_unlock_irqrestore(&conf->device_lock, flags); + } + if (uptodate) { raid_end_bio_io(r10_bio); rdev_dec_pending(rdev, conf->mddev); } else { @@ -572,25 +586,68 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset - * If near_copies == raid_disk, there are no striping issues, - * but in that case, the function isn't called at all. + * This requires checking for end-of-chunk if near_copies != raid_disks, + * and for subordinate merge_bvec_fns if merge_check_needed. */ static int raid10_mergeable_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *biovec) { struct mddev *mddev = q->queuedata; + struct r10conf *conf = mddev->private; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int bio_sectors = bvm->bi_size >> 9; - max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; - if (max < 0) max = 0; /* bio_add cannot handle a negative return */ - if (max <= biovec->bv_len && bio_sectors == 0) - return biovec->bv_len; - else - return max; + if (conf->near_copies < conf->raid_disks) { + max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + + bio_sectors)) << 9; + if (max < 0) + /* bio_add cannot handle a negative return */ + max = 0; + if (max <= biovec->bv_len && bio_sectors == 0) + return biovec->bv_len; + } else + max = biovec->bv_len; + + if (mddev->merge_check_needed) { + struct r10bio r10_bio; + int s; + r10_bio.sector = sector; + raid10_find_phys(conf, &r10_bio); + rcu_read_lock(); + for (s = 0; s < conf->copies; s++) { + int disk = r10_bio.devs[s].devnum; + struct md_rdev *rdev = rcu_dereference( + conf->mirrors[disk].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + struct request_queue *q = + bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) { + bvm->bi_sector = r10_bio.devs[s].addr + + rdev->data_offset; + bvm->bi_bdev = rdev->bdev; + max = min(max, q->merge_bvec_fn( + q, bvm, biovec)); + } + } + rdev = rcu_dereference(conf->mirrors[disk].replacement); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + struct request_queue *q = + bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) { + bvm->bi_sector = r10_bio.devs[s].addr + + rdev->data_offset; + bvm->bi_bdev = rdev->bdev; + max = min(max, q->merge_bvec_fn( + q, bvm, biovec)); + } + } + } + rcu_read_unlock(); + } + return max; } /* @@ -654,11 +711,12 @@ retry: disk = r10_bio->devs[slot].devnum; rdev = rcu_dereference(conf->mirrors[disk].replacement); if (rdev == NULL || test_bit(Faulty, &rdev->flags) || + test_bit(Unmerged, &rdev->flags) || r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) rdev = rcu_dereference(conf->mirrors[disk].rdev); - if (rdev == NULL) - continue; - if (test_bit(Faulty, &rdev->flags)) + if (rdev == NULL || + test_bit(Faulty, &rdev->flags) || + test_bit(Unmerged, &rdev->flags)) continue; if (!test_bit(In_sync, &rdev->flags) && r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) @@ -849,9 +907,22 @@ static void wait_barrier(struct r10conf *conf) spin_lock_irq(&conf->resync_lock); if (conf->barrier) { conf->nr_waiting++; - wait_event_lock_irq(conf->wait_barrier, !conf->barrier, + /* Wait for the barrier to drop. + * However if there are already pending + * requests (preventing the barrier from + * rising completely), and the + * pre-process bio queue isn't empty, + * then don't wait, as we need to empty + * that queue to get the nr_pending + * count down. + */ + wait_event_lock_irq(conf->wait_barrier, + !conf->barrier || + (conf->nr_pending && + current->bio_list && + !bio_list_empty(current->bio_list)), conf->resync_lock, - ); + ); conf->nr_waiting--; } conf->nr_pending++; @@ -1107,12 +1178,14 @@ retry_write: blocked_rdev = rrdev; break; } - if (rrdev && test_bit(Faulty, &rrdev->flags)) + if (rrdev && (test_bit(Faulty, &rrdev->flags) + || test_bit(Unmerged, &rrdev->flags))) rrdev = NULL; r10_bio->devs[i].bio = NULL; r10_bio->devs[i].repl_bio = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags)) { + if (!rdev || test_bit(Faulty, &rdev->flags) || + test_bit(Unmerged, &rdev->flags)) { set_bit(R10BIO_Degraded, &r10_bio->state); continue; } @@ -1463,18 +1536,24 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) int mirror; int first = 0; int last = conf->raid_disks - 1; + struct request_queue *q = bdev_get_queue(rdev->bdev); if (mddev->recovery_cp < MaxSector) /* only hot-add to in-sync arrays, as recovery is * very different from resync */ return -EBUSY; - if (!enough(conf, -1)) + if (rdev->saved_raid_disk < 0 && !enough(conf, -1)) return -EINVAL; if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; + if (q->merge_bvec_fn) { + set_bit(Unmerged, &rdev->flags); + mddev->merge_check_needed = 1; + } + if (rdev->saved_raid_disk >= first && conf->mirrors[rdev->saved_raid_disk].rdev == NULL) mirror = rdev->saved_raid_disk; @@ -1494,11 +1573,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) err = 0; disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } conf->fullsync = 1; rcu_assign_pointer(p->replacement, rdev); break; @@ -1506,17 +1580,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must - * never risk violating it, so limit - * ->max_segments to one lying with a single - * page, as a one page request is never in - * violation. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } p->head_position = 0; p->recovery_disabled = mddev->recovery_disabled - 1; @@ -1527,7 +1590,19 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) rcu_assign_pointer(p->rdev, rdev); break; } - + if (err == 0 && test_bit(Unmerged, &rdev->flags)) { + /* Some requests might not have seen this new + * merge_bvec_fn. We must wait for them to complete + * before merging the device fully. + * First we make sure any code which has tested + * our function has submitted the request, then + * we wait for all outstanding requests to complete. + */ + synchronize_sched(); + raise_barrier(conf, 0); + lower_barrier(conf); + clear_bit(Unmerged, &rdev->flags); + } md_integrity_add_rdev(rdev, mddev); print_conf(conf); return err; @@ -1668,10 +1743,8 @@ static void end_sync_write(struct bio *bio, int error) d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); if (repl) rdev = conf->mirrors[d].replacement; - if (!rdev) { - smp_mb(); + else rdev = conf->mirrors[d].rdev; - } if (!uptodate) { if (repl) @@ -2052,6 +2125,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 "md/raid10:%s: %s: Failing raid device\n", mdname(mddev), b); md_error(mddev, conf->mirrors[d].rdev); + r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; return; } @@ -2072,6 +2146,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && + !test_bit(Unmerged, &rdev->flags) && test_bit(In_sync, &rdev->flags) && is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, &first_bad, &bad_sectors) == 0) { @@ -2105,8 +2180,11 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 rdev, r10_bio->devs[r10_bio->read_slot].addr + sect, - s, 0)) + s, 0)) { md_error(mddev, rdev); + r10_bio->devs[r10_bio->read_slot].bio + = IO_BLOCKED; + } break; } @@ -2122,6 +2200,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); if (!rdev || + test_bit(Unmerged, &rdev->flags) || !test_bit(In_sync, &rdev->flags)) continue; @@ -2299,17 +2378,20 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) * This is all done synchronously while the array is * frozen. */ + bio = r10_bio->devs[slot].bio; + bdevname(bio->bi_bdev, b); + bio_put(bio); + r10_bio->devs[slot].bio = NULL; + if (mddev->ro == 0) { freeze_array(conf); fix_read_error(conf, mddev, r10_bio); unfreeze_array(conf); - } + } else + r10_bio->devs[slot].bio = IO_BLOCKED; + rdev_dec_pending(rdev, mddev); - bio = r10_bio->devs[slot].bio; - bdevname(bio->bi_bdev, b); - r10_bio->devs[slot].bio = - mddev->ro ? IO_BLOCKED : NULL; read_more: rdev = read_balance(conf, r10_bio, &max_sectors); if (rdev == NULL) { @@ -2318,13 +2400,10 @@ read_more: mdname(mddev), b, (unsigned long long)r10_bio->sector); raid_end_bio_io(r10_bio); - bio_put(bio); return; } do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); - if (bio) - bio_put(bio); slot = r10_bio->read_slot; printk_ratelimited( KERN_ERR @@ -2360,7 +2439,6 @@ read_more: mbio->bi_phys_segments++; spin_unlock_irq(&conf->device_lock); generic_make_request(bio); - bio = NULL; r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); @@ -3225,7 +3303,7 @@ static int run(struct mddev *mddev) blk_queue_io_opt(mddev->queue, chunk_size * (conf->raid_disks / conf->near_copies)); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { disk_idx = rdev->raid_disk; if (disk_idx >= conf->raid_disks @@ -3243,18 +3321,8 @@ static int run(struct mddev *mddev) disk->rdev = rdev; } - disk->rdev = rdev; disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit max_segments to 1 lying - * within a single page. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { - blk_queue_max_segments(mddev->queue, 1); - blk_queue_segment_boundary(mddev->queue, - PAGE_CACHE_SIZE - 1); - } disk->head_position = 0; } @@ -3318,8 +3386,7 @@ static int run(struct mddev *mddev) mddev->queue->backing_dev_info.ra_pages = 2* stripe; } - if (conf->near_copies < conf->raid_disks) - blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); + blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); if (md_integrity_register(mddev)) goto out_free_conf; @@ -3369,6 +3436,43 @@ static void raid10_quiesce(struct mddev *mddev, int state) } } +static int raid10_resize(struct mddev *mddev, sector_t sectors) +{ + /* Resize of 'far' arrays is not supported. + * For 'near' and 'offset' arrays we can set the + * number of sectors used to be an appropriate multiple + * of the chunk size. + * For 'offset', this is far_copies*chunksize. + * For 'near' the multiplier is the LCM of + * near_copies and raid_disks. + * So if far_copies > 1 && !far_offset, fail. + * Else find LCM(raid_disks, near_copy)*far_copies and + * multiply by chunk_size. Then round to this number. + * This is mostly done by raid10_size() + */ + struct r10conf *conf = mddev->private; + sector_t oldsize, size; + + if (conf->far_copies > 1 && !conf->far_offset) + return -EINVAL; + + oldsize = raid10_size(mddev, 0, 0); + size = raid10_size(mddev, sectors, 0); + md_set_array_sectors(mddev, size); + if (mddev->array_sectors > size) + return -EINVAL; + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + if (sectors > mddev->dev_sectors && + mddev->recovery_cp > oldsize) { + mddev->recovery_cp = oldsize; + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } + mddev->dev_sectors = sectors; + mddev->resync_max_sectors = size; + return 0; +} + static void *raid10_takeover_raid0(struct mddev *mddev) { struct md_rdev *rdev; @@ -3392,7 +3496,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev) conf = setup_conf(mddev); if (!IS_ERR(conf)) { - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) rdev->new_raid_disk = rdev->raid_disk * 2; conf->barrier = 1; @@ -3438,6 +3542,7 @@ static struct md_personality raid10_personality = .sync_request = sync_request, .quiesce = raid10_quiesce, .size = raid10_size, + .resize = raid10_resize, .takeover = raid10_takeover, }; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 360f2b98f62..23ac880bba9 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -208,11 +208,10 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) md_wakeup_thread(conf->mddev->thread); } else { BUG_ON(stripe_operations_active(sh)); - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - atomic_dec(&conf->preread_active_stripes); - if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + if (atomic_dec_return(&conf->preread_active_stripes) + < IO_THRESHOLD) md_wakeup_thread(conf->mddev->thread); - } atomic_dec(&conf->active_stripes); if (!test_bit(STRIPE_EXPANDING, &sh->state)) { list_add_tail(&sh->lru, &conf->inactive_list); @@ -4843,7 +4842,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) pr_debug("raid456: run(%s) called.\n", mdname(mddev)); - list_for_each_entry(rdev, &mddev->disks, same_set) { + rdev_for_each(rdev, mddev) { raid_disk = rdev->raid_disk; if (raid_disk >= max_disks || raid_disk < 0) @@ -5178,7 +5177,7 @@ static int run(struct mddev *mddev) blk_queue_io_opt(mddev->queue, chunk_size * (conf->raid_disks - conf->max_degraded)); - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); } @@ -5362,7 +5361,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; - if (has_failed(conf)) + if (rdev->saved_raid_disk < 0 && has_failed(conf)) /* no point adding a device */ return -EINVAL; @@ -5501,7 +5500,7 @@ static int raid5_start_reshape(struct mddev *mddev) if (!check_stripe_cache(mddev)) return -ENOSPC; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (!test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) spares++; @@ -5547,16 +5546,14 @@ static int raid5_start_reshape(struct mddev *mddev) * such devices during the reshape and confusion could result. */ if (mddev->delta_disks >= 0) { - int added_devices = 0; - list_for_each_entry(rdev, &mddev->disks, same_set) + rdev_for_each(rdev, mddev) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { if (raid5_add_disk(mddev, rdev) == 0) { if (rdev->raid_disk - >= conf->previous_raid_disks) { + >= conf->previous_raid_disks) set_bit(In_sync, &rdev->flags); - added_devices++; - } else + else rdev->recovery_offset = 0; if (sysfs_link_rdev(mddev, rdev)) @@ -5566,7 +5563,6 @@ static int raid5_start_reshape(struct mddev *mddev) && !test_bit(Faulty, &rdev->flags)) { /* This is a spare that was manually added */ set_bit(In_sync, &rdev->flags); - added_devices++; } /* When a reshape changes the number of devices, @@ -5592,6 +5588,7 @@ static int raid5_start_reshape(struct mddev *mddev) spin_lock_irq(&conf->device_lock); mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; conf->reshape_progress = MaxSector; + mddev->reshape_position = MaxSector; spin_unlock_irq(&conf->device_lock); return -EAGAIN; } |