diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/.gitignore | 4 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 116 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 7 | ||||
-rw-r--r-- | drivers/md/dm-io.h | 3 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 25 | ||||
-rw-r--r-- | drivers/md/dm-log.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 13 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 27 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 2 | ||||
-rw-r--r-- | drivers/md/dm.c | 98 | ||||
-rw-r--r-- | drivers/md/dm.h | 5 | ||||
-rw-r--r-- | drivers/md/faulty.c | 9 | ||||
-rw-r--r-- | drivers/md/kcopyd.c | 4 | ||||
-rw-r--r-- | drivers/md/linear.c | 14 | ||||
-rw-r--r-- | drivers/md/md.c | 927 | ||||
-rw-r--r-- | drivers/md/multipath.c | 29 | ||||
-rw-r--r-- | drivers/md/raid0.c | 32 | ||||
-rw-r--r-- | drivers/md/raid1.c | 735 | ||||
-rw-r--r-- | drivers/md/raid10.c | 548 | ||||
-rw-r--r-- | drivers/md/raid5.c | 189 | ||||
-rw-r--r-- | drivers/md/raid6main.c | 356 |
21 files changed, 2201 insertions, 944 deletions
diff --git a/drivers/md/.gitignore b/drivers/md/.gitignore new file mode 100644 index 00000000000..a7afec6b19c --- /dev/null +++ b/drivers/md/.gitignore @@ -0,0 +1,4 @@ +mktables +raid6altivec*.c +raid6int*.c +raid6tables.c diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 252d55df964..eae4473eadd 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -200,7 +200,7 @@ out: /* if page is completely empty, put it back on the free list, or dealloc it */ /* if page was hijacked, unmark the flag so it might get alloced next time */ /* Note: lock should be held when calling this */ -static inline void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) +static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) { char *ptr; @@ -315,6 +315,8 @@ static int write_page(struct bitmap *bitmap, struct page *page, int wait) if (bitmap->file == NULL) return write_sb_page(bitmap->mddev, bitmap->offset, page, wait); + flush_dcache_page(page); /* make sure visible to anyone reading the file */ + if (wait) lock_page(page); else { @@ -341,7 +343,7 @@ static int write_page(struct bitmap *bitmap, struct page *page, int wait) /* add to list to be waited for by daemon */ struct page_list *item = mempool_alloc(bitmap->write_pool, GFP_NOIO); item->page = page; - page_cache_get(page); + get_page(page); spin_lock(&bitmap->write_lock); list_add(&item->list, &bitmap->complete_pages); spin_unlock(&bitmap->write_lock); @@ -357,10 +359,10 @@ static struct page *read_page(struct file *file, unsigned long index, struct inode *inode = file->f_mapping->host; struct page *page = NULL; loff_t isize = i_size_read(inode); - unsigned long end_index = isize >> PAGE_CACHE_SHIFT; + unsigned long end_index = isize >> PAGE_SHIFT; - PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_CACHE_SIZE, - (unsigned long long)index << PAGE_CACHE_SHIFT); + PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_SIZE, + (unsigned long long)index << PAGE_SHIFT); page = read_cache_page(inode->i_mapping, index, (filler_t *)inode->i_mapping->a_ops->readpage, file); @@ -368,7 +370,7 @@ static struct page *read_page(struct file *file, unsigned long index, goto out; wait_on_page_locked(page); if (!PageUptodate(page) || PageError(page)) { - page_cache_release(page); + put_page(page); page = ERR_PTR(-EIO); goto out; } @@ -376,14 +378,14 @@ static struct page *read_page(struct file *file, unsigned long index, if (index > end_index) /* we have read beyond EOF */ *bytes_read = 0; else if (index == end_index) /* possible short read */ - *bytes_read = isize & ~PAGE_CACHE_MASK; + *bytes_read = isize & ~PAGE_MASK; else - *bytes_read = PAGE_CACHE_SIZE; /* got a full page */ + *bytes_read = PAGE_SIZE; /* got a full page */ out: if (IS_ERR(page)) printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n", - (int)PAGE_CACHE_SIZE, - (unsigned long long)index << PAGE_CACHE_SHIFT, + (int)PAGE_SIZE, + (unsigned long long)index << PAGE_SHIFT, PTR_ERR(page)); return page; } @@ -406,11 +408,11 @@ int bitmap_update_sb(struct bitmap *bitmap) return 0; } spin_unlock_irqrestore(&bitmap->lock, flags); - sb = (bitmap_super_t *)kmap(bitmap->sb_page); + sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); sb->events = cpu_to_le64(bitmap->mddev->events); if (!bitmap->mddev->degraded) sb->events_cleared = cpu_to_le64(bitmap->mddev->events); - kunmap(bitmap->sb_page); + kunmap_atomic(sb, KM_USER0); return write_page(bitmap, bitmap->sb_page, 1); } @@ -421,7 +423,7 @@ void bitmap_print_sb(struct bitmap *bitmap) if (!bitmap || !bitmap->sb_page) return; - sb = (bitmap_super_t *)kmap(bitmap->sb_page); + sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); @@ -440,7 +442,7 @@ void bitmap_print_sb(struct bitmap *bitmap) printk(KERN_DEBUG " sync size: %llu KB\n", (unsigned long long)le64_to_cpu(sb->sync_size)/2); printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); - kunmap(bitmap->sb_page); + kunmap_atomic(sb, KM_USER0); } /* read the superblock from the bitmap file and initialize some bitmap fields */ @@ -466,7 +468,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) return err; } - sb = (bitmap_super_t *)kmap(bitmap->sb_page); + sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); if (bytes_read < sizeof(*sb)) { /* short read */ printk(KERN_INFO "%s: bitmap file superblock truncated\n", @@ -485,12 +487,12 @@ static int bitmap_read_sb(struct bitmap *bitmap) else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || le32_to_cpu(sb->version) > BITMAP_MAJOR_HI) reason = "unrecognized superblock version"; - else if (chunksize < 512 || chunksize > (1024 * 1024 * 4)) - reason = "bitmap chunksize out of range (512B - 4MB)"; + else if (chunksize < PAGE_SIZE) + reason = "bitmap chunksize too small"; else if ((1 << ffz(~chunksize)) != chunksize) reason = "bitmap chunksize not a power of 2"; - else if (daemon_sleep < 1 || daemon_sleep > 15) - reason = "daemon sleep period out of range (1-15s)"; + else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) + reason = "daemon sleep period out of range"; else if (write_behind > COUNTER_MAX) reason = "write-behind limit out of range (0 - 16383)"; if (reason) { @@ -535,7 +537,7 @@ success: bitmap->events_cleared = bitmap->mddev->events; err = 0; out: - kunmap(bitmap->sb_page); + kunmap_atomic(sb, KM_USER0); if (err) bitmap_print_sb(bitmap); return err; @@ -558,9 +560,9 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, spin_unlock_irqrestore(&bitmap->lock, flags); return; } - page_cache_get(bitmap->sb_page); + get_page(bitmap->sb_page); spin_unlock_irqrestore(&bitmap->lock, flags); - sb = (bitmap_super_t *)kmap(bitmap->sb_page); + sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); switch (op) { case MASK_SET: sb->state |= bits; break; @@ -568,8 +570,8 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, break; default: BUG(); } - kunmap(bitmap->sb_page); - page_cache_release(bitmap->sb_page); + kunmap_atomic(sb, KM_USER0); + put_page(bitmap->sb_page); } /* @@ -622,12 +624,11 @@ static void bitmap_file_unmap(struct bitmap *bitmap) while (pages--) if (map[pages]->index != 0) /* 0 is sb_page, release it below */ - page_cache_release(map[pages]); + put_page(map[pages]); kfree(map); kfree(attr); - if (sb_page) - page_cache_release(sb_page); + safe_put_page(sb_page); } static void bitmap_stop_daemon(struct bitmap *bitmap); @@ -654,7 +655,7 @@ static void drain_write_queues(struct bitmap *bitmap) while ((item = dequeue_page(bitmap))) { /* don't bother to wait */ - page_cache_release(item->page); + put_page(item->page); mempool_free(item, bitmap->write_pool); } @@ -763,7 +764,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) /* make sure the page stays cached until it gets written out */ if (! (get_page_attr(bitmap, page) & BITMAP_PAGE_DIRTY)) - page_cache_get(page); + get_page(page); /* set the bit */ kaddr = kmap_atomic(page, KM_USER0); @@ -854,6 +855,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) unsigned long bytes, offset, dummy; int outofdate; int ret = -ENOSPC; + void *paddr; chunks = bitmap->chunks; file = bitmap->file; @@ -887,12 +889,10 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) if (!bitmap->filemap) goto out; - bitmap->filemap_attr = kmalloc(sizeof(long) * num_pages, GFP_KERNEL); + bitmap->filemap_attr = kzalloc(sizeof(long) * num_pages, GFP_KERNEL); if (!bitmap->filemap_attr) goto out; - memset(bitmap->filemap_attr, 0, sizeof(long) * num_pages); - oldindex = ~0L; for (i = 0; i < chunks; i++) { @@ -901,8 +901,6 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) bit = file_page_offset(i); if (index != oldindex) { /* this is a new page, read it in */ /* unmap the old page, we're done with it */ - if (oldpage != NULL) - kunmap(oldpage); if (index == 0) { /* * if we're here then the superblock page @@ -925,30 +923,32 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) oldindex = index; oldpage = page; - kmap(page); if (outofdate) { /* * if bitmap is out of date, dirty the * whole page and write it out */ - memset(page_address(page) + offset, 0xff, + paddr = kmap_atomic(page, KM_USER0); + memset(paddr + offset, 0xff, PAGE_SIZE - offset); + kunmap_atomic(paddr, KM_USER0); ret = write_page(bitmap, page, 1); if (ret) { - kunmap(page); /* release, page not in filemap yet */ - page_cache_release(page); + put_page(page); goto out; } } bitmap->filemap[bitmap->file_pages++] = page; } + paddr = kmap_atomic(page, KM_USER0); if (bitmap->flags & BITMAP_HOSTENDIAN) - b = test_bit(bit, page_address(page)); + b = test_bit(bit, paddr); else - b = ext2_test_bit(bit, page_address(page)); + b = ext2_test_bit(bit, paddr); + kunmap_atomic(paddr, KM_USER0); if (b) { /* if the disk bit is set, set the memory bit */ bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap), @@ -963,9 +963,6 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) ret = 0; bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET); - if (page) /* unmap the last page */ - kunmap(page); - if (bit_cnt) { /* Kick recovery if any bits were set */ set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); md_wakeup_thread(bitmap->mddev->thread); @@ -1021,6 +1018,7 @@ int bitmap_daemon_work(struct bitmap *bitmap) int err = 0; int blocks; int attr; + void *paddr; if (bitmap == NULL) return 0; @@ -1043,7 +1041,7 @@ int bitmap_daemon_work(struct bitmap *bitmap) /* skip this page unless it's marked as needing cleaning */ if (!((attr=get_page_attr(bitmap, page)) & BITMAP_PAGE_CLEAN)) { if (attr & BITMAP_PAGE_NEEDWRITE) { - page_cache_get(page); + get_page(page); clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); } spin_unlock_irqrestore(&bitmap->lock, flags); @@ -1057,13 +1055,13 @@ int bitmap_daemon_work(struct bitmap *bitmap) default: bitmap_file_kick(bitmap); } - page_cache_release(page); + put_page(page); } continue; } /* grab the new page, sync and release the old */ - page_cache_get(page); + get_page(page); if (lastpage != NULL) { if (get_page_attr(bitmap, lastpage) & BITMAP_PAGE_NEEDWRITE) { clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); @@ -1077,14 +1075,12 @@ int bitmap_daemon_work(struct bitmap *bitmap) set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); spin_unlock_irqrestore(&bitmap->lock, flags); } - kunmap(lastpage); - page_cache_release(lastpage); + put_page(lastpage); if (err) bitmap_file_kick(bitmap); } else spin_unlock_irqrestore(&bitmap->lock, flags); lastpage = page; - kmap(page); /* printk("bitmap clean at page %lu\n", j); */ @@ -1107,10 +1103,12 @@ int bitmap_daemon_work(struct bitmap *bitmap) -1); /* clear the bit */ + paddr = kmap_atomic(page, KM_USER0); if (bitmap->flags & BITMAP_HOSTENDIAN) - clear_bit(file_page_offset(j), page_address(page)); + clear_bit(file_page_offset(j), paddr); else - ext2_clear_bit(file_page_offset(j), page_address(page)); + ext2_clear_bit(file_page_offset(j), paddr); + kunmap_atomic(paddr, KM_USER0); } } spin_unlock_irqrestore(&bitmap->lock, flags); @@ -1118,7 +1116,6 @@ int bitmap_daemon_work(struct bitmap *bitmap) /* now sync the final page */ if (lastpage != NULL) { - kunmap(lastpage); spin_lock_irqsave(&bitmap->lock, flags); if (get_page_attr(bitmap, lastpage) &BITMAP_PAGE_NEEDWRITE) { clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); @@ -1133,7 +1130,7 @@ int bitmap_daemon_work(struct bitmap *bitmap) spin_unlock_irqrestore(&bitmap->lock, flags); } - page_cache_release(lastpage); + put_page(lastpage); } return err; @@ -1184,7 +1181,7 @@ static void bitmap_writeback_daemon(mddev_t *mddev) PRINTK("finished page writeback: %p\n", page); err = PageError(page); - page_cache_release(page); + put_page(page); if (err) { printk(KERN_WARNING "%s: bitmap file writeback " "failed (page %lu): %d\n", @@ -1530,6 +1527,8 @@ void bitmap_destroy(mddev_t *mddev) return; mddev->bitmap = NULL; /* disconnect from the md device */ + if (mddev->thread) + mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; bitmap_free(bitmap); } @@ -1555,12 +1554,10 @@ int bitmap_create(mddev_t *mddev) BUG_ON(file && mddev->bitmap_offset); - bitmap = kmalloc(sizeof(*bitmap), GFP_KERNEL); + bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); if (!bitmap) return -ENOMEM; - memset(bitmap, 0, sizeof(*bitmap)); - spin_lock_init(&bitmap->lock); bitmap->mddev = mddev; @@ -1601,12 +1598,11 @@ int bitmap_create(mddev_t *mddev) #ifdef INJECT_FATAL_FAULT_1 bitmap->bp = NULL; #else - bitmap->bp = kmalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL); + bitmap->bp = kzalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL); #endif err = -ENOMEM; if (!bitmap->bp) goto error; - memset(bitmap->bp, 0, pages * sizeof(*bitmap->bp)); bitmap->flags |= BITMAP_ACTIVE; @@ -1636,6 +1632,8 @@ int bitmap_create(mddev_t *mddev) if (IS_ERR(bitmap->writeback_daemon)) return PTR_ERR(bitmap->writeback_daemon); + mddev->thread->timeout = bitmap->daemon_sleep * HZ; + return bitmap_update_sb(bitmap); error: diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index cf663105668..e7a650f9ca0 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -228,7 +228,7 @@ static struct crypt_iv_operations crypt_iv_essiv_ops = { }; -static inline int +static int crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out, struct scatterlist *in, unsigned int length, int write, sector_t sector) @@ -690,6 +690,8 @@ bad3: bad2: crypto_free_tfm(tfm); bad1: + /* Must zero key material before freeing */ + memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); kfree(cc); return -EINVAL; } @@ -706,6 +708,9 @@ static void crypt_dtr(struct dm_target *ti) cc->iv_gen_ops->dtr(cc); crypto_free_tfm(cc->tfm); dm_put_device(ti, cc->dev); + + /* Must zero key material before freeing */ + memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); kfree(cc); } diff --git a/drivers/md/dm-io.h b/drivers/md/dm-io.h index 1a77f326570..f9035bfd1a9 100644 --- a/drivers/md/dm-io.h +++ b/drivers/md/dm-io.h @@ -9,9 +9,6 @@ #include "dm.h" -/* FIXME make this configurable */ -#define DM_MAX_IO_REGIONS 8 - struct io_region { struct block_device *bdev; sector_t sector; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 07d44e19536..1235135b384 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -270,6 +270,7 @@ static int dm_hash_rename(const char *old, const char *new) { char *new_name, *old_name; struct hash_cell *hc; + struct dm_table *table; /* * duplicate new. @@ -317,6 +318,15 @@ static int dm_hash_rename(const char *old, const char *new) /* rename the device node in devfs */ register_with_devfs(hc); + /* + * Wake up any dm event waiters. + */ + table = dm_get_table(hc->md); + if (table) { + dm_table_event(table); + dm_table_put(table); + } + up_write(&_hash_lock); kfree(old_name); return 0; @@ -588,7 +598,7 @@ static int dev_create(struct dm_ioctl *param, size_t param_size) /* * Always use UUID for lookups if it's present, otherwise use name or dev. */ -static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) +static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) { if (*param->uuid) return __get_uuid_cell(param->uuid); @@ -598,7 +608,7 @@ static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) return dm_get_mdptr(huge_decode_dev(param->dev)); } -static inline struct mapped_device *find_device(struct dm_ioctl *param) +static struct mapped_device *find_device(struct dm_ioctl *param) { struct hash_cell *hc; struct mapped_device *md = NULL; @@ -683,14 +693,18 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size) static int do_suspend(struct dm_ioctl *param) { int r = 0; + int do_lockfs = 1; struct mapped_device *md; md = find_device(param); if (!md) return -ENXIO; + if (param->flags & DM_SKIP_LOCKFS_FLAG) + do_lockfs = 0; + if (!dm_suspended(md)) - r = dm_suspend(md); + r = dm_suspend(md, do_lockfs); if (!r) r = __dev_status(md, param); @@ -702,6 +716,7 @@ static int do_suspend(struct dm_ioctl *param) static int do_resume(struct dm_ioctl *param) { int r = 0; + int do_lockfs = 1; struct hash_cell *hc; struct mapped_device *md; struct dm_table *new_map; @@ -727,8 +742,10 @@ static int do_resume(struct dm_ioctl *param) /* Do we need to load a new map ? */ if (new_map) { /* Suspend if it isn't already suspended */ + if (param->flags & DM_SKIP_LOCKFS_FLAG) + do_lockfs = 0; if (!dm_suspended(md)) - dm_suspend(md); + dm_suspend(md, do_lockfs); r = dm_swap_table(md, new_map); if (r) { diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index a76349cb10a..efe4adf7853 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -573,7 +573,7 @@ static int core_get_resync_work(struct dirty_log *log, region_t *region) lc->sync_search); lc->sync_search = *region + 1; - if (*region == lc->region_count) + if (*region >= lc->region_count) return 0; } while (log_test_bit(lc->recovering_bits, *region)); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 6b0fc167092..6cfa8d435d5 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -562,6 +562,8 @@ struct mirror_set { region_t nr_regions; int in_sync; + struct mirror *default_mirror; /* Default mirror */ + unsigned int nr_mirrors; struct mirror mirror[0]; }; @@ -611,7 +613,7 @@ static int recover(struct mirror_set *ms, struct region *reg) unsigned long flags = 0; /* fill in the source */ - m = ms->mirror + DEFAULT_MIRROR; + m = ms->default_mirror; from.bdev = m->dev->bdev; from.sector = m->offset + region_to_sector(reg->rh, reg->key); if (reg->key == (ms->nr_regions - 1)) { @@ -627,7 +629,7 @@ static int recover(struct mirror_set *ms, struct region *reg) /* fill in the destinations */ for (i = 0, dest = to; i < ms->nr_mirrors; i++) { - if (i == DEFAULT_MIRROR) + if (&ms->mirror[i] == ms->default_mirror) continue; m = ms->mirror + i; @@ -682,7 +684,7 @@ static void do_recovery(struct mirror_set *ms) static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) { /* FIXME: add read balancing */ - return ms->mirror + DEFAULT_MIRROR; + return ms->default_mirror; } /* @@ -709,7 +711,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) if (rh_in_sync(&ms->rh, region, 0)) m = choose_mirror(ms, bio->bi_sector); else - m = ms->mirror + DEFAULT_MIRROR; + m = ms->default_mirror; map_bio(ms, m, bio); generic_make_request(bio); @@ -833,7 +835,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) rh_delay(&ms->rh, bio); while ((bio = bio_list_pop(&nosync))) { - map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio); + map_bio(ms, ms->default_mirror, bio); generic_make_request(bio); } } @@ -900,6 +902,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, ms->nr_mirrors = nr_mirrors; ms->nr_regions = dm_sector_div_up(ti->len, region_size); ms->in_sync = 0; + ms->default_mirror = &ms->mirror[DEFAULT_MIRROR]; if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { ti->error = "dm-mirror: Error creating dirty region hash"; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index ab54f99b7c3..87727d84dbb 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -371,6 +371,20 @@ static inline ulong round_up(ulong n, ulong size) return (n + size) & ~size; } +static void read_snapshot_metadata(struct dm_snapshot *s) +{ + if (s->have_metadata) + return; + + if (s->store.read_metadata(&s->store)) { + down_write(&s->lock); + s->valid = 0; + up_write(&s->lock); + } + + s->have_metadata = 1; +} + /* * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> */ @@ -677,7 +691,7 @@ static void copy_callback(int read_err, unsigned int write_err, void *context) /* * Dispatches the copy operation to kcopyd. */ -static inline void start_copy(struct pending_exception *pe) +static void start_copy(struct pending_exception *pe) { struct dm_snapshot *s = pe->snap; struct io_region src, dest; @@ -848,16 +862,7 @@ static void snapshot_resume(struct dm_target *ti) { struct dm_snapshot *s = (struct dm_snapshot *) ti->private; - if (s->have_metadata) - return; - - if (s->store.read_metadata(&s->store)) { - down_write(&s->lock); - s->valid = 0; - up_write(&s->lock); - } - - s->have_metadata = 1; + read_snapshot_metadata(s); } static int snapshot_status(struct dm_target *ti, status_type_t type, diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index a6d3baa46f6..a6f2dc66c3d 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -638,7 +638,7 @@ int dm_split_args(int *argc, char ***argvp, char *input) static void check_for_valid_limits(struct io_restrictions *rs) { if (!rs->max_sectors) - rs->max_sectors = MAX_SECTORS; + rs->max_sectors = SAFE_MAX_SECTORS; if (!rs->max_phys_segments) rs->max_phys_segments = MAX_PHYS_SEGMENTS; if (!rs->max_hw_segments) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 930b9fc2795..8c16359f8b0 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -55,6 +55,7 @@ union map_info *dm_get_mapinfo(struct bio *bio) */ #define DMF_BLOCK_IO 0 #define DMF_SUSPENDED 1 +#define DMF_FROZEN 2 struct mapped_device { struct rw_semaphore io_lock; @@ -97,7 +98,7 @@ struct mapped_device { * freeze/thaw support require holding onto a super block */ struct super_block *frozen_sb; - struct block_device *frozen_bdev; + struct block_device *suspended_bdev; }; #define MIN_IOS 256 @@ -292,7 +293,7 @@ struct dm_table *dm_get_table(struct mapped_device *md) * Decrements the number of outstanding ios that a bio has been * cloned into, completing the original io if necc. */ -static inline void dec_pending(struct dm_io *io, int error) +static void dec_pending(struct dm_io *io, int error) { if (error) io->error = error; @@ -767,6 +768,7 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent) md->queue->backing_dev_info.congested_fn = dm_any_congested; md->queue->backing_dev_info.congested_data = md; blk_queue_make_request(md->queue, dm_request); + blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); md->queue->unplug_fn = dm_unplug_all; md->queue->issue_flush_fn = dm_flush_all; @@ -836,9 +838,9 @@ static void __set_size(struct mapped_device *md, sector_t size) { set_capacity(md->disk, size); - down(&md->frozen_bdev->bd_inode->i_sem); - i_size_write(md->frozen_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); - up(&md->frozen_bdev->bd_inode->i_sem); + mutex_lock(&md->suspended_bdev->bd_inode->i_mutex); + i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); + mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex); } static int __bind(struct mapped_device *md, struct dm_table *t) @@ -902,10 +904,9 @@ int dm_create_with_minor(unsigned int minor, struct mapped_device **result) return create_aux(minor, 1, result); } -void *dm_get_mdptr(dev_t dev) +static struct mapped_device *dm_find_md(dev_t dev) { struct mapped_device *md; - void *mdptr = NULL; unsigned minor = MINOR(dev); if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) @@ -914,12 +915,32 @@ void *dm_get_mdptr(dev_t dev) down(&_minor_lock); md = idr_find(&_minor_idr, minor); - - if (md && (dm_disk(md)->first_minor == minor)) - mdptr = md->interface_ptr; + if (!md || (dm_disk(md)->first_minor != minor)) + md = NULL; up(&_minor_lock); + return md; +} + +struct mapped_device *dm_get_md(dev_t dev) +{ + struct mapped_device *md = dm_find_md(dev); + + if (md) + dm_get(md); + + return md; +} + +void *dm_get_mdptr(dev_t dev) +{ + struct mapped_device *md; + void *mdptr = NULL; + + md = dm_find_md(dev); + if (md) + mdptr = md->interface_ptr; return mdptr; } @@ -991,43 +1012,33 @@ out: */ static int lock_fs(struct mapped_device *md) { - int r = -ENOMEM; - - md->frozen_bdev = bdget_disk(md->disk, 0); - if (!md->frozen_bdev) { - DMWARN("bdget failed in lock_fs"); - goto out; - } + int r; WARN_ON(md->frozen_sb); - md->frozen_sb = freeze_bdev(md->frozen_bdev); + md->frozen_sb = freeze_bdev(md->suspended_bdev); if (IS_ERR(md->frozen_sb)) { r = PTR_ERR(md->frozen_sb); - goto out_bdput; + md->frozen_sb = NULL; + return r; } + set_bit(DMF_FROZEN, &md->flags); + /* don't bdput right now, we don't want the bdev - * to go away while it is locked. We'll bdput - * in unlock_fs + * to go away while it is locked. */ return 0; - -out_bdput: - bdput(md->frozen_bdev); - md->frozen_sb = NULL; - md->frozen_bdev = NULL; -out: - return r; } static void unlock_fs(struct mapped_device *md) { - thaw_bdev(md->frozen_bdev, md->frozen_sb); - bdput(md->frozen_bdev); + if (!test_bit(DMF_FROZEN, &md->flags)) + return; + thaw_bdev(md->suspended_bdev, md->frozen_sb); md->frozen_sb = NULL; - md->frozen_bdev = NULL; + clear_bit(DMF_FROZEN, &md->flags); } /* @@ -1037,7 +1048,7 @@ static void unlock_fs(struct mapped_device *md) * dm_bind_table, dm_suspend must be called to flush any in * flight bios and ensure that any further io gets deferred. */ -int dm_suspend(struct mapped_device *md) +int dm_suspend(struct mapped_device *md, int do_lockfs) { struct dm_table *map = NULL; DECLARE_WAITQUEUE(wait, current); @@ -1053,10 +1064,19 @@ int dm_suspend(struct mapped_device *md) /* This does not get reverted if there's an error later. */ dm_table_presuspend_targets(map); - /* Flush I/O to the device. */ - r = lock_fs(md); - if (r) + md->suspended_bdev = bdget_disk(md->disk, 0); + if (!md->suspended_bdev) { + DMWARN("bdget failed in dm_suspend"); + r = -ENOMEM; goto out; + } + + /* Flush I/O to the device. */ + if (do_lockfs) { + r = lock_fs(md); + if (r) + goto out; + } /* * First we set the BLOCK_IO flag so no more ios will be mapped. @@ -1105,6 +1125,11 @@ int dm_suspend(struct mapped_device *md) r = 0; out: + if (r && md->suspended_bdev) { + bdput(md->suspended_bdev); + md->suspended_bdev = NULL; + } + dm_table_put(map); up(&md->suspend_lock); return r; @@ -1135,6 +1160,9 @@ int dm_resume(struct mapped_device *md) unlock_fs(md); + bdput(md->suspended_bdev); + md->suspended_bdev = NULL; + clear_bit(DMF_SUSPENDED, &md->flags); dm_table_unplug_all(map); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index e38c3fc1a1d..4eaf075da21 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -28,7 +28,7 @@ * in types.h. */ #ifdef CONFIG_LBD -#define SECTOR_FORMAT "%Lu" +#define SECTOR_FORMAT "%llu" #else #define SECTOR_FORMAT "%lu" #endif @@ -58,6 +58,7 @@ int dm_create(struct mapped_device **md); int dm_create_with_minor(unsigned int minor, struct mapped_device **md); void dm_set_mdptr(struct mapped_device *md, void *ptr); void *dm_get_mdptr(dev_t dev); +struct mapped_device *dm_get_md(dev_t dev); /* * Reference counting for md. @@ -68,7 +69,7 @@ void dm_put(struct mapped_device *md); /* * A device can still be used while suspended, but I/O is deferred. */ -int dm_suspend(struct mapped_device *md); +int dm_suspend(struct mapped_device *md, int with_lockfs); int dm_resume(struct mapped_device *md); /* diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 0248f8e7eac..a7a5ab55433 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -316,9 +316,10 @@ static int stop(mddev_t *mddev) return 0; } -static mdk_personality_t faulty_personality = +static struct mdk_personality faulty_personality = { .name = "faulty", + .level = LEVEL_FAULTY, .owner = THIS_MODULE, .make_request = make_request, .run = run, @@ -329,15 +330,17 @@ static mdk_personality_t faulty_personality = static int __init raid_init(void) { - return register_md_personality(FAULTY, &faulty_personality); + return register_md_personality(&faulty_personality); } static void raid_exit(void) { - unregister_md_personality(FAULTY); + unregister_md_personality(&faulty_personality); } module_init(raid_init); module_exit(raid_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS("md-personality-10"); /* faulty */ +MODULE_ALIAS("md-faulty"); +MODULE_ALIAS("md-level--5"); diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c index eb703648597..8b3515f394a 100644 --- a/drivers/md/kcopyd.c +++ b/drivers/md/kcopyd.c @@ -8,6 +8,7 @@ * completion notification. */ +#include <asm/types.h> #include <asm/atomic.h> #include <linux/blkdev.h> @@ -561,11 +562,13 @@ int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, * Cancels a kcopyd job, eg. someone might be deactivating a * mirror. */ +#if 0 int kcopyd_cancel(struct kcopyd_job *job, int block) { /* FIXME: finish */ return -1; } +#endif /* 0 */ /*----------------------------------------------------------------- * Unit setup @@ -684,4 +687,3 @@ void kcopyd_client_destroy(struct kcopyd_client *kc) EXPORT_SYMBOL(kcopyd_client_create); EXPORT_SYMBOL(kcopyd_client_destroy); EXPORT_SYMBOL(kcopyd_copy); -EXPORT_SYMBOL(kcopyd_cancel); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 946efef3a8f..777585458c8 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -121,11 +121,10 @@ static int linear_run (mddev_t *mddev) sector_t curr_offset; struct list_head *tmp; - conf = kmalloc (sizeof (*conf) + mddev->raid_disks*sizeof(dev_info_t), + conf = kzalloc (sizeof (*conf) + mddev->raid_disks*sizeof(dev_info_t), GFP_KERNEL); if (!conf) goto out; - memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t)); mddev->private = conf; cnt = 0; @@ -352,9 +351,10 @@ static void linear_status (struct seq_file *seq, mddev_t *mddev) } -static mdk_personality_t linear_personality= +static struct mdk_personality linear_personality = { .name = "linear", + .level = LEVEL_LINEAR, .owner = THIS_MODULE, .make_request = linear_make_request, .run = linear_run, @@ -364,16 +364,18 @@ static mdk_personality_t linear_personality= static int __init linear_init (void) { - return register_md_personality (LINEAR, &linear_personality); + return register_md_personality (&linear_personality); } static void linear_exit (void) { - unregister_md_personality (LINEAR); + unregister_md_personality (&linear_personality); } module_init(linear_init); module_exit(linear_exit); MODULE_LICENSE("GPL"); -MODULE_ALIAS("md-personality-1"); /* LINEAR */ +MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ +MODULE_ALIAS("md-linear"); +MODULE_ALIAS("md-level--1"); diff --git a/drivers/md/md.c b/drivers/md/md.c index cd12fca73b0..7145cd150f7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -42,6 +42,7 @@ #include <linux/devfs_fs_kernel.h> #include <linux/buffer_head.h> /* for invalidate_bdev */ #include <linux/suspend.h> +#include <linux/poll.h> #include <linux/init.h> @@ -67,7 +68,7 @@ static void autostart_arrays (int part); #endif -static mdk_personality_t *pers[MAX_PERSONALITY]; +static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); /* @@ -80,10 +81,22 @@ static DEFINE_SPINLOCK(pers_lock); * idle IO detection. * * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. + * or /sys/block/mdX/md/sync_speed_{min,max} */ static int sysctl_speed_limit_min = 1000; static int sysctl_speed_limit_max = 200000; +static inline int speed_min(mddev_t *mddev) +{ + return mddev->sync_speed_min ? + mddev->sync_speed_min : sysctl_speed_limit_min; +} + +static inline int speed_max(mddev_t *mddev) +{ + return mddev->sync_speed_max ? + mddev->sync_speed_max : sysctl_speed_limit_max; +} static struct ctl_table_header *raid_table_header; @@ -134,6 +147,24 @@ static struct block_device_operations md_fops; static int start_readonly; /* + * We have a system wide 'event count' that is incremented + * on any 'interesting' event, and readers of /proc/mdstat + * can use 'poll' or 'select' to find out when the event + * count increases. + * + * Events are: + * start array, stop array, error, add device, remove device, + * start build, activate spare + */ +static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); +static atomic_t md_event_count; +static void md_new_event(mddev_t *mddev) +{ + atomic_inc(&md_event_count); + wake_up(&md_event_waiters); +} + +/* * Enables to iterate over all existing md arrays * all_mddevs_lock protects this list. */ @@ -209,12 +240,10 @@ static mddev_t * mddev_find(dev_t unit) } spin_unlock(&all_mddevs_lock); - new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); + new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return NULL; - memset(new, 0, sizeof(*new)); - new->unit = unit; if (MAJOR(unit) == MD_MAJOR) new->md_minor = MINOR(unit); @@ -262,7 +291,7 @@ static inline void mddev_unlock(mddev_t * mddev) md_wakeup_thread(mddev->thread); } -mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) { mdk_rdev_t * rdev; struct list_head *tmp; @@ -286,6 +315,18 @@ static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) return NULL; } +static struct mdk_personality *find_pers(int level, char *clevel) +{ + struct mdk_personality *pers; + list_for_each_entry(pers, &pers_list, list) { + if (level != LEVEL_NONE && pers->level == level) + return pers; + if (strcmp(pers->name, clevel)==0) + return pers; + } + return NULL; +} + static inline sector_t calc_dev_sboffset(struct block_device *bdev) { sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; @@ -320,7 +361,7 @@ static int alloc_disk_sb(mdk_rdev_t * rdev) static void free_disk_sb(mdk_rdev_t * rdev) { if (rdev->sb_page) { - page_cache_release(rdev->sb_page); + put_page(rdev->sb_page); rdev->sb_loaded = 0; rdev->sb_page = NULL; rdev->sb_offset = 0; @@ -461,6 +502,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size, bio_put(bio); return ret; } +EXPORT_SYMBOL_GPL(sync_page_io); static int read_disk_sb(mdk_rdev_t * rdev, int size) { @@ -665,6 +707,10 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version } rdev->size = calc_dev_size(rdev, sb->chunk_size); + if (rdev->size < sb->size && sb->level > 1) + /* "this cannot possibly happen" ... */ + ret = -EINVAL; + abort: return ret; } @@ -688,6 +734,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->ctime = sb->ctime; mddev->utime = sb->utime; mddev->level = sb->level; + mddev->clevel[0] = 0; mddev->layout = sb->layout; mddev->raid_disks = sb->raid_disks; mddev->size = sb->size; @@ -714,9 +761,10 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && mddev->bitmap_file == NULL) { - if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) { + if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6 + && mddev->level != 10) { /* FIXME use a better test */ - printk(KERN_WARNING "md: bitmaps only support for raid1\n"); + printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); return -EINVAL; } mddev->bitmap_offset = mddev->default_bitmap_offset; @@ -968,6 +1016,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) } rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); + atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; @@ -1006,6 +1055,9 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) rdev->size = le64_to_cpu(sb->data_size)/2; if (le32_to_cpu(sb->chunksize)) rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); + + if (le32_to_cpu(sb->size) > rdev->size*2) + return -EINVAL; return 0; } @@ -1023,6 +1075,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); mddev->level = le32_to_cpu(sb->level); + mddev->clevel[0] = 0; mddev->layout = le32_to_cpu(sb->layout); mddev->raid_disks = le32_to_cpu(sb->raid_disks); mddev->size = le64_to_cpu(sb->size)/2; @@ -1037,8 +1090,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && mddev->bitmap_file == NULL ) { - if (mddev->level != 1) { - printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); + if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6 + && mddev->level != 10) { + printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); return -EINVAL; } mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); @@ -1105,6 +1159,8 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) else sb->resync_offset = cpu_to_le64(0); + sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors); + if (mddev->bitmap && mddev->bitmap_file == NULL) { sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); @@ -1182,11 +1238,20 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) mdk_rdev_t *same_pdev; char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; struct kobject *ko; + char *s; if (rdev->mddev) { MD_BUG(); return -EINVAL; } + /* make sure rdev->size exceeds mddev->size */ + if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { + if (mddev->pers) + /* Cannot change size, so fail */ + return -ENOSPC; + else + mddev->size = rdev->size; + } same_pdev = match_dev_unit(mddev, rdev); if (same_pdev) printk(KERN_WARNING @@ -1213,6 +1278,8 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) bdevname(rdev->bdev,b); if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0) return -ENOMEM; + while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL) + *s = '!'; list_add(&rdev->same_set, &mddev->disks); rdev->mddev = mddev; @@ -1496,6 +1563,26 @@ repeat: } +/* words written to sysfs files may, or my not, be \n terminated. + * We want to accept with case. For this we use cmd_match. + */ +static int cmd_match(const char *cmd, const char *str) +{ + /* See if cmd, written into a sysfs file, matches + * str. They must either be the same, or cmd can + * have a trailing newline + */ + while (*cmd && *str && *cmd == *str) { + cmd++; + str++; + } + if (*cmd == '\n') + cmd++; + if (*str || *cmd) + return 0; + return 1; +} + struct rdev_sysfs_entry { struct attribute attr; ssize_t (*show)(mdk_rdev_t *, char *); @@ -1538,9 +1625,113 @@ super_show(mdk_rdev_t *rdev, char *page) } static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super); +static ssize_t +errors_show(mdk_rdev_t *rdev, char *page) +{ + return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); +} + +static ssize_t +errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) +{ + char *e; + unsigned long n = simple_strtoul(buf, &e, 10); + if (*buf && (*e == 0 || *e == '\n')) { + atomic_set(&rdev->corrected_errors, n); + return len; + } + return -EINVAL; +} +static struct rdev_sysfs_entry rdev_errors = +__ATTR(errors, 0644, errors_show, errors_store); + +static ssize_t +slot_show(mdk_rdev_t *rdev, char *page) +{ + if (rdev->raid_disk < 0) + return sprintf(page, "none\n"); + else + return sprintf(page, "%d\n", rdev->raid_disk); +} + +static ssize_t +slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) +{ + char *e; + int slot = simple_strtoul(buf, &e, 10); + if (strncmp(buf, "none", 4)==0) + slot = -1; + else if (e==buf || (*e && *e!= '\n')) + return -EINVAL; + if (rdev->mddev->pers) + /* Cannot set slot in active array (yet) */ + return -EBUSY; + if (slot >= rdev->mddev->raid_disks) + return -ENOSPC; + rdev->raid_disk = slot; + /* assume it is working */ + rdev->flags = 0; + set_bit(In_sync, &rdev->flags); + return len; +} + + +static struct rdev_sysfs_entry rdev_slot = +__ATTR(slot, 0644, slot_show, slot_store); + +static ssize_t +offset_show(mdk_rdev_t *rdev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); +} + +static ssize_t +offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) +{ + char *e; + unsigned long long offset = simple_strtoull(buf, &e, 10); + if (e==buf || (*e && *e != '\n')) + return -EINVAL; + if (rdev->mddev->pers) + return -EBUSY; + rdev->data_offset = offset; + return len; +} + +static struct rdev_sysfs_entry rdev_offset = +__ATTR(offset, 0644, offset_show, offset_store); + +static ssize_t +rdev_size_show(mdk_rdev_t *rdev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)rdev->size); +} + +static ssize_t +rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) +{ + char *e; + unsigned long long size = simple_strtoull(buf, &e, 10); + if (e==buf || (*e && *e != '\n')) + return -EINVAL; + if (rdev->mddev->pers) + return -EBUSY; + rdev->size = size; + if (size < rdev->mddev->size || rdev->mddev->size == 0) + rdev->mddev->size = size; + return len; +} + +static struct rdev_sysfs_entry rdev_size = +__ATTR(size, 0644, rdev_size_show, rdev_size_store); + static struct attribute *rdev_default_attrs[] = { &rdev_state.attr, &rdev_super.attr, + &rdev_errors.attr, + &rdev_slot.attr, + &rdev_offset.attr, + &rdev_size.attr, NULL, }; static ssize_t @@ -1598,12 +1789,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi mdk_rdev_t *rdev; sector_t size; - rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); + rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); if (!rdev) { printk(KERN_ERR "md: could not alloc mem for new device!\n"); return ERR_PTR(-ENOMEM); } - memset(rdev, 0, sizeof(*rdev)); if ((err = alloc_disk_sb(rdev))) goto abort_free; @@ -1621,6 +1811,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi rdev->data_offset = 0; atomic_set(&rdev->nr_pending, 0); atomic_set(&rdev->read_errors, 0); + atomic_set(&rdev->corrected_errors, 0); size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; if (!size) { @@ -1725,16 +1916,37 @@ static void analyze_sbs(mddev_t * mddev) static ssize_t level_show(mddev_t *mddev, char *page) { - mdk_personality_t *p = mddev->pers; - if (p == NULL && mddev->raid_disks == 0) - return 0; - if (mddev->level >= 0) - return sprintf(page, "RAID-%d\n", mddev->level); - else + struct mdk_personality *p = mddev->pers; + if (p) return sprintf(page, "%s\n", p->name); + else if (mddev->clevel[0]) + return sprintf(page, "%s\n", mddev->clevel); + else if (mddev->level != LEVEL_NONE) + return sprintf(page, "%d\n", mddev->level); + else + return 0; } -static struct md_sysfs_entry md_level = __ATTR_RO(level); +static ssize_t +level_store(mddev_t *mddev, const char *buf, size_t len) +{ + int rv = len; + if (mddev->pers) + return -EBUSY; + if (len == 0) + return 0; + if (len >= sizeof(mddev->clevel)) + return -ENOSPC; + strncpy(mddev->clevel, buf, len); + if (mddev->clevel[len-1] == '\n') + len--; + mddev->clevel[len] = 0; + mddev->level = LEVEL_NONE; + return rv; +} + +static struct md_sysfs_entry md_level = +__ATTR(level, 0644, level_show, level_store); static ssize_t raid_disks_show(mddev_t *mddev, char *page) @@ -1744,7 +1956,197 @@ raid_disks_show(mddev_t *mddev, char *page) return sprintf(page, "%d\n", mddev->raid_disks); } -static struct md_sysfs_entry md_raid_disks = __ATTR_RO(raid_disks); +static int update_raid_disks(mddev_t *mddev, int raid_disks); + +static ssize_t +raid_disks_store(mddev_t *mddev, const char *buf, size_t len) +{ + /* can only set raid_disks if array is not yet active */ + char *e; + int rv = 0; + unsigned long n = simple_strtoul(buf, &e, 10); + + if (!*buf || (*e && *e != '\n')) + return -EINVAL; + + if (mddev->pers) + rv = update_raid_disks(mddev, n); + else + mddev->raid_disks = n; + return rv ? rv : len; +} +static struct md_sysfs_entry md_raid_disks = +__ATTR(raid_disks, 0644, raid_disks_show, raid_disks_store); + +static ssize_t +chunk_size_show(mddev_t *mddev, char *page) +{ + return sprintf(page, "%d\n", mddev->chunk_size); +} + +static ssize_t +chunk_size_store(mddev_t *mddev, const char *buf, size_t len) +{ + /* can only set chunk_size if array is not yet active */ + char *e; + unsigned long n = simple_strtoul(buf, &e, 10); + + if (mddev->pers) + return -EBUSY; + if (!*buf || (*e && *e != '\n')) + return -EINVAL; + + mddev->chunk_size = n; + return len; +} +static struct md_sysfs_entry md_chunk_size = +__ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store); + +static ssize_t +null_show(mddev_t *mddev, char *page) +{ + return -EINVAL; +} + +static ssize_t +new_dev_store(mddev_t *mddev, const char *buf, size_t len) +{ + /* buf must be %d:%d\n? giving major and minor numbers */ + /* The new device is added to the array. + * If the array has a persistent superblock, we read the + * superblock to initialise info and check validity. + * Otherwise, only checking done is that in bind_rdev_to_array, + * which mainly checks size. + */ + char *e; + int major = simple_strtoul(buf, &e, 10); + int minor; + dev_t dev; + mdk_rdev_t *rdev; + int err; + + if (!*buf || *e != ':' || !e[1] || e[1] == '\n') + return -EINVAL; + minor = simple_strtoul(e+1, &e, 10); + if (*e && *e != '\n') + return -EINVAL; + dev = MKDEV(major, minor); + if (major != MAJOR(dev) || + minor != MINOR(dev)) + return -EOVERFLOW; + + + if (mddev->persistent) { + rdev = md_import_device(dev, mddev->major_version, + mddev->minor_version); + if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { + mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, + mdk_rdev_t, same_set); + err = super_types[mddev->major_version] + .load_super(rdev, rdev0, mddev->minor_version); + if (err < 0) + goto out; + } + } else + rdev = md_import_device(dev, -1, -1); + + if (IS_ERR(rdev)) + return PTR_ERR(rdev); + err = bind_rdev_to_array(rdev, mddev); + out: + if (err) + export_rdev(rdev); + return err ? err : len; +} + +static struct md_sysfs_entry md_new_device = +__ATTR(new_dev, 0200, null_show, new_dev_store); + +static ssize_t +size_show(mddev_t *mddev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)mddev->size); +} + +static int update_size(mddev_t *mddev, unsigned long size); + +static ssize_t +size_store(mddev_t *mddev, const char *buf, size_t len) +{ + /* If array is inactive, we can reduce the component size, but + * not increase it (except from 0). + * If array is active, we can try an on-line resize + */ + char *e; + int err = 0; + unsigned long long size = simple_strtoull(buf, &e, 10); + if (!*buf || *buf == '\n' || + (*e && *e != '\n')) + return -EINVAL; + + if (mddev->pers) { + err = update_size(mddev, size); + md_update_sb(mddev); + } else { + if (mddev->size == 0 || + mddev->size > size) + mddev->size = size; + else + err = -ENOSPC; + } + return err ? err : len; +} + +static struct md_sysfs_entry md_size = +__ATTR(component_size, 0644, size_show, size_store); + + +/* Metdata version. + * This is either 'none' for arrays with externally managed metadata, + * or N.M for internally known formats + */ +static ssize_t +metadata_show(mddev_t *mddev, char *page) +{ + if (mddev->persistent) + return sprintf(page, "%d.%d\n", + mddev->major_version, mddev->minor_version); + else + return sprintf(page, "none\n"); +} + +static ssize_t +metadata_store(mddev_t *mddev, const char *buf, size_t len) +{ + int major, minor; + char *e; + if (!list_empty(&mddev->disks)) + return -EBUSY; + + if (cmd_match(buf, "none")) { + mddev->persistent = 0; + mddev->major_version = 0; + mddev->minor_version = 90; + return len; + } + major = simple_strtoul(buf, &e, 10); + if (e==buf || *e != '.') + return -EINVAL; + buf = e+1; + minor = simple_strtoul(buf, &e, 10); + if (e==buf || *e != '\n') + return -EINVAL; + if (major >= sizeof(super_types)/sizeof(super_types[0]) || + super_types[major].name == NULL) + return -ENOENT; + mddev->major_version = major; + mddev->minor_version = minor; + mddev->persistent = 1; + return len; +} + +static struct md_sysfs_entry md_metadata = +__ATTR(metadata_version, 0644, metadata_show, metadata_store); static ssize_t action_show(mddev_t *mddev, char *page) @@ -1771,31 +2173,27 @@ action_store(mddev_t *mddev, const char *page, size_t len) if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; - if (strcmp(page, "idle")==0 || strcmp(page, "idle\n")==0) { + if (cmd_match(page, "idle")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_unregister_thread(mddev->sync_thread); mddev->sync_thread = NULL; mddev->recovery = 0; } - return len; - } - - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) + } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return -EBUSY; - if (strcmp(page, "resync")==0 || strcmp(page, "resync\n")==0 || - strcmp(page, "recover")==0 || strcmp(page, "recover\n")==0) + else if (cmd_match(page, "resync") || cmd_match(page, "recover")) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); else { - if (strcmp(page, "check")==0 || strcmp(page, "check\n")==0) + if (cmd_match(page, "check")) set_bit(MD_RECOVERY_CHECK, &mddev->recovery); - else if (strcmp(page, "repair")!=0 && strcmp(page, "repair\n")!=0) + else if (cmd_match(page, "repair")) return -EINVAL; set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); return len; } @@ -1814,15 +2212,107 @@ md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); +static ssize_t +sync_min_show(mddev_t *mddev, char *page) +{ + return sprintf(page, "%d (%s)\n", speed_min(mddev), + mddev->sync_speed_min ? "local": "system"); +} + +static ssize_t +sync_min_store(mddev_t *mddev, const char *buf, size_t len) +{ + int min; + char *e; + if (strncmp(buf, "system", 6)==0) { + mddev->sync_speed_min = 0; + return len; + } + min = simple_strtoul(buf, &e, 10); + if (buf == e || (*e && *e != '\n') || min <= 0) + return -EINVAL; + mddev->sync_speed_min = min; + return len; +} + +static struct md_sysfs_entry md_sync_min = +__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); + +static ssize_t +sync_max_show(mddev_t *mddev, char *page) +{ + return sprintf(page, "%d (%s)\n", speed_max(mddev), + mddev->sync_speed_max ? "local": "system"); +} + +static ssize_t +sync_max_store(mddev_t *mddev, const char *buf, size_t len) +{ + int max; + char *e; + if (strncmp(buf, "system", 6)==0) { + mddev->sync_speed_max = 0; + return len; + } + max = simple_strtoul(buf, &e, 10); + if (buf == e || (*e && *e != '\n') || max <= 0) + return -EINVAL; + mddev->sync_speed_max = max; + return len; +} + +static struct md_sysfs_entry md_sync_max = +__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); + + +static ssize_t +sync_speed_show(mddev_t *mddev, char *page) +{ + unsigned long resync, dt, db; + resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); + dt = ((jiffies - mddev->resync_mark) / HZ); + if (!dt) dt++; + db = resync - (mddev->resync_mark_cnt); + return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ +} + +static struct md_sysfs_entry +md_sync_speed = __ATTR_RO(sync_speed); + +static ssize_t +sync_completed_show(mddev_t *mddev, char *page) +{ + unsigned long max_blocks, resync; + + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + max_blocks = mddev->resync_max_sectors; + else + max_blocks = mddev->size << 1; + + resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); + return sprintf(page, "%lu / %lu\n", resync, max_blocks); +} + +static struct md_sysfs_entry +md_sync_completed = __ATTR_RO(sync_completed); + static struct attribute *md_default_attrs[] = { &md_level.attr, &md_raid_disks.attr, + &md_chunk_size.attr, + &md_size.attr, + &md_metadata.attr, + &md_new_device.attr, NULL, }; static struct attribute *md_redundancy_attrs[] = { &md_scan_mode.attr, &md_mismatches.attr, + &md_sync_min.attr, + &md_sync_max.attr, + &md_sync_speed.attr, + &md_sync_completed.attr, NULL, }; static struct attribute_group md_redundancy_group = { @@ -1937,14 +2427,16 @@ static void md_safemode_timeout(unsigned long data) md_wakeup_thread(mddev->thread); } +static int start_dirty_degraded; static int do_md_run(mddev_t * mddev) { - int pnum, err; + int err; int chunk_size; struct list_head *tmp; mdk_rdev_t *rdev; struct gendisk *disk; + struct mdk_personality *pers; char b[BDEVNAME_SIZE]; if (list_empty(&mddev->disks)) @@ -1961,20 +2453,8 @@ static int do_md_run(mddev_t * mddev) analyze_sbs(mddev); chunk_size = mddev->chunk_size; - pnum = level_to_pers(mddev->level); - if ((pnum != MULTIPATH) && (pnum != RAID1)) { - if (!chunk_size) { - /* - * 'default chunksize' in the old md code used to - * be PAGE_SIZE, baaad. - * we abort here to be on the safe side. We don't - * want to continue the bad practice. - */ - printk(KERN_ERR - "no chunksize specified, see 'man raidtab'\n"); - return -EINVAL; - } + if (chunk_size) { if (chunk_size > MAX_CHUNK_SIZE) { printk(KERN_ERR "too big chunk_size: %d > %d\n", chunk_size, MAX_CHUNK_SIZE); @@ -2010,10 +2490,10 @@ static int do_md_run(mddev_t * mddev) } #ifdef CONFIG_KMOD - if (!pers[pnum]) - { - request_module("md-personality-%d", pnum); - } + if (mddev->level != LEVEL_NONE) + request_module("md-level-%d", mddev->level); + else if (mddev->clevel[0]) + request_module("md-%s", mddev->clevel); #endif /* @@ -2035,30 +2515,39 @@ static int do_md_run(mddev_t * mddev) return -ENOMEM; spin_lock(&pers_lock); - if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { + pers = find_pers(mddev->level, mddev->clevel); + if (!pers || !try_module_get(pers->owner)) { spin_unlock(&pers_lock); - printk(KERN_WARNING "md: personality %d is not loaded!\n", - pnum); + if (mddev->level != LEVEL_NONE) + printk(KERN_WARNING "md: personality for level %d is not loaded!\n", + mddev->level); + else + printk(KERN_WARNING "md: personality for level %s is not loaded!\n", + mddev->clevel); return -EINVAL; } - - mddev->pers = pers[pnum]; + mddev->pers = pers; spin_unlock(&pers_lock); + mddev->level = pers->level; + strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); mddev->recovery = 0; mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ mddev->barriers_work = 1; + mddev->ok_start_degraded = start_dirty_degraded; if (start_readonly) mddev->ro = 2; /* read-only, but switch on first write */ - /* before we start the array running, initialise the bitmap */ - err = bitmap_create(mddev); - if (err) - printk(KERN_ERR "%s: failed to create bitmap (%d)\n", - mdname(mddev), err); - else - err = mddev->pers->run(mddev); + err = mddev->pers->run(mddev); + if (!err && mddev->pers->sync_request) { + err = bitmap_create(mddev); + if (err) { + printk(KERN_ERR "%s: failed to create bitmap (%d)\n", + mdname(mddev), err); + mddev->pers->stop(mddev); + } + } if (err) { printk(KERN_ERR "md: pers->run() failed ...\n"); module_put(mddev->pers->owner); @@ -2104,6 +2593,7 @@ static int do_md_run(mddev_t * mddev) mddev->queue->make_request_fn = mddev->pers->make_request; mddev->changed = 1; + md_new_event(mddev); return 0; } @@ -2231,6 +2721,7 @@ static int do_md_stop(mddev_t * mddev, int ro) printk(KERN_INFO "md: %s switched to read-only mode.\n", mdname(mddev)); err = 0; + md_new_event(mddev); out: return err; } @@ -2668,12 +3159,6 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) if (info->state & (1<<MD_DISK_WRITEMOSTLY)) set_bit(WriteMostly, &rdev->flags); - err = bind_rdev_to_array(rdev, mddev); - if (err) { - export_rdev(rdev); - return err; - } - if (!mddev->persistent) { printk(KERN_INFO "md: nonpersistent superblock ...\n"); rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; @@ -2681,8 +3166,11 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) rdev->sb_offset = calc_dev_sboffset(rdev->bdev); rdev->size = calc_dev_size(rdev, mddev->chunk_size); - if (!mddev->size || (mddev->size > rdev->size)) - mddev->size = rdev->size; + err = bind_rdev_to_array(rdev, mddev); + if (err) { + export_rdev(rdev); + return err; + } } return 0; @@ -2705,6 +3193,7 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev) kick_rdev_from_array(rdev); md_update_sb(mddev); + md_new_event(mddev); return 0; busy: @@ -2753,15 +3242,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) size = calc_dev_size(rdev, mddev->chunk_size); rdev->size = size; - if (size < mddev->size) { - printk(KERN_WARNING - "%s: disk size %llu blocks < array size %llu\n", - mdname(mddev), (unsigned long long)size, - (unsigned long long)mddev->size); - err = -ENOSPC; - goto abort_export; - } - if (test_bit(Faulty, &rdev->flags)) { printk(KERN_WARNING "md: can not hot-add faulty %s disk to %s!\n", @@ -2771,7 +3251,9 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) } clear_bit(In_sync, &rdev->flags); rdev->desc_nr = -1; - bind_rdev_to_array(rdev, mddev); + err = bind_rdev_to_array(rdev, mddev); + if (err) + goto abort_export; /* * The rest should better be atomic, we can have disk failures @@ -2795,7 +3277,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); - + md_new_event(mddev); return 0; abort_unbind_export: @@ -2913,6 +3395,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->ctime = get_seconds(); mddev->level = info->level; + mddev->clevel[0] = 0; mddev->size = info->size; mddev->raid_disks = info->raid_disks; /* don't set md_minor, it is determined by which /dev/md* was @@ -2942,6 +3425,81 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) return 0; } +static int update_size(mddev_t *mddev, unsigned long size) +{ + mdk_rdev_t * rdev; + int rv; + struct list_head *tmp; + + if (mddev->pers->resize == NULL) + return -EINVAL; + /* The "size" is the amount of each device that is used. + * This can only make sense for arrays with redundancy. + * linear and raid0 always use whatever space is available + * We can only consider changing the size if no resync + * or reconstruction is happening, and if the new size + * is acceptable. It must fit before the sb_offset or, + * if that is <data_offset, it must fit before the + * size of each device. + * If size is zero, we find the largest size that fits. + */ + if (mddev->sync_thread) + return -EBUSY; + ITERATE_RDEV(mddev,rdev,tmp) { + sector_t avail; + int fit = (size == 0); + if (rdev->sb_offset > rdev->data_offset) + avail = (rdev->sb_offset*2) - rdev->data_offset; + else + avail = get_capacity(rdev->bdev->bd_disk) + - rdev->data_offset; + if (fit && (size == 0 || size > avail/2)) + size = avail/2; + if (avail < ((sector_t)size << 1)) + return -ENOSPC; + } + rv = mddev->pers->resize(mddev, (sector_t)size *2); + if (!rv) { + struct block_device *bdev; + + bdev = bdget_disk(mddev->gendisk, 0); + if (bdev) { + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, mddev->array_size << 10); + mutex_unlock(&bdev->bd_inode->i_mutex); + bdput(bdev); + } + } + return rv; +} + +static int update_raid_disks(mddev_t *mddev, int raid_disks) +{ + int rv; + /* change the number of raid disks */ + if (mddev->pers->reshape == NULL) + return -EINVAL; + if (raid_disks <= 0 || + raid_disks >= mddev->max_disks) + return -EINVAL; + if (mddev->sync_thread) + return -EBUSY; + rv = mddev->pers->reshape(mddev, raid_disks); + if (!rv) { + struct block_device *bdev; + + bdev = bdget_disk(mddev->gendisk, 0); + if (bdev) { + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, mddev->array_size << 10); + mutex_unlock(&bdev->bd_inode->i_mutex); + bdput(bdev); + } + } + return rv; +} + + /* * update_array_info is used to change the configuration of an * on-line array. @@ -2990,71 +3548,12 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) else return mddev->pers->reconfig(mddev, info->layout, -1); } - if (mddev->size != info->size) { - mdk_rdev_t * rdev; - struct list_head *tmp; - if (mddev->pers->resize == NULL) - return -EINVAL; - /* The "size" is the amount of each device that is used. - * This can only make sense for arrays with redundancy. - * linear and raid0 always use whatever space is available - * We can only consider changing the size if no resync - * or reconstruction is happening, and if the new size - * is acceptable. It must fit before the sb_offset or, - * if that is <data_offset, it must fit before the - * size of each device. - * If size is zero, we find the largest size that fits. - */ - if (mddev->sync_thread) - return -EBUSY; - ITERATE_RDEV(mddev,rdev,tmp) { - sector_t avail; - int fit = (info->size == 0); - if (rdev->sb_offset > rdev->data_offset) - avail = (rdev->sb_offset*2) - rdev->data_offset; - else - avail = get_capacity(rdev->bdev->bd_disk) - - rdev->data_offset; - if (fit && (info->size == 0 || info->size > avail/2)) - info->size = avail/2; - if (avail < ((sector_t)info->size << 1)) - return -ENOSPC; - } - rv = mddev->pers->resize(mddev, (sector_t)info->size *2); - if (!rv) { - struct block_device *bdev; - - bdev = bdget_disk(mddev->gendisk, 0); - if (bdev) { - down(&bdev->bd_inode->i_sem); - i_size_write(bdev->bd_inode, mddev->array_size << 10); - up(&bdev->bd_inode->i_sem); - bdput(bdev); - } - } - } - if (mddev->raid_disks != info->raid_disks) { - /* change the number of raid disks */ - if (mddev->pers->reshape == NULL) - return -EINVAL; - if (info->raid_disks <= 0 || - info->raid_disks >= mddev->max_disks) - return -EINVAL; - if (mddev->sync_thread) - return -EBUSY; - rv = mddev->pers->reshape(mddev, info->raid_disks); - if (!rv) { - struct block_device *bdev; - - bdev = bdget_disk(mddev->gendisk, 0); - if (bdev) { - down(&bdev->bd_inode->i_sem); - i_size_write(bdev->bd_inode, mddev->array_size << 10); - up(&bdev->bd_inode->i_sem); - bdput(bdev); - } - } - } + if (mddev->size != info->size) + rv = update_size(mddev, info->size); + + if (mddev->raid_disks != info->raid_disks) + rv = update_raid_disks(mddev, info->raid_disks); + if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { if (mddev->pers->quiesce == NULL) return -EINVAL; @@ -3103,12 +3602,21 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev) return 0; } +static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + mddev_t *mddev = bdev->bd_disk->private_data; + + geo->heads = 2; + geo->sectors = 4; + geo->cylinders = get_capacity(mddev->gendisk) / 8; + return 0; +} + static int md_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { int err = 0; void __user *argp = (void __user *)arg; - struct hd_geometry __user *loc = argp; mddev_t *mddev = NULL; if (!capable(CAP_SYS_ADMIN)) @@ -3270,24 +3778,6 @@ static int md_ioctl(struct inode *inode, struct file *file, * 4 sectors (with a BIG number of cylinders...). This drives * dosfs just mad... ;-) */ - case HDIO_GETGEO: - if (!loc) { - err = -EINVAL; - goto abort_unlock; - } - err = put_user (2, (char __user *) &loc->heads); - if (err) - goto abort_unlock; - err = put_user (4, (char __user *) &loc->sectors); - if (err) - goto abort_unlock; - err = put_user(get_capacity(mddev->gendisk)/8, - (short __user *) &loc->cylinders); - if (err) - goto abort_unlock; - err = put_user (get_start_sect(inode->i_bdev), - (long __user *) &loc->start); - goto done_unlock; } /* @@ -3416,6 +3906,7 @@ static struct block_device_operations md_fops = .open = md_open, .release = md_release, .ioctl = md_ioctl, + .getgeo = md_getgeo, .media_changed = md_media_changed, .revalidate_disk= md_revalidate, }; @@ -3476,11 +3967,10 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, { mdk_thread_t *thread; - thread = kmalloc(sizeof(mdk_thread_t), GFP_KERNEL); + thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); if (!thread) return NULL; - memset(thread, 0, sizeof(mdk_thread_t)); init_waitqueue_head(&thread->wqueue); thread->run = run; @@ -3524,6 +4014,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); + md_new_event(mddev); } /* seq_file implementation /proc/mdstat */ @@ -3664,24 +4155,29 @@ static void md_seq_stop(struct seq_file *seq, void *v) mddev_put(mddev); } +struct mdstat_info { + int event; +}; + static int md_seq_show(struct seq_file *seq, void *v) { mddev_t *mddev = v; sector_t size; struct list_head *tmp2; mdk_rdev_t *rdev; - int i; + struct mdstat_info *mi = seq->private; struct bitmap *bitmap; if (v == (void*)1) { + struct mdk_personality *pers; seq_printf(seq, "Personalities : "); spin_lock(&pers_lock); - for (i = 0; i < MAX_PERSONALITY; i++) - if (pers[i]) - seq_printf(seq, "[%s] ", pers[i]->name); + list_for_each_entry(pers, &pers_list, list) + seq_printf(seq, "[%s] ", pers->name); spin_unlock(&pers_lock); seq_printf(seq, "\n"); + mi->event = atomic_read(&md_event_count); return 0; } if (v == (void*)2) { @@ -3790,47 +4286,68 @@ static struct seq_operations md_seq_ops = { static int md_seq_open(struct inode *inode, struct file *file) { int error; + struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); + if (mi == NULL) + return -ENOMEM; error = seq_open(file, &md_seq_ops); + if (error) + kfree(mi); + else { + struct seq_file *p = file->private_data; + p->private = mi; + mi->event = atomic_read(&md_event_count); + } return error; } +static int md_seq_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + struct mdstat_info *mi = m->private; + m->private = NULL; + kfree(mi); + return seq_release(inode, file); +} + +static unsigned int mdstat_poll(struct file *filp, poll_table *wait) +{ + struct seq_file *m = filp->private_data; + struct mdstat_info *mi = m->private; + int mask; + + poll_wait(filp, &md_event_waiters, wait); + + /* always allow read */ + mask = POLLIN | POLLRDNORM; + + if (mi->event != atomic_read(&md_event_count)) + mask |= POLLERR | POLLPRI; + return mask; +} + static struct file_operations md_seq_fops = { .open = md_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = md_seq_release, + .poll = mdstat_poll, }; -int register_md_personality(int pnum, mdk_personality_t *p) +int register_md_personality(struct mdk_personality *p) { - if (pnum >= MAX_PERSONALITY) { - printk(KERN_ERR - "md: tried to install personality %s as nr %d, but max is %lu\n", - p->name, pnum, MAX_PERSONALITY-1); - return -EINVAL; - } - spin_lock(&pers_lock); - if (pers[pnum]) { - spin_unlock(&pers_lock); - return -EBUSY; - } - - pers[pnum] = p; - printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); + list_add_tail(&p->list, &pers_list); + printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); spin_unlock(&pers_lock); return 0; } -int unregister_md_personality(int pnum) +int unregister_md_personality(struct mdk_personality *p) { - if (pnum >= MAX_PERSONALITY) - return -EINVAL; - - printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); + printk(KERN_INFO "md: %s personality unregistered\n", p->name); spin_lock(&pers_lock); - pers[pnum] = NULL; + list_del_init(&p->list); spin_unlock(&pers_lock); return 0; } @@ -4012,10 +4529,10 @@ static void md_do_sync(mddev_t *mddev) printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" - " %d KB/sec/disc.\n", sysctl_speed_limit_min); + " %d KB/sec/disc.\n", speed_min(mddev)); printk(KERN_INFO "md: using maximum available idle IO bandwidth " "(but not more than %d KB/sec) for reconstruction.\n", - sysctl_speed_limit_max); + speed_max(mddev)); is_mddev_idle(mddev); /* this also initializes IO event counters */ /* we don't use the checkpoint if there's a bitmap */ @@ -4056,7 +4573,7 @@ static void md_do_sync(mddev_t *mddev) skipped = 0; sectors = mddev->pers->sync_request(mddev, j, &skipped, - currspeed < sysctl_speed_limit_min); + currspeed < speed_min(mddev)); if (sectors == 0) { set_bit(MD_RECOVERY_ERR, &mddev->recovery); goto out; @@ -4069,7 +4586,11 @@ static void md_do_sync(mddev_t *mddev) j += sectors; if (j>1) mddev->curr_resync = j; - + if (last_check == 0) + /* this is the earliers that rebuilt will be + * visible in /proc/mdstat + */ + md_new_event(mddev); if (last_check + window > io_sectors || j == max_sectors) continue; @@ -4117,8 +4638,8 @@ static void md_do_sync(mddev_t *mddev) currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 /((jiffies-mddev->resync_mark)/HZ +1) +1; - if (currspeed > sysctl_speed_limit_min) { - if ((currspeed > sysctl_speed_limit_max) || + if (currspeed > speed_min(mddev)) { + if ((currspeed > speed_max(mddev)) || !is_mddev_idle(mddev)) { msleep(500); goto repeat; @@ -4255,6 +4776,7 @@ void md_check_recovery(mddev_t *mddev) mddev->recovery = 0; /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_new_event(mddev); goto unlock; } /* Clear some bits that don't mean anything, but @@ -4292,6 +4814,7 @@ void md_check_recovery(mddev_t *mddev) sprintf(nm, "rd%d", rdev->raid_disk); sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); spares++; + md_new_event(mddev); } else break; } @@ -4324,9 +4847,9 @@ void md_check_recovery(mddev_t *mddev) mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ mddev->recovery = 0; - } else { + } else md_wakeup_thread(mddev->sync_thread); - } + md_new_event(mddev); } unlock: mddev_unlock(mddev); @@ -4503,12 +5026,14 @@ static int set_ro(const char *val, struct kernel_param *kp) int num = simple_strtoul(val, &e, 10); if (*val && (*e == '\0' || *e == '\n')) { start_readonly = num; - return 0;; + return 0; } return -EINVAL; } module_param_call(start_ro, set_ro, get_ro, NULL, 0600); +module_param(start_dirty_degraded, int, 0644); + EXPORT_SYMBOL(register_md_personality); EXPORT_SYMBOL(unregister_md_personality); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 145cdc5ad00..96f7af4ae40 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -35,15 +35,10 @@ #define NR_RESERVED_BUFS 32 -static mdk_personality_t multipath_personality; - - static void *mp_pool_alloc(gfp_t gfp_flags, void *data) { struct multipath_bh *mpb; - mpb = kmalloc(sizeof(*mpb), gfp_flags); - if (mpb) - memset(mpb, 0, sizeof(*mpb)); + mpb = kzalloc(sizeof(*mpb), gfp_flags); return mpb; } @@ -308,6 +303,7 @@ static void print_multipath_conf (multipath_conf_t *conf) static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) { multipath_conf_t *conf = mddev->private; + struct request_queue *q; int found = 0; int path; struct multipath_info *p; @@ -316,8 +312,8 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) for (path=0; path<mddev->raid_disks; path++) if ((p=conf->multipaths+path)->rdev == NULL) { - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); + q = rdev->bdev->bd_disk->queue; + blk_queue_stack_limits(mddev->queue, q); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as @@ -325,7 +321,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) * (Note: it is very unlikely that a device with * merge_bvec_fn will be involved in multipath.) */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && + if (q->merge_bvec_fn && mddev->queue->max_sectors > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); @@ -444,7 +440,7 @@ static int multipath_run (mddev_t *mddev) * should be freed in multipath_stop()] */ - conf = kmalloc(sizeof(multipath_conf_t), GFP_KERNEL); + conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL); mddev->private = conf; if (!conf) { printk(KERN_ERR @@ -452,9 +448,8 @@ static int multipath_run (mddev_t *mddev) mdname(mddev)); goto out; } - memset(conf, 0, sizeof(*conf)); - conf->multipaths = kmalloc(sizeof(struct multipath_info)*mddev->raid_disks, + conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks, GFP_KERNEL); if (!conf->multipaths) { printk(KERN_ERR @@ -462,7 +457,6 @@ static int multipath_run (mddev_t *mddev) mdname(mddev)); goto out_free_conf; } - memset(conf->multipaths, 0, sizeof(struct multipath_info)*mddev->raid_disks); conf->working_disks = 0; ITERATE_RDEV(mddev,rdev,tmp) { @@ -557,9 +551,10 @@ static int multipath_stop (mddev_t *mddev) return 0; } -static mdk_personality_t multipath_personality= +static struct mdk_personality multipath_personality = { .name = "multipath", + .level = LEVEL_MULTIPATH, .owner = THIS_MODULE, .make_request = multipath_make_request, .run = multipath_run, @@ -572,15 +567,17 @@ static mdk_personality_t multipath_personality= static int __init multipath_init (void) { - return register_md_personality (MULTIPATH, &multipath_personality); + return register_md_personality (&multipath_personality); } static void __exit multipath_exit (void) { - unregister_md_personality (MULTIPATH); + unregister_md_personality (&multipath_personality); } module_init(multipath_init); module_exit(multipath_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ +MODULE_ALIAS("md-multipath"); +MODULE_ALIAS("md-level--4"); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index fece3277c2a..d03f99cf4b7 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -113,21 +113,16 @@ static int create_strip_zones (mddev_t *mddev) } printk("raid0: FINAL %d zones\n", conf->nr_strip_zones); - conf->strip_zone = kmalloc(sizeof(struct strip_zone)* + conf->strip_zone = kzalloc(sizeof(struct strip_zone)* conf->nr_strip_zones, GFP_KERNEL); if (!conf->strip_zone) return 1; - conf->devlist = kmalloc(sizeof(mdk_rdev_t*)* + conf->devlist = kzalloc(sizeof(mdk_rdev_t*)* conf->nr_strip_zones*mddev->raid_disks, GFP_KERNEL); if (!conf->devlist) return 1; - memset(conf->strip_zone, 0,sizeof(struct strip_zone)* - conf->nr_strip_zones); - memset(conf->devlist, 0, - sizeof(mdk_rdev_t*) * conf->nr_strip_zones * mddev->raid_disks); - /* The first zone must contain all devices, so here we check that * there is a proper alignment of slots to devices and find them all */ @@ -280,7 +275,11 @@ static int raid0_run (mddev_t *mddev) mdk_rdev_t *rdev; struct list_head *tmp; - printk("%s: setting max_sectors to %d, segment boundary to %d\n", + if (mddev->chunk_size == 0) { + printk(KERN_ERR "md/raid0: non-zero chunk size required.\n"); + return -EINVAL; + } + printk(KERN_INFO "%s: setting max_sectors to %d, segment boundary to %d\n", mdname(mddev), mddev->chunk_size >> 9, (mddev->chunk_size>>1)-1); @@ -307,9 +306,6 @@ static int raid0_run (mddev_t *mddev) printk("raid0 : conf->hash_spacing is %llu blocks.\n", (unsigned long long)conf->hash_spacing); { -#if __GNUC__ < 3 - volatile -#endif sector_t s = mddev->array_size; sector_t space = conf->hash_spacing; int round; @@ -361,7 +357,7 @@ static int raid0_run (mddev_t *mddev) * chunksize should be used in that case. */ { - int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE; + int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE; if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) mddev->queue->backing_dev_info.ra_pages = 2* stripe; } @@ -440,9 +436,6 @@ static int raid0_make_request (request_queue_t *q, struct bio *bio) { -#if __GNUC__ < 3 - volatile -#endif sector_t x = block >> conf->preshift; sector_div(x, (u32)conf->hash_spacing); zone = conf->hash_table[x]; @@ -512,9 +505,10 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev) return; } -static mdk_personality_t raid0_personality= +static struct mdk_personality raid0_personality= { .name = "raid0", + .level = 0, .owner = THIS_MODULE, .make_request = raid0_make_request, .run = raid0_run, @@ -524,15 +518,17 @@ static mdk_personality_t raid0_personality= static int __init raid0_init (void) { - return register_md_personality (RAID0, &raid0_personality); + return register_md_personality (&raid0_personality); } static void raid0_exit (void) { - unregister_md_personality (RAID0); + unregister_md_personality (&raid0_personality); } module_init(raid0_init); module_exit(raid0_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS("md-personality-2"); /* RAID0 */ +MODULE_ALIAS("md-raid0"); +MODULE_ALIAS("md-level-0"); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 3066c587b53..d39f584cd8b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -47,10 +47,11 @@ */ #define NR_RAID1_BIOS 256 -static mdk_personality_t raid1_personality; static void unplug_slaves(mddev_t *mddev); +static void allow_barrier(conf_t *conf); +static void lower_barrier(conf_t *conf); static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) { @@ -59,10 +60,8 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) int size = offsetof(r1bio_t, bios[pi->raid_disks]); /* allocate a r1bio with room for raid_disks entries in the bios array */ - r1_bio = kmalloc(size, gfp_flags); - if (r1_bio) - memset(r1_bio, 0, size); - else + r1_bio = kzalloc(size, gfp_flags); + if (!r1_bio) unplug_slaves(pi->mddev); return r1_bio; @@ -104,15 +103,30 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) } /* * Allocate RESYNC_PAGES data pages and attach them to - * the first bio; + * the first bio. + * If this is a user-requested check/repair, allocate + * RESYNC_PAGES for each bio. */ - bio = r1_bio->bios[0]; - for (i = 0; i < RESYNC_PAGES; i++) { - page = alloc_page(gfp_flags); - if (unlikely(!page)) - goto out_free_pages; - - bio->bi_io_vec[i].bv_page = page; + if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) + j = pi->raid_disks; + else + j = 1; + while(j--) { + bio = r1_bio->bios[j]; + for (i = 0; i < RESYNC_PAGES; i++) { + page = alloc_page(gfp_flags); + if (unlikely(!page)) + goto out_free_pages; + + bio->bi_io_vec[i].bv_page = page; + } + } + /* If not user-requests, copy the page pointers to all bios */ + if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) { + for (i=0; i<RESYNC_PAGES ; i++) + for (j=1; j<pi->raid_disks; j++) + r1_bio->bios[j]->bi_io_vec[i].bv_page = + r1_bio->bios[0]->bi_io_vec[i].bv_page; } r1_bio->master_bio = NULL; @@ -120,8 +134,10 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) return r1_bio; out_free_pages: - for ( ; i > 0 ; i--) - __free_page(bio->bi_io_vec[i-1].bv_page); + for (i=0; i < RESYNC_PAGES ; i++) + for (j=0 ; j < pi->raid_disks; j++) + safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); + j = -1; out_free_bio: while ( ++j < pi->raid_disks ) bio_put(r1_bio->bios[j]); @@ -132,14 +148,16 @@ out_free_bio: static void r1buf_pool_free(void *__r1_bio, void *data) { struct pool_info *pi = data; - int i; + int i,j; r1bio_t *r1bio = __r1_bio; - struct bio *bio = r1bio->bios[0]; - for (i = 0; i < RESYNC_PAGES; i++) { - __free_page(bio->bi_io_vec[i].bv_page); - bio->bi_io_vec[i].bv_page = NULL; - } + for (i = 0; i < RESYNC_PAGES; i++) + for (j = pi->raid_disks; j-- ;) { + if (j == 0 || + r1bio->bios[j]->bi_io_vec[i].bv_page != + r1bio->bios[0]->bi_io_vec[i].bv_page) + safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page); + } for (i=0 ; i < pi->raid_disks; i++) bio_put(r1bio->bios[i]); @@ -152,52 +170,40 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) for (i = 0; i < conf->raid_disks; i++) { struct bio **bio = r1_bio->bios + i; - if (*bio) + if (*bio && *bio != IO_BLOCKED) bio_put(*bio); *bio = NULL; } } -static inline void free_r1bio(r1bio_t *r1_bio) +static void free_r1bio(r1bio_t *r1_bio) { - unsigned long flags; - conf_t *conf = mddev_to_conf(r1_bio->mddev); /* * Wake up any possible resync thread that waits for the device * to go idle. */ - spin_lock_irqsave(&conf->resync_lock, flags); - if (!--conf->nr_pending) { - wake_up(&conf->wait_idle); - wake_up(&conf->wait_resume); - } - spin_unlock_irqrestore(&conf->resync_lock, flags); + allow_barrier(conf); put_all_bios(conf, r1_bio); mempool_free(r1_bio, conf->r1bio_pool); } -static inline void put_buf(r1bio_t *r1_bio) +static void put_buf(r1bio_t *r1_bio) { conf_t *conf = mddev_to_conf(r1_bio->mddev); - unsigned long flags; + int i; - mempool_free(r1_bio, conf->r1buf_pool); + for (i=0; i<conf->raid_disks; i++) { + struct bio *bio = r1_bio->bios[i]; + if (bio->bi_end_io) + rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev); + } - spin_lock_irqsave(&conf->resync_lock, flags); - if (!conf->barrier) - BUG(); - --conf->barrier; - wake_up(&conf->wait_resume); - wake_up(&conf->wait_idle); + mempool_free(r1_bio, conf->r1buf_pool); - if (!--conf->nr_pending) { - wake_up(&conf->wait_idle); - wake_up(&conf->wait_resume); - } - spin_unlock_irqrestore(&conf->resync_lock, flags); + lower_barrier(conf); } static void reschedule_retry(r1bio_t *r1_bio) @@ -208,8 +214,10 @@ static void reschedule_retry(r1bio_t *r1_bio) spin_lock_irqsave(&conf->device_lock, flags); list_add(&r1_bio->retry_list, &conf->retry_list); + conf->nr_queued ++; spin_unlock_irqrestore(&conf->device_lock, flags); + wake_up(&conf->wait_barrier); md_wakeup_thread(mddev->thread); } @@ -261,9 +269,9 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int /* * this branch is our 'one mirror IO has finished' event handler: */ - if (!uptodate) - md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); - else + update_head_pos(mirror, r1_bio); + + if (uptodate || conf->working_disks <= 1) { /* * Set R1BIO_Uptodate in our master bio, so that * we will return a good error code for to the higher @@ -273,16 +281,11 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int * user-side. So if something waits for IO, then it will * wait for the 'master' bio. */ - set_bit(R1BIO_Uptodate, &r1_bio->state); - - update_head_pos(mirror, r1_bio); + if (uptodate) + set_bit(R1BIO_Uptodate, &r1_bio->state); - /* - * we have only one bio on the read side - */ - if (uptodate) raid_end_bio_io(r1_bio); - else { + } else { /* * oops, read error: */ @@ -320,7 +323,6 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int * this branch is our 'one mirror IO has finished' event handler: */ r1_bio->bios[mirror] = NULL; - bio_put(bio); if (!uptodate) { md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); /* an I/O failed, we can't clear the bitmap */ @@ -377,10 +379,9 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int } if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { /* free extra copy of the data pages */ -/* FIXME bio has been freed!!! */ int i = bio->bi_vcnt; while (i--) - __free_page(bio->bi_io_vec[i].bv_page); + safe_put_page(bio->bi_io_vec[i].bv_page); } /* clear the bitmap if all writes complete successfully */ bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, @@ -391,6 +392,9 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int raid_end_bio_io(r1_bio); } + if (r1_bio->bios[mirror]==NULL) + bio_put(bio); + rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); return 0; } @@ -432,11 +436,13 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) new_disk = 0; for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); + r1_bio->bios[new_disk] == IO_BLOCKED || !rdev || !test_bit(In_sync, &rdev->flags) || test_bit(WriteMostly, &rdev->flags); rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) { - if (rdev && test_bit(In_sync, &rdev->flags)) + if (rdev && test_bit(In_sync, &rdev->flags) && + r1_bio->bios[new_disk] != IO_BLOCKED) wonly_disk = new_disk; if (new_disk == conf->raid_disks - 1) { @@ -450,11 +456,13 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) /* make sure the disk is operational */ for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); + r1_bio->bios[new_disk] == IO_BLOCKED || !rdev || !test_bit(In_sync, &rdev->flags) || test_bit(WriteMostly, &rdev->flags); rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) { - if (rdev && test_bit(In_sync, &rdev->flags)) + if (rdev && test_bit(In_sync, &rdev->flags) && + r1_bio->bios[new_disk] != IO_BLOCKED) wonly_disk = new_disk; if (new_disk <= 0) @@ -491,7 +499,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) rdev = rcu_dereference(conf->mirrors[disk].rdev); - if (!rdev || + if (!rdev || r1_bio->bios[disk] == IO_BLOCKED || !test_bit(In_sync, &rdev->flags) || test_bit(WriteMostly, &rdev->flags)) continue; @@ -519,7 +527,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) /* cannot risk returning a device that failed * before we inc'ed nr_pending */ - atomic_dec(&rdev->nr_pending); + rdev_dec_pending(rdev, conf->mddev); goto retry; } conf->next_seq_sect = this_sector + sectors; @@ -592,42 +600,119 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, return ret; } -/* - * Throttle resync depth, so that we can both get proper overlapping of - * requests, but are still able to handle normal requests quickly. +/* Barriers.... + * Sometimes we need to suspend IO while we do something else, + * either some resync/recovery, or reconfigure the array. + * To do this we raise a 'barrier'. + * The 'barrier' is a counter that can be raised multiple times + * to count how many activities are happening which preclude + * normal IO. + * We can only raise the barrier if there is no pending IO. + * i.e. if nr_pending == 0. + * We choose only to raise the barrier if no-one is waiting for the + * barrier to go down. This means that as soon as an IO request + * is ready, no other operations which require a barrier will start + * until the IO request has had a chance. + * + * So: regular IO calls 'wait_barrier'. When that returns there + * is no backgroup IO happening, It must arrange to call + * allow_barrier when it has finished its IO. + * backgroup IO calls must call raise_barrier. Once that returns + * there is no normal IO happeing. It must arrange to call + * lower_barrier when the particular background IO completes. */ #define RESYNC_DEPTH 32 -static void device_barrier(conf_t *conf, sector_t sect) +static void raise_barrier(conf_t *conf) { spin_lock_irq(&conf->resync_lock); - wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), - conf->resync_lock, raid1_unplug(conf->mddev->queue)); - - if (!conf->barrier++) { - wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, - conf->resync_lock, raid1_unplug(conf->mddev->queue)); - if (conf->nr_pending) - BUG(); + + /* Wait until no block IO is waiting */ + wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, + conf->resync_lock, + raid1_unplug(conf->mddev->queue)); + + /* block any new IO from starting */ + conf->barrier++; + + /* No wait for all pending IO to complete */ + wait_event_lock_irq(conf->wait_barrier, + !conf->nr_pending && conf->barrier < RESYNC_DEPTH, + conf->resync_lock, + raid1_unplug(conf->mddev->queue)); + + spin_unlock_irq(&conf->resync_lock); +} + +static void lower_barrier(conf_t *conf) +{ + unsigned long flags; + spin_lock_irqsave(&conf->resync_lock, flags); + conf->barrier--; + spin_unlock_irqrestore(&conf->resync_lock, flags); + wake_up(&conf->wait_barrier); +} + +static void wait_barrier(conf_t *conf) +{ + spin_lock_irq(&conf->resync_lock); + if (conf->barrier) { + conf->nr_waiting++; + wait_event_lock_irq(conf->wait_barrier, !conf->barrier, + conf->resync_lock, + raid1_unplug(conf->mddev->queue)); + conf->nr_waiting--; } - wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, - conf->resync_lock, raid1_unplug(conf->mddev->queue)); - conf->next_resync = sect; + conf->nr_pending++; spin_unlock_irq(&conf->resync_lock); } +static void allow_barrier(conf_t *conf) +{ + unsigned long flags; + spin_lock_irqsave(&conf->resync_lock, flags); + conf->nr_pending--; + spin_unlock_irqrestore(&conf->resync_lock, flags); + wake_up(&conf->wait_barrier); +} + +static void freeze_array(conf_t *conf) +{ + /* stop syncio and normal IO and wait for everything to + * go quite. + * We increment barrier and nr_waiting, and then + * wait until barrier+nr_pending match nr_queued+2 + */ + spin_lock_irq(&conf->resync_lock); + conf->barrier++; + conf->nr_waiting++; + wait_event_lock_irq(conf->wait_barrier, + conf->barrier+conf->nr_pending == conf->nr_queued+2, + conf->resync_lock, + raid1_unplug(conf->mddev->queue)); + spin_unlock_irq(&conf->resync_lock); +} +static void unfreeze_array(conf_t *conf) +{ + /* reverse the effect of the freeze */ + spin_lock_irq(&conf->resync_lock); + conf->barrier--; + conf->nr_waiting--; + wake_up(&conf->wait_barrier); + spin_unlock_irq(&conf->resync_lock); +} + + /* duplicate the data pages for behind I/O */ static struct page **alloc_behind_pages(struct bio *bio) { int i; struct bio_vec *bvec; - struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *), + struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *), GFP_NOIO); if (unlikely(!pages)) goto do_sync_io; - memset(pages, 0, bio->bi_vcnt * sizeof(struct page *)); - bio_for_each_segment(bvec, bio, i) { pages[i] = alloc_page(GFP_NOIO); if (unlikely(!pages[i])) @@ -643,7 +728,7 @@ static struct page **alloc_behind_pages(struct bio *bio) do_sync_io: if (pages) for (i = 0; i < bio->bi_vcnt && pages[i]; i++) - __free_page(pages[i]); + put_page(pages[i]); kfree(pages); PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); return NULL; @@ -677,10 +762,7 @@ static int make_request(request_queue_t *q, struct bio * bio) */ md_write_start(mddev, bio); /* wait on superblock update early */ - spin_lock_irq(&conf->resync_lock); - wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, ); - conf->nr_pending++; - spin_unlock_irq(&conf->resync_lock); + wait_barrier(conf); disk_stat_inc(mddev->gendisk, ios[rw]); disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); @@ -748,7 +830,7 @@ static int make_request(request_queue_t *q, struct bio * bio) !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); if (test_bit(Faulty, &rdev->flags)) { - atomic_dec(&rdev->nr_pending); + rdev_dec_pending(rdev, mddev); r1_bio->bios[i] = NULL; } else r1_bio->bios[i] = bio; @@ -908,13 +990,8 @@ static void print_conf(conf_t *conf) static void close_sync(conf_t *conf) { - spin_lock_irq(&conf->resync_lock); - wait_event_lock_irq(conf->wait_resume, !conf->barrier, - conf->resync_lock, raid1_unplug(conf->mddev->queue)); - spin_unlock_irq(&conf->resync_lock); - - if (conf->barrier) BUG(); - if (waitqueue_active(&conf->wait_idle)) BUG(); + wait_barrier(conf); + allow_barrier(conf); mempool_destroy(conf->r1buf_pool); conf->r1buf_pool = NULL; @@ -1014,28 +1091,27 @@ abort: static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) { - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); - conf_t *conf = mddev_to_conf(r1_bio->mddev); + int i; if (bio->bi_size) return 1; - if (r1_bio->bios[r1_bio->read_disk] != bio) - BUG(); - update_head_pos(r1_bio->read_disk, r1_bio); + for (i=r1_bio->mddev->raid_disks; i--; ) + if (r1_bio->bios[i] == bio) + break; + BUG_ON(i < 0); + update_head_pos(i, r1_bio); /* * we have read a block, now it needs to be re-written, * or re-read if the read failed. * We don't do much here, just schedule handling by raid1d */ - if (!uptodate) { - md_error(r1_bio->mddev, - conf->mirrors[r1_bio->read_disk].rdev); - } else + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) set_bit(R1BIO_Uptodate, &r1_bio->state); - rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); - reschedule_retry(r1_bio); + + if (atomic_dec_and_test(&r1_bio->remaining)) + reschedule_retry(r1_bio); return 0; } @@ -1065,7 +1141,6 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) md_done_sync(mddev, r1_bio->sectors, uptodate); put_buf(r1_bio); } - rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); return 0; } @@ -1078,34 +1153,173 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) bio = r1_bio->bios[r1_bio->read_disk]; -/* - if (r1_bio->sector == 0) printk("First sync write startss\n"); -*/ - /* - * schedule writes - */ + + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + /* We have read all readable devices. If we haven't + * got the block, then there is no hope left. + * If we have, then we want to do a comparison + * and skip the write if everything is the same. + * If any blocks failed to read, then we need to + * attempt an over-write + */ + int primary; + if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { + for (i=0; i<mddev->raid_disks; i++) + if (r1_bio->bios[i]->bi_end_io == end_sync_read) + md_error(mddev, conf->mirrors[i].rdev); + + md_done_sync(mddev, r1_bio->sectors, 1); + put_buf(r1_bio); + return; + } + for (primary=0; primary<mddev->raid_disks; primary++) + if (r1_bio->bios[primary]->bi_end_io == end_sync_read && + test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { + r1_bio->bios[primary]->bi_end_io = NULL; + rdev_dec_pending(conf->mirrors[primary].rdev, mddev); + break; + } + r1_bio->read_disk = primary; + for (i=0; i<mddev->raid_disks; i++) + if (r1_bio->bios[i]->bi_end_io == end_sync_read && + test_bit(BIO_UPTODATE, &r1_bio->bios[i]->bi_flags)) { + int j; + int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); + struct bio *pbio = r1_bio->bios[primary]; + struct bio *sbio = r1_bio->bios[i]; + for (j = vcnt; j-- ; ) + if (memcmp(page_address(pbio->bi_io_vec[j].bv_page), + page_address(sbio->bi_io_vec[j].bv_page), + PAGE_SIZE)) + break; + if (j >= 0) + mddev->resync_mismatches += r1_bio->sectors; + if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { + sbio->bi_end_io = NULL; + rdev_dec_pending(conf->mirrors[i].rdev, mddev); + } else { + /* fixup the bio for reuse */ + sbio->bi_vcnt = vcnt; + sbio->bi_size = r1_bio->sectors << 9; + sbio->bi_idx = 0; + sbio->bi_phys_segments = 0; + sbio->bi_hw_segments = 0; + sbio->bi_hw_front_size = 0; + sbio->bi_hw_back_size = 0; + sbio->bi_flags &= ~(BIO_POOL_MASK - 1); + sbio->bi_flags |= 1 << BIO_UPTODATE; + sbio->bi_next = NULL; + sbio->bi_sector = r1_bio->sector + + conf->mirrors[i].rdev->data_offset; + sbio->bi_bdev = conf->mirrors[i].rdev->bdev; + } + } + } if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { - /* - * There is no point trying a read-for-reconstruct as - * reconstruct is about to be aborted + /* ouch - failed to read all of that. + * Try some synchronous reads of other devices to get + * good data, much like with normal read errors. Only + * read into the pages we already have so they we don't + * need to re-issue the read request. + * We don't need to freeze the array, because being in an + * active sync request, there is no normal IO, and + * no overlapping syncs. */ - char b[BDEVNAME_SIZE]; - printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" - " for block %llu\n", - bdevname(bio->bi_bdev,b), - (unsigned long long)r1_bio->sector); - md_done_sync(mddev, r1_bio->sectors, 0); - put_buf(r1_bio); - return; + sector_t sect = r1_bio->sector; + int sectors = r1_bio->sectors; + int idx = 0; + + while(sectors) { + int s = sectors; + int d = r1_bio->read_disk; + int success = 0; + mdk_rdev_t *rdev; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + do { + if (r1_bio->bios[d]->bi_end_io == end_sync_read) { + rdev = conf->mirrors[d].rdev; + if (sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, + bio->bi_io_vec[idx].bv_page, + READ)) { + success = 1; + break; + } + } + d++; + if (d == conf->raid_disks) + d = 0; + } while (!success && d != r1_bio->read_disk); + + if (success) { + int start = d; + /* write it back and re-read */ + set_bit(R1BIO_Uptodate, &r1_bio->state); + while (d != r1_bio->read_disk) { + if (d == 0) + d = conf->raid_disks; + d--; + if (r1_bio->bios[d]->bi_end_io != end_sync_read) + continue; + rdev = conf->mirrors[d].rdev; + atomic_add(s, &rdev->corrected_errors); + if (sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, + bio->bi_io_vec[idx].bv_page, + WRITE) == 0) + md_error(mddev, rdev); + } + d = start; + while (d != r1_bio->read_disk) { + if (d == 0) + d = conf->raid_disks; + d--; + if (r1_bio->bios[d]->bi_end_io != end_sync_read) + continue; + rdev = conf->mirrors[d].rdev; + if (sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, + bio->bi_io_vec[idx].bv_page, + READ) == 0) + md_error(mddev, rdev); + } + } else { + char b[BDEVNAME_SIZE]; + /* Cannot read from anywhere, array is toast */ + md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); + printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" + " for block %llu\n", + bdevname(bio->bi_bdev,b), + (unsigned long long)r1_bio->sector); + md_done_sync(mddev, r1_bio->sectors, 0); + put_buf(r1_bio); + return; + } + sectors -= s; + sect += s; + idx ++; + } } + /* + * schedule writes + */ atomic_set(&r1_bio->remaining, 1); for (i = 0; i < disks ; i++) { wbio = r1_bio->bios[i]; - if (wbio->bi_end_io != end_sync_write) + if (wbio->bi_end_io == NULL || + (wbio->bi_end_io == end_sync_read && + (i == r1_bio->read_disk || + !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)))) continue; - atomic_inc(&conf->mirrors[i].rdev->nr_pending); + wbio->bi_rw = WRITE; + wbio->bi_end_io = end_sync_write; atomic_inc(&r1_bio->remaining); md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); @@ -1166,6 +1380,7 @@ static void raid1d(mddev_t *mddev) break; r1_bio = list_entry(head->prev, r1bio_t, retry_list); list_del(head->prev); + conf->nr_queued--; spin_unlock_irqrestore(&conf->device_lock, flags); mddev = r1_bio->mddev; @@ -1205,6 +1420,86 @@ static void raid1d(mddev_t *mddev) } } else { int disk; + + /* we got a read error. Maybe the drive is bad. Maybe just + * the block and we can fix it. + * We freeze all other IO, and try reading the block from + * other devices. When we find one, we re-write + * and check it that fixes the read error. + * This is all done synchronously while the array is + * frozen + */ + sector_t sect = r1_bio->sector; + int sectors = r1_bio->sectors; + freeze_array(conf); + if (mddev->ro == 0) while(sectors) { + int s = sectors; + int d = r1_bio->read_disk; + int success = 0; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + + do { + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags) && + sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, + conf->tmppage, READ)) + success = 1; + else { + d++; + if (d == conf->raid_disks) + d = 0; + } + } while (!success && d != r1_bio->read_disk); + + if (success) { + /* write it back and re-read */ + int start = d; + while (d != r1_bio->read_disk) { + if (d==0) + d = conf->raid_disks; + d--; + rdev = conf->mirrors[d].rdev; + atomic_add(s, &rdev->corrected_errors); + if (rdev && + test_bit(In_sync, &rdev->flags)) { + if (sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, conf->tmppage, WRITE) == 0) + /* Well, this device is dead */ + md_error(mddev, rdev); + } + } + d = start; + while (d != r1_bio->read_disk) { + if (d==0) + d = conf->raid_disks; + d--; + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags)) { + if (sync_page_io(rdev->bdev, + sect + rdev->data_offset, + s<<9, conf->tmppage, READ) == 0) + /* Well, this device is dead */ + md_error(mddev, rdev); + } + } + } else { + /* Cannot read from anywhere -- bye bye array */ + md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); + break; + } + sectors -= s; + sect += s; + } + + unfreeze_array(conf); + bio = r1_bio->bios[r1_bio->read_disk]; if ((disk=read_balance(conf, r1_bio)) == -1) { printk(KERN_ALERT "raid1: %s: unrecoverable I/O" @@ -1213,7 +1508,8 @@ static void raid1d(mddev_t *mddev) (unsigned long long)r1_bio->sector); raid_end_bio_io(r1_bio); } else { - r1_bio->bios[r1_bio->read_disk] = NULL; + r1_bio->bios[r1_bio->read_disk] = + mddev->ro ? IO_BLOCKED : NULL; r1_bio->read_disk = disk; bio_put(bio); bio = bio_clone(r1_bio->master_bio, GFP_NOIO); @@ -1268,14 +1564,13 @@ static int init_resync(conf_t *conf) static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) { conf_t *conf = mddev_to_conf(mddev); - mirror_info_t *mirror; r1bio_t *r1_bio; struct bio *bio; sector_t max_sector, nr_sectors; - int disk; + int disk = -1; int i; - int wonly; - int write_targets = 0; + int wonly = -1; + int write_targets = 0, read_targets = 0; int sync_blocks; int still_degraded = 0; @@ -1316,55 +1611,35 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return sync_blocks; } /* - * If there is non-resync activity waiting for us then - * put in a delay to throttle resync. + * If there is non-resync activity waiting for a turn, + * and resync is going fast enough, + * then let it though before starting on this new sync request. */ - if (!go_faster && waitqueue_active(&conf->wait_resume)) + if (!go_faster && conf->nr_waiting) msleep_interruptible(1000); - device_barrier(conf, sector_nr + RESYNC_SECTORS); - - /* - * If reconstructing, and >1 working disc, - * could dedicate one to rebuild and others to - * service read requests .. - */ - disk = conf->last_used; - /* make sure disk is operational */ - wonly = disk; - while (conf->mirrors[disk].rdev == NULL || - !test_bit(In_sync, &conf->mirrors[disk].rdev->flags) || - test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags) - ) { - if (conf->mirrors[disk].rdev && - test_bit(In_sync, &conf->mirrors[disk].rdev->flags)) - wonly = disk; - if (disk <= 0) - disk = conf->raid_disks; - disk--; - if (disk == conf->last_used) { - disk = wonly; - break; - } - } - conf->last_used = disk; - atomic_inc(&conf->mirrors[disk].rdev->nr_pending); + raise_barrier(conf); - mirror = conf->mirrors + disk; + conf->next_resync = sector_nr; r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); - - spin_lock_irq(&conf->resync_lock); - conf->nr_pending++; - spin_unlock_irq(&conf->resync_lock); + rcu_read_lock(); + /* + * If we get a correctably read error during resync or recovery, + * we might want to read from a different device. So we + * flag all drives that could conceivably be read from for READ, + * and any others (which will be non-In_sync devices) for WRITE. + * If a read fails, we try reading from something else for which READ + * is OK. + */ r1_bio->mddev = mddev; r1_bio->sector = sector_nr; r1_bio->state = 0; set_bit(R1BIO_IsSync, &r1_bio->state); - r1_bio->read_disk = disk; for (i=0; i < conf->raid_disks; i++) { + mdk_rdev_t *rdev; bio = r1_bio->bios[i]; /* take from bio_init */ @@ -1379,35 +1654,49 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i bio->bi_end_io = NULL; bio->bi_private = NULL; - if (i == disk) { - bio->bi_rw = READ; - bio->bi_end_io = end_sync_read; - } else if (conf->mirrors[i].rdev == NULL || - test_bit(Faulty, &conf->mirrors[i].rdev->flags)) { + rdev = rcu_dereference(conf->mirrors[i].rdev); + if (rdev == NULL || + test_bit(Faulty, &rdev->flags)) { still_degraded = 1; continue; - } else if (!test_bit(In_sync, &conf->mirrors[i].rdev->flags) || - sector_nr + RESYNC_SECTORS > mddev->recovery_cp || - test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + } else if (!test_bit(In_sync, &rdev->flags)) { bio->bi_rw = WRITE; bio->bi_end_io = end_sync_write; write_targets ++; - } else - /* no need to read or write here */ - continue; - bio->bi_sector = sector_nr + conf->mirrors[i].rdev->data_offset; - bio->bi_bdev = conf->mirrors[i].rdev->bdev; + } else { + /* may need to read from here */ + bio->bi_rw = READ; + bio->bi_end_io = end_sync_read; + if (test_bit(WriteMostly, &rdev->flags)) { + if (wonly < 0) + wonly = i; + } else { + if (disk < 0) + disk = i; + } + read_targets++; + } + atomic_inc(&rdev->nr_pending); + bio->bi_sector = sector_nr + rdev->data_offset; + bio->bi_bdev = rdev->bdev; bio->bi_private = r1_bio; } + rcu_read_unlock(); + if (disk < 0) + disk = wonly; + r1_bio->read_disk = disk; + + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0) + /* extra read targets are also write targets */ + write_targets += read_targets-1; - if (write_targets == 0) { + if (write_targets == 0 || read_targets == 0) { /* There is nowhere to write, so all non-sync * drives must be failed - so we are finished */ sector_t rv = max_sector - sector_nr; *skipped = 1; put_buf(r1_bio); - rdev_dec_pending(conf->mirrors[disk].rdev, mddev); return rv; } @@ -1435,10 +1724,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i for (i=0 ; i < conf->raid_disks; i++) { bio = r1_bio->bios[i]; if (bio->bi_end_io) { - page = r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page; + page = bio->bi_io_vec[bio->bi_vcnt].bv_page; if (bio_add_page(bio, page, len, 0) == 0) { /* stop here */ - r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page = page; + bio->bi_io_vec[bio->bi_vcnt].bv_page = page; while (i > 0) { i--; bio = r1_bio->bios[i]; @@ -1458,12 +1747,28 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i sync_blocks -= (len>>9); } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES); bio_full: - bio = r1_bio->bios[disk]; r1_bio->sectors = nr_sectors; - md_sync_acct(mirror->rdev->bdev, nr_sectors); + /* For a user-requested sync, we read all readable devices and do a + * compare + */ + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + atomic_set(&r1_bio->remaining, read_targets); + for (i=0; i<conf->raid_disks; i++) { + bio = r1_bio->bios[i]; + if (bio->bi_end_io == end_sync_read) { + md_sync_acct(conf->mirrors[i].rdev->bdev, nr_sectors); + generic_make_request(bio); + } + } + } else { + atomic_set(&r1_bio->remaining, 1); + bio = r1_bio->bios[r1_bio->read_disk]; + md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev, + nr_sectors); + generic_make_request(bio); - generic_make_request(bio); + } return nr_sectors; } @@ -1486,18 +1791,19 @@ static int run(mddev_t *mddev) * bookkeeping area. [whatever we allocate in run(), * should be freed in stop()] */ - conf = kmalloc(sizeof(conf_t), GFP_KERNEL); + conf = kzalloc(sizeof(conf_t), GFP_KERNEL); mddev->private = conf; if (!conf) goto out_no_mem; - memset(conf, 0, sizeof(*conf)); - conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks, + conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, GFP_KERNEL); if (!conf->mirrors) goto out_no_mem; - memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks); + conf->tmppage = alloc_page(GFP_KERNEL); + if (!conf->tmppage) + goto out_no_mem; conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); if (!conf->poolinfo) @@ -1541,8 +1847,7 @@ static int run(mddev_t *mddev) mddev->recovery_cp = MaxSector; spin_lock_init(&conf->resync_lock); - init_waitqueue_head(&conf->wait_idle); - init_waitqueue_head(&conf->wait_resume); + init_waitqueue_head(&conf->wait_barrier); bio_list_init(&conf->pending_bio_list); bio_list_init(&conf->flushing_bio_list); @@ -1582,7 +1887,6 @@ static int run(mddev_t *mddev) mdname(mddev)); goto out_free_conf; } - if (mddev->bitmap) mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; printk(KERN_INFO "raid1: raid set %s active with %d out of %d mirrors\n", @@ -1607,6 +1911,7 @@ out_free_conf: if (conf->r1bio_pool) mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); + safe_put_page(conf->tmppage); kfree(conf->poolinfo); kfree(conf); mddev->private = NULL; @@ -1705,19 +2010,14 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks) kfree(newpoolinfo); return -ENOMEM; } - newmirrors = kmalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL); + newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL); if (!newmirrors) { kfree(newpoolinfo); mempool_destroy(newpool); return -ENOMEM; } - memset(newmirrors, 0, sizeof(struct mirror_info)*raid_disks); - spin_lock_irq(&conf->resync_lock); - conf->barrier++; - wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, - conf->resync_lock, raid1_unplug(mddev->queue)); - spin_unlock_irq(&conf->resync_lock); + raise_barrier(conf); /* ok, everything is stopped */ oldpool = conf->r1bio_pool; @@ -1737,12 +2037,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks) conf->raid_disks = mddev->raid_disks = raid_disks; conf->last_used = 0; /* just make sure it is in-range */ - spin_lock_irq(&conf->resync_lock); - conf->barrier--; - spin_unlock_irq(&conf->resync_lock); - wake_up(&conf->wait_resume); - wake_up(&conf->wait_idle); - + lower_barrier(conf); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -1757,33 +2052,19 @@ static void raid1_quiesce(mddev_t *mddev, int state) switch(state) { case 1: - spin_lock_irq(&conf->resync_lock); - conf->barrier++; - wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, - conf->resync_lock, raid1_unplug(mddev->queue)); - spin_unlock_irq(&conf->resync_lock); + raise_barrier(conf); break; case 0: - spin_lock_irq(&conf->resync_lock); - conf->barrier--; - spin_unlock_irq(&conf->resync_lock); - wake_up(&conf->wait_resume); - wake_up(&conf->wait_idle); + lower_barrier(conf); break; } - if (mddev->thread) { - if (mddev->bitmap) - mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; - else - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; - md_wakeup_thread(mddev->thread); - } } -static mdk_personality_t raid1_personality = +static struct mdk_personality raid1_personality = { .name = "raid1", + .level = 1, .owner = THIS_MODULE, .make_request = make_request, .run = run, @@ -1801,15 +2082,17 @@ static mdk_personality_t raid1_personality = static int __init raid_init(void) { - return register_md_personality(RAID1, &raid1_personality); + return register_md_personality(&raid1_personality); } static void raid_exit(void) { - unregister_md_personality(RAID1); + unregister_md_personality(&raid1_personality); } module_init(raid_init); module_exit(raid_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS("md-personality-3"); /* RAID1 */ +MODULE_ALIAS("md-raid1"); +MODULE_ALIAS("md-level-1"); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 713dc9c2c73..9130d051b47 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -18,7 +18,9 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include "dm-bio-list.h" #include <linux/raid/raid10.h> +#include <linux/raid/bitmap.h> /* * RAID10 provides a combination of RAID0 and RAID1 functionality. @@ -47,6 +49,9 @@ static void unplug_slaves(mddev_t *mddev); +static void allow_barrier(conf_t *conf); +static void lower_barrier(conf_t *conf); + static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) { conf_t *conf = data; @@ -54,10 +59,8 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) int size = offsetof(struct r10bio_s, devs[conf->copies]); /* allocate a r10bio with room for raid_disks entries in the bios array */ - r10_bio = kmalloc(size, gfp_flags); - if (r10_bio) - memset(r10_bio, 0, size); - else + r10_bio = kzalloc(size, gfp_flags); + if (!r10_bio) unplug_slaves(conf->mddev); return r10_bio; @@ -129,10 +132,10 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) out_free_pages: for ( ; i > 0 ; i--) - __free_page(bio->bi_io_vec[i-1].bv_page); + safe_put_page(bio->bi_io_vec[i-1].bv_page); while (j--) for (i = 0; i < RESYNC_PAGES ; i++) - __free_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); + safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); j = -1; out_free_bio: while ( ++j < nalloc ) @@ -152,7 +155,7 @@ static void r10buf_pool_free(void *__r10_bio, void *data) struct bio *bio = r10bio->devs[j].bio; if (bio) { for (i = 0; i < RESYNC_PAGES; i++) { - __free_page(bio->bi_io_vec[i].bv_page); + safe_put_page(bio->bi_io_vec[i].bv_page); bio->bi_io_vec[i].bv_page = NULL; } bio_put(bio); @@ -167,52 +170,33 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio) for (i = 0; i < conf->copies; i++) { struct bio **bio = & r10_bio->devs[i].bio; - if (*bio) + if (*bio && *bio != IO_BLOCKED) bio_put(*bio); *bio = NULL; } } -static inline void free_r10bio(r10bio_t *r10_bio) +static void free_r10bio(r10bio_t *r10_bio) { - unsigned long flags; - conf_t *conf = mddev_to_conf(r10_bio->mddev); /* * Wake up any possible resync thread that waits for the device * to go idle. */ - spin_lock_irqsave(&conf->resync_lock, flags); - if (!--conf->nr_pending) { - wake_up(&conf->wait_idle); - wake_up(&conf->wait_resume); - } - spin_unlock_irqrestore(&conf->resync_lock, flags); + allow_barrier(conf); put_all_bios(conf, r10_bio); mempool_free(r10_bio, conf->r10bio_pool); } -static inline void put_buf(r10bio_t *r10_bio) +static void put_buf(r10bio_t *r10_bio) { conf_t *conf = mddev_to_conf(r10_bio->mddev); - unsigned long flags; mempool_free(r10_bio, conf->r10buf_pool); - spin_lock_irqsave(&conf->resync_lock, flags); - if (!conf->barrier) - BUG(); - --conf->barrier; - wake_up(&conf->wait_resume); - wake_up(&conf->wait_idle); - - if (!--conf->nr_pending) { - wake_up(&conf->wait_idle); - wake_up(&conf->wait_resume); - } - spin_unlock_irqrestore(&conf->resync_lock, flags); + lower_barrier(conf); } static void reschedule_retry(r10bio_t *r10_bio) @@ -223,6 +207,7 @@ static void reschedule_retry(r10bio_t *r10_bio) spin_lock_irqsave(&conf->device_lock, flags); list_add(&r10_bio->retry_list, &conf->retry_list); + conf->nr_queued ++; spin_unlock_irqrestore(&conf->device_lock, flags); md_wakeup_thread(mddev->thread); @@ -268,9 +253,9 @@ static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int /* * this branch is our 'one mirror IO has finished' event handler: */ - if (!uptodate) - md_error(r10_bio->mddev, conf->mirrors[dev].rdev); - else + update_head_pos(slot, r10_bio); + + if (uptodate) { /* * Set R10BIO_Uptodate in our master bio, so that * we will return a good error code to the higher @@ -281,15 +266,8 @@ static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int * wait for the 'master' bio. */ set_bit(R10BIO_Uptodate, &r10_bio->state); - - update_head_pos(slot, r10_bio); - - /* - * we have only one bio on the read side - */ - if (uptodate) raid_end_bio_io(r10_bio); - else { + } else { /* * oops, read error: */ @@ -322,9 +300,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in /* * this branch is our 'one mirror IO has finished' event handler: */ - if (!uptodate) + if (!uptodate) { md_error(r10_bio->mddev, conf->mirrors[dev].rdev); - else + /* an I/O failed, we can't clear the bitmap */ + set_bit(R10BIO_Degraded, &r10_bio->state); + } else /* * Set R10BIO_Uptodate in our master bio, so that * we will return a good error code for to the higher @@ -344,6 +324,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in * already. */ if (atomic_dec_and_test(&r10_bio->remaining)) { + /* clear the bitmap if all writes complete successfully */ + bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, + r10_bio->sectors, + !test_bit(R10BIO_Degraded, &r10_bio->state), + 0); md_write_end(r10_bio->mddev); raid_end_bio_io(r10_bio); } @@ -502,8 +487,9 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) rcu_read_lock(); /* * Check if we can balance. We can balance on the whole - * device if no resync is going on, or below the resync window. - * We take the first readable disk when above the resync window. + * device if no resync is going on (recovery is ok), or below + * the resync window. We take the first readable disk when + * above the resync window. */ if (conf->mddev->recovery_cp < MaxSector && (this_sector + sectors >= conf->next_resync)) { @@ -512,6 +498,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) disk = r10_bio->devs[slot].devnum; while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL || + r10_bio->devs[slot].bio == IO_BLOCKED || !test_bit(In_sync, &rdev->flags)) { slot++; if (slot == conf->copies) { @@ -529,6 +516,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) slot = 0; disk = r10_bio->devs[slot].devnum; while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL || + r10_bio->devs[slot].bio == IO_BLOCKED || !test_bit(In_sync, &rdev->flags)) { slot ++; if (slot == conf->copies) { @@ -549,6 +537,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL || + r10_bio->devs[nslot].bio == IO_BLOCKED || !test_bit(In_sync, &rdev->flags)) continue; @@ -607,7 +596,10 @@ static void unplug_slaves(mddev_t *mddev) static void raid10_unplug(request_queue_t *q) { + mddev_t *mddev = q->queuedata; + unplug_slaves(q->queuedata); + md_wakeup_thread(mddev->thread); } static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk, @@ -640,27 +632,107 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk, return ret; } -/* - * Throttle resync depth, so that we can both get proper overlapping of - * requests, but are still able to handle normal requests quickly. +/* Barriers.... + * Sometimes we need to suspend IO while we do something else, + * either some resync/recovery, or reconfigure the array. + * To do this we raise a 'barrier'. + * The 'barrier' is a counter that can be raised multiple times + * to count how many activities are happening which preclude + * normal IO. + * We can only raise the barrier if there is no pending IO. + * i.e. if nr_pending == 0. + * We choose only to raise the barrier if no-one is waiting for the + * barrier to go down. This means that as soon as an IO request + * is ready, no other operations which require a barrier will start + * until the IO request has had a chance. + * + * So: regular IO calls 'wait_barrier'. When that returns there + * is no backgroup IO happening, It must arrange to call + * allow_barrier when it has finished its IO. + * backgroup IO calls must call raise_barrier. Once that returns + * there is no normal IO happeing. It must arrange to call + * lower_barrier when the particular background IO completes. */ #define RESYNC_DEPTH 32 -static void device_barrier(conf_t *conf, sector_t sect) +static void raise_barrier(conf_t *conf, int force) +{ + BUG_ON(force && !conf->barrier); + spin_lock_irq(&conf->resync_lock); + + /* Wait until no block IO is waiting (unless 'force') */ + wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, + conf->resync_lock, + raid10_unplug(conf->mddev->queue)); + + /* block any new IO from starting */ + conf->barrier++; + + /* No wait for all pending IO to complete */ + wait_event_lock_irq(conf->wait_barrier, + !conf->nr_pending && conf->barrier < RESYNC_DEPTH, + conf->resync_lock, + raid10_unplug(conf->mddev->queue)); + + spin_unlock_irq(&conf->resync_lock); +} + +static void lower_barrier(conf_t *conf) +{ + unsigned long flags; + spin_lock_irqsave(&conf->resync_lock, flags); + conf->barrier--; + spin_unlock_irqrestore(&conf->resync_lock, flags); + wake_up(&conf->wait_barrier); +} + +static void wait_barrier(conf_t *conf) { spin_lock_irq(&conf->resync_lock); - wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), - conf->resync_lock, unplug_slaves(conf->mddev)); - - if (!conf->barrier++) { - wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, - conf->resync_lock, unplug_slaves(conf->mddev)); - if (conf->nr_pending) - BUG(); + if (conf->barrier) { + conf->nr_waiting++; + wait_event_lock_irq(conf->wait_barrier, !conf->barrier, + conf->resync_lock, + raid10_unplug(conf->mddev->queue)); + conf->nr_waiting--; } - wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, - conf->resync_lock, unplug_slaves(conf->mddev)); - conf->next_resync = sect; + conf->nr_pending++; + spin_unlock_irq(&conf->resync_lock); +} + +static void allow_barrier(conf_t *conf) +{ + unsigned long flags; + spin_lock_irqsave(&conf->resync_lock, flags); + conf->nr_pending--; + spin_unlock_irqrestore(&conf->resync_lock, flags); + wake_up(&conf->wait_barrier); +} + +static void freeze_array(conf_t *conf) +{ + /* stop syncio and normal IO and wait for everything to + * go quiet. + * We increment barrier and nr_waiting, and then + * wait until barrier+nr_pending match nr_queued+2 + */ + spin_lock_irq(&conf->resync_lock); + conf->barrier++; + conf->nr_waiting++; + wait_event_lock_irq(conf->wait_barrier, + conf->barrier+conf->nr_pending == conf->nr_queued+2, + conf->resync_lock, + raid10_unplug(conf->mddev->queue)); + spin_unlock_irq(&conf->resync_lock); +} + +static void unfreeze_array(conf_t *conf) +{ + /* reverse the effect of the freeze */ + spin_lock_irq(&conf->resync_lock); + conf->barrier--; + conf->nr_waiting--; + wake_up(&conf->wait_barrier); spin_unlock_irq(&conf->resync_lock); } @@ -674,6 +746,8 @@ static int make_request(request_queue_t *q, struct bio * bio) int i; int chunk_sects = conf->chunk_mask + 1; const int rw = bio_data_dir(bio); + struct bio_list bl; + unsigned long flags; if (unlikely(bio_barrier(bio))) { bio_endio(bio, bio->bi_size, -EOPNOTSUPP); @@ -719,10 +793,7 @@ static int make_request(request_queue_t *q, struct bio * bio) * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. */ - spin_lock_irq(&conf->resync_lock); - wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, ); - conf->nr_pending++; - spin_unlock_irq(&conf->resync_lock); + wait_barrier(conf); disk_stat_inc(mddev->gendisk, ios[rw]); disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); @@ -734,6 +805,7 @@ static int make_request(request_queue_t *q, struct bio * bio) r10_bio->mddev = mddev; r10_bio->sector = bio->bi_sector; + r10_bio->state = 0; if (rw == READ) { /* @@ -778,13 +850,16 @@ static int make_request(request_queue_t *q, struct bio * bio) !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); r10_bio->devs[i].bio = bio; - } else + } else { r10_bio->devs[i].bio = NULL; + set_bit(R10BIO_Degraded, &r10_bio->state); + } } rcu_read_unlock(); - atomic_set(&r10_bio->remaining, 1); + atomic_set(&r10_bio->remaining, 0); + bio_list_init(&bl); for (i = 0; i < conf->copies; i++) { struct bio *mbio; int d = r10_bio->devs[i].devnum; @@ -802,13 +877,14 @@ static int make_request(request_queue_t *q, struct bio * bio) mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); - generic_make_request(mbio); + bio_list_add(&bl, mbio); } - if (atomic_dec_and_test(&r10_bio->remaining)) { - md_write_end(mddev); - raid_end_bio_io(r10_bio); - } + bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); + spin_lock_irqsave(&conf->device_lock, flags); + bio_list_merge(&conf->pending_bio_list, &bl); + blk_plug_device(mddev->queue); + spin_unlock_irqrestore(&conf->device_lock, flags); return 0; } @@ -897,13 +973,8 @@ static void print_conf(conf_t *conf) static void close_sync(conf_t *conf) { - spin_lock_irq(&conf->resync_lock); - wait_event_lock_irq(conf->wait_resume, !conf->barrier, - conf->resync_lock, unplug_slaves(conf->mddev)); - spin_unlock_irq(&conf->resync_lock); - - if (conf->barrier) BUG(); - if (waitqueue_active(&conf->wait_idle)) BUG(); + wait_barrier(conf); + allow_barrier(conf); mempool_destroy(conf->r10buf_pool); conf->r10buf_pool = NULL; @@ -971,7 +1042,12 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) if (!enough(conf)) return 0; - for (mirror=0; mirror < mddev->raid_disks; mirror++) + if (rdev->saved_raid_disk >= 0 && + conf->mirrors[rdev->saved_raid_disk].rdev == NULL) + mirror = rdev->saved_raid_disk; + else + mirror = 0; + for ( ; mirror < mddev->raid_disks; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { blk_queue_stack_limits(mddev->queue, @@ -987,6 +1063,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) p->head_position = 0; rdev->raid_disk = mirror; found = 1; + if (rdev->saved_raid_disk != mirror) + conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); break; } @@ -1027,7 +1105,6 @@ abort: static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) { - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); conf_t *conf = mddev_to_conf(r10_bio->mddev); int i,d; @@ -1042,9 +1119,16 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) BUG(); update_head_pos(i, r10_bio); d = r10_bio->devs[i].devnum; - if (!uptodate) - md_error(r10_bio->mddev, - conf->mirrors[d].rdev); + + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + set_bit(R10BIO_Uptodate, &r10_bio->state); + else { + atomic_add(r10_bio->sectors, + &conf->mirrors[d].rdev->corrected_errors); + if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) + md_error(r10_bio->mddev, + conf->mirrors[d].rdev); + } /* for reconstruct, we always reschedule after a read. * for resync, only after all reads @@ -1132,23 +1216,32 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) fbio = r10_bio->devs[i].bio; /* now find blocks with errors */ - for (i=first+1 ; i < conf->copies ; i++) { - int vcnt, j, d; + for (i=0 ; i < conf->copies ; i++) { + int j, d; + int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9); - if (!test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) - continue; - /* We know that the bi_io_vec layout is the same for - * both 'first' and 'i', so we just compare them. - * All vec entries are PAGE_SIZE; - */ tbio = r10_bio->devs[i].bio; - vcnt = r10_bio->sectors >> (PAGE_SHIFT-9); - for (j = 0; j < vcnt; j++) - if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), - page_address(tbio->bi_io_vec[j].bv_page), - PAGE_SIZE)) - break; - if (j == vcnt) + + if (tbio->bi_end_io != end_sync_read) + continue; + if (i == first) + continue; + if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) { + /* We know that the bi_io_vec layout is the same for + * both 'first' and 'i', so we just compare them. + * All vec entries are PAGE_SIZE; + */ + for (j = 0; j < vcnt; j++) + if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), + page_address(tbio->bi_io_vec[j].bv_page), + PAGE_SIZE)) + break; + if (j == vcnt) + continue; + mddev->resync_mismatches += r10_bio->sectors; + } + if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + /* Don't fix anything. */ continue; /* Ok, we need to write this bio * First we need to fixup bv_offset, bv_len and @@ -1227,7 +1320,10 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) atomic_inc(&conf->mirrors[d].rdev->nr_pending); md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); - generic_make_request(wbio); + if (test_bit(R10BIO_Uptodate, &r10_bio->state)) + generic_make_request(wbio); + else + bio_endio(wbio, wbio->bi_size, -EIO); } @@ -1254,10 +1350,31 @@ static void raid10d(mddev_t *mddev) for (;;) { char b[BDEVNAME_SIZE]; spin_lock_irqsave(&conf->device_lock, flags); + + if (conf->pending_bio_list.head) { + bio = bio_list_get(&conf->pending_bio_list); + blk_remove_plug(mddev->queue); + spin_unlock_irqrestore(&conf->device_lock, flags); + /* flush any pending bitmap writes to disk before proceeding w/ I/O */ + if (bitmap_unplug(mddev->bitmap) != 0) + printk("%s: bitmap file write failed!\n", mdname(mddev)); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + unplug = 1; + + continue; + } + if (list_empty(head)) break; r10_bio = list_entry(head->prev, r10bio_t, retry_list); list_del(head->prev); + conf->nr_queued--; spin_unlock_irqrestore(&conf->device_lock, flags); mddev = r10_bio->mddev; @@ -1270,8 +1387,96 @@ static void raid10d(mddev_t *mddev) unplug = 1; } else { int mirror; + /* we got a read error. Maybe the drive is bad. Maybe just + * the block and we can fix it. + * We freeze all other IO, and try reading the block from + * other devices. When we find one, we re-write + * and check it that fixes the read error. + * This is all done synchronously while the array is + * frozen. + */ + int sect = 0; /* Offset from r10_bio->sector */ + int sectors = r10_bio->sectors; + freeze_array(conf); + if (mddev->ro == 0) while(sectors) { + int s = sectors; + int sl = r10_bio->read_slot; + int success = 0; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + + do { + int d = r10_bio->devs[sl].devnum; + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags) && + sync_page_io(rdev->bdev, + r10_bio->devs[sl].addr + + sect + rdev->data_offset, + s<<9, + conf->tmppage, READ)) + success = 1; + else { + sl++; + if (sl == conf->copies) + sl = 0; + } + } while (!success && sl != r10_bio->read_slot); + + if (success) { + int start = sl; + /* write it back and re-read */ + while (sl != r10_bio->read_slot) { + int d; + if (sl==0) + sl = conf->copies; + sl--; + d = r10_bio->devs[sl].devnum; + rdev = conf->mirrors[d].rdev; + atomic_add(s, &rdev->corrected_errors); + if (rdev && + test_bit(In_sync, &rdev->flags)) { + if (sync_page_io(rdev->bdev, + r10_bio->devs[sl].addr + + sect + rdev->data_offset, + s<<9, conf->tmppage, WRITE) == 0) + /* Well, this device is dead */ + md_error(mddev, rdev); + } + } + sl = start; + while (sl != r10_bio->read_slot) { + int d; + if (sl==0) + sl = conf->copies; + sl--; + d = r10_bio->devs[sl].devnum; + rdev = conf->mirrors[d].rdev; + if (rdev && + test_bit(In_sync, &rdev->flags)) { + if (sync_page_io(rdev->bdev, + r10_bio->devs[sl].addr + + sect + rdev->data_offset, + s<<9, conf->tmppage, READ) == 0) + /* Well, this device is dead */ + md_error(mddev, rdev); + } + } + } else { + /* Cannot read from anywhere -- bye bye array */ + md_error(mddev, conf->mirrors[r10_bio->devs[r10_bio->read_slot].devnum].rdev); + break; + } + sectors -= s; + sect += s; + } + + unfreeze_array(conf); + bio = r10_bio->devs[r10_bio->read_slot].bio; - r10_bio->devs[r10_bio->read_slot].bio = NULL; + r10_bio->devs[r10_bio->read_slot].bio = + mddev->ro ? IO_BLOCKED : NULL; bio_put(bio); mirror = read_balance(conf, r10_bio); if (mirror == -1) { @@ -1360,6 +1565,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i sector_t max_sector, nr_sectors; int disk; int i; + int max_sync; + int sync_blocks; sector_t sectors_skipped = 0; int chunks_skipped = 0; @@ -1373,6 +1580,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) max_sector = mddev->resync_max_sectors; if (sector_nr >= max_sector) { + /* If we aborted, we need to abort the + * sync on the 'current' bitmap chucks (there can + * be several when recovering multiple devices). + * as we may have started syncing it but not finished. + * We can find the current address in + * mddev->curr_resync, but for recovery, + * we need to convert that to several + * virtual addresses. + */ + if (mddev->curr_resync < max_sector) { /* aborted */ + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + bitmap_end_sync(mddev->bitmap, mddev->curr_resync, + &sync_blocks, 1); + else for (i=0; i<conf->raid_disks; i++) { + sector_t sect = + raid10_find_virt(conf, mddev->curr_resync, i); + bitmap_end_sync(mddev->bitmap, sect, + &sync_blocks, 1); + } + } else /* completed sync */ + conf->fullsync = 0; + + bitmap_close_sync(mddev->bitmap); close_sync(conf); *skipped = 1; return sectors_skipped; @@ -1395,9 +1625,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i * If there is non-resync activity waiting for us then * put in a delay to throttle resync. */ - if (!go_faster && waitqueue_active(&conf->wait_resume)) + if (!go_faster && conf->nr_waiting) msleep_interruptible(1000); - device_barrier(conf, sector_nr + RESYNC_SECTORS); /* Again, very different code for resync and recovery. * Both must result in an r10bio with a list of bios that @@ -1414,6 +1643,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i * end_sync_write if we will want to write. */ + max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { /* recovery... the complicated one */ int i, j, k; @@ -1422,14 +1652,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i for (i=0 ; i<conf->raid_disks; i++) if (conf->mirrors[i].rdev && !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) { + int still_degraded = 0; /* want to reconstruct this device */ r10bio_t *rb2 = r10_bio; + sector_t sect = raid10_find_virt(conf, sector_nr, i); + int must_sync; + /* Unless we are doing a full sync, we only need + * to recover the block if it is set in the bitmap + */ + must_sync = bitmap_start_sync(mddev->bitmap, sect, + &sync_blocks, 1); + if (sync_blocks < max_sync) + max_sync = sync_blocks; + if (!must_sync && + !conf->fullsync) { + /* yep, skip the sync_blocks here, but don't assume + * that there will never be anything to do here + */ + chunks_skipped = -1; + continue; + } r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); - spin_lock_irq(&conf->resync_lock); - conf->nr_pending++; - if (rb2) conf->barrier++; - spin_unlock_irq(&conf->resync_lock); + raise_barrier(conf, rb2 != NULL); atomic_set(&r10_bio->remaining, 0); r10_bio->master_bio = (struct bio*)rb2; @@ -1437,8 +1682,23 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i atomic_inc(&rb2->remaining); r10_bio->mddev = mddev; set_bit(R10BIO_IsRecover, &r10_bio->state); - r10_bio->sector = raid10_find_virt(conf, sector_nr, i); + r10_bio->sector = sect; + raid10_find_phys(conf, r10_bio); + /* Need to check if this section will still be + * degraded + */ + for (j=0; j<conf->copies;j++) { + int d = r10_bio->devs[j].devnum; + if (conf->mirrors[d].rdev == NULL || + test_bit(Faulty, &conf->mirrors[d].rdev->flags)) { + still_degraded = 1; + break; + } + } + must_sync = bitmap_start_sync(mddev->bitmap, sect, + &sync_blocks, still_degraded); + for (j=0; j<conf->copies;j++) { int d = r10_bio->devs[j].devnum; if (conf->mirrors[d].rdev && @@ -1498,14 +1758,22 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i } else { /* resync. Schedule a read for every block at this virt offset */ int count = 0; - r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); - spin_lock_irq(&conf->resync_lock); - conf->nr_pending++; - spin_unlock_irq(&conf->resync_lock); + if (!bitmap_start_sync(mddev->bitmap, sector_nr, + &sync_blocks, mddev->degraded) && + !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + /* We can skip this block */ + *skipped = 1; + return sync_blocks + sectors_skipped; + } + if (sync_blocks < max_sync) + max_sync = sync_blocks; + r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); r10_bio->mddev = mddev; atomic_set(&r10_bio->remaining, 0); + raise_barrier(conf, 0); + conf->next_resync = sector_nr; r10_bio->master_bio = NULL; r10_bio->sector = sector_nr; @@ -1558,6 +1826,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i } nr_sectors = 0; + if (sector_nr + max_sync < max_sector) + max_sector = sector_nr + max_sync; do { struct page *page; int len = PAGE_SIZE; @@ -1632,11 +1902,11 @@ static int run(mddev_t *mddev) int nc, fc; sector_t stride, size; - if (mddev->level != 10) { - printk(KERN_ERR "raid10: %s: raid level not set correctly... (%d)\n", - mdname(mddev), mddev->level); - goto out; + if (mddev->chunk_size == 0) { + printk(KERN_ERR "md/raid10: non-zero chunk size required.\n"); + return -EINVAL; } + nc = mddev->layout & 255; fc = (mddev->layout >> 8) & 255; if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || @@ -1650,22 +1920,24 @@ static int run(mddev_t *mddev) * bookkeeping area. [whatever we allocate in run(), * should be freed in stop()] */ - conf = kmalloc(sizeof(conf_t), GFP_KERNEL); + conf = kzalloc(sizeof(conf_t), GFP_KERNEL); mddev->private = conf; if (!conf) { printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", mdname(mddev)); goto out; } - memset(conf, 0, sizeof(*conf)); - conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks, + conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, GFP_KERNEL); if (!conf->mirrors) { printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", mdname(mddev)); goto out_free_conf; } - memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks); + + conf->tmppage = alloc_page(GFP_KERNEL); + if (!conf->tmppage) + goto out_free_conf; conf->near_copies = nc; conf->far_copies = fc; @@ -1713,8 +1985,7 @@ static int run(mddev_t *mddev) INIT_LIST_HEAD(&conf->retry_list); spin_lock_init(&conf->resync_lock); - init_waitqueue_head(&conf->wait_idle); - init_waitqueue_head(&conf->wait_resume); + init_waitqueue_head(&conf->wait_barrier); /* need to check that every block has at least one working mirror */ if (!enough(conf)) { @@ -1763,7 +2034,7 @@ static int run(mddev_t *mddev) * maybe... */ { - int stripe = conf->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE; + int stripe = conf->raid_disks * mddev->chunk_size / PAGE_SIZE; stripe /= conf->near_copies; if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) mddev->queue->backing_dev_info.ra_pages = 2* stripe; @@ -1776,6 +2047,7 @@ static int run(mddev_t *mddev) out_free_conf: if (conf->r10bio_pool) mempool_destroy(conf->r10bio_pool); + safe_put_page(conf->tmppage); kfree(conf->mirrors); kfree(conf); mddev->private = NULL; @@ -1798,10 +2070,31 @@ static int stop(mddev_t *mddev) return 0; } +static void raid10_quiesce(mddev_t *mddev, int state) +{ + conf_t *conf = mddev_to_conf(mddev); + + switch(state) { + case 1: + raise_barrier(conf, 0); + break; + case 0: + lower_barrier(conf); + break; + } + if (mddev->thread) { + if (mddev->bitmap) + mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; + else + mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + md_wakeup_thread(mddev->thread); + } +} -static mdk_personality_t raid10_personality = +static struct mdk_personality raid10_personality = { .name = "raid10", + .level = 10, .owner = THIS_MODULE, .make_request = make_request, .run = run, @@ -1812,19 +2105,22 @@ static mdk_personality_t raid10_personality = .hot_remove_disk= raid10_remove_disk, .spare_active = raid10_spare_active, .sync_request = sync_request, + .quiesce = raid10_quiesce, }; static int __init raid_init(void) { - return register_md_personality(RAID10, &raid10_personality); + return register_md_personality(&raid10_personality); } static void raid_exit(void) { - unregister_md_personality(RAID10); + unregister_md_personality(&raid10_personality); } module_init(raid_init); module_exit(raid_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS("md-personality-9"); /* RAID10 */ +MODULE_ALIAS("md-raid10"); +MODULE_ALIAS("md-level-10"); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 36d5f8ac826..25976bfb6f9 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -35,12 +35,10 @@ #define STRIPE_SHIFT (PAGE_SHIFT - 9) #define STRIPE_SECTORS (STRIPE_SIZE>>9) #define IO_THRESHOLD 1 -#define HASH_PAGES 1 -#define HASH_PAGES_ORDER 0 -#define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *)) +#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) #define HASH_MASK (NR_HASH - 1) -#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]) +#define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])) /* bio's attached to a stripe+device for I/O are linked together in bi_sector * order without overlap. There may be several bio's per stripe+device, and @@ -71,7 +69,7 @@ static void print_raid5_conf (raid5_conf_t *conf); -static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) +static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) { if (atomic_dec_and_test(&sh->count)) { if (!list_empty(&sh->lru)) @@ -98,7 +96,7 @@ static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) list_add_tail(&sh->lru, &conf->inactive_list); atomic_dec(&conf->active_stripes); if (!conf->inactive_blocked || - atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4)) + atomic_read(&conf->active_stripes) < (conf->max_nr_stripes*3/4)) wake_up(&conf->wait_for_stripe); } } @@ -113,29 +111,21 @@ static void release_stripe(struct stripe_head *sh) spin_unlock_irqrestore(&conf->device_lock, flags); } -static void remove_hash(struct stripe_head *sh) +static inline void remove_hash(struct stripe_head *sh) { PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector); - if (sh->hash_pprev) { - if (sh->hash_next) - sh->hash_next->hash_pprev = sh->hash_pprev; - *sh->hash_pprev = sh->hash_next; - sh->hash_pprev = NULL; - } + hlist_del_init(&sh->hash); } -static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) +static void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) { - struct stripe_head **shp = &stripe_hash(conf, sh->sector); + struct hlist_head *hp = stripe_hash(conf, sh->sector); PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector); CHECK_DEVLOCK(); - if ((sh->hash_next = *shp) != NULL) - (*shp)->hash_pprev = &sh->hash_next; - *shp = sh; - sh->hash_pprev = shp; + hlist_add_head(&sh->hash, hp); } @@ -167,7 +157,7 @@ static void shrink_buffers(struct stripe_head *sh, int num) if (!p) continue; sh->dev[i].page = NULL; - page_cache_release(p); + put_page(p); } } @@ -188,7 +178,7 @@ static int grow_buffers(struct stripe_head *sh, int num) static void raid5_build_block (struct stripe_head *sh, int i); -static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx) +static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx) { raid5_conf_t *conf = sh->raid_conf; int disks = conf->raid_disks, i; @@ -228,10 +218,11 @@ static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_i static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector) { struct stripe_head *sh; + struct hlist_node *hn; CHECK_DEVLOCK(); PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector); - for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next) + hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) if (sh->sector == sector) return sh; PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector); @@ -264,7 +255,8 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector conf->inactive_blocked = 1; wait_event_lock_irq(conf->wait_for_stripe, !list_empty(&conf->inactive_list) && - (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4) + (atomic_read(&conf->active_stripes) + < (conf->max_nr_stripes *3/4) || !conf->inactive_blocked), conf->device_lock, unplug_slaves(conf->mddev); @@ -416,7 +408,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, set_bit(R5_UPTODATE, &sh->dev[i].flags); #endif if (test_bit(R5_ReadError, &sh->dev[i].flags)) { - printk("R5: read error corrected!!\n"); + printk(KERN_INFO "raid5: read error corrected!!\n"); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); } @@ -427,13 +419,14 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&conf->disks[i].rdev->read_errors); if (conf->mddev->degraded) - printk("R5: read error not correctable.\n"); + printk(KERN_WARNING "raid5: read error not correctable.\n"); else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) /* Oh, no!!! */ - printk("R5: read error NOT corrected!!\n"); + printk(KERN_WARNING "raid5: read error NOT corrected!!\n"); else if (atomic_read(&conf->disks[i].rdev->read_errors) > conf->max_nr_stripes) - printk("raid5: Too many read errors, failing device.\n"); + printk(KERN_WARNING + "raid5: Too many read errors, failing device.\n"); else retry = 1; if (retry) @@ -603,7 +596,7 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; break; default: - printk("raid5: unsupported algorithm %d\n", + printk(KERN_ERR "raid5: unsupported algorithm %d\n", conf->algorithm); } @@ -644,7 +637,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) i -= (sh->pd_idx + 1); break; default: - printk("raid5: unsupported algorithm %d\n", + printk(KERN_ERR "raid5: unsupported algorithm %d\n", conf->algorithm); } @@ -653,7 +646,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { - printk("compute_blocknr: map not correct\n"); + printk(KERN_ERR "compute_blocknr: map not correct\n"); return 0; } return r_sector; @@ -736,7 +729,7 @@ static void compute_block(struct stripe_head *sh, int dd_idx) if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) ptr[count++] = p; else - printk("compute_block() %d, stripe %llu, %d" + printk(KERN_ERR "compute_block() %d, stripe %llu, %d" " not present\n", dd_idx, (unsigned long long)sh->sector, i); @@ -959,11 +952,11 @@ static void handle_stripe(struct stripe_head *sh) syncing = test_bit(STRIPE_SYNCING, &sh->state); /* Now to look around and see what can be done */ + rcu_read_lock(); for (i=disks; i--; ) { mdk_rdev_t *rdev; dev = &sh->dev[i]; clear_bit(R5_Insync, &dev->flags); - clear_bit(R5_Syncio, &dev->flags); PRINTK("check %d: state 0x%lx read %p write %p written %p\n", i, dev->flags, dev->toread, dev->towrite, dev->written); @@ -1002,9 +995,9 @@ static void handle_stripe(struct stripe_head *sh) non_overwrite++; } if (dev->written) written++; - rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */ + rdev = rcu_dereference(conf->disks[i].rdev); if (!rdev || !test_bit(In_sync, &rdev->flags)) { - /* The ReadError flag wil just be confusing now */ + /* The ReadError flag will just be confusing now */ clear_bit(R5_ReadError, &dev->flags); clear_bit(R5_ReWrite, &dev->flags); } @@ -1015,6 +1008,7 @@ static void handle_stripe(struct stripe_head *sh) } else set_bit(R5_Insync, &dev->flags); } + rcu_read_unlock(); PRINTK("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d\n", locked, uptodate, to_read, to_write, failed, failed_num); @@ -1026,10 +1020,13 @@ static void handle_stripe(struct stripe_head *sh) int bitmap_end = 0; if (test_bit(R5_ReadError, &sh->dev[i].flags)) { - mdk_rdev_t *rdev = conf->disks[i].rdev; + mdk_rdev_t *rdev; + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && test_bit(In_sync, &rdev->flags)) /* multiple read failures in one stripe */ md_error(conf->mddev, rdev); + rcu_read_unlock(); } spin_lock_irq(&conf->device_lock); @@ -1178,9 +1175,6 @@ static void handle_stripe(struct stripe_head *sh) locked++; PRINTK("Reading block %d (sync=%d)\n", i, syncing); - if (syncing) - md_sync_acct(conf->disks[i].rdev->bdev, - STRIPE_SECTORS); } } } @@ -1287,7 +1281,7 @@ static void handle_stripe(struct stripe_head *sh) * is available */ if (syncing && locked == 0 && - !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) { + !test_bit(STRIPE_INSYNC, &sh->state)) { set_bit(STRIPE_HANDLE, &sh->state); if (failed == 0) { char *pagea; @@ -1305,27 +1299,25 @@ static void handle_stripe(struct stripe_head *sh) if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); + else { + compute_block(sh, sh->pd_idx); + uptodate++; + } } } if (!test_bit(STRIPE_INSYNC, &sh->state)) { + /* either failed parity check, or recovery is happening */ if (failed==0) failed_num = sh->pd_idx; - /* should be able to compute the missing block and write it to spare */ - if (!test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)) { - if (uptodate+1 != disks) - BUG(); - compute_block(sh, failed_num); - uptodate++; - } - if (uptodate != disks) - BUG(); dev = &sh->dev[failed_num]; + BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); + BUG_ON(uptodate != disks); + set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); clear_bit(STRIPE_DEGRADED, &sh->state); locked++; set_bit(STRIPE_INSYNC, &sh->state); - set_bit(R5_Syncio, &dev->flags); } } if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { @@ -1391,7 +1383,7 @@ static void handle_stripe(struct stripe_head *sh) rcu_read_unlock(); if (rdev) { - if (test_bit(R5_Syncio, &sh->dev[i].flags)) + if (syncing) md_sync_acct(rdev->bdev, STRIPE_SECTORS); bi->bi_bdev = rdev->bdev; @@ -1408,6 +1400,9 @@ static void handle_stripe(struct stripe_head *sh) bi->bi_io_vec[0].bv_offset = 0; bi->bi_size = STRIPE_SIZE; bi->bi_next = NULL; + if (rw == WRITE && + test_bit(R5_ReWrite, &sh->dev[i].flags)) + atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); generic_make_request(bi); } else { if (rw == 1) @@ -1420,7 +1415,7 @@ static void handle_stripe(struct stripe_head *sh) } } -static inline void raid5_activate_delayed(raid5_conf_t *conf) +static void raid5_activate_delayed(raid5_conf_t *conf) { if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { while (!list_empty(&conf->delayed_list)) { @@ -1436,7 +1431,7 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf) } } -static inline void activate_bit_delay(raid5_conf_t *conf) +static void activate_bit_delay(raid5_conf_t *conf) { /* device_lock is held */ struct list_head head; @@ -1821,21 +1816,21 @@ static int run(mddev_t *mddev) struct list_head *tmp; if (mddev->level != 5 && mddev->level != 4) { - printk("raid5: %s: raid level not set to 4/5 (%d)\n", mdname(mddev), mddev->level); + printk(KERN_ERR "raid5: %s: raid level not set to 4/5 (%d)\n", + mdname(mddev), mddev->level); return -EIO; } - mddev->private = kmalloc (sizeof (raid5_conf_t) - + mddev->raid_disks * sizeof(struct disk_info), - GFP_KERNEL); + mddev->private = kzalloc(sizeof (raid5_conf_t) + + mddev->raid_disks * sizeof(struct disk_info), + GFP_KERNEL); if ((conf = mddev->private) == NULL) goto abort; - memset (conf, 0, sizeof (*conf) + mddev->raid_disks * sizeof(struct disk_info) ); + conf->mddev = mddev; - if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) + if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) goto abort; - memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); spin_lock_init(&conf->device_lock); init_waitqueue_head(&conf->wait_for_stripe); @@ -1902,10 +1897,17 @@ static int run(mddev_t *mddev) if (mddev->degraded == 1 && mddev->recovery_cp != MaxSector) { - printk(KERN_ERR - "raid5: cannot start dirty degraded array for %s\n", - mdname(mddev)); - goto abort; + if (mddev->ok_start_degraded) + printk(KERN_WARNING + "raid5: starting dirty degraded array: %s" + "- data corruption possible.\n", + mdname(mddev)); + else { + printk(KERN_ERR + "raid5: cannot start dirty degraded array for %s\n", + mdname(mddev)); + goto abort; + } } { @@ -1917,7 +1919,7 @@ static int run(mddev_t *mddev) goto abort; } } -memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + + memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; if (grow_stripes(conf, conf->max_nr_stripes)) { printk(KERN_ERR @@ -1947,7 +1949,7 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + */ { int stripe = (mddev->raid_disks-1) * mddev->chunk_size - / PAGE_CACHE_SIZE; + / PAGE_SIZE; if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) mddev->queue->backing_dev_info.ra_pages = 2 * stripe; } @@ -1955,9 +1957,6 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + /* Ok, everything is just fine now */ sysfs_create_group(&mddev->kobj, &raid5_attrs_group); - if (mddev->bitmap) - mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; - mddev->queue->unplug_fn = raid5_unplug_device; mddev->queue->issue_flush_fn = raid5_issue_flush; @@ -1966,9 +1965,7 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + abort: if (conf) { print_raid5_conf(conf); - if (conf->stripe_hashtbl) - free_pages((unsigned long) conf->stripe_hashtbl, - HASH_PAGES_ORDER); + kfree(conf->stripe_hashtbl); kfree(conf); } mddev->private = NULL; @@ -1985,7 +1982,7 @@ static int stop(mddev_t *mddev) md_unregister_thread(mddev->thread); mddev->thread = NULL; shrink_stripes(conf); - free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); + kfree(conf->stripe_hashtbl); blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); kfree(conf); @@ -2013,12 +2010,12 @@ static void print_sh (struct stripe_head *sh) static void printall (raid5_conf_t *conf) { struct stripe_head *sh; + struct hlist_node *hn; int i; spin_lock_irq(&conf->device_lock); for (i = 0; i < NR_HASH; i++) { - sh = conf->stripe_hashtbl[i]; - for (; sh; sh = sh->hash_next) { + hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) { if (sh->raid_conf != conf) continue; print_sh(sh); @@ -2191,17 +2188,12 @@ static void raid5_quiesce(mddev_t *mddev, int state) spin_unlock_irq(&conf->device_lock); break; } - if (mddev->thread) { - if (mddev->bitmap) - mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; - else - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; - md_wakeup_thread(mddev->thread); - } } -static mdk_personality_t raid5_personality= + +static struct mdk_personality raid5_personality = { .name = "raid5", + .level = 5, .owner = THIS_MODULE, .make_request = make_request, .run = run, @@ -2216,17 +2208,42 @@ static mdk_personality_t raid5_personality= .quiesce = raid5_quiesce, }; -static int __init raid5_init (void) +static struct mdk_personality raid4_personality = { - return register_md_personality (RAID5, &raid5_personality); + .name = "raid4", + .level = 4, + .owner = THIS_MODULE, + .make_request = make_request, + .run = run, + .stop = stop, + .status = status, + .error_handler = error, + .hot_add_disk = raid5_add_disk, + .hot_remove_disk= raid5_remove_disk, + .spare_active = raid5_spare_active, + .sync_request = sync_request, + .resize = raid5_resize, + .quiesce = raid5_quiesce, +}; + +static int __init raid5_init(void) +{ + register_md_personality(&raid5_personality); + register_md_personality(&raid4_personality); + return 0; } -static void raid5_exit (void) +static void raid5_exit(void) { - unregister_md_personality (RAID5); + unregister_md_personality(&raid5_personality); + unregister_md_personality(&raid4_personality); } module_init(raid5_init); module_exit(raid5_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS("md-personality-4"); /* RAID5 */ +MODULE_ALIAS("md-raid5"); +MODULE_ALIAS("md-raid4"); +MODULE_ALIAS("md-level-5"); +MODULE_ALIAS("md-level-4"); diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 0000d162d19..f618a53b98b 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -40,12 +40,10 @@ #define STRIPE_SHIFT (PAGE_SHIFT - 9) #define STRIPE_SECTORS (STRIPE_SIZE>>9) #define IO_THRESHOLD 1 -#define HASH_PAGES 1 -#define HASH_PAGES_ORDER 0 -#define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *)) +#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) #define HASH_MASK (NR_HASH - 1) -#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]) +#define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])) /* bio's attached to a stripe+device for I/O are linked together in bi_sector * order without overlap. There may be several bio's per stripe+device, and @@ -90,7 +88,7 @@ static inline int raid6_next_disk(int disk, int raid_disks) static void print_raid6_conf (raid6_conf_t *conf); -static inline void __release_stripe(raid6_conf_t *conf, struct stripe_head *sh) +static void __release_stripe(raid6_conf_t *conf, struct stripe_head *sh) { if (atomic_dec_and_test(&sh->count)) { if (!list_empty(&sh->lru)) @@ -132,29 +130,21 @@ static void release_stripe(struct stripe_head *sh) spin_unlock_irqrestore(&conf->device_lock, flags); } -static void remove_hash(struct stripe_head *sh) +static inline void remove_hash(struct stripe_head *sh) { PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector); - if (sh->hash_pprev) { - if (sh->hash_next) - sh->hash_next->hash_pprev = sh->hash_pprev; - *sh->hash_pprev = sh->hash_next; - sh->hash_pprev = NULL; - } + hlist_del_init(&sh->hash); } -static __inline__ void insert_hash(raid6_conf_t *conf, struct stripe_head *sh) +static inline void insert_hash(raid6_conf_t *conf, struct stripe_head *sh) { - struct stripe_head **shp = &stripe_hash(conf, sh->sector); + struct hlist_head *hp = stripe_hash(conf, sh->sector); PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector); CHECK_DEVLOCK(); - if ((sh->hash_next = *shp) != NULL) - (*shp)->hash_pprev = &sh->hash_next; - *shp = sh; - sh->hash_pprev = shp; + hlist_add_head(&sh->hash, hp); } @@ -186,7 +176,7 @@ static void shrink_buffers(struct stripe_head *sh, int num) if (!p) continue; sh->dev[i].page = NULL; - page_cache_release(p); + put_page(p); } } @@ -207,7 +197,7 @@ static int grow_buffers(struct stripe_head *sh, int num) static void raid6_build_block (struct stripe_head *sh, int i); -static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx) +static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx) { raid6_conf_t *conf = sh->raid_conf; int disks = conf->raid_disks, i; @@ -247,10 +237,11 @@ static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_i static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector) { struct stripe_head *sh; + struct hlist_node *hn; CHECK_DEVLOCK(); PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector); - for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next) + hlist_for_each_entry (sh, hn, stripe_hash(conf, sector), hash) if (sh->sector == sector) return sh; PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector); @@ -367,8 +358,8 @@ static void shrink_stripes(raid6_conf_t *conf) conf->slab_cache = NULL; } -static int raid6_end_read_request (struct bio * bi, unsigned int bytes_done, - int error) +static int raid6_end_read_request(struct bio * bi, unsigned int bytes_done, + int error) { struct stripe_head *sh = bi->bi_private; raid6_conf_t *conf = sh->raid_conf; @@ -420,9 +411,35 @@ static int raid6_end_read_request (struct bio * bi, unsigned int bytes_done, #else set_bit(R5_UPTODATE, &sh->dev[i].flags); #endif + if (test_bit(R5_ReadError, &sh->dev[i].flags)) { + printk(KERN_INFO "raid6: read error corrected!!\n"); + clear_bit(R5_ReadError, &sh->dev[i].flags); + clear_bit(R5_ReWrite, &sh->dev[i].flags); + } + if (atomic_read(&conf->disks[i].rdev->read_errors)) + atomic_set(&conf->disks[i].rdev->read_errors, 0); } else { - md_error(conf->mddev, conf->disks[i].rdev); + int retry = 0; clear_bit(R5_UPTODATE, &sh->dev[i].flags); + atomic_inc(&conf->disks[i].rdev->read_errors); + if (conf->mddev->degraded) + printk(KERN_WARNING "raid6: read error not correctable.\n"); + else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) + /* Oh, no!!! */ + printk(KERN_WARNING "raid6: read error NOT corrected!!\n"); + else if (atomic_read(&conf->disks[i].rdev->read_errors) + > conf->max_nr_stripes) + printk(KERN_WARNING + "raid6: Too many read errors, failing device.\n"); + else + retry = 1; + if (retry) + set_bit(R5_ReadError, &sh->dev[i].flags); + else { + clear_bit(R5_ReadError, &sh->dev[i].flags); + clear_bit(R5_ReWrite, &sh->dev[i].flags); + md_error(conf->mddev, conf->disks[i].rdev); + } } rdev_dec_pending(conf->disks[i].rdev, conf->mddev); #if 0 @@ -805,7 +822,7 @@ static void compute_parity(struct stripe_head *sh, int method) } /* Compute one missing block */ -static void compute_block_1(struct stripe_head *sh, int dd_idx) +static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) { raid6_conf_t *conf = sh->raid_conf; int i, count, disks = conf->raid_disks; @@ -821,7 +838,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx) compute_parity(sh, UPDATE_PARITY); } else { ptr[0] = page_address(sh->dev[dd_idx].page); - memset(ptr[0], 0, STRIPE_SIZE); + if (!nozero) memset(ptr[0], 0, STRIPE_SIZE); count = 1; for (i = disks ; i--; ) { if (i == dd_idx || i == qd_idx) @@ -838,7 +855,8 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx) } if (count != 1) xor_block(count, STRIPE_SIZE, ptr); - set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); + if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); + else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); } } @@ -871,7 +889,7 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) return; } else { /* We're missing D+Q; recompute D from P */ - compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1); + compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0); compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */ return; } @@ -982,6 +1000,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in } +static int page_is_zero(struct page *p) +{ + char *a = page_address(p); + return ((*(u32*)a) == 0 && + memcmp(a, a+4, STRIPE_SIZE-4)==0); +} /* * handle_stripe - do things to a stripe. * @@ -1000,7 +1024,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in * */ -static void handle_stripe(struct stripe_head *sh) +static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) { raid6_conf_t *conf = sh->raid_conf; int disks = conf->raid_disks; @@ -1027,11 +1051,11 @@ static void handle_stripe(struct stripe_head *sh) syncing = test_bit(STRIPE_SYNCING, &sh->state); /* Now to look around and see what can be done */ + rcu_read_lock(); for (i=disks; i--; ) { mdk_rdev_t *rdev; dev = &sh->dev[i]; clear_bit(R5_Insync, &dev->flags); - clear_bit(R5_Syncio, &dev->flags); PRINTK("check %d: state 0x%lx read %p write %p written %p\n", i, dev->flags, dev->toread, dev->towrite, dev->written); @@ -1070,14 +1094,21 @@ static void handle_stripe(struct stripe_head *sh) non_overwrite++; } if (dev->written) written++; - rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */ + rdev = rcu_dereference(conf->disks[i].rdev); if (!rdev || !test_bit(In_sync, &rdev->flags)) { + /* The ReadError flag will just be confusing now */ + clear_bit(R5_ReadError, &dev->flags); + clear_bit(R5_ReWrite, &dev->flags); + } + if (!rdev || !test_bit(In_sync, &rdev->flags) + || test_bit(R5_ReadError, &dev->flags)) { if ( failed < 2 ) failed_num[failed] = i; failed++; } else set_bit(R5_Insync, &dev->flags); } + rcu_read_unlock(); PRINTK("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d,%d\n", locked, uptodate, to_read, to_write, failed, @@ -1088,6 +1119,17 @@ static void handle_stripe(struct stripe_head *sh) if (failed > 2 && to_read+to_write+written) { for (i=disks; i--; ) { int bitmap_end = 0; + + if (test_bit(R5_ReadError, &sh->dev[i].flags)) { + mdk_rdev_t *rdev; + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && test_bit(In_sync, &rdev->flags)) + /* multiple read failures in one stripe */ + md_error(conf->mddev, rdev); + rcu_read_unlock(); + } + spin_lock_irq(&conf->device_lock); /* fail all writes first */ bi = sh->dev[i].towrite; @@ -1123,7 +1165,8 @@ static void handle_stripe(struct stripe_head *sh) } /* fail any reads if this device is non-operational */ - if (!test_bit(R5_Insync, &sh->dev[i].flags)) { + if (!test_bit(R5_Insync, &sh->dev[i].flags) || + test_bit(R5_ReadError, &sh->dev[i].flags)) { bi = sh->dev[i].toread; sh->dev[i].toread = NULL; if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) @@ -1228,7 +1271,7 @@ static void handle_stripe(struct stripe_head *sh) if (uptodate == disks-1) { PRINTK("Computing stripe %llu block %d\n", (unsigned long long)sh->sector, i); - compute_block_1(sh, i); + compute_block_1(sh, i, 0); uptodate++; } else if ( uptodate == disks-2 && failed >= 2 ) { /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */ @@ -1259,9 +1302,6 @@ static void handle_stripe(struct stripe_head *sh) locked++; PRINTK("Reading block %d (sync=%d)\n", i, syncing); - if (syncing) - md_sync_acct(conf->disks[i].rdev->bdev, - STRIPE_SECTORS); } } } @@ -1323,7 +1363,7 @@ static void handle_stripe(struct stripe_head *sh) /* We have failed blocks and need to compute them */ switch ( failed ) { case 0: BUG(); - case 1: compute_block_1(sh, failed_num[0]); break; + case 1: compute_block_1(sh, failed_num[0], 0); break; case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break; default: BUG(); /* This request should have been failed? */ } @@ -1338,12 +1378,10 @@ static void handle_stripe(struct stripe_head *sh) (unsigned long long)sh->sector, i); locked++; set_bit(R5_Wantwrite, &sh->dev[i].flags); -#if 0 /**** FIX: I don't understand the logic here... ****/ - if (!test_bit(R5_Insync, &sh->dev[i].flags) - || ((i==pd_idx || i==qd_idx) && failed == 0)) /* FIX? */ - set_bit(STRIPE_INSYNC, &sh->state); -#endif } + /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ + set_bit(STRIPE_INSYNC, &sh->state); + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { atomic_dec(&conf->preread_active_stripes); if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) @@ -1356,84 +1394,119 @@ static void handle_stripe(struct stripe_head *sh) * Any reads will already have been scheduled, so we just see if enough data * is available */ - if (syncing && locked == 0 && - !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 2) { - set_bit(STRIPE_HANDLE, &sh->state); -#if 0 /* RAID-6: Don't support CHECK PARITY yet */ - if (failed == 0) { - char *pagea; - if (uptodate != disks) - BUG(); - compute_parity(sh, CHECK_PARITY); - uptodate--; - pagea = page_address(sh->dev[pd_idx].page); - if ((*(u32*)pagea) == 0 && - !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) { - /* parity is correct (on disc, not in buffer any more) */ - set_bit(STRIPE_INSYNC, &sh->state); - } - } -#endif - if (!test_bit(STRIPE_INSYNC, &sh->state)) { - int failed_needupdate[2]; - struct r5dev *adev, *bdev; - - if ( failed < 1 ) - failed_num[0] = pd_idx; - if ( failed < 2 ) - failed_num[1] = (failed_num[0] == qd_idx) ? pd_idx : qd_idx; + if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) { + int update_p = 0, update_q = 0; + struct r5dev *dev; - failed_needupdate[0] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[0]].flags); - failed_needupdate[1] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[1]].flags); + set_bit(STRIPE_HANDLE, &sh->state); - PRINTK("sync: failed=%d num=%d,%d fnu=%u%u\n", - failed, failed_num[0], failed_num[1], failed_needupdate[0], failed_needupdate[1]); + BUG_ON(failed>2); + BUG_ON(uptodate < disks); + /* Want to check and possibly repair P and Q. + * However there could be one 'failed' device, in which + * case we can only check one of them, possibly using the + * other to generate missing data + */ -#if 0 /* RAID-6: This code seems to require that CHECK_PARITY destroys the uptodateness of the parity */ - /* should be able to compute the missing block(s) and write to spare */ - if ( failed_needupdate[0] ^ failed_needupdate[1] ) { - if (uptodate+1 != disks) - BUG(); - compute_block_1(sh, failed_needupdate[0] ? failed_num[0] : failed_num[1]); - uptodate++; - } else if ( failed_needupdate[0] & failed_needupdate[1] ) { - if (uptodate+2 != disks) - BUG(); - compute_block_2(sh, failed_num[0], failed_num[1]); - uptodate += 2; + /* If !tmp_page, we cannot do the calculations, + * but as we have set STRIPE_HANDLE, we will soon be called + * by stripe_handle with a tmp_page - just wait until then. + */ + if (tmp_page) { + if (failed == q_failed) { + /* The only possible failed device holds 'Q', so it makes + * sense to check P (If anything else were failed, we would + * have used P to recreate it). + */ + compute_block_1(sh, pd_idx, 1); + if (!page_is_zero(sh->dev[pd_idx].page)) { + compute_block_1(sh,pd_idx,0); + update_p = 1; + } + } + if (!q_failed && failed < 2) { + /* q is not failed, and we didn't use it to generate + * anything, so it makes sense to check it + */ + memcpy(page_address(tmp_page), + page_address(sh->dev[qd_idx].page), + STRIPE_SIZE); + compute_parity(sh, UPDATE_PARITY); + if (memcmp(page_address(tmp_page), + page_address(sh->dev[qd_idx].page), + STRIPE_SIZE)!= 0) { + clear_bit(STRIPE_INSYNC, &sh->state); + update_q = 1; + } + } + if (update_p || update_q) { + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + update_p = update_q = 0; } -#else - compute_block_2(sh, failed_num[0], failed_num[1]); - uptodate += failed_needupdate[0] + failed_needupdate[1]; -#endif - if (uptodate != disks) - BUG(); + /* now write out any block on a failed drive, + * or P or Q if they need it + */ - PRINTK("Marking for sync stripe %llu blocks %d,%d\n", - (unsigned long long)sh->sector, failed_num[0], failed_num[1]); + if (failed == 2) { + dev = &sh->dev[failed_num[1]]; + locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + if (failed >= 1) { + dev = &sh->dev[failed_num[0]]; + locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } - /**** FIX: Should we really do both of these unconditionally? ****/ - adev = &sh->dev[failed_num[0]]; - locked += !test_bit(R5_LOCKED, &adev->flags); - set_bit(R5_LOCKED, &adev->flags); - set_bit(R5_Wantwrite, &adev->flags); - bdev = &sh->dev[failed_num[1]]; - locked += !test_bit(R5_LOCKED, &bdev->flags); - set_bit(R5_LOCKED, &bdev->flags); + if (update_p) { + dev = &sh->dev[pd_idx]; + locked ++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + if (update_q) { + dev = &sh->dev[qd_idx]; + locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } clear_bit(STRIPE_DEGRADED, &sh->state); - set_bit(R5_Wantwrite, &bdev->flags); set_bit(STRIPE_INSYNC, &sh->state); - set_bit(R5_Syncio, &adev->flags); - set_bit(R5_Syncio, &bdev->flags); } } + if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { md_done_sync(conf->mddev, STRIPE_SECTORS,1); clear_bit(STRIPE_SYNCING, &sh->state); } + /* If the failed drives are just a ReadError, then we might need + * to progress the repair/check process + */ + if (failed <= 2 && ! conf->mddev->ro) + for (i=0; i<failed;i++) { + dev = &sh->dev[failed_num[i]]; + if (test_bit(R5_ReadError, &dev->flags) + && !test_bit(R5_LOCKED, &dev->flags) + && test_bit(R5_UPTODATE, &dev->flags) + ) { + if (!test_bit(R5_ReWrite, &dev->flags)) { + set_bit(R5_Wantwrite, &dev->flags); + set_bit(R5_ReWrite, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + } else { + /* let's read it back */ + set_bit(R5_Wantread, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + } + } + } spin_unlock(&sh->lock); while ((bi=return_bi)) { @@ -1472,7 +1545,7 @@ static void handle_stripe(struct stripe_head *sh) rcu_read_unlock(); if (rdev) { - if (test_bit(R5_Syncio, &sh->dev[i].flags)) + if (syncing) md_sync_acct(rdev->bdev, STRIPE_SECTORS); bi->bi_bdev = rdev->bdev; @@ -1489,6 +1562,9 @@ static void handle_stripe(struct stripe_head *sh) bi->bi_io_vec[0].bv_offset = 0; bi->bi_size = STRIPE_SIZE; bi->bi_next = NULL; + if (rw == WRITE && + test_bit(R5_ReWrite, &sh->dev[i].flags)) + atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); generic_make_request(bi); } else { if (rw == 1) @@ -1501,7 +1577,7 @@ static void handle_stripe(struct stripe_head *sh) } } -static inline void raid6_activate_delayed(raid6_conf_t *conf) +static void raid6_activate_delayed(raid6_conf_t *conf) { if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { while (!list_empty(&conf->delayed_list)) { @@ -1517,7 +1593,7 @@ static inline void raid6_activate_delayed(raid6_conf_t *conf) } } -static inline void activate_bit_delay(raid6_conf_t *conf) +static void activate_bit_delay(raid6_conf_t *conf) { /* device_lock is held */ struct list_head head; @@ -1664,7 +1740,7 @@ static int make_request (request_queue_t *q, struct bio * bi) } finish_wait(&conf->wait_for_overlap, &w); raid6_plug_device(conf); - handle_stripe(sh); + handle_stripe(sh, NULL); release_stripe(sh); } else { /* cannot get stripe for read-ahead, just give-up */ @@ -1728,6 +1804,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return rv; } if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { /* we can skip this block, and probably more */ sync_blocks /= STRIPE_SECTORS; @@ -1765,7 +1842,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i clear_bit(STRIPE_INSYNC, &sh->state); spin_unlock(&sh->lock); - handle_stripe(sh); + handle_stripe(sh, NULL); release_stripe(sh); return STRIPE_SECTORS; @@ -1821,7 +1898,7 @@ static void raid6d (mddev_t *mddev) spin_unlock_irq(&conf->device_lock); handled++; - handle_stripe(sh); + handle_stripe(sh, conf->spare_page); release_stripe(sh); spin_lock_irq(&conf->device_lock); @@ -1848,17 +1925,19 @@ static int run(mddev_t *mddev) return -EIO; } - mddev->private = kmalloc (sizeof (raid6_conf_t) - + mddev->raid_disks * sizeof(struct disk_info), - GFP_KERNEL); + mddev->private = kzalloc(sizeof (raid6_conf_t) + + mddev->raid_disks * sizeof(struct disk_info), + GFP_KERNEL); if ((conf = mddev->private) == NULL) goto abort; - memset (conf, 0, sizeof (*conf) + mddev->raid_disks * sizeof(struct disk_info) ); conf->mddev = mddev; - if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) + if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) + goto abort; + + conf->spare_page = alloc_page(GFP_KERNEL); + if (!conf->spare_page) goto abort; - memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); spin_lock_init(&conf->device_lock); init_waitqueue_head(&conf->wait_for_stripe); @@ -1929,13 +2008,18 @@ static int run(mddev_t *mddev) goto abort; } -#if 0 /* FIX: For now */ if (mddev->degraded > 0 && mddev->recovery_cp != MaxSector) { - printk(KERN_ERR "raid6: cannot start dirty degraded array for %s\n", mdname(mddev)); - goto abort; + if (mddev->ok_start_degraded) + printk(KERN_WARNING "raid6: starting dirty degraded array:%s" + "- data corruption possible.\n", + mdname(mddev)); + else { + printk(KERN_ERR "raid6: cannot start dirty degraded array" + " for %s\n", mdname(mddev)); + goto abort; + } } -#endif { mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6"); @@ -1977,7 +2061,7 @@ static int run(mddev_t *mddev) */ { int stripe = (mddev->raid_disks-2) * mddev->chunk_size - / PAGE_CACHE_SIZE; + / PAGE_SIZE; if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) mddev->queue->backing_dev_info.ra_pages = 2 * stripe; } @@ -1985,18 +2069,14 @@ static int run(mddev_t *mddev) /* Ok, everything is just fine now */ mddev->array_size = mddev->size * (mddev->raid_disks - 2); - if (mddev->bitmap) - mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; - mddev->queue->unplug_fn = raid6_unplug_device; mddev->queue->issue_flush_fn = raid6_issue_flush; return 0; abort: if (conf) { print_raid6_conf(conf); - if (conf->stripe_hashtbl) - free_pages((unsigned long) conf->stripe_hashtbl, - HASH_PAGES_ORDER); + safe_put_page(conf->spare_page); + kfree(conf->stripe_hashtbl); kfree(conf); } mddev->private = NULL; @@ -2013,7 +2093,7 @@ static int stop (mddev_t *mddev) md_unregister_thread(mddev->thread); mddev->thread = NULL; shrink_stripes(conf); - free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); + kfree(conf->stripe_hashtbl); blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ kfree(conf); mddev->private = NULL; @@ -2040,12 +2120,13 @@ static void print_sh (struct seq_file *seq, struct stripe_head *sh) static void printall (struct seq_file *seq, raid6_conf_t *conf) { struct stripe_head *sh; + struct hlist_node *hn; int i; spin_lock_irq(&conf->device_lock); for (i = 0; i < NR_HASH; i++) { sh = conf->stripe_hashtbl[i]; - for (; sh; sh = sh->hash_next) { + hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) { if (sh->raid_conf != conf) continue; print_sh(seq, sh); @@ -2223,17 +2304,12 @@ static void raid6_quiesce(mddev_t *mddev, int state) spin_unlock_irq(&conf->device_lock); break; } - if (mddev->thread) { - if (mddev->bitmap) - mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; - else - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; - md_wakeup_thread(mddev->thread); - } } -static mdk_personality_t raid6_personality= + +static struct mdk_personality raid6_personality = { .name = "raid6", + .level = 6, .owner = THIS_MODULE, .make_request = make_request, .run = run, @@ -2248,7 +2324,7 @@ static mdk_personality_t raid6_personality= .quiesce = raid6_quiesce, }; -static int __init raid6_init (void) +static int __init raid6_init(void) { int e; @@ -2256,15 +2332,17 @@ static int __init raid6_init (void) if ( e ) return e; - return register_md_personality (RAID6, &raid6_personality); + return register_md_personality(&raid6_personality); } static void raid6_exit (void) { - unregister_md_personality (RAID6); + unregister_md_personality(&raid6_personality); } module_init(raid6_init); module_exit(raid6_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS("md-personality-8"); /* RAID6 */ +MODULE_ALIAS("md-raid6"); +MODULE_ALIAS("md-level-6"); |