From eea1bf384e05b5ab747f8530c4fba9e9e6907fff Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:27:02 +1100 Subject: md: Fix is_mddev_idle test (again). There are two problems with is_mddev_idle. 1/ sync_io is 'atomic_t' and hence 'int'. curr_events and all the rest are 'long'. So if sync_io were to wrap on a 64bit host, the value of curr_events would go very negative suddenly, and take a very long time to return to positive. So do all calculations as 'int'. That gives us plenty of precision for what we need. 2/ To initialise rdev->last_events we simply call is_mddev_idle, on the assumption that it will make sure that last_events is in a suitable range. It used to do this, but now it does not. So now we need to be more explicit about initialisation. Signed-off-by: NeilBrown --- include/linux/raid/md_k.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 9743e4dbc91..4aedb9fe2bd 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -51,7 +51,7 @@ struct mdk_rdev_s sector_t size; /* Device size (in blocks) */ mddev_t *mddev; /* RAID array if running */ - long last_events; /* IO event timestamp */ + int last_events; /* IO event timestamp */ struct block_device *bdev; /* block device handle */ -- cgit v1.2.3-70-g09d2 From ef740c372dfd80e706dbf955d4e4aedda6c0c148 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 31 Mar 2009 14:27:03 +1100 Subject: md: move headers out of include/linux/raid/ Move the headers with the local structures for the disciplines and bitmap.h into drivers/md/ so that they are more easily grepable for hacking and not far away. md.h is left where it is for now as there are some uses from the outside. Signed-off-by: Christoph Hellwig Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 2 +- drivers/md/bitmap.h | 288 +++++++++++++++++++++++++++++ drivers/md/linear.c | 2 +- drivers/md/linear.h | 31 ++++ drivers/md/md.c | 2 +- drivers/md/multipath.c | 2 +- drivers/md/multipath.h | 42 +++++ drivers/md/raid0.c | 2 +- drivers/md/raid0.h | 30 +++ drivers/md/raid1.c | 4 +- drivers/md/raid1.h | 134 ++++++++++++++ drivers/md/raid10.c | 4 +- drivers/md/raid10.h | 123 +++++++++++++ drivers/md/raid5.c | 5 +- drivers/md/raid5.h | 402 +++++++++++++++++++++++++++++++++++++++++ drivers/md/raid6.h | 2 +- include/linux/raid/bitmap.h | 288 ----------------------------- include/linux/raid/linear.h | 31 ---- include/linux/raid/multipath.h | 42 ----- include/linux/raid/raid0.h | 30 --- include/linux/raid/raid1.h | 134 -------------- include/linux/raid/raid10.h | 123 ------------- include/linux/raid/raid5.h | 402 ----------------------------------------- 23 files changed, 1062 insertions(+), 1063 deletions(-) create mode 100644 drivers/md/bitmap.h create mode 100644 drivers/md/linear.h create mode 100644 drivers/md/multipath.h create mode 100644 drivers/md/raid0.h create mode 100644 drivers/md/raid1.h create mode 100644 drivers/md/raid10.h create mode 100644 drivers/md/raid5.h delete mode 100644 include/linux/raid/bitmap.h delete mode 100644 include/linux/raid/linear.h delete mode 100644 include/linux/raid/multipath.h delete mode 100644 include/linux/raid/raid0.h delete mode 100644 include/linux/raid/raid1.h delete mode 100644 include/linux/raid/raid10.h delete mode 100644 include/linux/raid/raid5.h (limited to 'include/linux/raid') diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 27f978dfe6a..7666117738c 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include "bitmap.h" /* debug macros */ diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h new file mode 100644 index 00000000000..e98900671ca --- /dev/null +++ b/drivers/md/bitmap.h @@ -0,0 +1,288 @@ +/* + * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 + * + * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. + */ +#ifndef BITMAP_H +#define BITMAP_H 1 + +#define BITMAP_MAJOR_LO 3 +/* version 4 insists the bitmap is in little-endian order + * with version 3, it is host-endian which is non-portable + */ +#define BITMAP_MAJOR_HI 4 +#define BITMAP_MAJOR_HOSTENDIAN 3 + +#define BITMAP_MINOR 39 + +/* + * in-memory bitmap: + * + * Use 16 bit block counters to track pending writes to each "chunk". + * The 2 high order bits are special-purpose, the first is a flag indicating + * whether a resync is needed. The second is a flag indicating whether a + * resync is active. + * This means that the counter is actually 14 bits: + * + * +--------+--------+------------------------------------------------+ + * | resync | resync | counter | + * | needed | active | | + * | (0-1) | (0-1) | (0-16383) | + * +--------+--------+------------------------------------------------+ + * + * The "resync needed" bit is set when: + * a '1' bit is read from storage at startup. + * a write request fails on some drives + * a resync is aborted on a chunk with 'resync active' set + * It is cleared (and resync-active set) when a resync starts across all drives + * of the chunk. + * + * + * The "resync active" bit is set when: + * a resync is started on all drives, and resync_needed is set. + * resync_needed will be cleared (as long as resync_active wasn't already set). + * It is cleared when a resync completes. + * + * The counter counts pending write requests, plus the on-disk bit. + * When the counter is '1' and the resync bits are clear, the on-disk + * bit can be cleared aswell, thus setting the counter to 0. + * When we set a bit, or in the counter (to start a write), if the fields is + * 0, we first set the disk bit and set the counter to 1. + * + * If the counter is 0, the on-disk bit is clear and the stipe is clean + * Anything that dirties the stipe pushes the counter to 2 (at least) + * and sets the on-disk bit (lazily). + * If a periodic sweep find the counter at 2, it is decremented to 1. + * If the sweep find the counter at 1, the on-disk bit is cleared and the + * counter goes to zero. + * + * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block + * counters as a fallback when "page" memory cannot be allocated: + * + * Normal case (page memory allocated): + * + * page pointer (32-bit) + * + * [ ] ------+ + * | + * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) + * c1 c2 c2048 + * + * Hijacked case (page memory allocation failed): + * + * hijacked page pointer (32-bit) + * + * [ ][ ] (no page memory allocated) + * counter #1 (16-bit) counter #2 (16-bit) + * + */ + +#ifdef __KERNEL__ + +#define PAGE_BITS (PAGE_SIZE << 3) +#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) + +typedef __u16 bitmap_counter_t; +#define COUNTER_BITS 16 +#define COUNTER_BIT_SHIFT 4 +#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8) +#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) + +#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) +#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) +#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) +#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) +#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) +#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) + +/* how many counters per page? */ +#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) +/* same, except a shift value for more efficient bitops */ +#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) +/* same, except a mask value for more efficient bitops */ +#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) + +#define BITMAP_BLOCK_SIZE 512 +#define BITMAP_BLOCK_SHIFT 9 + +/* how many blocks per chunk? (this is variable) */ +#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT) +#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) +#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) + +/* when hijacked, the counters and bits represent even larger "chunks" */ +/* there will be 1024 chunks represented by each counter in the page pointers */ +#define PAGEPTR_BLOCK_RATIO(bitmap) \ + (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1) +#define PAGEPTR_BLOCK_SHIFT(bitmap) \ + (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) +#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) + +/* + * on-disk bitmap: + * + * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap + * file a page at a time. There's a superblock at the start of the file. + */ + +/* map chunks (bits) to file pages - offset by the size of the superblock */ +#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3)) + +#endif + +/* + * bitmap structures: + */ + +#define BITMAP_MAGIC 0x6d746962 + +/* use these for bitmap->flags and bitmap->sb->state bit-fields */ +enum bitmap_state { + BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */ + BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */ + BITMAP_HOSTENDIAN = 0x8000, +}; + +/* the superblock at the front of the bitmap file -- little endian */ +typedef struct bitmap_super_s { + __le32 magic; /* 0 BITMAP_MAGIC */ + __le32 version; /* 4 the bitmap major for now, could change... */ + __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ + __le64 events; /* 24 event counter for the bitmap (1)*/ + __le64 events_cleared;/*32 event counter when last bit cleared (2) */ + __le64 sync_size; /* 40 the size of the md device's sync range(3) */ + __le32 state; /* 48 bitmap state information */ + __le32 chunksize; /* 52 the bitmap chunk size in bytes */ + __le32 daemon_sleep; /* 56 seconds between disk flushes */ + __le32 write_behind; /* 60 number of outstanding write-behind writes */ + + __u8 pad[256 - 64]; /* set to zero */ +} bitmap_super_t; + +/* notes: + * (1) This event counter is updated before the eventcounter in the md superblock + * When a bitmap is loaded, it is only accepted if this event counter is equal + * to, or one greater than, the event counter in the superblock. + * (2) This event counter is updated when the other one is *if*and*only*if* the + * array is not degraded. As bits are not cleared when the array is degraded, + * this represents the last time that any bits were cleared. + * If a device is being added that has an event count with this value or + * higher, it is accepted as conforming to the bitmap. + * (3)This is the number of sectors represented by the bitmap, and is the range that + * resync happens across. For raid1 and raid5/6 it is the size of individual + * devices. For raid10 it is the size of the array. + */ + +#ifdef __KERNEL__ + +/* the in-memory bitmap is represented by bitmap_pages */ +struct bitmap_page { + /* + * map points to the actual memory page + */ + char *map; + /* + * in emergencies (when map cannot be alloced), hijack the map + * pointer and use it as two counters itself + */ + unsigned int hijacked:1; + /* + * count of dirty bits on the page + */ + unsigned int count:31; +}; + +/* keep track of bitmap file pages that have pending writes on them */ +struct page_list { + struct list_head list; + struct page *page; +}; + +/* the main bitmap structure - one per mddev */ +struct bitmap { + struct bitmap_page *bp; + unsigned long pages; /* total number of pages in the bitmap */ + unsigned long missing_pages; /* number of pages not yet allocated */ + + mddev_t *mddev; /* the md device that the bitmap is for */ + + int counter_bits; /* how many bits per block counter */ + + /* bitmap chunksize -- how much data does each bit represent? */ + unsigned long chunksize; + unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ + unsigned long chunks; /* total number of data chunks for the array */ + + /* We hold a count on the chunk currently being synced, and drop + * it when the last block is started. If the resync is aborted + * midway, we need to be able to drop that count, so we remember + * the counted chunk.. + */ + unsigned long syncchunk; + + __u64 events_cleared; + int need_sync; + + /* bitmap spinlock */ + spinlock_t lock; + + long offset; /* offset from superblock if file is NULL */ + struct file *file; /* backing disk file */ + struct page *sb_page; /* cached copy of the bitmap file superblock */ + struct page **filemap; /* list of cache pages for the file */ + unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ + unsigned long file_pages; /* number of pages in the file */ + int last_page_size; /* bytes in the last page */ + + unsigned long flags; + + int allclean; + + unsigned long max_write_behind; /* write-behind mode */ + atomic_t behind_writes; + + /* + * the bitmap daemon - periodically wakes up and sweeps the bitmap + * file, cleaning up bits and flushing out pages to disk as necessary + */ + unsigned long daemon_lastrun; /* jiffies of last run */ + unsigned long daemon_sleep; /* how many seconds between updates? */ + unsigned long last_end_sync; /* when we lasted called end_sync to + * update bitmap with resync progress */ + + atomic_t pending_writes; /* pending writes to the bitmap file */ + wait_queue_head_t write_wait; + wait_queue_head_t overflow_wait; + +}; + +/* the bitmap API */ + +/* these are used only by md/bitmap */ +int bitmap_create(mddev_t *mddev); +void bitmap_flush(mddev_t *mddev); +void bitmap_destroy(mddev_t *mddev); + +void bitmap_print_sb(struct bitmap *bitmap); +void bitmap_update_sb(struct bitmap *bitmap); + +int bitmap_setallbits(struct bitmap *bitmap); +void bitmap_write_all(struct bitmap *bitmap); + +void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); + +/* these are exported */ +int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int behind); +void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int success, int behind); +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded); +void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted); +void bitmap_close_sync(struct bitmap *bitmap); +void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); + +void bitmap_unplug(struct bitmap *bitmap); +void bitmap_daemon_work(struct bitmap *bitmap); +#endif + +#endif diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 09658b21847..3603ffa9edc 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -16,7 +16,7 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include +#include "linear.h" /* * find which device holds a particular offset diff --git a/drivers/md/linear.h b/drivers/md/linear.h new file mode 100644 index 00000000000..f38b9c586af --- /dev/null +++ b/drivers/md/linear.h @@ -0,0 +1,31 @@ +#ifndef _LINEAR_H +#define _LINEAR_H + +#include + +struct dev_info { + mdk_rdev_t *rdev; + sector_t num_sectors; + sector_t start_sector; +}; + +typedef struct dev_info dev_info_t; + +struct linear_private_data +{ + struct linear_private_data *prev; /* earlier version */ + dev_info_t **hash_table; + sector_t spacing; + sector_t array_sectors; + int sector_shift; /* shift before dividing + * by spacing + */ + dev_info_t disks[0]; +}; + + +typedef struct linear_private_data linear_conf_t; + +#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private) + +#endif diff --git a/drivers/md/md.c b/drivers/md/md.c index 3efc0bceada..9a3214c8585 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -34,7 +34,6 @@ #include #include -#include #include #include /* for invalidate_bdev */ #include @@ -45,6 +44,7 @@ #include #include #include +#include "bitmap.h" /* 63 partitions with the alternate major number (mdp) */ #define MdpMinorShift 6 diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index f6d08f24167..547df09a7af 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -19,7 +19,7 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include +#include "multipath.h" #define MAX_WORK_PER_DISK 128 diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h new file mode 100644 index 00000000000..6f53fc177a4 --- /dev/null +++ b/drivers/md/multipath.h @@ -0,0 +1,42 @@ +#ifndef _MULTIPATH_H +#define _MULTIPATH_H + +#include + +struct multipath_info { + mdk_rdev_t *rdev; +}; + +struct multipath_private_data { + mddev_t *mddev; + struct multipath_info *multipaths; + int raid_disks; + int working_disks; + spinlock_t device_lock; + struct list_head retry_list; + + mempool_t *pool; +}; + +typedef struct multipath_private_data multipath_conf_t; + +/* + * this is the only point in the RAID code where we violate + * C type safety. mddev->private is an 'opaque' pointer. + */ +#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private) + +/* + * this is our 'private' 'collective' MULTIPATH buffer head. + * it contains information about what kind of IO operations were started + * for this MULTIPATH operation, and about their status: + */ + +struct multipath_bh { + mddev_t *mddev; + struct bio *master_bio; + struct bio bio; + int path; + struct list_head retry_list; +}; +#endif diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c605ba80558..ef09ed04864 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -18,7 +18,7 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include +#include "raid0.h" static void raid0_unplug(struct request_queue *q) { diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h new file mode 100644 index 00000000000..fd42aa87c39 --- /dev/null +++ b/drivers/md/raid0.h @@ -0,0 +1,30 @@ +#ifndef _RAID0_H +#define _RAID0_H + +#include + +struct strip_zone +{ + sector_t zone_start; /* Zone offset in md_dev (in sectors) */ + sector_t dev_start; /* Zone offset in real dev (in sectors) */ + sector_t sectors; /* Zone size in sectors */ + int nb_dev; /* # of devices attached to the zone */ + mdk_rdev_t **dev; /* Devices attached to the zone */ +}; + +struct raid0_private_data +{ + struct strip_zone **hash_table; /* Table of indexes into strip_zone */ + struct strip_zone *strip_zone; + mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ + int nr_strip_zones; + + sector_t spacing; + int sector_shift; /* shift this before divide by spacing */ +}; + +typedef struct raid0_private_data raid0_conf_t; + +#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private) + +#endif diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e2466425d9c..bff32285f8b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -33,8 +33,8 @@ #include "dm-bio-list.h" #include -#include -#include +#include "raid1.h" +#include "bitmap.h" #define DEBUG 0 #if DEBUG diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h new file mode 100644 index 00000000000..0a9ba7c3302 --- /dev/null +++ b/drivers/md/raid1.h @@ -0,0 +1,134 @@ +#ifndef _RAID1_H +#define _RAID1_H + +#include + +typedef struct mirror_info mirror_info_t; + +struct mirror_info { + mdk_rdev_t *rdev; + sector_t head_position; +}; + +/* + * memory pools need a pointer to the mddev, so they can force an unplug + * when memory is tight, and a count of the number of drives that the + * pool was allocated for, so they know how much to allocate and free. + * mddev->raid_disks cannot be used, as it can change while a pool is active + * These two datums are stored in a kmalloced struct. + */ + +struct pool_info { + mddev_t *mddev; + int raid_disks; +}; + + +typedef struct r1bio_s r1bio_t; + +struct r1_private_data_s { + mddev_t *mddev; + mirror_info_t *mirrors; + int raid_disks; + int last_used; + sector_t next_seq_sect; + spinlock_t device_lock; + + struct list_head retry_list; + /* queue pending writes and submit them on unplug */ + struct bio_list pending_bio_list; + /* queue of writes that have been unplugged */ + struct bio_list flushing_bio_list; + + /* for use when syncing mirrors: */ + + spinlock_t resync_lock; + int nr_pending; + int nr_waiting; + int nr_queued; + int barrier; + sector_t next_resync; + int fullsync; /* set to 1 if a full sync is needed, + * (fresh device added). + * Cleared when a sync completes. + */ + + wait_queue_head_t wait_barrier; + + struct pool_info *poolinfo; + + struct page *tmppage; + + mempool_t *r1bio_pool; + mempool_t *r1buf_pool; +}; + +typedef struct r1_private_data_s conf_t; + +/* + * this is the only point in the RAID code where we violate + * C type safety. mddev->private is an 'opaque' pointer. + */ +#define mddev_to_conf(mddev) ((conf_t *) mddev->private) + +/* + * this is our 'private' RAID1 bio. + * + * it contains information about what kind of IO operations were started + * for this RAID1 operation, and about their status: + */ + +struct r1bio_s { + atomic_t remaining; /* 'have we finished' count, + * used from IRQ handlers + */ + atomic_t behind_remaining; /* number of write-behind ios remaining + * in this BehindIO request + */ + sector_t sector; + int sectors; + unsigned long state; + mddev_t *mddev; + /* + * original bio going to /dev/mdx + */ + struct bio *master_bio; + /* + * if the IO is in READ direction, then this is where we read + */ + int read_disk; + + struct list_head retry_list; + struct bitmap_update *bitmap_update; + /* + * if the IO is in WRITE direction, then multiple bios are used. + * We choose the number when they are allocated. + */ + struct bio *bios[0]; + /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ +}; + +/* when we get a read error on a read-only array, we redirect to another + * device without failing the first device, or trying to over-write to + * correct the read error. To keep track of bad blocks on a per-bio + * level, we store IO_BLOCKED in the appropriate 'bios' pointer + */ +#define IO_BLOCKED ((struct bio*)1) + +/* bits for r1bio.state */ +#define R1BIO_Uptodate 0 +#define R1BIO_IsSync 1 +#define R1BIO_Degraded 2 +#define R1BIO_BehindIO 3 +#define R1BIO_Barrier 4 +#define R1BIO_BarrierRetry 5 +/* For write-behind requests, we call bi_end_io when + * the last non-write-behind device completes, providing + * any write was successful. Otherwise we call when + * any write-behind write succeeds, otherwise we call + * with failure when last write completes (and all failed). + * Record that bi_end_io was called with this flag... + */ +#define R1BIO_Returned 6 + +#endif diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7301631abe0..f03dd70d12a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -20,8 +20,8 @@ #include "dm-bio-list.h" #include -#include -#include +#include "raid10.h" +#include "bitmap.h" /* * RAID10 provides a combination of RAID0 and RAID1 functionality. diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h new file mode 100644 index 00000000000..e9091cfeb28 --- /dev/null +++ b/drivers/md/raid10.h @@ -0,0 +1,123 @@ +#ifndef _RAID10_H +#define _RAID10_H + +#include + +typedef struct mirror_info mirror_info_t; + +struct mirror_info { + mdk_rdev_t *rdev; + sector_t head_position; +}; + +typedef struct r10bio_s r10bio_t; + +struct r10_private_data_s { + mddev_t *mddev; + mirror_info_t *mirrors; + int raid_disks; + spinlock_t device_lock; + + /* geometry */ + int near_copies; /* number of copies layed out raid0 style */ + int far_copies; /* number of copies layed out + * at large strides across drives + */ + int far_offset; /* far_copies are offset by 1 stripe + * instead of many + */ + int copies; /* near_copies * far_copies. + * must be <= raid_disks + */ + sector_t stride; /* distance between far copies. + * This is size / far_copies unless + * far_offset, in which case it is + * 1 stripe. + */ + + int chunk_shift; /* shift from chunks to sectors */ + sector_t chunk_mask; + + struct list_head retry_list; + /* queue pending writes and submit them on unplug */ + struct bio_list pending_bio_list; + + + spinlock_t resync_lock; + int nr_pending; + int nr_waiting; + int nr_queued; + int barrier; + sector_t next_resync; + int fullsync; /* set to 1 if a full sync is needed, + * (fresh device added). + * Cleared when a sync completes. + */ + + wait_queue_head_t wait_barrier; + + mempool_t *r10bio_pool; + mempool_t *r10buf_pool; + struct page *tmppage; +}; + +typedef struct r10_private_data_s conf_t; + +/* + * this is the only point in the RAID code where we violate + * C type safety. mddev->private is an 'opaque' pointer. + */ +#define mddev_to_conf(mddev) ((conf_t *) mddev->private) + +/* + * this is our 'private' RAID10 bio. + * + * it contains information about what kind of IO operations were started + * for this RAID10 operation, and about their status: + */ + +struct r10bio_s { + atomic_t remaining; /* 'have we finished' count, + * used from IRQ handlers + */ + sector_t sector; /* virtual sector number */ + int sectors; + unsigned long state; + mddev_t *mddev; + /* + * original bio going to /dev/mdx + */ + struct bio *master_bio; + /* + * if the IO is in READ direction, then this is where we read + */ + int read_slot; + + struct list_head retry_list; + /* + * if the IO is in WRITE direction, then multiple bios are used, + * one for each copy. + * When resyncing we also use one for each copy. + * When reconstructing, we use 2 bios, one for read, one for write. + * We choose the number when they are allocated. + */ + struct { + struct bio *bio; + sector_t addr; + int devnum; + } devs[0]; +}; + +/* when we get a read error on a read-only array, we redirect to another + * device without failing the first device, or trying to over-write to + * correct the read error. To keep track of bad blocks on a per-bio + * level, we store IO_BLOCKED in the appropriate 'bios' pointer + */ +#define IO_BLOCKED ((struct bio*)1) + +/* bits for r10bio.state */ +#define R10BIO_Uptodate 0 +#define R10BIO_IsSync 1 +#define R10BIO_IsRecover 2 +#define R10BIO_Degraded 3 +#endif diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a5ba080d303..f75698b1f63 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -44,10 +44,9 @@ */ #include -#include "raid6.h" - -#include #include +#include "raid6.h" +#include "bitmap.h" /* * Stripe cache diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h new file mode 100644 index 00000000000..40f1d0335c7 --- /dev/null +++ b/drivers/md/raid5.h @@ -0,0 +1,402 @@ +#ifndef _RAID5_H +#define _RAID5_H + +#include +#include + +/* + * + * Each stripe contains one buffer per disc. Each buffer can be in + * one of a number of states stored in "flags". Changes between + * these states happen *almost* exclusively under a per-stripe + * spinlock. Some very specific changes can happen in bi_end_io, and + * these are not protected by the spin lock. + * + * The flag bits that are used to represent these states are: + * R5_UPTODATE and R5_LOCKED + * + * State Empty == !UPTODATE, !LOCK + * We have no data, and there is no active request + * State Want == !UPTODATE, LOCK + * A read request is being submitted for this block + * State Dirty == UPTODATE, LOCK + * Some new data is in this buffer, and it is being written out + * State Clean == UPTODATE, !LOCK + * We have valid data which is the same as on disc + * + * The possible state transitions are: + * + * Empty -> Want - on read or write to get old data for parity calc + * Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE) + * Empty -> Clean - on compute_block when computing a block for failed drive + * Want -> Empty - on failed read + * Want -> Clean - on successful completion of read request + * Dirty -> Clean - on successful completion of write request + * Dirty -> Clean - on failed write + * Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW) + * + * The Want->Empty, Want->Clean, Dirty->Clean, transitions + * all happen in b_end_io at interrupt time. + * Each sets the Uptodate bit before releasing the Lock bit. + * This leaves one multi-stage transition: + * Want->Dirty->Clean + * This is safe because thinking that a Clean buffer is actually dirty + * will at worst delay some action, and the stripe will be scheduled + * for attention after the transition is complete. + * + * There is one possibility that is not covered by these states. That + * is if one drive has failed and there is a spare being rebuilt. We + * can't distinguish between a clean block that has been generated + * from parity calculations, and a clean block that has been + * successfully written to the spare ( or to parity when resyncing). + * To distingush these states we have a stripe bit STRIPE_INSYNC that + * is set whenever a write is scheduled to the spare, or to the parity + * disc if there is no spare. A sync request clears this bit, and + * when we find it set with no buffers locked, we know the sync is + * complete. + * + * Buffers for the md device that arrive via make_request are attached + * to the appropriate stripe in one of two lists linked on b_reqnext. + * One list (bh_read) for read requests, one (bh_write) for write. + * There should never be more than one buffer on the two lists + * together, but we are not guaranteed of that so we allow for more. + * + * If a buffer is on the read list when the associated cache buffer is + * Uptodate, the data is copied into the read buffer and it's b_end_io + * routine is called. This may happen in the end_request routine only + * if the buffer has just successfully been read. end_request should + * remove the buffers from the list and then set the Uptodate bit on + * the buffer. Other threads may do this only if they first check + * that the Uptodate bit is set. Once they have checked that they may + * take buffers off the read queue. + * + * When a buffer on the write list is committed for write it is copied + * into the cache buffer, which is then marked dirty, and moved onto a + * third list, the written list (bh_written). Once both the parity + * block and the cached buffer are successfully written, any buffer on + * a written list can be returned with b_end_io. + * + * The write list and read list both act as fifos. The read list is + * protected by the device_lock. The write and written lists are + * protected by the stripe lock. The device_lock, which can be + * claimed while the stipe lock is held, is only for list + * manipulations and will only be held for a very short time. It can + * be claimed from interrupts. + * + * + * Stripes in the stripe cache can be on one of two lists (or on + * neither). The "inactive_list" contains stripes which are not + * currently being used for any request. They can freely be reused + * for another stripe. The "handle_list" contains stripes that need + * to be handled in some way. Both of these are fifo queues. Each + * stripe is also (potentially) linked to a hash bucket in the hash + * table so that it can be found by sector number. Stripes that are + * not hashed must be on the inactive_list, and will normally be at + * the front. All stripes start life this way. + * + * The inactive_list, handle_list and hash bucket lists are all protected by the + * device_lock. + * - stripes on the inactive_list never have their stripe_lock held. + * - stripes have a reference counter. If count==0, they are on a list. + * - If a stripe might need handling, STRIPE_HANDLE is set. + * - When refcount reaches zero, then if STRIPE_HANDLE it is put on + * handle_list else inactive_list + * + * This, combined with the fact that STRIPE_HANDLE is only ever + * cleared while a stripe has a non-zero count means that if the + * refcount is 0 and STRIPE_HANDLE is set, then it is on the + * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then + * the stripe is on inactive_list. + * + * The possible transitions are: + * activate an unhashed/inactive stripe (get_active_stripe()) + * lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev + * activate a hashed, possibly active stripe (get_active_stripe()) + * lockdev check-hash if(!cnt++)unlink-stripe unlockdev + * attach a request to an active stripe (add_stripe_bh()) + * lockdev attach-buffer unlockdev + * handle a stripe (handle_stripe()) + * lockstripe clrSTRIPE_HANDLE ... + * (lockdev check-buffers unlockdev) .. + * change-state .. + * record io/ops needed unlockstripe schedule io/ops + * release an active stripe (release_stripe()) + * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev + * + * The refcount counts each thread that have activated the stripe, + * plus raid5d if it is handling it, plus one for each active request + * on a cached buffer, and plus one if the stripe is undergoing stripe + * operations. + * + * Stripe operations are performed outside the stripe lock, + * the stripe operations are: + * -copying data between the stripe cache and user application buffers + * -computing blocks to save a disk access, or to recover a missing block + * -updating the parity on a write operation (reconstruct write and + * read-modify-write) + * -checking parity correctness + * -running i/o to disk + * These operations are carried out by raid5_run_ops which uses the async_tx + * api to (optionally) offload operations to dedicated hardware engines. + * When requesting an operation handle_stripe sets the pending bit for the + * operation and increments the count. raid5_run_ops is then run whenever + * the count is non-zero. + * There are some critical dependencies between the operations that prevent some + * from being requested while another is in flight. + * 1/ Parity check operations destroy the in cache version of the parity block, + * so we prevent parity dependent operations like writes and compute_blocks + * from starting while a check is in progress. Some dma engines can perform + * the check without damaging the parity block, in these cases the parity + * block is re-marked up to date (assuming the check was successful) and is + * not re-read from disk. + * 2/ When a write operation is requested we immediately lock the affected + * blocks, and mark them as not up to date. This causes new read requests + * to be held off, as well as parity checks and compute block operations. + * 3/ Once a compute block operation has been requested handle_stripe treats + * that block as if it is up to date. raid5_run_ops guaruntees that any + * operation that is dependent on the compute block result is initiated after + * the compute block completes. + */ + +/* + * Operations state - intermediate states that are visible outside of sh->lock + * In general _idle indicates nothing is running, _run indicates a data + * processing operation is active, and _result means the data processing result + * is stable and can be acted upon. For simple operations like biofill and + * compute that only have an _idle and _run state they are indicated with + * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN) + */ +/** + * enum check_states - handles syncing / repairing a stripe + * @check_state_idle - check operations are quiesced + * @check_state_run - check operation is running + * @check_state_result - set outside lock when check result is valid + * @check_state_compute_run - check failed and we are repairing + * @check_state_compute_result - set outside lock when compute result is valid + */ +enum check_states { + check_state_idle = 0, + check_state_run, /* parity check */ + check_state_check_result, + check_state_compute_run, /* parity repair */ + check_state_compute_result, +}; + +/** + * enum reconstruct_states - handles writing or expanding a stripe + */ +enum reconstruct_states { + reconstruct_state_idle = 0, + reconstruct_state_prexor_drain_run, /* prexor-write */ + reconstruct_state_drain_run, /* write */ + reconstruct_state_run, /* expand */ + reconstruct_state_prexor_drain_result, + reconstruct_state_drain_result, + reconstruct_state_result, +}; + +struct stripe_head { + struct hlist_node hash; + struct list_head lru; /* inactive_list or handle_list */ + struct raid5_private_data *raid_conf; + sector_t sector; /* sector of this row */ + int pd_idx; /* parity disk index */ + unsigned long state; /* state flags */ + atomic_t count; /* nr of active thread/requests */ + spinlock_t lock; + int bm_seq; /* sequence number for bitmap flushes */ + int disks; /* disks in stripe */ + enum check_states check_state; + enum reconstruct_states reconstruct_state; + /* stripe_operations + * @target - STRIPE_OP_COMPUTE_BLK target + */ + struct stripe_operations { + int target; + u32 zero_sum_result; + } ops; + struct r5dev { + struct bio req; + struct bio_vec vec; + struct page *page; + struct bio *toread, *read, *towrite, *written; + sector_t sector; /* sector of this page */ + unsigned long flags; + } dev[1]; /* allocated with extra space depending of RAID geometry */ +}; + +/* stripe_head_state - collects and tracks the dynamic state of a stripe_head + * for handle_stripe. It is only valid under spin_lock(sh->lock); + */ +struct stripe_head_state { + int syncing, expanding, expanded; + int locked, uptodate, to_read, to_write, failed, written; + int to_fill, compute, req_compute, non_overwrite; + int failed_num; + unsigned long ops_request; +}; + +/* r6_state - extra state data only relevant to r6 */ +struct r6_state { + int p_failed, q_failed, qd_idx, failed_num[2]; +}; + +/* Flags */ +#define R5_UPTODATE 0 /* page contains current data */ +#define R5_LOCKED 1 /* IO has been submitted on "req" */ +#define R5_OVERWRITE 2 /* towrite covers whole page */ +/* and some that are internal to handle_stripe */ +#define R5_Insync 3 /* rdev && rdev->in_sync at start */ +#define R5_Wantread 4 /* want to schedule a read */ +#define R5_Wantwrite 5 +#define R5_Overlap 7 /* There is a pending overlapping request on this block */ +#define R5_ReadError 8 /* seen a read error here recently */ +#define R5_ReWrite 9 /* have tried to over-write the readerror */ + +#define R5_Expanded 10 /* This block now has post-expand data */ +#define R5_Wantcompute 11 /* compute_block in progress treat as + * uptodate + */ +#define R5_Wantfill 12 /* dev->toread contains a bio that needs + * filling + */ +#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ +/* + * Write method + */ +#define RECONSTRUCT_WRITE 1 +#define READ_MODIFY_WRITE 2 +/* not a write method, but a compute_parity mode */ +#define CHECK_PARITY 3 + +/* + * Stripe state + */ +#define STRIPE_HANDLE 2 +#define STRIPE_SYNCING 3 +#define STRIPE_INSYNC 4 +#define STRIPE_PREREAD_ACTIVE 5 +#define STRIPE_DELAYED 6 +#define STRIPE_DEGRADED 7 +#define STRIPE_BIT_DELAY 8 +#define STRIPE_EXPANDING 9 +#define STRIPE_EXPAND_SOURCE 10 +#define STRIPE_EXPAND_READY 11 +#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ +#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ +#define STRIPE_BIOFILL_RUN 14 +#define STRIPE_COMPUTE_RUN 15 +/* + * Operation request flags + */ +#define STRIPE_OP_BIOFILL 0 +#define STRIPE_OP_COMPUTE_BLK 1 +#define STRIPE_OP_PREXOR 2 +#define STRIPE_OP_BIODRAIN 3 +#define STRIPE_OP_POSTXOR 4 +#define STRIPE_OP_CHECK 5 + +/* + * Plugging: + * + * To improve write throughput, we need to delay the handling of some + * stripes until there has been a chance that several write requests + * for the one stripe have all been collected. + * In particular, any write request that would require pre-reading + * is put on a "delayed" queue until there are no stripes currently + * in a pre-read phase. Further, if the "delayed" queue is empty when + * a stripe is put on it then we "plug" the queue and do not process it + * until an unplug call is made. (the unplug_io_fn() is called). + * + * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add + * it to the count of prereading stripes. + * When write is initiated, or the stripe refcnt == 0 (just in case) we + * clear the PREREAD_ACTIVE flag and decrement the count + * Whenever the 'handle' queue is empty and the device is not plugged, we + * move any strips from delayed to handle and clear the DELAYED flag and set + * PREREAD_ACTIVE. + * In stripe_handle, if we find pre-reading is necessary, we do it if + * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. + * HANDLE gets cleared if stripe_handle leave nothing locked. + */ + + +struct disk_info { + mdk_rdev_t *rdev; +}; + +struct raid5_private_data { + struct hlist_head *stripe_hashtbl; + mddev_t *mddev; + struct disk_info *spare; + int chunk_size, level, algorithm; + int max_degraded; + int raid_disks; + int max_nr_stripes; + + /* used during an expand */ + sector_t expand_progress; /* MaxSector when no expand happening */ + sector_t expand_lo; /* from here up to expand_progress it out-of-bounds + * as we haven't flushed the metadata yet + */ + int previous_raid_disks; + + struct list_head handle_list; /* stripes needing handling */ + struct list_head hold_list; /* preread ready stripes */ + struct list_head delayed_list; /* stripes that have plugged requests */ + struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ + struct bio *retry_read_aligned; /* currently retrying aligned bios */ + struct bio *retry_read_aligned_list; /* aligned bios retry list */ + atomic_t preread_active_stripes; /* stripes with scheduled io */ + atomic_t active_aligned_reads; + atomic_t pending_full_writes; /* full write backlog */ + int bypass_count; /* bypassed prereads */ + int bypass_threshold; /* preread nice */ + struct list_head *last_hold; /* detect hold_list promotions */ + + atomic_t reshape_stripes; /* stripes with pending writes for reshape */ + /* unfortunately we need two cache names as we temporarily have + * two caches. + */ + int active_name; + char cache_name[2][20]; + struct kmem_cache *slab_cache; /* for allocating stripes */ + + int seq_flush, seq_write; + int quiesce; + + int fullsync; /* set to 1 if a full sync is needed, + * (fresh device added). + * Cleared when a sync completes. + */ + + struct page *spare_page; /* Used when checking P/Q in raid6 */ + + /* + * Free stripes pool + */ + atomic_t active_stripes; + struct list_head inactive_list; + wait_queue_head_t wait_for_stripe; + wait_queue_head_t wait_for_overlap; + int inactive_blocked; /* release of inactive stripes blocked, + * waiting for 25% to be free + */ + int pool_size; /* number of disks in stripeheads in pool */ + spinlock_t device_lock; + struct disk_info *disks; +}; + +typedef struct raid5_private_data raid5_conf_t; + +#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private) + +/* + * Our supported algorithms + */ +#define ALGORITHM_LEFT_ASYMMETRIC 0 +#define ALGORITHM_RIGHT_ASYMMETRIC 1 +#define ALGORITHM_LEFT_SYMMETRIC 2 +#define ALGORITHM_RIGHT_SYMMETRIC 3 + +#endif diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h index 98dcde88470..f6c13af6500 100644 --- a/drivers/md/raid6.h +++ b/drivers/md/raid6.h @@ -19,7 +19,7 @@ #define RAID6_USE_EMPTY_ZERO_PAGE 0 #include -#include +#include "raid5.h" typedef raid5_conf_t raid6_conf_t; /* Same configuration */ diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h deleted file mode 100644 index e98900671ca..00000000000 --- a/include/linux/raid/bitmap.h +++ /dev/null @@ -1,288 +0,0 @@ -/* - * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 - * - * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. - */ -#ifndef BITMAP_H -#define BITMAP_H 1 - -#define BITMAP_MAJOR_LO 3 -/* version 4 insists the bitmap is in little-endian order - * with version 3, it is host-endian which is non-portable - */ -#define BITMAP_MAJOR_HI 4 -#define BITMAP_MAJOR_HOSTENDIAN 3 - -#define BITMAP_MINOR 39 - -/* - * in-memory bitmap: - * - * Use 16 bit block counters to track pending writes to each "chunk". - * The 2 high order bits are special-purpose, the first is a flag indicating - * whether a resync is needed. The second is a flag indicating whether a - * resync is active. - * This means that the counter is actually 14 bits: - * - * +--------+--------+------------------------------------------------+ - * | resync | resync | counter | - * | needed | active | | - * | (0-1) | (0-1) | (0-16383) | - * +--------+--------+------------------------------------------------+ - * - * The "resync needed" bit is set when: - * a '1' bit is read from storage at startup. - * a write request fails on some drives - * a resync is aborted on a chunk with 'resync active' set - * It is cleared (and resync-active set) when a resync starts across all drives - * of the chunk. - * - * - * The "resync active" bit is set when: - * a resync is started on all drives, and resync_needed is set. - * resync_needed will be cleared (as long as resync_active wasn't already set). - * It is cleared when a resync completes. - * - * The counter counts pending write requests, plus the on-disk bit. - * When the counter is '1' and the resync bits are clear, the on-disk - * bit can be cleared aswell, thus setting the counter to 0. - * When we set a bit, or in the counter (to start a write), if the fields is - * 0, we first set the disk bit and set the counter to 1. - * - * If the counter is 0, the on-disk bit is clear and the stipe is clean - * Anything that dirties the stipe pushes the counter to 2 (at least) - * and sets the on-disk bit (lazily). - * If a periodic sweep find the counter at 2, it is decremented to 1. - * If the sweep find the counter at 1, the on-disk bit is cleared and the - * counter goes to zero. - * - * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block - * counters as a fallback when "page" memory cannot be allocated: - * - * Normal case (page memory allocated): - * - * page pointer (32-bit) - * - * [ ] ------+ - * | - * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) - * c1 c2 c2048 - * - * Hijacked case (page memory allocation failed): - * - * hijacked page pointer (32-bit) - * - * [ ][ ] (no page memory allocated) - * counter #1 (16-bit) counter #2 (16-bit) - * - */ - -#ifdef __KERNEL__ - -#define PAGE_BITS (PAGE_SIZE << 3) -#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) - -typedef __u16 bitmap_counter_t; -#define COUNTER_BITS 16 -#define COUNTER_BIT_SHIFT 4 -#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8) -#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) - -#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) -#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) -#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) -#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) -#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) -#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) - -/* how many counters per page? */ -#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) -/* same, except a shift value for more efficient bitops */ -#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) -/* same, except a mask value for more efficient bitops */ -#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) - -#define BITMAP_BLOCK_SIZE 512 -#define BITMAP_BLOCK_SHIFT 9 - -/* how many blocks per chunk? (this is variable) */ -#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT) -#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) -#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) - -/* when hijacked, the counters and bits represent even larger "chunks" */ -/* there will be 1024 chunks represented by each counter in the page pointers */ -#define PAGEPTR_BLOCK_RATIO(bitmap) \ - (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1) -#define PAGEPTR_BLOCK_SHIFT(bitmap) \ - (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) -#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) - -/* - * on-disk bitmap: - * - * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap - * file a page at a time. There's a superblock at the start of the file. - */ - -/* map chunks (bits) to file pages - offset by the size of the superblock */ -#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3)) - -#endif - -/* - * bitmap structures: - */ - -#define BITMAP_MAGIC 0x6d746962 - -/* use these for bitmap->flags and bitmap->sb->state bit-fields */ -enum bitmap_state { - BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */ - BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */ - BITMAP_HOSTENDIAN = 0x8000, -}; - -/* the superblock at the front of the bitmap file -- little endian */ -typedef struct bitmap_super_s { - __le32 magic; /* 0 BITMAP_MAGIC */ - __le32 version; /* 4 the bitmap major for now, could change... */ - __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ - __le64 events; /* 24 event counter for the bitmap (1)*/ - __le64 events_cleared;/*32 event counter when last bit cleared (2) */ - __le64 sync_size; /* 40 the size of the md device's sync range(3) */ - __le32 state; /* 48 bitmap state information */ - __le32 chunksize; /* 52 the bitmap chunk size in bytes */ - __le32 daemon_sleep; /* 56 seconds between disk flushes */ - __le32 write_behind; /* 60 number of outstanding write-behind writes */ - - __u8 pad[256 - 64]; /* set to zero */ -} bitmap_super_t; - -/* notes: - * (1) This event counter is updated before the eventcounter in the md superblock - * When a bitmap is loaded, it is only accepted if this event counter is equal - * to, or one greater than, the event counter in the superblock. - * (2) This event counter is updated when the other one is *if*and*only*if* the - * array is not degraded. As bits are not cleared when the array is degraded, - * this represents the last time that any bits were cleared. - * If a device is being added that has an event count with this value or - * higher, it is accepted as conforming to the bitmap. - * (3)This is the number of sectors represented by the bitmap, and is the range that - * resync happens across. For raid1 and raid5/6 it is the size of individual - * devices. For raid10 it is the size of the array. - */ - -#ifdef __KERNEL__ - -/* the in-memory bitmap is represented by bitmap_pages */ -struct bitmap_page { - /* - * map points to the actual memory page - */ - char *map; - /* - * in emergencies (when map cannot be alloced), hijack the map - * pointer and use it as two counters itself - */ - unsigned int hijacked:1; - /* - * count of dirty bits on the page - */ - unsigned int count:31; -}; - -/* keep track of bitmap file pages that have pending writes on them */ -struct page_list { - struct list_head list; - struct page *page; -}; - -/* the main bitmap structure - one per mddev */ -struct bitmap { - struct bitmap_page *bp; - unsigned long pages; /* total number of pages in the bitmap */ - unsigned long missing_pages; /* number of pages not yet allocated */ - - mddev_t *mddev; /* the md device that the bitmap is for */ - - int counter_bits; /* how many bits per block counter */ - - /* bitmap chunksize -- how much data does each bit represent? */ - unsigned long chunksize; - unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ - unsigned long chunks; /* total number of data chunks for the array */ - - /* We hold a count on the chunk currently being synced, and drop - * it when the last block is started. If the resync is aborted - * midway, we need to be able to drop that count, so we remember - * the counted chunk.. - */ - unsigned long syncchunk; - - __u64 events_cleared; - int need_sync; - - /* bitmap spinlock */ - spinlock_t lock; - - long offset; /* offset from superblock if file is NULL */ - struct file *file; /* backing disk file */ - struct page *sb_page; /* cached copy of the bitmap file superblock */ - struct page **filemap; /* list of cache pages for the file */ - unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ - unsigned long file_pages; /* number of pages in the file */ - int last_page_size; /* bytes in the last page */ - - unsigned long flags; - - int allclean; - - unsigned long max_write_behind; /* write-behind mode */ - atomic_t behind_writes; - - /* - * the bitmap daemon - periodically wakes up and sweeps the bitmap - * file, cleaning up bits and flushing out pages to disk as necessary - */ - unsigned long daemon_lastrun; /* jiffies of last run */ - unsigned long daemon_sleep; /* how many seconds between updates? */ - unsigned long last_end_sync; /* when we lasted called end_sync to - * update bitmap with resync progress */ - - atomic_t pending_writes; /* pending writes to the bitmap file */ - wait_queue_head_t write_wait; - wait_queue_head_t overflow_wait; - -}; - -/* the bitmap API */ - -/* these are used only by md/bitmap */ -int bitmap_create(mddev_t *mddev); -void bitmap_flush(mddev_t *mddev); -void bitmap_destroy(mddev_t *mddev); - -void bitmap_print_sb(struct bitmap *bitmap); -void bitmap_update_sb(struct bitmap *bitmap); - -int bitmap_setallbits(struct bitmap *bitmap); -void bitmap_write_all(struct bitmap *bitmap); - -void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); - -/* these are exported */ -int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int behind); -void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int success, int behind); -int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded); -void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted); -void bitmap_close_sync(struct bitmap *bitmap); -void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); - -void bitmap_unplug(struct bitmap *bitmap); -void bitmap_daemon_work(struct bitmap *bitmap); -#endif - -#endif diff --git a/include/linux/raid/linear.h b/include/linux/raid/linear.h deleted file mode 100644 index f38b9c586af..00000000000 --- a/include/linux/raid/linear.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef _LINEAR_H -#define _LINEAR_H - -#include - -struct dev_info { - mdk_rdev_t *rdev; - sector_t num_sectors; - sector_t start_sector; -}; - -typedef struct dev_info dev_info_t; - -struct linear_private_data -{ - struct linear_private_data *prev; /* earlier version */ - dev_info_t **hash_table; - sector_t spacing; - sector_t array_sectors; - int sector_shift; /* shift before dividing - * by spacing - */ - dev_info_t disks[0]; -}; - - -typedef struct linear_private_data linear_conf_t; - -#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private) - -#endif diff --git a/include/linux/raid/multipath.h b/include/linux/raid/multipath.h deleted file mode 100644 index 6f53fc177a4..00000000000 --- a/include/linux/raid/multipath.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef _MULTIPATH_H -#define _MULTIPATH_H - -#include - -struct multipath_info { - mdk_rdev_t *rdev; -}; - -struct multipath_private_data { - mddev_t *mddev; - struct multipath_info *multipaths; - int raid_disks; - int working_disks; - spinlock_t device_lock; - struct list_head retry_list; - - mempool_t *pool; -}; - -typedef struct multipath_private_data multipath_conf_t; - -/* - * this is the only point in the RAID code where we violate - * C type safety. mddev->private is an 'opaque' pointer. - */ -#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private) - -/* - * this is our 'private' 'collective' MULTIPATH buffer head. - * it contains information about what kind of IO operations were started - * for this MULTIPATH operation, and about their status: - */ - -struct multipath_bh { - mddev_t *mddev; - struct bio *master_bio; - struct bio bio; - int path; - struct list_head retry_list; -}; -#endif diff --git a/include/linux/raid/raid0.h b/include/linux/raid/raid0.h deleted file mode 100644 index fd42aa87c39..00000000000 --- a/include/linux/raid/raid0.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef _RAID0_H -#define _RAID0_H - -#include - -struct strip_zone -{ - sector_t zone_start; /* Zone offset in md_dev (in sectors) */ - sector_t dev_start; /* Zone offset in real dev (in sectors) */ - sector_t sectors; /* Zone size in sectors */ - int nb_dev; /* # of devices attached to the zone */ - mdk_rdev_t **dev; /* Devices attached to the zone */ -}; - -struct raid0_private_data -{ - struct strip_zone **hash_table; /* Table of indexes into strip_zone */ - struct strip_zone *strip_zone; - mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ - int nr_strip_zones; - - sector_t spacing; - int sector_shift; /* shift this before divide by spacing */ -}; - -typedef struct raid0_private_data raid0_conf_t; - -#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private) - -#endif diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h deleted file mode 100644 index 0a9ba7c3302..00000000000 --- a/include/linux/raid/raid1.h +++ /dev/null @@ -1,134 +0,0 @@ -#ifndef _RAID1_H -#define _RAID1_H - -#include - -typedef struct mirror_info mirror_info_t; - -struct mirror_info { - mdk_rdev_t *rdev; - sector_t head_position; -}; - -/* - * memory pools need a pointer to the mddev, so they can force an unplug - * when memory is tight, and a count of the number of drives that the - * pool was allocated for, so they know how much to allocate and free. - * mddev->raid_disks cannot be used, as it can change while a pool is active - * These two datums are stored in a kmalloced struct. - */ - -struct pool_info { - mddev_t *mddev; - int raid_disks; -}; - - -typedef struct r1bio_s r1bio_t; - -struct r1_private_data_s { - mddev_t *mddev; - mirror_info_t *mirrors; - int raid_disks; - int last_used; - sector_t next_seq_sect; - spinlock_t device_lock; - - struct list_head retry_list; - /* queue pending writes and submit them on unplug */ - struct bio_list pending_bio_list; - /* queue of writes that have been unplugged */ - struct bio_list flushing_bio_list; - - /* for use when syncing mirrors: */ - - spinlock_t resync_lock; - int nr_pending; - int nr_waiting; - int nr_queued; - int barrier; - sector_t next_resync; - int fullsync; /* set to 1 if a full sync is needed, - * (fresh device added). - * Cleared when a sync completes. - */ - - wait_queue_head_t wait_barrier; - - struct pool_info *poolinfo; - - struct page *tmppage; - - mempool_t *r1bio_pool; - mempool_t *r1buf_pool; -}; - -typedef struct r1_private_data_s conf_t; - -/* - * this is the only point in the RAID code where we violate - * C type safety. mddev->private is an 'opaque' pointer. - */ -#define mddev_to_conf(mddev) ((conf_t *) mddev->private) - -/* - * this is our 'private' RAID1 bio. - * - * it contains information about what kind of IO operations were started - * for this RAID1 operation, and about their status: - */ - -struct r1bio_s { - atomic_t remaining; /* 'have we finished' count, - * used from IRQ handlers - */ - atomic_t behind_remaining; /* number of write-behind ios remaining - * in this BehindIO request - */ - sector_t sector; - int sectors; - unsigned long state; - mddev_t *mddev; - /* - * original bio going to /dev/mdx - */ - struct bio *master_bio; - /* - * if the IO is in READ direction, then this is where we read - */ - int read_disk; - - struct list_head retry_list; - struct bitmap_update *bitmap_update; - /* - * if the IO is in WRITE direction, then multiple bios are used. - * We choose the number when they are allocated. - */ - struct bio *bios[0]; - /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ -}; - -/* when we get a read error on a read-only array, we redirect to another - * device without failing the first device, or trying to over-write to - * correct the read error. To keep track of bad blocks on a per-bio - * level, we store IO_BLOCKED in the appropriate 'bios' pointer - */ -#define IO_BLOCKED ((struct bio*)1) - -/* bits for r1bio.state */ -#define R1BIO_Uptodate 0 -#define R1BIO_IsSync 1 -#define R1BIO_Degraded 2 -#define R1BIO_BehindIO 3 -#define R1BIO_Barrier 4 -#define R1BIO_BarrierRetry 5 -/* For write-behind requests, we call bi_end_io when - * the last non-write-behind device completes, providing - * any write was successful. Otherwise we call when - * any write-behind write succeeds, otherwise we call - * with failure when last write completes (and all failed). - * Record that bi_end_io was called with this flag... - */ -#define R1BIO_Returned 6 - -#endif diff --git a/include/linux/raid/raid10.h b/include/linux/raid/raid10.h deleted file mode 100644 index e9091cfeb28..00000000000 --- a/include/linux/raid/raid10.h +++ /dev/null @@ -1,123 +0,0 @@ -#ifndef _RAID10_H -#define _RAID10_H - -#include - -typedef struct mirror_info mirror_info_t; - -struct mirror_info { - mdk_rdev_t *rdev; - sector_t head_position; -}; - -typedef struct r10bio_s r10bio_t; - -struct r10_private_data_s { - mddev_t *mddev; - mirror_info_t *mirrors; - int raid_disks; - spinlock_t device_lock; - - /* geometry */ - int near_copies; /* number of copies layed out raid0 style */ - int far_copies; /* number of copies layed out - * at large strides across drives - */ - int far_offset; /* far_copies are offset by 1 stripe - * instead of many - */ - int copies; /* near_copies * far_copies. - * must be <= raid_disks - */ - sector_t stride; /* distance between far copies. - * This is size / far_copies unless - * far_offset, in which case it is - * 1 stripe. - */ - - int chunk_shift; /* shift from chunks to sectors */ - sector_t chunk_mask; - - struct list_head retry_list; - /* queue pending writes and submit them on unplug */ - struct bio_list pending_bio_list; - - - spinlock_t resync_lock; - int nr_pending; - int nr_waiting; - int nr_queued; - int barrier; - sector_t next_resync; - int fullsync; /* set to 1 if a full sync is needed, - * (fresh device added). - * Cleared when a sync completes. - */ - - wait_queue_head_t wait_barrier; - - mempool_t *r10bio_pool; - mempool_t *r10buf_pool; - struct page *tmppage; -}; - -typedef struct r10_private_data_s conf_t; - -/* - * this is the only point in the RAID code where we violate - * C type safety. mddev->private is an 'opaque' pointer. - */ -#define mddev_to_conf(mddev) ((conf_t *) mddev->private) - -/* - * this is our 'private' RAID10 bio. - * - * it contains information about what kind of IO operations were started - * for this RAID10 operation, and about their status: - */ - -struct r10bio_s { - atomic_t remaining; /* 'have we finished' count, - * used from IRQ handlers - */ - sector_t sector; /* virtual sector number */ - int sectors; - unsigned long state; - mddev_t *mddev; - /* - * original bio going to /dev/mdx - */ - struct bio *master_bio; - /* - * if the IO is in READ direction, then this is where we read - */ - int read_slot; - - struct list_head retry_list; - /* - * if the IO is in WRITE direction, then multiple bios are used, - * one for each copy. - * When resyncing we also use one for each copy. - * When reconstructing, we use 2 bios, one for read, one for write. - * We choose the number when they are allocated. - */ - struct { - struct bio *bio; - sector_t addr; - int devnum; - } devs[0]; -}; - -/* when we get a read error on a read-only array, we redirect to another - * device without failing the first device, or trying to over-write to - * correct the read error. To keep track of bad blocks on a per-bio - * level, we store IO_BLOCKED in the appropriate 'bios' pointer - */ -#define IO_BLOCKED ((struct bio*)1) - -/* bits for r10bio.state */ -#define R10BIO_Uptodate 0 -#define R10BIO_IsSync 1 -#define R10BIO_IsRecover 2 -#define R10BIO_Degraded 3 -#endif diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h deleted file mode 100644 index 3b267279245..00000000000 --- a/include/linux/raid/raid5.h +++ /dev/null @@ -1,402 +0,0 @@ -#ifndef _RAID5_H -#define _RAID5_H - -#include -#include - -/* - * - * Each stripe contains one buffer per disc. Each buffer can be in - * one of a number of states stored in "flags". Changes between - * these states happen *almost* exclusively under a per-stripe - * spinlock. Some very specific changes can happen in bi_end_io, and - * these are not protected by the spin lock. - * - * The flag bits that are used to represent these states are: - * R5_UPTODATE and R5_LOCKED - * - * State Empty == !UPTODATE, !LOCK - * We have no data, and there is no active request - * State Want == !UPTODATE, LOCK - * A read request is being submitted for this block - * State Dirty == UPTODATE, LOCK - * Some new data is in this buffer, and it is being written out - * State Clean == UPTODATE, !LOCK - * We have valid data which is the same as on disc - * - * The possible state transitions are: - * - * Empty -> Want - on read or write to get old data for parity calc - * Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE) - * Empty -> Clean - on compute_block when computing a block for failed drive - * Want -> Empty - on failed read - * Want -> Clean - on successful completion of read request - * Dirty -> Clean - on successful completion of write request - * Dirty -> Clean - on failed write - * Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW) - * - * The Want->Empty, Want->Clean, Dirty->Clean, transitions - * all happen in b_end_io at interrupt time. - * Each sets the Uptodate bit before releasing the Lock bit. - * This leaves one multi-stage transition: - * Want->Dirty->Clean - * This is safe because thinking that a Clean buffer is actually dirty - * will at worst delay some action, and the stripe will be scheduled - * for attention after the transition is complete. - * - * There is one possibility that is not covered by these states. That - * is if one drive has failed and there is a spare being rebuilt. We - * can't distinguish between a clean block that has been generated - * from parity calculations, and a clean block that has been - * successfully written to the spare ( or to parity when resyncing). - * To distingush these states we have a stripe bit STRIPE_INSYNC that - * is set whenever a write is scheduled to the spare, or to the parity - * disc if there is no spare. A sync request clears this bit, and - * when we find it set with no buffers locked, we know the sync is - * complete. - * - * Buffers for the md device that arrive via make_request are attached - * to the appropriate stripe in one of two lists linked on b_reqnext. - * One list (bh_read) for read requests, one (bh_write) for write. - * There should never be more than one buffer on the two lists - * together, but we are not guaranteed of that so we allow for more. - * - * If a buffer is on the read list when the associated cache buffer is - * Uptodate, the data is copied into the read buffer and it's b_end_io - * routine is called. This may happen in the end_request routine only - * if the buffer has just successfully been read. end_request should - * remove the buffers from the list and then set the Uptodate bit on - * the buffer. Other threads may do this only if they first check - * that the Uptodate bit is set. Once they have checked that they may - * take buffers off the read queue. - * - * When a buffer on the write list is committed for write it is copied - * into the cache buffer, which is then marked dirty, and moved onto a - * third list, the written list (bh_written). Once both the parity - * block and the cached buffer are successfully written, any buffer on - * a written list can be returned with b_end_io. - * - * The write list and read list both act as fifos. The read list is - * protected by the device_lock. The write and written lists are - * protected by the stripe lock. The device_lock, which can be - * claimed while the stipe lock is held, is only for list - * manipulations and will only be held for a very short time. It can - * be claimed from interrupts. - * - * - * Stripes in the stripe cache can be on one of two lists (or on - * neither). The "inactive_list" contains stripes which are not - * currently being used for any request. They can freely be reused - * for another stripe. The "handle_list" contains stripes that need - * to be handled in some way. Both of these are fifo queues. Each - * stripe is also (potentially) linked to a hash bucket in the hash - * table so that it can be found by sector number. Stripes that are - * not hashed must be on the inactive_list, and will normally be at - * the front. All stripes start life this way. - * - * The inactive_list, handle_list and hash bucket lists are all protected by the - * device_lock. - * - stripes on the inactive_list never have their stripe_lock held. - * - stripes have a reference counter. If count==0, they are on a list. - * - If a stripe might need handling, STRIPE_HANDLE is set. - * - When refcount reaches zero, then if STRIPE_HANDLE it is put on - * handle_list else inactive_list - * - * This, combined with the fact that STRIPE_HANDLE is only ever - * cleared while a stripe has a non-zero count means that if the - * refcount is 0 and STRIPE_HANDLE is set, then it is on the - * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then - * the stripe is on inactive_list. - * - * The possible transitions are: - * activate an unhashed/inactive stripe (get_active_stripe()) - * lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev - * activate a hashed, possibly active stripe (get_active_stripe()) - * lockdev check-hash if(!cnt++)unlink-stripe unlockdev - * attach a request to an active stripe (add_stripe_bh()) - * lockdev attach-buffer unlockdev - * handle a stripe (handle_stripe()) - * lockstripe clrSTRIPE_HANDLE ... - * (lockdev check-buffers unlockdev) .. - * change-state .. - * record io/ops needed unlockstripe schedule io/ops - * release an active stripe (release_stripe()) - * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev - * - * The refcount counts each thread that have activated the stripe, - * plus raid5d if it is handling it, plus one for each active request - * on a cached buffer, and plus one if the stripe is undergoing stripe - * operations. - * - * Stripe operations are performed outside the stripe lock, - * the stripe operations are: - * -copying data between the stripe cache and user application buffers - * -computing blocks to save a disk access, or to recover a missing block - * -updating the parity on a write operation (reconstruct write and - * read-modify-write) - * -checking parity correctness - * -running i/o to disk - * These operations are carried out by raid5_run_ops which uses the async_tx - * api to (optionally) offload operations to dedicated hardware engines. - * When requesting an operation handle_stripe sets the pending bit for the - * operation and increments the count. raid5_run_ops is then run whenever - * the count is non-zero. - * There are some critical dependencies between the operations that prevent some - * from being requested while another is in flight. - * 1/ Parity check operations destroy the in cache version of the parity block, - * so we prevent parity dependent operations like writes and compute_blocks - * from starting while a check is in progress. Some dma engines can perform - * the check without damaging the parity block, in these cases the parity - * block is re-marked up to date (assuming the check was successful) and is - * not re-read from disk. - * 2/ When a write operation is requested we immediately lock the affected - * blocks, and mark them as not up to date. This causes new read requests - * to be held off, as well as parity checks and compute block operations. - * 3/ Once a compute block operation has been requested handle_stripe treats - * that block as if it is up to date. raid5_run_ops guaruntees that any - * operation that is dependent on the compute block result is initiated after - * the compute block completes. - */ - -/* - * Operations state - intermediate states that are visible outside of sh->lock - * In general _idle indicates nothing is running, _run indicates a data - * processing operation is active, and _result means the data processing result - * is stable and can be acted upon. For simple operations like biofill and - * compute that only have an _idle and _run state they are indicated with - * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN) - */ -/** - * enum check_states - handles syncing / repairing a stripe - * @check_state_idle - check operations are quiesced - * @check_state_run - check operation is running - * @check_state_result - set outside lock when check result is valid - * @check_state_compute_run - check failed and we are repairing - * @check_state_compute_result - set outside lock when compute result is valid - */ -enum check_states { - check_state_idle = 0, - check_state_run, /* parity check */ - check_state_check_result, - check_state_compute_run, /* parity repair */ - check_state_compute_result, -}; - -/** - * enum reconstruct_states - handles writing or expanding a stripe - */ -enum reconstruct_states { - reconstruct_state_idle = 0, - reconstruct_state_prexor_drain_run, /* prexor-write */ - reconstruct_state_drain_run, /* write */ - reconstruct_state_run, /* expand */ - reconstruct_state_prexor_drain_result, - reconstruct_state_drain_result, - reconstruct_state_result, -}; - -struct stripe_head { - struct hlist_node hash; - struct list_head lru; /* inactive_list or handle_list */ - struct raid5_private_data *raid_conf; - sector_t sector; /* sector of this row */ - int pd_idx; /* parity disk index */ - unsigned long state; /* state flags */ - atomic_t count; /* nr of active thread/requests */ - spinlock_t lock; - int bm_seq; /* sequence number for bitmap flushes */ - int disks; /* disks in stripe */ - enum check_states check_state; - enum reconstruct_states reconstruct_state; - /* stripe_operations - * @target - STRIPE_OP_COMPUTE_BLK target - */ - struct stripe_operations { - int target; - u32 zero_sum_result; - } ops; - struct r5dev { - struct bio req; - struct bio_vec vec; - struct page *page; - struct bio *toread, *read, *towrite, *written; - sector_t sector; /* sector of this page */ - unsigned long flags; - } dev[1]; /* allocated with extra space depending of RAID geometry */ -}; - -/* stripe_head_state - collects and tracks the dynamic state of a stripe_head - * for handle_stripe. It is only valid under spin_lock(sh->lock); - */ -struct stripe_head_state { - int syncing, expanding, expanded; - int locked, uptodate, to_read, to_write, failed, written; - int to_fill, compute, req_compute, non_overwrite; - int failed_num; - unsigned long ops_request; -}; - -/* r6_state - extra state data only relevant to r6 */ -struct r6_state { - int p_failed, q_failed, qd_idx, failed_num[2]; -}; - -/* Flags */ -#define R5_UPTODATE 0 /* page contains current data */ -#define R5_LOCKED 1 /* IO has been submitted on "req" */ -#define R5_OVERWRITE 2 /* towrite covers whole page */ -/* and some that are internal to handle_stripe */ -#define R5_Insync 3 /* rdev && rdev->in_sync at start */ -#define R5_Wantread 4 /* want to schedule a read */ -#define R5_Wantwrite 5 -#define R5_Overlap 7 /* There is a pending overlapping request on this block */ -#define R5_ReadError 8 /* seen a read error here recently */ -#define R5_ReWrite 9 /* have tried to over-write the readerror */ - -#define R5_Expanded 10 /* This block now has post-expand data */ -#define R5_Wantcompute 11 /* compute_block in progress treat as - * uptodate - */ -#define R5_Wantfill 12 /* dev->toread contains a bio that needs - * filling - */ -#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ -/* - * Write method - */ -#define RECONSTRUCT_WRITE 1 -#define READ_MODIFY_WRITE 2 -/* not a write method, but a compute_parity mode */ -#define CHECK_PARITY 3 - -/* - * Stripe state - */ -#define STRIPE_HANDLE 2 -#define STRIPE_SYNCING 3 -#define STRIPE_INSYNC 4 -#define STRIPE_PREREAD_ACTIVE 5 -#define STRIPE_DELAYED 6 -#define STRIPE_DEGRADED 7 -#define STRIPE_BIT_DELAY 8 -#define STRIPE_EXPANDING 9 -#define STRIPE_EXPAND_SOURCE 10 -#define STRIPE_EXPAND_READY 11 -#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ -#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ -#define STRIPE_BIOFILL_RUN 14 -#define STRIPE_COMPUTE_RUN 15 -/* - * Operation request flags - */ -#define STRIPE_OP_BIOFILL 0 -#define STRIPE_OP_COMPUTE_BLK 1 -#define STRIPE_OP_PREXOR 2 -#define STRIPE_OP_BIODRAIN 3 -#define STRIPE_OP_POSTXOR 4 -#define STRIPE_OP_CHECK 5 - -/* - * Plugging: - * - * To improve write throughput, we need to delay the handling of some - * stripes until there has been a chance that several write requests - * for the one stripe have all been collected. - * In particular, any write request that would require pre-reading - * is put on a "delayed" queue until there are no stripes currently - * in a pre-read phase. Further, if the "delayed" queue is empty when - * a stripe is put on it then we "plug" the queue and do not process it - * until an unplug call is made. (the unplug_io_fn() is called). - * - * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add - * it to the count of prereading stripes. - * When write is initiated, or the stripe refcnt == 0 (just in case) we - * clear the PREREAD_ACTIVE flag and decrement the count - * Whenever the 'handle' queue is empty and the device is not plugged, we - * move any strips from delayed to handle and clear the DELAYED flag and set - * PREREAD_ACTIVE. - * In stripe_handle, if we find pre-reading is necessary, we do it if - * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. - * HANDLE gets cleared if stripe_handle leave nothing locked. - */ - - -struct disk_info { - mdk_rdev_t *rdev; -}; - -struct raid5_private_data { - struct hlist_head *stripe_hashtbl; - mddev_t *mddev; - struct disk_info *spare; - int chunk_size, level, algorithm; - int max_degraded; - int raid_disks; - int max_nr_stripes; - - /* used during an expand */ - sector_t expand_progress; /* MaxSector when no expand happening */ - sector_t expand_lo; /* from here up to expand_progress it out-of-bounds - * as we haven't flushed the metadata yet - */ - int previous_raid_disks; - - struct list_head handle_list; /* stripes needing handling */ - struct list_head hold_list; /* preread ready stripes */ - struct list_head delayed_list; /* stripes that have plugged requests */ - struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ - struct bio *retry_read_aligned; /* currently retrying aligned bios */ - struct bio *retry_read_aligned_list; /* aligned bios retry list */ - atomic_t preread_active_stripes; /* stripes with scheduled io */ - atomic_t active_aligned_reads; - atomic_t pending_full_writes; /* full write backlog */ - int bypass_count; /* bypassed prereads */ - int bypass_threshold; /* preread nice */ - struct list_head *last_hold; /* detect hold_list promotions */ - - atomic_t reshape_stripes; /* stripes with pending writes for reshape */ - /* unfortunately we need two cache names as we temporarily have - * two caches. - */ - int active_name; - char cache_name[2][20]; - struct kmem_cache *slab_cache; /* for allocating stripes */ - - int seq_flush, seq_write; - int quiesce; - - int fullsync; /* set to 1 if a full sync is needed, - * (fresh device added). - * Cleared when a sync completes. - */ - - struct page *spare_page; /* Used when checking P/Q in raid6 */ - - /* - * Free stripes pool - */ - atomic_t active_stripes; - struct list_head inactive_list; - wait_queue_head_t wait_for_stripe; - wait_queue_head_t wait_for_overlap; - int inactive_blocked; /* release of inactive stripes blocked, - * waiting for 25% to be free - */ - int pool_size; /* number of disks in stripeheads in pool */ - spinlock_t device_lock; - struct disk_info *disks; -}; - -typedef struct raid5_private_data raid5_conf_t; - -#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private) - -/* - * Our supported algorithms - */ -#define ALGORITHM_LEFT_ASYMMETRIC 0 -#define ALGORITHM_RIGHT_ASYMMETRIC 1 -#define ALGORITHM_LEFT_SYMMETRIC 2 -#define ALGORITHM_RIGHT_SYMMETRIC 3 - -#endif -- cgit v1.2.3-70-g09d2 From 8b2b5c217c20b5460218ab8731295f2e46c7dd29 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:27:03 +1100 Subject: md: move LEVEL_* definition from md_k.h to md_u.h .. as they are part of the user-space interface. Also move MdpMinorShift into there so we can remove duplication. Lastly move mdp_major in. It is less obviously part of the user-space interface, but do_mounts_md.c uses it, and it is acting a bit like user-space. Signed-off-by: NeilBrown --- drivers/md/md.c | 3 --- include/linux/raid/md.h | 2 -- include/linux/raid/md_k.h | 10 ---------- include/linux/raid/md_u.h | 17 +++++++++++++++++ init/do_mounts_md.c | 2 -- 5 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include/linux/raid') diff --git a/drivers/md/md.c b/drivers/md/md.c index 9a3214c8585..96336b050b5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -46,9 +46,6 @@ #include #include "bitmap.h" -/* 63 partitions with the alternate major number (mdp) */ -#define MdpMinorShift 6 - #define DEBUG 0 #define dprintk(x...) ((void)(DEBUG && printk(x))) diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index 82bea14cae1..8bfaf6b1d30 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -52,8 +52,6 @@ */ #define MD_PATCHLEVEL_VERSION 3 -extern int mdp_major; - extern int register_md_personality(struct mdk_personality *p); extern int unregister_md_personality(struct mdk_personality *p); extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 4aedb9fe2bd..758ec2842d9 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -20,16 +20,6 @@ #ifdef CONFIG_BLOCK -#define LEVEL_MULTIPATH (-4) -#define LEVEL_LINEAR (-1) -#define LEVEL_FAULTY (-5) - -/* we need a value for 'no level specified' and 0 - * means 'raid0', so we need something else. This is - * for internal use only - */ -#define LEVEL_NONE (-1000000) - #define MaxSector (~(sector_t)0) typedef struct mddev_s mddev_t; diff --git a/include/linux/raid/md_u.h b/include/linux/raid/md_u.h index 7192035fc4b..2f824aa889f 100644 --- a/include/linux/raid/md_u.h +++ b/include/linux/raid/md_u.h @@ -46,6 +46,12 @@ #define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) #define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) +/* 63 partitions with the alternate major number (mdp) */ +#define MdpMinorShift 6 +#ifdef __KERNEL__ +extern int mdp_major; +#endif + typedef struct mdu_version_s { int major; int minor; @@ -85,6 +91,17 @@ typedef struct mdu_array_info_s { } mdu_array_info_t; +/* non-obvious values for 'level' */ +#define LEVEL_MULTIPATH (-4) +#define LEVEL_LINEAR (-1) +#define LEVEL_FAULTY (-5) + +/* we need a value for 'no level specified' and 0 + * means 'raid0', so we need something else. This is + * for internal use only + */ +#define LEVEL_NONE (-1000000) + typedef struct mdu_disk_info_s { /* * configuration/status of one particular disk diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c index 9bdddbcb3d6..23a15fb57e1 100644 --- a/init/do_mounts_md.c +++ b/init/do_mounts_md.c @@ -112,8 +112,6 @@ static int __init md_setup(char *str) return 1; } -#define MdpMinorShift 6 - static void __init md_setup_drive(void) { int minor, i, ent, partitioned; -- cgit v1.2.3-70-g09d2 From 92022950c6b1bb3da90b2976b20271cdfd98b8a3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:33:13 +1100 Subject: md: move most content from md.h to md_k.h The extern function definitions are kernel-internal definitions, so they belong in md_k.h The MD_*_VERSION values could reasonably go in a number of places, but md_u.h seems most reasonable. This leaves almost nothing in md.h. It will go soon. Signed-off-by: NeilBrown --- include/linux/raid/md.h | 40 ---------------------------------------- include/linux/raid/md_k.h | 22 ++++++++++++++++++++++ include/linux/raid/md_u.h | 18 ++++++++++++++++++ 3 files changed, 40 insertions(+), 40 deletions(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index 8bfaf6b1d30..71c4fd19c31 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -34,46 +34,6 @@ #ifdef CONFIG_MD -/* - * Different major versions are not compatible. - * Different minor versions are only downward compatible. - * Different patchlevel versions are downward and upward compatible. - */ -#define MD_MAJOR_VERSION 0 -#define MD_MINOR_VERSION 90 -/* - * MD_PATCHLEVEL_VERSION indicates kernel functionality. - * >=1 means different superblock formats are selectable using SET_ARRAY_INFO - * and major_version/minor_version accordingly - * >=2 means that Internal bitmaps are supported by setting MD_SB_BITMAP_PRESENT - * in the super status byte - * >=3 means that bitmap superblock version 4 is supported, which uses - * little-ending representation rather than host-endian - */ -#define MD_PATCHLEVEL_VERSION 3 - -extern int register_md_personality(struct mdk_personality *p); -extern int unregister_md_personality(struct mdk_personality *p); -extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), - mddev_t *mddev, const char *name); -extern void md_unregister_thread(mdk_thread_t *thread); -extern void md_wakeup_thread(mdk_thread_t *thread); -extern void md_check_recovery(mddev_t *mddev); -extern void md_write_start(mddev_t *mddev, struct bio *bi); -extern void md_write_end(mddev_t *mddev); -extern void md_done_sync(mddev_t *mddev, int blocks, int ok); -extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); - -extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, - sector_t sector, int size, struct page *page); -extern void md_super_wait(mddev_t *mddev); -extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, - struct page *page, int rw); -extern void md_do_sync(mddev_t *mddev); -extern void md_new_event(mddev_t *mddev); -extern int md_allow_write(mddev_t *mddev); -extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); - #endif /* CONFIG_MD */ #endif diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 758ec2842d9..4c5e2d00ff5 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -390,3 +390,25 @@ static inline void safe_put_page(struct page *p) #endif /* CONFIG_BLOCK */ #endif + +extern int register_md_personality(struct mdk_personality *p); +extern int unregister_md_personality(struct mdk_personality *p); +extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), + mddev_t *mddev, const char *name); +extern void md_unregister_thread(mdk_thread_t *thread); +extern void md_wakeup_thread(mdk_thread_t *thread); +extern void md_check_recovery(mddev_t *mddev); +extern void md_write_start(mddev_t *mddev, struct bio *bi); +extern void md_write_end(mddev_t *mddev); +extern void md_done_sync(mddev_t *mddev, int blocks, int ok); +extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); + +extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, + sector_t sector, int size, struct page *page); +extern void md_super_wait(mddev_t *mddev); +extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, + struct page *page, int rw); +extern void md_do_sync(mddev_t *mddev); +extern void md_new_event(mddev_t *mddev); +extern int md_allow_write(mddev_t *mddev); +extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); diff --git a/include/linux/raid/md_u.h b/include/linux/raid/md_u.h index 2f824aa889f..fb1abb3367e 100644 --- a/include/linux/raid/md_u.h +++ b/include/linux/raid/md_u.h @@ -15,6 +15,24 @@ #ifndef _MD_U_H #define _MD_U_H +/* + * Different major versions are not compatible. + * Different minor versions are only downward compatible. + * Different patchlevel versions are downward and upward compatible. + */ +#define MD_MAJOR_VERSION 0 +#define MD_MINOR_VERSION 90 +/* + * MD_PATCHLEVEL_VERSION indicates kernel functionality. + * >=1 means different superblock formats are selectable using SET_ARRAY_INFO + * and major_version/minor_version accordingly + * >=2 means that Internal bitmaps are supported by setting MD_SB_BITMAP_PRESENT + * in the super status byte + * >=3 means that bitmap superblock version 4 is supported, which uses + * little-ending representation rather than host-endian + */ +#define MD_PATCHLEVEL_VERSION 3 + /* ioctls */ /* status */ -- cgit v1.2.3-70-g09d2 From bff61975b3d6c18ee31457cc5b4d73042f44915f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:33:13 +1100 Subject: md: move lots of #include lines out of .h files and into .c This makes the includes more explicit, and is preparation for moving md_k.h to drivers/md/md.h Remove include/raid/md.h as its only remaining use was to #include other files. Signed-off-by: NeilBrown --- crypto/xor.c | 2 +- drivers/md/bitmap.c | 3 ++- drivers/md/faulty.c | 5 ++++- drivers/md/linear.c | 4 ++++ drivers/md/linear.h | 2 -- drivers/md/md.c | 6 +++++- drivers/md/multipath.c | 4 ++++ drivers/md/multipath.h | 2 -- drivers/md/raid0.c | 3 +++ drivers/md/raid0.h | 2 -- drivers/md/raid1.c | 5 ++++- drivers/md/raid1.h | 2 -- drivers/md/raid10.c | 5 ++++- drivers/md/raid10.h | 2 -- drivers/md/raid5.c | 8 ++++++-- drivers/md/raid5.h | 1 - drivers/md/raid6.h | 6 +----- fs/compat_ioctl.c | 2 +- include/linux/raid/md.h | 39 --------------------------------------- include/linux/raid/md_k.h | 3 --- include/linux/raid/xor.h | 2 -- init/do_mounts.h | 1 + init/do_mounts_md.c | 3 ++- 23 files changed, 42 insertions(+), 70 deletions(-) delete mode 100644 include/linux/raid/md.h (limited to 'include/linux/raid') diff --git a/crypto/xor.c b/crypto/xor.c index b2e6db075e4..996b6ee57d9 100644 --- a/crypto/xor.c +++ b/crypto/xor.c @@ -18,8 +18,8 @@ #define BH_TRACE 0 #include -#include #include +#include #include /* The xor routines to use. */ diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 7666117738c..1df012e9d73 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -16,6 +16,7 @@ * wait if count gets too high, wake when it drops to half. */ +#include #include #include #include @@ -26,7 +27,7 @@ #include #include #include -#include +#include #include "bitmap.h" /* debug macros */ diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 86d9adf90e7..cc5d2cf08df 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -62,7 +62,10 @@ #define ModeShift 5 #define MaxFault 50 -#include +#include +#include +#include +#include static void faulty_fail(struct bio *bio, int error) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 3603ffa9edc..c43c3b60ef0 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -16,6 +16,10 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include +#include +#include +#include #include "linear.h" /* diff --git a/drivers/md/linear.h b/drivers/md/linear.h index f38b9c586af..bf8179587f9 100644 --- a/drivers/md/linear.h +++ b/drivers/md/linear.h @@ -1,8 +1,6 @@ #ifndef _LINEAR_H #define _LINEAR_H -#include - struct dev_info { mdk_rdev_t *rdev; sector_t num_sectors; diff --git a/drivers/md/md.c b/drivers/md/md.c index 96336b050b5..11d6e0e1045 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -33,8 +33,9 @@ */ #include -#include +#include #include +#include #include /* for invalidate_bdev */ #include #include @@ -44,6 +45,9 @@ #include #include #include +#include +#include +#include #include "bitmap.h" #define DEBUG 0 diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 547df09a7af..148b3cd058b 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -19,6 +19,10 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include +#include +#include +#include #include "multipath.h" #define MAX_WORK_PER_DISK 128 diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h index 6f53fc177a4..6fa70b400cd 100644 --- a/drivers/md/multipath.h +++ b/drivers/md/multipath.h @@ -1,8 +1,6 @@ #ifndef _MULTIPATH_H #define _MULTIPATH_H -#include - struct multipath_info { mdk_rdev_t *rdev; }; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ef09ed04864..64e4c77a156 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -18,6 +18,9 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include +#include +#include #include "raid0.h" static void raid0_unplug(struct request_queue *q) diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index fd42aa87c39..824b12eb1d4 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -1,8 +1,6 @@ #ifndef _RAID0_H #define _RAID0_H -#include - struct strip_zone { sector_t zone_start; /* Zone offset in md_dev (in sectors) */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index bff32285f8b..253b09c86ec 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -31,8 +31,11 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include "dm-bio-list.h" #include +#include +#include +#include +#include "dm-bio-list.h" #include "raid1.h" #include "bitmap.h" diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 0a9ba7c3302..1620eea3d57 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -1,8 +1,6 @@ #ifndef _RAID1_H #define _RAID1_H -#include - typedef struct mirror_info mirror_info_t; struct mirror_info { diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f03dd70d12a..186e1b199d4 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -18,8 +18,11 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include "dm-bio-list.h" #include +#include +#include +#include +#include "dm-bio-list.h" #include "raid10.h" #include "bitmap.h" diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index e9091cfeb28..244dbe507a5 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -1,8 +1,6 @@ #ifndef _RAID10_H #define _RAID10_H -#include - typedef struct mirror_info mirror_info_t; struct mirror_info { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f75698b1f63..816157e7d8e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -43,8 +43,12 @@ * miss any bits. */ +#include +#include #include #include +#include +#include "raid5.h" #include "raid6.h" #include "bitmap.h" @@ -1467,7 +1471,7 @@ static void copy_data(int frombio, struct bio *bio, static void compute_parity6(struct stripe_head *sh, int method) { - raid6_conf_t *conf = sh->raid_conf; + raid5_conf_t *conf = sh->raid_conf; int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count; struct bio *chosen; /**** FIX THIS: This could be very bad if disks is close to 256 ****/ @@ -2795,7 +2799,7 @@ static bool handle_stripe5(struct stripe_head *sh) static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) { - raid6_conf_t *conf = sh->raid_conf; + raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks; struct bio *return_bi = NULL; int i, pd_idx = sh->pd_idx; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 40f1d0335c7..0ed22dff56e 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -1,7 +1,6 @@ #ifndef _RAID5_H #define _RAID5_H -#include #include /* diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h index f6c13af6500..66e6b0c6734 100644 --- a/drivers/md/raid6.h +++ b/drivers/md/raid6.h @@ -17,11 +17,7 @@ /* Set to 1 to use kernel-wide empty_zero_page */ #define RAID6_USE_EMPTY_ZERO_PAGE 0 - -#include -#include "raid5.h" - -typedef raid5_conf_t raid6_conf_t; /* Same configuration */ +#include /* Additional compute_parity mode -- updates the parity w/o LOCKING */ #define UPDATE_PARITY 4 diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 45e59d3c7f1..141c0382915 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h deleted file mode 100644 index 71c4fd19c31..00000000000 --- a/include/linux/raid/md.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - md.h : Multiple Devices driver for Linux - Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman - Copyright (C) 1994-96 Marc ZYNGIER - or - - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -#ifndef _MD_H -#define _MD_H - -#include -#include - -/* - * 'md_p.h' holds the 'physical' layout of RAID devices - * 'md_u.h' holds the user <=> kernel API - * - * 'md_k.h' holds kernel internal definitions - */ - -#include -#include -#include - -#ifdef CONFIG_MD - -#endif /* CONFIG_MD */ -#endif - diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 4c5e2d00ff5..e78b3c1d55f 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -15,9 +15,6 @@ #ifndef _MD_K_H #define _MD_K_H -/* and dm-bio-list.h is not under include/linux because.... ??? */ -#include "../../../drivers/md/dm-bio-list.h" - #ifdef CONFIG_BLOCK #define MaxSector (~(sector_t)0) diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h index 3e120587ead..5a210959e3f 100644 --- a/include/linux/raid/xor.h +++ b/include/linux/raid/xor.h @@ -1,8 +1,6 @@ #ifndef _XOR_H #define _XOR_H -#include - #define MAX_XOR_BLOCKS 4 extern void xor_blocks(unsigned int count, unsigned int bytes, diff --git a/init/do_mounts.h b/init/do_mounts.h index 9aa968d5432..f5b978a9bb9 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -1,4 +1,5 @@ #include +#include #include #include #include diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c index 23a15fb57e1..69aebbf8fd2 100644 --- a/init/do_mounts_md.c +++ b/init/do_mounts_md.c @@ -1,5 +1,6 @@ #include -#include +#include +#include #include "do_mounts.h" -- cgit v1.2.3-70-g09d2 From 43b2e5d86d8bdd77386226db0bc961529492c043 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Mar 2009 14:33:13 +1100 Subject: md: move md_k.h from include/linux/raid/ to drivers/md/ It really is nicer to keep related code together.. Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 2 +- drivers/md/faulty.c | 2 +- drivers/md/linear.c | 2 +- drivers/md/md.c | 2 +- drivers/md/md.h | 411 ++++++++++++++++++++++++++++++++++++++++++++++ drivers/md/multipath.c | 2 +- drivers/md/raid0.c | 2 +- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 2 +- drivers/md/raid5.c | 2 +- include/linux/raid/md_k.h | 411 ---------------------------------------------- 11 files changed, 420 insertions(+), 420 deletions(-) create mode 100644 drivers/md/md.h delete mode 100644 include/linux/raid/md_k.h (limited to 'include/linux/raid') diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 1df012e9d73..623292a5473 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include "md.h" #include "bitmap.h" /* debug macros */ diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index cc5d2cf08df..7b66b9fca29 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -64,7 +64,7 @@ #define MaxFault 50 #include #include -#include +#include "md.h" #include diff --git a/drivers/md/linear.c b/drivers/md/linear.c index c43c3b60ef0..f2488343ed4 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -18,8 +18,8 @@ #include #include -#include #include +#include "md.h" #include "linear.h" /* diff --git a/drivers/md/md.c b/drivers/md/md.c index 11d6e0e1045..aad0ac54bf9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -45,9 +45,9 @@ #include #include #include -#include #include #include +#include "md.h" #include "bitmap.h" #define DEBUG 0 diff --git a/drivers/md/md.h b/drivers/md/md.h new file mode 100644 index 00000000000..e78b3c1d55f --- /dev/null +++ b/drivers/md/md.h @@ -0,0 +1,411 @@ +/* + md_k.h : kernel internal structure of the Linux MD driver + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_K_H +#define _MD_K_H + +#ifdef CONFIG_BLOCK + +#define MaxSector (~(sector_t)0) + +typedef struct mddev_s mddev_t; +typedef struct mdk_rdev_s mdk_rdev_t; + +/* + * options passed in raidrun: + */ + +/* Currently this must fit in an 'int' */ +#define MAX_CHUNK_SIZE (1<<30) + +/* + * MD's 'extended' device + */ +struct mdk_rdev_s +{ + struct list_head same_set; /* RAID devices within the same set */ + + sector_t size; /* Device size (in blocks) */ + mddev_t *mddev; /* RAID array if running */ + int last_events; /* IO event timestamp */ + + struct block_device *bdev; /* block device handle */ + + struct page *sb_page; + int sb_loaded; + __u64 sb_events; + sector_t data_offset; /* start of data in array */ + sector_t sb_start; /* offset of the super block (in 512byte sectors) */ + int sb_size; /* bytes in the superblock */ + int preferred_minor; /* autorun support */ + + struct kobject kobj; + + /* A device can be in one of three states based on two flags: + * Not working: faulty==1 in_sync==0 + * Fully working: faulty==0 in_sync==1 + * Working, but not + * in sync with array + * faulty==0 in_sync==0 + * + * It can never have faulty==1, in_sync==1 + * This reduces the burden of testing multiple flags in many cases + */ + + unsigned long flags; +#define Faulty 1 /* device is known to have a fault */ +#define In_sync 2 /* device is in_sync with rest of array */ +#define WriteMostly 4 /* Avoid reading if at all possible */ +#define BarriersNotsupp 5 /* BIO_RW_BARRIER is not supported */ +#define AllReserved 6 /* If whole device is reserved for + * one array */ +#define AutoDetected 7 /* added by auto-detect */ +#define Blocked 8 /* An error occured on an externally + * managed array, don't allow writes + * until it is cleared */ +#define StateChanged 9 /* Faulty or Blocked has changed during + * interrupt, so it needs to be + * notified by the thread */ + wait_queue_head_t blocked_wait; + + int desc_nr; /* descriptor index in the superblock */ + int raid_disk; /* role of device in array */ + int saved_raid_disk; /* role that device used to have in the + * array and could again if we did a partial + * resync from the bitmap + */ + sector_t recovery_offset;/* If this device has been partially + * recovered, this is where we were + * up to. + */ + + atomic_t nr_pending; /* number of pending requests. + * only maintained for arrays that + * support hot removal + */ + atomic_t read_errors; /* number of consecutive read errors that + * we have tried to ignore. + */ + atomic_t corrected_errors; /* number of corrected read errors, + * for reporting to userspace and storing + * in superblock. + */ + struct work_struct del_work; /* used for delayed sysfs removal */ + + struct sysfs_dirent *sysfs_state; /* handle for 'state' + * sysfs entry */ +}; + +struct mddev_s +{ + void *private; + struct mdk_personality *pers; + dev_t unit; + int md_minor; + struct list_head disks; + unsigned long flags; +#define MD_CHANGE_DEVS 0 /* Some device status has changed */ +#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ +#define MD_CHANGE_PENDING 2 /* superblock update in progress */ + + int ro; + + struct gendisk *gendisk; + + struct kobject kobj; + int hold_active; +#define UNTIL_IOCTL 1 +#define UNTIL_STOP 2 + + /* Superblock information */ + int major_version, + minor_version, + patch_version; + int persistent; + int external; /* metadata is + * managed externally */ + char metadata_type[17]; /* externally set*/ + int chunk_size; + time_t ctime, utime; + int level, layout; + char clevel[16]; + int raid_disks; + int max_disks; + sector_t size; /* used size of component devices */ + sector_t array_sectors; /* exported array size */ + __u64 events; + + char uuid[16]; + + /* If the array is being reshaped, we need to record the + * new shape and an indication of where we are up to. + * This is written to the superblock. + * If reshape_position is MaxSector, then no reshape is happening (yet). + */ + sector_t reshape_position; + int delta_disks, new_level, new_layout, new_chunk; + + struct mdk_thread_s *thread; /* management thread */ + struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ + sector_t curr_resync; /* last block scheduled */ + unsigned long resync_mark; /* a recent timestamp */ + sector_t resync_mark_cnt;/* blocks written at resync_mark */ + sector_t curr_mark_cnt; /* blocks scheduled now */ + + sector_t resync_max_sectors; /* may be set by personality */ + + sector_t resync_mismatches; /* count of sectors where + * parity/replica mismatch found + */ + + /* allow user-space to request suspension of IO to regions of the array */ + sector_t suspend_lo; + sector_t suspend_hi; + /* if zero, use the system-wide default */ + int sync_speed_min; + int sync_speed_max; + + /* resync even though the same disks are shared among md-devices */ + int parallel_resync; + + int ok_start_degraded; + /* recovery/resync flags + * NEEDED: we might need to start a resync/recover + * RUNNING: a thread is running, or about to be started + * SYNC: actually doing a resync, not a recovery + * RECOVER: doing recovery, or need to try it. + * INTR: resync needs to be aborted for some reason + * DONE: thread is done and is waiting to be reaped + * REQUEST: user-space has requested a sync (used with SYNC) + * CHECK: user-space request for for check-only, no repair + * RESHAPE: A reshape is happening + * + * If neither SYNC or RESHAPE are set, then it is a recovery. + */ +#define MD_RECOVERY_RUNNING 0 +#define MD_RECOVERY_SYNC 1 +#define MD_RECOVERY_RECOVER 2 +#define MD_RECOVERY_INTR 3 +#define MD_RECOVERY_DONE 4 +#define MD_RECOVERY_NEEDED 5 +#define MD_RECOVERY_REQUESTED 6 +#define MD_RECOVERY_CHECK 7 +#define MD_RECOVERY_RESHAPE 8 +#define MD_RECOVERY_FROZEN 9 + + unsigned long recovery; + int recovery_disabled; /* if we detect that recovery + * will always fail, set this + * so we don't loop trying */ + + int in_sync; /* know to not need resync */ + struct mutex reconfig_mutex; + atomic_t active; /* general refcount */ + atomic_t openers; /* number of active opens */ + + int changed; /* true if we might need to reread partition info */ + int degraded; /* whether md should consider + * adding a spare + */ + int barriers_work; /* initialised to true, cleared as soon + * as a barrier request to slave + * fails. Only supported + */ + struct bio *biolist; /* bios that need to be retried + * because BIO_RW_BARRIER is not supported + */ + + atomic_t recovery_active; /* blocks scheduled, but not written */ + wait_queue_head_t recovery_wait; + sector_t recovery_cp; + sector_t resync_min; /* user requested sync + * starts here */ + sector_t resync_max; /* resync should pause + * when it gets here */ + + struct sysfs_dirent *sysfs_state; /* handle for 'array_state' + * file in sysfs. + */ + struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */ + + struct work_struct del_work; /* used for delayed sysfs removal */ + + spinlock_t write_lock; + wait_queue_head_t sb_wait; /* for waiting on superblock updates */ + atomic_t pending_writes; /* number of active superblock writes */ + + unsigned int safemode; /* if set, update "clean" superblock + * when no writes pending. + */ + unsigned int safemode_delay; + struct timer_list safemode_timer; + atomic_t writes_pending; + struct request_queue *queue; /* for plugging ... */ + + atomic_t write_behind; /* outstanding async IO */ + unsigned int max_write_behind; /* 0 = sync */ + + struct bitmap *bitmap; /* the bitmap for the device */ + struct file *bitmap_file; /* the bitmap file */ + long bitmap_offset; /* offset from superblock of + * start of bitmap. May be + * negative, but not '0' + */ + long default_bitmap_offset; /* this is the offset to use when + * hot-adding a bitmap. It should + * eventually be settable by sysfs. + */ + + struct list_head all_mddevs; +}; + + +static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev) +{ + int faulty = test_bit(Faulty, &rdev->flags); + if (atomic_dec_and_test(&rdev->nr_pending) && faulty) + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +} + +static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) +{ + atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); +} + +struct mdk_personality +{ + char *name; + int level; + struct list_head list; + struct module *owner; + int (*make_request)(struct request_queue *q, struct bio *bio); + int (*run)(mddev_t *mddev); + int (*stop)(mddev_t *mddev); + void (*status)(struct seq_file *seq, mddev_t *mddev); + /* error_handler must set ->faulty and clear ->in_sync + * if appropriate, and should abort recovery if needed + */ + void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev); + int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev); + int (*hot_remove_disk) (mddev_t *mddev, int number); + int (*spare_active) (mddev_t *mddev); + sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster); + int (*resize) (mddev_t *mddev, sector_t sectors); + int (*check_reshape) (mddev_t *mddev); + int (*start_reshape) (mddev_t *mddev); + int (*reconfig) (mddev_t *mddev, int layout, int chunk_size); + /* quiesce moves between quiescence states + * 0 - fully active + * 1 - no new requests allowed + * others - reserved + */ + void (*quiesce) (mddev_t *mddev, int state); +}; + + +struct md_sysfs_entry { + struct attribute attr; + ssize_t (*show)(mddev_t *, char *); + ssize_t (*store)(mddev_t *, const char *, size_t); +}; + + +static inline char * mdname (mddev_t * mddev) +{ + return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; +} + +/* + * iterates through some rdev ringlist. It's safe to remove the + * current 'rdev'. Dont touch 'tmp' though. + */ +#define rdev_for_each_list(rdev, tmp, head) \ + list_for_each_entry_safe(rdev, tmp, head, same_set) + +/* + * iterates through the 'same array disks' ringlist + */ +#define rdev_for_each(rdev, tmp, mddev) \ + list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) + +#define rdev_for_each_rcu(rdev, mddev) \ + list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) + +typedef struct mdk_thread_s { + void (*run) (mddev_t *mddev); + mddev_t *mddev; + wait_queue_head_t wqueue; + unsigned long flags; + struct task_struct *tsk; + unsigned long timeout; +} mdk_thread_t; + +#define THREAD_WAKEUP 0 + +#define __wait_event_lock_irq(wq, condition, lock, cmd) \ +do { \ + wait_queue_t __wait; \ + init_waitqueue_entry(&__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + if (condition) \ + break; \ + spin_unlock_irq(&lock); \ + cmd; \ + schedule(); \ + spin_lock_irq(&lock); \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#define wait_event_lock_irq(wq, condition, lock, cmd) \ +do { \ + if (condition) \ + break; \ + __wait_event_lock_irq(wq, condition, lock, cmd); \ +} while (0) + +static inline void safe_put_page(struct page *p) +{ + if (p) put_page(p); +} + +#endif /* CONFIG_BLOCK */ +#endif + + +extern int register_md_personality(struct mdk_personality *p); +extern int unregister_md_personality(struct mdk_personality *p); +extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), + mddev_t *mddev, const char *name); +extern void md_unregister_thread(mdk_thread_t *thread); +extern void md_wakeup_thread(mdk_thread_t *thread); +extern void md_check_recovery(mddev_t *mddev); +extern void md_write_start(mddev_t *mddev, struct bio *bi); +extern void md_write_end(mddev_t *mddev); +extern void md_done_sync(mddev_t *mddev, int blocks, int ok); +extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); + +extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, + sector_t sector, int size, struct page *page); +extern void md_super_wait(mddev_t *mddev); +extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, + struct page *page, int rw); +extern void md_do_sync(mddev_t *mddev); +extern void md_new_event(mddev_t *mddev); +extern int md_allow_write(mddev_t *mddev); +extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 148b3cd058b..0ed1005afb5 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -21,8 +21,8 @@ #include #include -#include #include +#include "md.h" #include "multipath.h" #define MAX_WORK_PER_DISK 128 diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 64e4c77a156..3d06df86da8 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -19,8 +19,8 @@ */ #include -#include #include +#include "md.h" #include "raid0.h" static void raid0_unplug(struct request_queue *q) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 253b09c86ec..051ecfa6151 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -33,8 +33,8 @@ #include #include -#include #include +#include "md.h" #include "dm-bio-list.h" #include "raid1.h" #include "bitmap.h" diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 186e1b199d4..fea61e3dcd9 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -20,8 +20,8 @@ #include #include -#include #include +#include "md.h" #include "dm-bio-list.h" #include "raid10.h" #include "bitmap.h" diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 816157e7d8e..849478e9afd 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -44,10 +44,10 @@ */ #include -#include #include #include #include +#include "md.h" #include "raid5.h" #include "raid6.h" #include "bitmap.h" diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h deleted file mode 100644 index e78b3c1d55f..00000000000 --- a/include/linux/raid/md_k.h +++ /dev/null @@ -1,411 +0,0 @@ -/* - md_k.h : kernel internal structure of the Linux MD driver - Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -#ifndef _MD_K_H -#define _MD_K_H - -#ifdef CONFIG_BLOCK - -#define MaxSector (~(sector_t)0) - -typedef struct mddev_s mddev_t; -typedef struct mdk_rdev_s mdk_rdev_t; - -/* - * options passed in raidrun: - */ - -/* Currently this must fit in an 'int' */ -#define MAX_CHUNK_SIZE (1<<30) - -/* - * MD's 'extended' device - */ -struct mdk_rdev_s -{ - struct list_head same_set; /* RAID devices within the same set */ - - sector_t size; /* Device size (in blocks) */ - mddev_t *mddev; /* RAID array if running */ - int last_events; /* IO event timestamp */ - - struct block_device *bdev; /* block device handle */ - - struct page *sb_page; - int sb_loaded; - __u64 sb_events; - sector_t data_offset; /* start of data in array */ - sector_t sb_start; /* offset of the super block (in 512byte sectors) */ - int sb_size; /* bytes in the superblock */ - int preferred_minor; /* autorun support */ - - struct kobject kobj; - - /* A device can be in one of three states based on two flags: - * Not working: faulty==1 in_sync==0 - * Fully working: faulty==0 in_sync==1 - * Working, but not - * in sync with array - * faulty==0 in_sync==0 - * - * It can never have faulty==1, in_sync==1 - * This reduces the burden of testing multiple flags in many cases - */ - - unsigned long flags; -#define Faulty 1 /* device is known to have a fault */ -#define In_sync 2 /* device is in_sync with rest of array */ -#define WriteMostly 4 /* Avoid reading if at all possible */ -#define BarriersNotsupp 5 /* BIO_RW_BARRIER is not supported */ -#define AllReserved 6 /* If whole device is reserved for - * one array */ -#define AutoDetected 7 /* added by auto-detect */ -#define Blocked 8 /* An error occured on an externally - * managed array, don't allow writes - * until it is cleared */ -#define StateChanged 9 /* Faulty or Blocked has changed during - * interrupt, so it needs to be - * notified by the thread */ - wait_queue_head_t blocked_wait; - - int desc_nr; /* descriptor index in the superblock */ - int raid_disk; /* role of device in array */ - int saved_raid_disk; /* role that device used to have in the - * array and could again if we did a partial - * resync from the bitmap - */ - sector_t recovery_offset;/* If this device has been partially - * recovered, this is where we were - * up to. - */ - - atomic_t nr_pending; /* number of pending requests. - * only maintained for arrays that - * support hot removal - */ - atomic_t read_errors; /* number of consecutive read errors that - * we have tried to ignore. - */ - atomic_t corrected_errors; /* number of corrected read errors, - * for reporting to userspace and storing - * in superblock. - */ - struct work_struct del_work; /* used for delayed sysfs removal */ - - struct sysfs_dirent *sysfs_state; /* handle for 'state' - * sysfs entry */ -}; - -struct mddev_s -{ - void *private; - struct mdk_personality *pers; - dev_t unit; - int md_minor; - struct list_head disks; - unsigned long flags; -#define MD_CHANGE_DEVS 0 /* Some device status has changed */ -#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ -#define MD_CHANGE_PENDING 2 /* superblock update in progress */ - - int ro; - - struct gendisk *gendisk; - - struct kobject kobj; - int hold_active; -#define UNTIL_IOCTL 1 -#define UNTIL_STOP 2 - - /* Superblock information */ - int major_version, - minor_version, - patch_version; - int persistent; - int external; /* metadata is - * managed externally */ - char metadata_type[17]; /* externally set*/ - int chunk_size; - time_t ctime, utime; - int level, layout; - char clevel[16]; - int raid_disks; - int max_disks; - sector_t size; /* used size of component devices */ - sector_t array_sectors; /* exported array size */ - __u64 events; - - char uuid[16]; - - /* If the array is being reshaped, we need to record the - * new shape and an indication of where we are up to. - * This is written to the superblock. - * If reshape_position is MaxSector, then no reshape is happening (yet). - */ - sector_t reshape_position; - int delta_disks, new_level, new_layout, new_chunk; - - struct mdk_thread_s *thread; /* management thread */ - struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ - sector_t curr_resync; /* last block scheduled */ - unsigned long resync_mark; /* a recent timestamp */ - sector_t resync_mark_cnt;/* blocks written at resync_mark */ - sector_t curr_mark_cnt; /* blocks scheduled now */ - - sector_t resync_max_sectors; /* may be set by personality */ - - sector_t resync_mismatches; /* count of sectors where - * parity/replica mismatch found - */ - - /* allow user-space to request suspension of IO to regions of the array */ - sector_t suspend_lo; - sector_t suspend_hi; - /* if zero, use the system-wide default */ - int sync_speed_min; - int sync_speed_max; - - /* resync even though the same disks are shared among md-devices */ - int parallel_resync; - - int ok_start_degraded; - /* recovery/resync flags - * NEEDED: we might need to start a resync/recover - * RUNNING: a thread is running, or about to be started - * SYNC: actually doing a resync, not a recovery - * RECOVER: doing recovery, or need to try it. - * INTR: resync needs to be aborted for some reason - * DONE: thread is done and is waiting to be reaped - * REQUEST: user-space has requested a sync (used with SYNC) - * CHECK: user-space request for for check-only, no repair - * RESHAPE: A reshape is happening - * - * If neither SYNC or RESHAPE are set, then it is a recovery. - */ -#define MD_RECOVERY_RUNNING 0 -#define MD_RECOVERY_SYNC 1 -#define MD_RECOVERY_RECOVER 2 -#define MD_RECOVERY_INTR 3 -#define MD_RECOVERY_DONE 4 -#define MD_RECOVERY_NEEDED 5 -#define MD_RECOVERY_REQUESTED 6 -#define MD_RECOVERY_CHECK 7 -#define MD_RECOVERY_RESHAPE 8 -#define MD_RECOVERY_FROZEN 9 - - unsigned long recovery; - int recovery_disabled; /* if we detect that recovery - * will always fail, set this - * so we don't loop trying */ - - int in_sync; /* know to not need resync */ - struct mutex reconfig_mutex; - atomic_t active; /* general refcount */ - atomic_t openers; /* number of active opens */ - - int changed; /* true if we might need to reread partition info */ - int degraded; /* whether md should consider - * adding a spare - */ - int barriers_work; /* initialised to true, cleared as soon - * as a barrier request to slave - * fails. Only supported - */ - struct bio *biolist; /* bios that need to be retried - * because BIO_RW_BARRIER is not supported - */ - - atomic_t recovery_active; /* blocks scheduled, but not written */ - wait_queue_head_t recovery_wait; - sector_t recovery_cp; - sector_t resync_min; /* user requested sync - * starts here */ - sector_t resync_max; /* resync should pause - * when it gets here */ - - struct sysfs_dirent *sysfs_state; /* handle for 'array_state' - * file in sysfs. - */ - struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */ - - struct work_struct del_work; /* used for delayed sysfs removal */ - - spinlock_t write_lock; - wait_queue_head_t sb_wait; /* for waiting on superblock updates */ - atomic_t pending_writes; /* number of active superblock writes */ - - unsigned int safemode; /* if set, update "clean" superblock - * when no writes pending. - */ - unsigned int safemode_delay; - struct timer_list safemode_timer; - atomic_t writes_pending; - struct request_queue *queue; /* for plugging ... */ - - atomic_t write_behind; /* outstanding async IO */ - unsigned int max_write_behind; /* 0 = sync */ - - struct bitmap *bitmap; /* the bitmap for the device */ - struct file *bitmap_file; /* the bitmap file */ - long bitmap_offset; /* offset from superblock of - * start of bitmap. May be - * negative, but not '0' - */ - long default_bitmap_offset; /* this is the offset to use when - * hot-adding a bitmap. It should - * eventually be settable by sysfs. - */ - - struct list_head all_mddevs; -}; - - -static inline void rdev_dec_pending(mdk_rdev_t *rdev, mddev_t *mddev) -{ - int faulty = test_bit(Faulty, &rdev->flags); - if (atomic_dec_and_test(&rdev->nr_pending) && faulty) - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); -} - -static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) -{ - atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); -} - -struct mdk_personality -{ - char *name; - int level; - struct list_head list; - struct module *owner; - int (*make_request)(struct request_queue *q, struct bio *bio); - int (*run)(mddev_t *mddev); - int (*stop)(mddev_t *mddev); - void (*status)(struct seq_file *seq, mddev_t *mddev); - /* error_handler must set ->faulty and clear ->in_sync - * if appropriate, and should abort recovery if needed - */ - void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev); - int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev); - int (*hot_remove_disk) (mddev_t *mddev, int number); - int (*spare_active) (mddev_t *mddev); - sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster); - int (*resize) (mddev_t *mddev, sector_t sectors); - int (*check_reshape) (mddev_t *mddev); - int (*start_reshape) (mddev_t *mddev); - int (*reconfig) (mddev_t *mddev, int layout, int chunk_size); - /* quiesce moves between quiescence states - * 0 - fully active - * 1 - no new requests allowed - * others - reserved - */ - void (*quiesce) (mddev_t *mddev, int state); -}; - - -struct md_sysfs_entry { - struct attribute attr; - ssize_t (*show)(mddev_t *, char *); - ssize_t (*store)(mddev_t *, const char *, size_t); -}; - - -static inline char * mdname (mddev_t * mddev) -{ - return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; -} - -/* - * iterates through some rdev ringlist. It's safe to remove the - * current 'rdev'. Dont touch 'tmp' though. - */ -#define rdev_for_each_list(rdev, tmp, head) \ - list_for_each_entry_safe(rdev, tmp, head, same_set) - -/* - * iterates through the 'same array disks' ringlist - */ -#define rdev_for_each(rdev, tmp, mddev) \ - list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) - -#define rdev_for_each_rcu(rdev, mddev) \ - list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) - -typedef struct mdk_thread_s { - void (*run) (mddev_t *mddev); - mddev_t *mddev; - wait_queue_head_t wqueue; - unsigned long flags; - struct task_struct *tsk; - unsigned long timeout; -} mdk_thread_t; - -#define THREAD_WAKEUP 0 - -#define __wait_event_lock_irq(wq, condition, lock, cmd) \ -do { \ - wait_queue_t __wait; \ - init_waitqueue_entry(&__wait, current); \ - \ - add_wait_queue(&wq, &__wait); \ - for (;;) { \ - set_current_state(TASK_UNINTERRUPTIBLE); \ - if (condition) \ - break; \ - spin_unlock_irq(&lock); \ - cmd; \ - schedule(); \ - spin_lock_irq(&lock); \ - } \ - current->state = TASK_RUNNING; \ - remove_wait_queue(&wq, &__wait); \ -} while (0) - -#define wait_event_lock_irq(wq, condition, lock, cmd) \ -do { \ - if (condition) \ - break; \ - __wait_event_lock_irq(wq, condition, lock, cmd); \ -} while (0) - -static inline void safe_put_page(struct page *p) -{ - if (p) put_page(p); -} - -#endif /* CONFIG_BLOCK */ -#endif - - -extern int register_md_personality(struct mdk_personality *p); -extern int unregister_md_personality(struct mdk_personality *p); -extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev), - mddev_t *mddev, const char *name); -extern void md_unregister_thread(mdk_thread_t *thread); -extern void md_wakeup_thread(mdk_thread_t *thread); -extern void md_check_recovery(mddev_t *mddev); -extern void md_write_start(mddev_t *mddev, struct bio *bi); -extern void md_write_end(mddev_t *mddev); -extern void md_done_sync(mddev_t *mddev, int blocks, int ok); -extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); - -extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, - sector_t sector, int size, struct page *page); -extern void md_super_wait(mddev_t *mddev); -extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, - struct page *page, int rw); -extern void md_do_sync(mddev_t *mddev); -extern void md_new_event(mddev_t *mddev); -extern int md_allow_write(mddev_t *mddev); -extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); -- cgit v1.2.3-70-g09d2 From f701d589aa34d7531183c9ac6f7713ba14212b02 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 31 Mar 2009 15:09:39 +1100 Subject: md/raid6: move raid6 data processing to raid6_pq.ko Move the raid6 data processing routines into a standalone module (raid6_pq) to prepare them to be called from async_tx wrappers and other non-md drivers/modules. This precludes a circular dependency of raid456 needing the async modules for data processing while those modules in turn depend on raid456 for the base level synchronous raid6 routines. To support this move: 1/ The exportable definitions in raid6.h move to include/linux/raid/pq.h 2/ The raid6_call, recovery calls, and table symbols are exported 3/ Extra #ifdef __KERNEL__ statements to enable the userspace raid6test to compile Signed-off-by: Dan Williams Signed-off-by: NeilBrown --- drivers/md/Kconfig | 4 ++ drivers/md/Makefile | 4 +- drivers/md/mktables.c | 14 ++++- drivers/md/raid5.c | 12 +--- drivers/md/raid5.h | 2 + drivers/md/raid6.h | 126 ---------------------------------------- drivers/md/raid6algos.c | 19 +++++- drivers/md/raid6altivec.uc | 2 +- drivers/md/raid6int.uc | 2 +- drivers/md/raid6mmx.c | 2 +- drivers/md/raid6recov.c | 11 ++-- drivers/md/raid6sse1.c | 2 +- drivers/md/raid6sse2.c | 2 +- drivers/md/raid6test/Makefile | 2 +- drivers/md/raid6test/test.c | 2 +- include/linux/raid/pq.h | 132 ++++++++++++++++++++++++++++++++++++++++++ 16 files changed, 185 insertions(+), 153 deletions(-) delete mode 100644 drivers/md/raid6.h create mode 100644 include/linux/raid/pq.h (limited to 'include/linux/raid') diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 2281b5098e9..449d0b9cac1 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -121,6 +121,7 @@ config MD_RAID10 config MD_RAID456 tristate "RAID-4/RAID-5/RAID-6 mode" depends on BLK_DEV_MD + select MD_RAID6_PQ select ASYNC_MEMCPY select ASYNC_XOR ---help--- @@ -180,6 +181,9 @@ config MD_RAID5_RESHAPE If unsure, say Y. +config MD_RAID6_PQ + tristate + config MD_MULTIPATH tristate "Multipath I/O support" depends on BLK_DEV_MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 3b118da575e..45cc5951d92 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -9,7 +9,8 @@ dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ dm-snap-persistent.o dm-mirror-y += dm-raid1.o md-mod-y += md.o bitmap.o -raid456-y += raid5.o raid6algos.o raid6recov.o raid6tables.o \ +raid456-y += raid5.o +raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \ raid6int1.o raid6int2.o raid6int4.o \ raid6int8.o raid6int16.o raid6int32.o \ raid6altivec1.o raid6altivec2.o raid6altivec4.o \ @@ -26,6 +27,7 @@ obj-$(CONFIG_MD_LINEAR) += linear.o obj-$(CONFIG_MD_RAID0) += raid0.o obj-$(CONFIG_MD_RAID1) += raid1.o obj-$(CONFIG_MD_RAID10) += raid10.o +obj-$(CONFIG_MD_RAID6_PQ) += raid6_pq.o obj-$(CONFIG_MD_RAID456) += raid456.o obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_MD_FAULTY) += faulty.o diff --git a/drivers/md/mktables.c b/drivers/md/mktables.c index b61d5767aae..3b1500843bb 100644 --- a/drivers/md/mktables.c +++ b/drivers/md/mktables.c @@ -59,7 +59,7 @@ int main(int argc, char *argv[]) uint8_t v; uint8_t exptbl[256], invtbl[256]; - printf("#include \"raid6.h\"\n"); + printf("#include \n"); /* Compute multiplication table */ printf("\nconst u8 __attribute__((aligned(256)))\n" @@ -76,6 +76,9 @@ int main(int argc, char *argv[]) printf("\t},\n"); } printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfmul);\n"); + printf("#endif\n"); /* Compute power-of-2 table (exponent) */ v = 1; @@ -92,6 +95,9 @@ int main(int argc, char *argv[]) } } printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfexp);\n"); + printf("#endif\n"); /* Compute inverse table x^-1 == x^254 */ printf("\nconst u8 __attribute__((aligned(256)))\n" @@ -104,6 +110,9 @@ int main(int argc, char *argv[]) } } printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfinv);\n"); + printf("#endif\n"); /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ printf("\nconst u8 __attribute__((aligned(256)))\n" @@ -115,6 +124,9 @@ int main(int argc, char *argv[]) (j == 7) ? '\n' : ' '); } printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfexi);\n"); + printf("#endif\n"); return 0; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e1ee181b79b..1f1b054ff0b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -45,11 +45,11 @@ #include #include +#include #include #include #include "md.h" #include "raid5.h" -#include "raid6.h" #include "bitmap.h" /* @@ -94,11 +94,6 @@ #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) -#if !RAID6_USE_EMPTY_ZERO_PAGE -/* In .bss so it's zeroed */ -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); -#endif - /* * We maintain a biased count of active stripes in the bottom 16 bits of * bi_phys_segments, and a count of processed stripes in the upper 16 bits @@ -5153,11 +5148,6 @@ static struct mdk_personality raid4_personality = static int __init raid5_init(void) { - int e; - - e = raid6_select_algo(); - if ( e ) - return e; register_md_personality(&raid6_personality); register_md_personality(&raid5_personality); register_md_personality(&raid4_personality); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index c172371481c..2934ee0a39c 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -269,6 +269,8 @@ struct r6_state { #define READ_MODIFY_WRITE 2 /* not a write method, but a compute_parity mode */ #define CHECK_PARITY 3 +/* Additional compute_parity mode -- updates the parity w/o LOCKING */ +#define UPDATE_PARITY 4 /* * Stripe state diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h deleted file mode 100644 index 8a9c823bab9..00000000000 --- a/drivers/md/raid6.h +++ /dev/null @@ -1,126 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2003 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Boston MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -#ifndef LINUX_RAID_RAID6_H -#define LINUX_RAID_RAID6_H - -#ifdef __KERNEL__ - -/* Set to 1 to use kernel-wide empty_zero_page */ -#define RAID6_USE_EMPTY_ZERO_PAGE 0 -#include - -/* Additional compute_parity mode -- updates the parity w/o LOCKING */ -#define UPDATE_PARITY 4 - -/* We need a pre-zeroed page... if we don't want to use the kernel-provided - one define it here */ -#if RAID6_USE_EMPTY_ZERO_PAGE -# define raid6_empty_zero_page empty_zero_page -#else -extern const char raid6_empty_zero_page[PAGE_SIZE]; -#endif - -#else /* ! __KERNEL__ */ -/* Used for testing in user space */ - -#include -#include -#include -#include -#include -#include - -/* Not standard, but glibc defines it */ -#define BITS_PER_LONG __WORDSIZE - -typedef uint8_t u8; -typedef uint16_t u16; -typedef uint32_t u32; -typedef uint64_t u64; - -#ifndef PAGE_SIZE -# define PAGE_SIZE 4096 -#endif -extern const char raid6_empty_zero_page[PAGE_SIZE]; - -#define __init -#define __exit -#define __attribute_const__ __attribute__((const)) -#define noinline __attribute__((noinline)) - -#define preempt_enable() -#define preempt_disable() -#define cpu_has_feature(x) 1 -#define enable_kernel_altivec() -#define disable_kernel_altivec() - -#endif /* __KERNEL__ */ - -/* Routine choices */ -struct raid6_calls { - void (*gen_syndrome)(int, size_t, void **); - int (*valid)(void); /* Returns 1 if this routine set is usable */ - const char *name; /* Name of this routine set */ - int prefer; /* Has special performance attribute */ -}; - -/* Selected algorithm */ -extern struct raid6_calls raid6_call; - -/* Algorithm list */ -extern const struct raid6_calls * const raid6_algos[]; -int raid6_select_algo(void); - -/* Return values from chk_syndrome */ -#define RAID6_OK 0 -#define RAID6_P_BAD 1 -#define RAID6_Q_BAD 2 -#define RAID6_PQ_BAD 3 - -/* Galois field tables */ -extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256))); -extern const u8 raid6_gfexp[256] __attribute__((aligned(256))); -extern const u8 raid6_gfinv[256] __attribute__((aligned(256))); -extern const u8 raid6_gfexi[256] __attribute__((aligned(256))); - -/* Recovery routines */ -void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void **ptrs); -void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs); -void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs); - -/* Some definitions to allow code to be compiled for testing in userspace */ -#ifndef __KERNEL__ - -# define jiffies raid6_jiffies() -# define printk printf -# define GFP_KERNEL 0 -# define __get_free_pages(x,y) ((unsigned long)mmap(NULL, PAGE_SIZE << (y), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0)) -# define free_pages(x,y) munmap((void *)(x), (y)*PAGE_SIZE) - -static inline void cpu_relax(void) -{ - /* Nothing */ -} - -#undef HZ -#define HZ 1000 -static inline uint32_t raid6_jiffies(void) -{ - struct timeval tv; - gettimeofday(&tv, NULL); - return tv.tv_sec*1000 + tv.tv_usec/1000; -} - -#endif /* ! __KERNEL__ */ - -#endif /* LINUX_RAID_RAID6_H */ diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c index 1f6a3c82ee0..866215ac7f2 100644 --- a/drivers/md/raid6algos.c +++ b/drivers/md/raid6algos.c @@ -16,13 +16,20 @@ * Algorithm list and algorithm selection for RAID-6 */ -#include "raid6.h" +#include #ifndef __KERNEL__ #include #include +#else +#if !RAID6_USE_EMPTY_ZERO_PAGE +/* In .bss so it's zeroed */ +const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); +EXPORT_SYMBOL(raid6_empty_zero_page); +#endif #endif struct raid6_calls raid6_call; +EXPORT_SYMBOL_GPL(raid6_call); /* Various routine sets */ extern const struct raid6_calls raid6_intx1; @@ -79,6 +86,7 @@ const struct raid6_calls * const raid6_algos[] = { #else /* Need more time to be stable in userspace */ #define RAID6_TIME_JIFFIES_LG2 9 +#define time_before(x, y) ((x) < (y)) #endif /* Try to pick the best algorithm */ @@ -152,3 +160,12 @@ int __init raid6_select_algo(void) return best ? 0 : -EINVAL; } + +static void raid6_exit(void) +{ + do { } while (0); +} + +subsys_initcall(raid6_select_algo); +module_exit(raid6_exit); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc index 217580667e0..699dfeee494 100644 --- a/drivers/md/raid6altivec.uc +++ b/drivers/md/raid6altivec.uc @@ -22,7 +22,7 @@ * bracked this with preempt_disable/enable or in a lock) */ -#include "raid6.h" +#include #ifdef CONFIG_ALTIVEC diff --git a/drivers/md/raid6int.uc b/drivers/md/raid6int.uc index 32a0bac3eb3..f9bf9cba357 100644 --- a/drivers/md/raid6int.uc +++ b/drivers/md/raid6int.uc @@ -18,7 +18,7 @@ * This file is postprocessed using unroll.pl */ -#include "raid6.h" +#include /* * This is the C data type to use diff --git a/drivers/md/raid6mmx.c b/drivers/md/raid6mmx.c index 804cb50ecc1..e7f6c13132b 100644 --- a/drivers/md/raid6mmx.c +++ b/drivers/md/raid6mmx.c @@ -18,7 +18,7 @@ #if defined(__i386__) && !defined(__arch_um__) -#include "raid6.h" +#include #include "raid6x86.h" /* Shared with raid6sse1.c */ diff --git a/drivers/md/raid6recov.c b/drivers/md/raid6recov.c index 7a98b865258..2609f00e0d6 100644 --- a/drivers/md/raid6recov.c +++ b/drivers/md/raid6recov.c @@ -18,7 +18,7 @@ * the syndrome.) */ -#include "raid6.h" +#include /* Recover two failed data blocks. */ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, @@ -63,9 +63,7 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, p++; q++; } } - - - +EXPORT_SYMBOL_GPL(raid6_2data_recov); /* Recover failure of one data block plus the P block */ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) @@ -97,9 +95,10 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) q++; dq++; } } +EXPORT_SYMBOL_GPL(raid6_datap_recov); - -#ifndef __KERNEL__ /* Testing only */ +#ifndef __KERNEL__ +/* Testing only */ /* Recover two failed blocks. */ void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs) diff --git a/drivers/md/raid6sse1.c b/drivers/md/raid6sse1.c index 15c58890522..b274dd5eab8 100644 --- a/drivers/md/raid6sse1.c +++ b/drivers/md/raid6sse1.c @@ -23,7 +23,7 @@ #if defined(__i386__) && !defined(__arch_um__) -#include "raid6.h" +#include #include "raid6x86.h" /* Defined in raid6mmx.c */ diff --git a/drivers/md/raid6sse2.c b/drivers/md/raid6sse2.c index 2e92e96275b..6ed6c6c0389 100644 --- a/drivers/md/raid6sse2.c +++ b/drivers/md/raid6sse2.c @@ -19,7 +19,7 @@ #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) -#include "raid6.h" +#include #include "raid6x86.h" static const struct raid6_sse_constants { diff --git a/drivers/md/raid6test/Makefile b/drivers/md/raid6test/Makefile index 78e0396adf2..58ffdf4f516 100644 --- a/drivers/md/raid6test/Makefile +++ b/drivers/md/raid6test/Makefile @@ -5,7 +5,7 @@ CC = gcc OPTFLAGS = -O2 # Adjust as desired -CFLAGS = -I.. -g $(OPTFLAGS) +CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) LD = ld PERL = perl AR = ar diff --git a/drivers/md/raid6test/test.c b/drivers/md/raid6test/test.c index 559cc41b258..7a930318b17 100644 --- a/drivers/md/raid6test/test.c +++ b/drivers/md/raid6test/test.c @@ -17,7 +17,7 @@ #include #include #include -#include "raid6.h" +#include #define NDISKS 16 /* Including P and Q */ diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h new file mode 100644 index 00000000000..d92480f8285 --- /dev/null +++ b/include/linux/raid/pq.h @@ -0,0 +1,132 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2003 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +#ifndef LINUX_RAID_RAID6_H +#define LINUX_RAID_RAID6_H + +#ifdef __KERNEL__ + +/* Set to 1 to use kernel-wide empty_zero_page */ +#define RAID6_USE_EMPTY_ZERO_PAGE 0 +#include + +/* We need a pre-zeroed page... if we don't want to use the kernel-provided + one define it here */ +#if RAID6_USE_EMPTY_ZERO_PAGE +# define raid6_empty_zero_page empty_zero_page +#else +extern const char raid6_empty_zero_page[PAGE_SIZE]; +#endif + +#else /* ! __KERNEL__ */ +/* Used for testing in user space */ + +#include +#include +#include +#include +#include +#include + +/* Not standard, but glibc defines it */ +#define BITS_PER_LONG __WORDSIZE + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; + +#ifndef PAGE_SIZE +# define PAGE_SIZE 4096 +#endif +extern const char raid6_empty_zero_page[PAGE_SIZE]; + +#define __init +#define __exit +#define __attribute_const__ __attribute__((const)) +#define noinline __attribute__((noinline)) + +#define preempt_enable() +#define preempt_disable() +#define cpu_has_feature(x) 1 +#define enable_kernel_altivec() +#define disable_kernel_altivec() + +#define EXPORT_SYMBOL(sym) +#define MODULE_LICENSE(licence) +#define subsys_initcall(x) +#define module_exit(x) +#endif /* __KERNEL__ */ + +/* Routine choices */ +struct raid6_calls { + void (*gen_syndrome)(int, size_t, void **); + int (*valid)(void); /* Returns 1 if this routine set is usable */ + const char *name; /* Name of this routine set */ + int prefer; /* Has special performance attribute */ +}; + +/* Selected algorithm */ +extern struct raid6_calls raid6_call; + +/* Algorithm list */ +extern const struct raid6_calls * const raid6_algos[]; +int raid6_select_algo(void); + +/* Return values from chk_syndrome */ +#define RAID6_OK 0 +#define RAID6_P_BAD 1 +#define RAID6_Q_BAD 2 +#define RAID6_PQ_BAD 3 + +/* Galois field tables */ +extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256))); +extern const u8 raid6_gfexp[256] __attribute__((aligned(256))); +extern const u8 raid6_gfinv[256] __attribute__((aligned(256))); +extern const u8 raid6_gfexi[256] __attribute__((aligned(256))); + +/* Recovery routines */ +void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, + void **ptrs); +void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs); +void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, + void **ptrs); + +/* Some definitions to allow code to be compiled for testing in userspace */ +#ifndef __KERNEL__ + +# define jiffies raid6_jiffies() +# define printk printf +# define GFP_KERNEL 0 +# define __get_free_pages(x, y) ((unsigned long)mmap(NULL, PAGE_SIZE << (y), \ + PROT_READ|PROT_WRITE, \ + MAP_PRIVATE|MAP_ANONYMOUS,\ + 0, 0)) +# define free_pages(x, y) munmap((void *)(x), (y)*PAGE_SIZE) + +static inline void cpu_relax(void) +{ + /* Nothing */ +} + +#undef HZ +#define HZ 1000 +static inline uint32_t raid6_jiffies(void) +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec*1000 + tv.tv_usec/1000; +} + +#endif /* ! __KERNEL__ */ + +#endif /* LINUX_RAID_RAID6_H */ -- cgit v1.2.3-70-g09d2