diff options
Diffstat (limited to 'drivers/block')
38 files changed, 6141 insertions, 2504 deletions
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 9a13e889837..eb3950113e4 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -6473,7 +6473,7 @@ static int dac960_initial_status_proc_show(struct seq_file *m, void *v) static int dac960_initial_status_proc_open(struct inode *inode, struct file *file) { - return single_open(file, dac960_initial_status_proc_show, PDE(inode)->data); + return single_open(file, dac960_initial_status_proc_show, PDE_DATA(inode)); } static const struct file_operations dac960_initial_status_proc_fops = { @@ -6519,7 +6519,7 @@ static int dac960_current_status_proc_show(struct seq_file *m, void *v) static int dac960_current_status_proc_open(struct inode *inode, struct file *file) { - return single_open(file, dac960_current_status_proc_show, PDE(inode)->data); + return single_open(file, dac960_current_status_proc_show, PDE_DATA(inode)); } static const struct file_operations dac960_current_status_proc_fops = { @@ -6540,14 +6540,14 @@ static int dac960_user_command_proc_show(struct seq_file *m, void *v) static int dac960_user_command_proc_open(struct inode *inode, struct file *file) { - return single_open(file, dac960_user_command_proc_show, PDE(inode)->data); + return single_open(file, dac960_user_command_proc_show, PDE_DATA(inode)); } static ssize_t dac960_user_command_proc_write(struct file *file, const char __user *Buffer, size_t Count, loff_t *pos) { - DAC960_Controller_T *Controller = (DAC960_Controller_T *) PDE(file->f_path.dentry->d_inode)->data; + DAC960_Controller_T *Controller = PDE_DATA(file_inode(file)); unsigned char CommandBuffer[80]; int Length; if (Count > sizeof(CommandBuffer)-1) return -EINVAL; @@ -7054,6 +7054,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request, else ErrorCode = 0; } + break; default: ErrorCode = -ENOTTY; } diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 824e09c4d0d..b81ddfea1da 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -63,19 +63,6 @@ config AMIGA_Z2RAM To compile this driver as a module, choose M here: the module will be called z2ram. -config BLK_DEV_XD - tristate "XT hard disk support" - depends on ISA && ISA_DMA_API - select CHECK_SIGNATURE - help - Very old 8 bit hard disk controllers used in the IBM XT computer - will be supported if you say Y here. - - To compile this driver as a module, choose M here: the - module will be called xd. - - It's pretty unlikely that you have one of these: say N. - config GDROM tristate "SEGA Dreamcast GD-ROM drive" depends on SH_DREAMCAST @@ -544,4 +531,14 @@ config BLK_DEV_RBD If unsure, say N. +config BLK_DEV_RSXX + tristate "IBM FlashSystem 70/80 PCIe SSD Device Driver" + depends on PCI + help + Device driver for IBM's high speed PCIe SSD + storage devices: FlashSystem-70 and FlashSystem-80. + + To compile this driver as a module, choose M here: the + module will be called rsxx. + endif # BLK_DEV diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 17e82df3df7..a3b40232c6a 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -15,7 +15,6 @@ obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o obj-$(CONFIG_BLK_DEV_RAM) += brd.o obj-$(CONFIG_BLK_DEV_LOOP) += loop.o -obj-$(CONFIG_BLK_DEV_XD) += xd.o obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o obj-$(CONFIG_BLK_DEV_DAC960) += DAC960.o @@ -41,4 +40,6 @@ obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ +obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ + swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c index 42e67ad6bd2..ab41be625a5 100644 --- a/drivers/block/aoe/aoechr.c +++ b/drivers/block/aoe/aoechr.c @@ -139,13 +139,12 @@ bail: spin_unlock_irqrestore(&emsgs_lock, flags); return; } - mp = kmalloc(n, GFP_ATOMIC); + mp = kmemdup(msg, n, GFP_ATOMIC); if (mp == NULL) { printk(KERN_ERR "aoe: allocation failure, len=%ld\n", n); goto bail; } - memcpy(mp, msg, n); em->msg = mp; em->flags |= EMFL_VALID; em->len = n; diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 25ef5c014fc..92b6d7c51e3 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -51,8 +51,9 @@ new_skb(ulong len) { struct sk_buff *skb; - skb = alloc_skb(len, GFP_ATOMIC); + skb = alloc_skb(len + MAX_HEADER, GFP_ATOMIC); if (skb) { + skb_reserve(skb, MAX_HEADER); skb_reset_mac_header(skb); skb_reset_network_header(skb); skb->protocol = __constant_htons(ETH_P_AOE); diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index ade58bc8f3c..e18c99140c0 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -493,7 +493,7 @@ static int cciss_seq_open(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; if (!ret) - seq->private = PDE(inode)->data; + seq->private = PDE_DATA(inode); return ret; } @@ -4206,7 +4206,7 @@ static int cciss_find_cfgtables(ctlr_info_t *h) if (rc) return rc; h->cfgtable = remap_pci_mem(pci_resource_start(h->pdev, - cfg_base_addr_index) + cfg_offset, sizeof(h->cfgtable)); + cfg_base_addr_index) + cfg_offset, sizeof(*h->cfgtable)); if (!h->cfgtable) return -ENOMEM; rc = write_driver_ver_to_cfgtable(h->cfgtable); diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c index da3311129a0..ecd845cd28d 100644 --- a/drivers/block/cciss_scsi.c +++ b/drivers/block/cciss_scsi.c @@ -54,13 +54,11 @@ static CommandList_struct *cmd_special_alloc(ctlr_info_t *h); static void cmd_free(ctlr_info_t *h, CommandList_struct *c); static void cmd_special_free(ctlr_info_t *h, CommandList_struct *c); -static int cciss_scsi_proc_info( - struct Scsi_Host *sh, +static int cciss_scsi_write_info(struct Scsi_Host *sh, char *buffer, /* data buffer */ - char **start, /* where data in buffer starts */ - off_t offset, /* offset from start of imaginary file */ - int length, /* length of data in buffer */ - int func); /* 0 == read, 1 == write */ + int length); /* length of data in buffer */ +static int cciss_scsi_show_info(struct seq_file *m, + struct Scsi_Host *sh); static int cciss_scsi_queue_command (struct Scsi_Host *h, struct scsi_cmnd *cmd); @@ -82,7 +80,8 @@ static struct scsi_host_template cciss_driver_template = { .module = THIS_MODULE, .name = "cciss", .proc_name = "cciss", - .proc_info = cciss_scsi_proc_info, + .write_info = cciss_scsi_write_info, + .show_info = cciss_scsi_show_info, .queuecommand = cciss_scsi_queue_command, .this_id = 7, .cmd_per_lun = 1, @@ -1302,59 +1301,54 @@ cciss_scsi_user_command(ctlr_info_t *h, int hostno, char *buffer, int length) return length; } - static int -cciss_scsi_proc_info(struct Scsi_Host *sh, +cciss_scsi_write_info(struct Scsi_Host *sh, char *buffer, /* data buffer */ - char **start, /* where data in buffer starts */ - off_t offset, /* offset from start of imaginary file */ - int length, /* length of data in buffer */ - int func) /* 0 == read, 1 == write */ + int length) /* length of data in buffer */ { + ctlr_info_t *h = (ctlr_info_t *) sh->hostdata[0]; + if (h == NULL) /* This really shouldn't ever happen. */ + return -EINVAL; - int buflen, datalen; - ctlr_info_t *h; + return cciss_scsi_user_command(h, sh->host_no, + buffer, length); +} + +static int +cciss_scsi_show_info(struct seq_file *m, struct Scsi_Host *sh) +{ + + ctlr_info_t *h = (ctlr_info_t *) sh->hostdata[0]; int i; - h = (ctlr_info_t *) sh->hostdata[0]; if (h == NULL) /* This really shouldn't ever happen. */ return -EINVAL; - if (func == 0) { /* User is reading from /proc/scsi/ciss*?/?* */ - buflen = sprintf(buffer, "cciss%d: SCSI host: %d\n", - h->ctlr, sh->host_no); - - /* this information is needed by apps to know which cciss - device corresponds to which scsi host number without - having to open a scsi target device node. The device - information is not a duplicate of /proc/scsi/scsi because - the two may be out of sync due to scsi hotplug, rather - this info is for an app to be able to use to know how to - get them back in sync. */ - - for (i = 0; i < ccissscsi[h->ctlr].ndevices; i++) { - struct cciss_scsi_dev_t *sd = - &ccissscsi[h->ctlr].dev[i]; - buflen += sprintf(&buffer[buflen], "c%db%dt%dl%d %02d " - "0x%02x%02x%02x%02x%02x%02x%02x%02x\n", - sh->host_no, sd->bus, sd->target, sd->lun, - sd->devtype, - sd->scsi3addr[0], sd->scsi3addr[1], - sd->scsi3addr[2], sd->scsi3addr[3], - sd->scsi3addr[4], sd->scsi3addr[5], - sd->scsi3addr[6], sd->scsi3addr[7]); - } - datalen = buflen - offset; - if (datalen < 0) { /* they're reading past EOF. */ - datalen = 0; - *start = buffer+buflen; - } else - *start = buffer + offset; - return(datalen); - } else /* User is writing to /proc/scsi/cciss*?/?* ... */ - return cciss_scsi_user_command(h, sh->host_no, - buffer, length); -} + seq_printf(m, "cciss%d: SCSI host: %d\n", + h->ctlr, sh->host_no); + + /* this information is needed by apps to know which cciss + device corresponds to which scsi host number without + having to open a scsi target device node. The device + information is not a duplicate of /proc/scsi/scsi because + the two may be out of sync due to scsi hotplug, rather + this info is for an app to be able to use to know how to + get them back in sync. */ + + for (i = 0; i < ccissscsi[h->ctlr].ndevices; i++) { + struct cciss_scsi_dev_t *sd = + &ccissscsi[h->ctlr].dev[i]; + seq_printf(m, "c%db%dt%dl%d %02d " + "0x%02x%02x%02x%02x%02x%02x%02x%02x\n", + sh->host_no, sd->bus, sd->target, sd->lun, + sd->devtype, + sd->scsi3addr[0], sd->scsi3addr[1], + sd->scsi3addr[2], sd->scsi3addr[3], + sd->scsi3addr[4], sd->scsi3addr[5], + sd->scsi3addr[6], sd->scsi3addr[7]); + } + return 0; +} /* cciss_scatter_gather takes a struct scsi_cmnd, (cmd), and does the pci dma mapping and fills in the scatter gather entries of the diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c index 3f087133a25..3b9e8ebcb96 100644 --- a/drivers/block/cpqarray.c +++ b/drivers/block/cpqarray.c @@ -296,7 +296,7 @@ static int ida_proc_show(struct seq_file *m, void *v) static int ida_proc_open(struct inode *inode, struct file *file) { - return single_open(file, ida_proc_show, PDE(inode)->data); + return single_open(file, ida_proc_show, PDE_DATA(inode)); } static const struct file_operations ida_proc_fops = { diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 8c13eeb83c5..e98da675f0c 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2660,25 +2660,24 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, mdev->read_requests = RB_ROOT; mdev->write_requests = RB_ROOT; - if (!idr_pre_get(&minors, GFP_KERNEL)) - goto out_no_minor_idr; - if (idr_get_new_above(&minors, mdev, minor, &minor_got)) + minor_got = idr_alloc(&minors, mdev, minor, minor + 1, GFP_KERNEL); + if (minor_got < 0) { + if (minor_got == -ENOSPC) { + err = ERR_MINOR_EXISTS; + drbd_msg_put_info("requested minor exists already"); + } goto out_no_minor_idr; - if (minor_got != minor) { - err = ERR_MINOR_EXISTS; - drbd_msg_put_info("requested minor exists already"); - goto out_idr_remove_minor; } - if (!idr_pre_get(&tconn->volumes, GFP_KERNEL)) - goto out_idr_remove_minor; - if (idr_get_new_above(&tconn->volumes, mdev, vnr, &vnr_got)) + vnr_got = idr_alloc(&tconn->volumes, mdev, vnr, vnr + 1, GFP_KERNEL); + if (vnr_got < 0) { + if (vnr_got == -ENOSPC) { + err = ERR_INVALID_REQUEST; + drbd_msg_put_info("requested volume exists already"); + } goto out_idr_remove_minor; - if (vnr_got != vnr) { - err = ERR_INVALID_REQUEST; - drbd_msg_put_info("requested volume exists already"); - goto out_idr_remove_vol; } + add_disk(disk); kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */ @@ -2689,8 +2688,6 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, return NO_ERROR; -out_idr_remove_vol: - idr_remove(&tconn->volumes, vnr_got); out_idr_remove_minor: idr_remove(&minors, minor_got); synchronize_rcu(); diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 56672a61eb9..928adb815b0 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -314,7 +314,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v) static int drbd_proc_open(struct inode *inode, struct file *file) { if (try_module_get(THIS_MODULE)) - return single_open(file, drbd_seq_show, PDE(inode)->data); + return single_open(file, drbd_seq_show, PDE_DATA(inode)); return -ENODEV; } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index a9eccfc6079..83c5ae0ed56 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -757,7 +757,8 @@ static struct socket *drbd_wait_for_connect(struct drbd_tconn *tconn, struct acc rcu_read_unlock(); timeo = connect_int * HZ; - timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ + /* 28.5% random jitter */ + timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7; err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo); if (err <= 0) @@ -953,7 +954,7 @@ retry: conn_warn(tconn, "Error receiving initial packet\n"); sock_release(s); randomize: - if (random32() & 1) + if (prandom_u32() & 1) goto retry; } } diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ae125127062..b2955b3f2cb 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -162,12 +162,13 @@ static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = { static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file) { - loff_t size, loopsize; + loff_t loopsize; /* Compute loopsize in bytes */ - size = i_size_read(file->f_mapping->host); - loopsize = size - offset; - /* offset is beyond i_size, wierd but possible */ + loopsize = i_size_read(file->f_mapping->host); + if (offset > 0) + loopsize -= offset; + /* offset is beyond i_size, weird but possible */ if (loopsize < 0) return 0; @@ -190,6 +191,7 @@ figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit) { loff_t size = get_size(offset, sizelimit, lo->lo_backing_file); sector_t x = (sector_t)size; + struct block_device *bdev = lo->lo_device; if (unlikely((loff_t)x != size)) return -EFBIG; @@ -198,6 +200,9 @@ figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit) if (lo->lo_sizelimit != sizelimit) lo->lo_sizelimit = sizelimit; set_capacity(lo->lo_disk, x); + bd_set_size(bdev, (loff_t)get_capacity(bdev->bd_disk) << 9); + /* let user-space know about the new size */ + kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); return 0; } @@ -225,9 +230,11 @@ static int __do_lo_send_write(struct file *file, ssize_t bw; mm_segment_t old_fs = get_fs(); + file_start_write(file); set_fs(get_ds()); bw = file->f_op->write(file, buf, len, &pos); set_fs(old_fs); + file_end_write(file); if (likely(bw == len)) return 0; printk(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n", @@ -917,6 +924,11 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, lo->lo_flags |= LO_FLAGS_PARTSCAN; if (lo->lo_flags & LO_FLAGS_PARTSCAN) ioctl_by_bdev(bdev, BLKRRPART, 0); + + /* Grab the block_device to prevent its destruction after we + * put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev). + */ + bdgrab(bdev); return 0; out_clr: @@ -1026,8 +1038,10 @@ static int loop_clr_fd(struct loop_device *lo) memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE); - if (bdev) + if (bdev) { + bdput(bdev); invalidate_bdev(bdev); + } set_capacity(lo->lo_disk, 0); loop_sysfs_exit(lo); if (bdev) { @@ -1091,10 +1105,10 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) return err; if (lo->lo_offset != info->lo_offset || - lo->lo_sizelimit != info->lo_sizelimit) { + lo->lo_sizelimit != info->lo_sizelimit) if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) return -EFBIG; - } + loop_config_discard(lo); memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); @@ -1139,7 +1153,7 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info) if (lo->lo_state != Lo_bound) return -ENXIO; - error = vfs_getattr(file->f_path.mnt, file->f_path.dentry, &stat); + error = vfs_getattr(&file->f_path, &stat); if (error) return error; memset(info, 0, sizeof(*info)); @@ -1271,28 +1285,10 @@ loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) { static int loop_set_capacity(struct loop_device *lo, struct block_device *bdev) { - int err; - sector_t sec; - loff_t sz; - - err = -ENXIO; if (unlikely(lo->lo_state != Lo_bound)) - goto out; - err = figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit); - if (unlikely(err)) - goto out; - sec = get_capacity(lo->lo_disk); - /* the width of sector_t may be narrow for bit-shift */ - sz = sec; - sz <<= 9; - mutex_lock(&bdev->bd_mutex); - bd_set_size(bdev, sz); - /* let user-space know about the new size */ - kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); - mutex_unlock(&bdev->bd_mutex); + return -ENXIO; - out: - return err; + return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit); } static int lo_ioctl(struct block_device *bdev, fmode_t mode, @@ -1624,31 +1620,19 @@ static int loop_add(struct loop_device **l, int i) if (!lo) goto out; - if (!idr_pre_get(&loop_index_idr, GFP_KERNEL)) - goto out_free_dev; - + /* allocate id, if @id >= 0, we're requesting that specific id */ if (i >= 0) { - int m; - - /* create specific i in the index */ - err = idr_get_new_above(&loop_index_idr, lo, i, &m); - if (err >= 0 && i != m) { - idr_remove(&loop_index_idr, m); + err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL); + if (err == -ENOSPC) err = -EEXIST; - } - } else if (i == -1) { - int m; - - /* get next free nr */ - err = idr_get_new(&loop_index_idr, lo, &m); - if (err >= 0) - i = m; } else { - err = -EINVAL; + err = idr_alloc(&loop_index_idr, lo, 0, 0, GFP_KERNEL); } if (err < 0) goto out_free_dev; + i = err; + err = -ENOMEM; lo->lo_queue = blk_alloc_queue(GFP_KERNEL); if (!lo->lo_queue) goto out_free_dev; @@ -1858,11 +1842,15 @@ static int __init loop_init(void) max_part = (1UL << part_shift) - 1; } - if ((1UL << part_shift) > DISK_MAX_PARTS) - return -EINVAL; + if ((1UL << part_shift) > DISK_MAX_PARTS) { + err = -EINVAL; + goto misc_out; + } - if (max_loop > 1UL << (MINORBITS - part_shift)) - return -EINVAL; + if (max_loop > 1UL << (MINORBITS - part_shift)) { + err = -EINVAL; + goto misc_out; + } /* * If max_loop is specified, create that many devices upfront. @@ -1880,8 +1868,10 @@ static int __init loop_init(void) range = 1UL << MINORBITS; } - if (register_blkdev(LOOP_MAJOR, "loop")) - return -EIO; + if (register_blkdev(LOOP_MAJOR, "loop")) { + err = -EIO; + goto misc_out; + } blk_register_region(MKDEV(LOOP_MAJOR, 0), range, THIS_MODULE, loop_probe, NULL, NULL); @@ -1894,6 +1884,10 @@ static int __init loop_init(void) printk(KERN_INFO "loop: module loaded\n"); return 0; + +misc_out: + misc_deregister(&loop_misc); + return err; } static int loop_exit_cb(int id, void *ptr, void *data) @@ -1911,7 +1905,6 @@ static void __exit loop_exit(void) range = max_loop ? max_loop << part_shift : 1UL << MINORBITS; idr_for_each(&loop_index_idr, &loop_exit_cb, NULL); - idr_remove_all(&loop_index_idr); idr_destroy(&loop_index_idr); blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range); diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 1788f491e0f..076ae7f1b78 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -890,8 +890,10 @@ static int mg_probe(struct platform_device *plat_dev) gpio_direction_output(host->rst, 1); /* reset out pin */ - if (!(prv_data->dev_attr & MG_DEV_MASK)) + if (!(prv_data->dev_attr & MG_DEV_MASK)) { + err = -EINVAL; goto probe_err_3a; + } if (prv_data->dev_attr != MG_BOOT_DEV) { rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO, diff --git a/drivers/block/mtip32xx/Kconfig b/drivers/block/mtip32xx/Kconfig index 0ba837fc62a..1fca1f996b4 100644 --- a/drivers/block/mtip32xx/Kconfig +++ b/drivers/block/mtip32xx/Kconfig @@ -4,6 +4,6 @@ config BLK_DEV_PCIESSD_MTIP32XX tristate "Block Device Driver for Micron PCIe SSDs" - depends on PCI + depends on PCI && GENERIC_HARDIRQS help This enables the block driver for Micron PCIe SSDs. diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 3fd10099045..32c678028e5 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -81,12 +81,19 @@ /* Device instance number, incremented each time a device is probed. */ static int instance; +struct list_head online_list; +struct list_head removing_list; +spinlock_t dev_lock; + /* * Global variable used to hold the major block device number * allocated in mtip_init(). */ static int mtip_major; static struct dentry *dfs_parent; +static struct dentry *dfs_device_status; + +static u32 cpu_use[NR_CPUS]; static DEFINE_SPINLOCK(rssd_index_lock); static DEFINE_IDA(rssd_index_ida); @@ -241,40 +248,31 @@ static inline void release_slot(struct mtip_port *port, int tag) /* * Reset the HBA (without sleeping) * - * Just like hba_reset, except does not call sleep, so can be - * run from interrupt/tasklet context. - * * @dd Pointer to the driver data structure. * * return value * 0 The reset was successful. * -1 The HBA Reset bit did not clear. */ -static int hba_reset_nosleep(struct driver_data *dd) +static int mtip_hba_reset(struct driver_data *dd) { unsigned long timeout; - /* Chip quirk: quiesce any chip function */ - mdelay(10); - /* Set the reset bit */ writel(HOST_RESET, dd->mmio + HOST_CTL); /* Flush */ readl(dd->mmio + HOST_CTL); - /* - * Wait 10ms then spin for up to 1 second - * waiting for reset acknowledgement - */ - timeout = jiffies + msecs_to_jiffies(1000); - mdelay(10); - while ((readl(dd->mmio + HOST_CTL) & HOST_RESET) - && time_before(jiffies, timeout)) - mdelay(1); + /* Spin for up to 2 seconds, waiting for reset acknowledgement */ + timeout = jiffies + msecs_to_jiffies(2000); + do { + mdelay(10); + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) + return -1; - if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) - return -1; + } while ((readl(dd->mmio + HOST_CTL) & HOST_RESET) + && time_before(jiffies, timeout)); if (readl(dd->mmio + HOST_CTL) & HOST_RESET) return -1; @@ -296,16 +294,17 @@ static int hba_reset_nosleep(struct driver_data *dd) */ static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag) { - atomic_set(&port->commands[tag].active, 1); + int group = tag >> 5; - spin_lock(&port->cmd_issue_lock); + atomic_set(&port->commands[tag].active, 1); + /* guard SACT and CI registers */ + spin_lock(&port->cmd_issue_lock[group]); writel((1 << MTIP_TAG_BIT(tag)), port->s_active[MTIP_TAG_INDEX(tag)]); writel((1 << MTIP_TAG_BIT(tag)), port->cmd_issue[MTIP_TAG_INDEX(tag)]); - - spin_unlock(&port->cmd_issue_lock); + spin_unlock(&port->cmd_issue_lock[group]); /* Set the command's timeout value.*/ port->commands[tag].comp_time = jiffies + msecs_to_jiffies( @@ -478,7 +477,7 @@ static void mtip_restart_port(struct mtip_port *port) dev_warn(&port->dd->pdev->dev, "PxCMD.CR not clear, escalating reset\n"); - if (hba_reset_nosleep(port->dd)) + if (mtip_hba_reset(port->dd)) dev_err(&port->dd->pdev->dev, "HBA reset escalation failed.\n"); @@ -524,6 +523,26 @@ static void mtip_restart_port(struct mtip_port *port) } +static int mtip_device_reset(struct driver_data *dd) +{ + int rv = 0; + + if (mtip_check_surprise_removal(dd->pdev)) + return 0; + + if (mtip_hba_reset(dd) < 0) + rv = -EFAULT; + + mdelay(1); + mtip_init_port(dd->port); + mtip_start_port(dd->port); + + /* Enable interrupts on the HBA. */ + writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN, + dd->mmio + HOST_CTL); + return rv; +} + /* * Helper function for tag logging */ @@ -629,7 +648,7 @@ static void mtip_timeout_function(unsigned long int data) if (cmdto_cnt) { print_tags(port->dd, "timed out", tagaccum, cmdto_cnt); if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { - mtip_restart_port(port); + mtip_device_reset(port->dd); wake_up_interruptible(&port->svc_wait); } clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); @@ -964,56 +983,56 @@ handle_tfe_exit: /* * Handle a set device bits interrupt */ -static inline void mtip_process_sdbf(struct driver_data *dd) +static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, + u32 completed) { - struct mtip_port *port = dd->port; - int group, tag, bit; - u32 completed; + struct driver_data *dd = port->dd; + int tag, bit; struct mtip_cmd *command; - /* walk all bits in all slot groups */ - for (group = 0; group < dd->slot_groups; group++) { - completed = readl(port->completed[group]); - if (!completed) - continue; + if (!completed) { + WARN_ON_ONCE(!completed); + return; + } + /* clear completed status register in the hardware.*/ + writel(completed, port->completed[group]); - /* clear completed status register in the hardware.*/ - writel(completed, port->completed[group]); + /* Process completed commands. */ + for (bit = 0; (bit < 32) && completed; bit++) { + if (completed & 0x01) { + tag = (group << 5) | bit; - /* Process completed commands. */ - for (bit = 0; - (bit < 32) && completed; - bit++, completed >>= 1) { - if (completed & 0x01) { - tag = (group << 5) | bit; + /* skip internal command slot. */ + if (unlikely(tag == MTIP_TAG_INTERNAL)) + continue; - /* skip internal command slot. */ - if (unlikely(tag == MTIP_TAG_INTERNAL)) - continue; + command = &port->commands[tag]; + /* make internal callback */ + if (likely(command->comp_func)) { + command->comp_func( + port, + tag, + command->comp_data, + 0); + } else { + dev_warn(&dd->pdev->dev, + "Null completion " + "for tag %d", + tag); - command = &port->commands[tag]; - /* make internal callback */ - if (likely(command->comp_func)) { - command->comp_func( - port, - tag, - command->comp_data, - 0); - } else { - dev_warn(&dd->pdev->dev, - "Null completion " - "for tag %d", - tag); - - if (mtip_check_surprise_removal( - dd->pdev)) { - mtip_command_cleanup(dd); - return; - } + if (mtip_check_surprise_removal( + dd->pdev)) { + mtip_command_cleanup(dd); + return; } } } + completed >>= 1; } + + /* If last, re-enable interrupts */ + if (atomic_dec_return(&dd->irq_workers_active) == 0) + writel(0xffffffff, dd->mmio + HOST_IRQ_STAT); } /* @@ -1072,6 +1091,8 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data) struct mtip_port *port = dd->port; u32 hba_stat, port_stat; int rv = IRQ_NONE; + int do_irq_enable = 1, i, workers; + struct mtip_work *twork; hba_stat = readl(dd->mmio + HOST_IRQ_STAT); if (hba_stat) { @@ -1082,8 +1103,42 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data) writel(port_stat, port->mmio + PORT_IRQ_STAT); /* Demux port status */ - if (likely(port_stat & PORT_IRQ_SDB_FIS)) - mtip_process_sdbf(dd); + if (likely(port_stat & PORT_IRQ_SDB_FIS)) { + do_irq_enable = 0; + WARN_ON_ONCE(atomic_read(&dd->irq_workers_active) != 0); + + /* Start at 1: group zero is always local? */ + for (i = 0, workers = 0; i < MTIP_MAX_SLOT_GROUPS; + i++) { + twork = &dd->work[i]; + twork->completed = readl(port->completed[i]); + if (twork->completed) + workers++; + } + + atomic_set(&dd->irq_workers_active, workers); + if (workers) { + for (i = 1; i < MTIP_MAX_SLOT_GROUPS; i++) { + twork = &dd->work[i]; + if (twork->completed) + queue_work_on( + twork->cpu_binding, + dd->isr_workq, + &twork->work); + } + + if (likely(dd->work[0].completed)) + mtip_workq_sdbfx(port, 0, + dd->work[0].completed); + + } else { + /* + * Chip quirk: SDB interrupt but nothing + * to complete + */ + do_irq_enable = 1; + } + } if (unlikely(port_stat & PORT_IRQ_ERR)) { if (unlikely(mtip_check_surprise_removal(dd->pdev))) { @@ -1103,21 +1158,13 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data) } /* acknowledge interrupt */ - writel(hba_stat, dd->mmio + HOST_IRQ_STAT); + if (unlikely(do_irq_enable)) + writel(hba_stat, dd->mmio + HOST_IRQ_STAT); return rv; } /* - * Wrapper for mtip_handle_irq - * (ignores return code) - */ -static void mtip_tasklet(unsigned long data) -{ - mtip_handle_irq((struct driver_data *) data); -} - -/* * HBA interrupt subroutine. * * @irq IRQ number. @@ -1130,8 +1177,8 @@ static void mtip_tasklet(unsigned long data) static irqreturn_t mtip_irq_handler(int irq, void *instance) { struct driver_data *dd = instance; - tasklet_schedule(&dd->tasklet); - return IRQ_HANDLED; + + return mtip_handle_irq(dd); } static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag) @@ -1252,11 +1299,11 @@ static int mtip_exec_internal_command(struct mtip_port *port, int rv = 0, ready2go = 1; struct mtip_cmd *int_cmd = &port->commands[MTIP_TAG_INTERNAL]; unsigned long to; + struct driver_data *dd = port->dd; /* Make sure the buffer is 8 byte aligned. This is asic specific. */ if (buffer & 0x00000007) { - dev_err(&port->dd->pdev->dev, - "SG buffer is not 8 byte aligned\n"); + dev_err(&dd->pdev->dev, "SG buffer is not 8 byte aligned\n"); return -EFAULT; } @@ -1269,23 +1316,21 @@ static int mtip_exec_internal_command(struct mtip_port *port, mdelay(100); } while (time_before(jiffies, to)); if (!ready2go) { - dev_warn(&port->dd->pdev->dev, + dev_warn(&dd->pdev->dev, "Internal cmd active. new cmd [%02X]\n", fis->command); return -EBUSY; } set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); port->ic_pause_timer = 0; - if (fis->command == ATA_CMD_SEC_ERASE_UNIT) - clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); - else if (fis->command == ATA_CMD_DOWNLOAD_MICRO) - clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags); + clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); + clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags); if (atomic == GFP_KERNEL) { if (fis->command != ATA_CMD_STANDBYNOW1) { /* wait for io to complete if non atomic */ if (mtip_quiesce_io(port, 5000) < 0) { - dev_warn(&port->dd->pdev->dev, + dev_warn(&dd->pdev->dev, "Failed to quiesce IO\n"); release_slot(port, MTIP_TAG_INTERNAL); clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); @@ -1330,58 +1375,84 @@ static int mtip_exec_internal_command(struct mtip_port *port, /* Issue the command to the hardware */ mtip_issue_non_ncq_command(port, MTIP_TAG_INTERNAL); - /* Poll if atomic, wait_for_completion otherwise */ if (atomic == GFP_KERNEL) { /* Wait for the command to complete or timeout. */ - if (wait_for_completion_timeout( + if (wait_for_completion_interruptible_timeout( &wait, - msecs_to_jiffies(timeout)) == 0) { - dev_err(&port->dd->pdev->dev, - "Internal command did not complete [%d] " - "within timeout of %lu ms\n", - atomic, timeout); - if (mtip_check_surprise_removal(port->dd->pdev) || + msecs_to_jiffies(timeout)) <= 0) { + if (rv == -ERESTARTSYS) { /* interrupted */ + dev_err(&dd->pdev->dev, + "Internal command [%02X] was interrupted after %lu ms\n", + fis->command, timeout); + rv = -EINTR; + goto exec_ic_exit; + } else if (rv == 0) /* timeout */ + dev_err(&dd->pdev->dev, + "Internal command did not complete [%02X] within timeout of %lu ms\n", + fis->command, timeout); + else + dev_err(&dd->pdev->dev, + "Internal command [%02X] wait returned code [%d] after %lu ms - unhandled\n", + fis->command, rv, timeout); + + if (mtip_check_surprise_removal(dd->pdev) || test_bit(MTIP_DDF_REMOVE_PENDING_BIT, - &port->dd->dd_flag)) { + &dd->dd_flag)) { + dev_err(&dd->pdev->dev, + "Internal command [%02X] wait returned due to SR\n", + fis->command); rv = -ENXIO; goto exec_ic_exit; } + mtip_device_reset(dd); /* recover from timeout issue */ rv = -EAGAIN; + goto exec_ic_exit; } } else { + u32 hba_stat, port_stat; + /* Spin for <timeout> checking if command still outstanding */ timeout = jiffies + msecs_to_jiffies(timeout); while ((readl(port->cmd_issue[MTIP_TAG_INTERNAL]) & (1 << MTIP_TAG_INTERNAL)) && time_before(jiffies, timeout)) { - if (mtip_check_surprise_removal(port->dd->pdev)) { + if (mtip_check_surprise_removal(dd->pdev)) { rv = -ENXIO; goto exec_ic_exit; } if ((fis->command != ATA_CMD_STANDBYNOW1) && test_bit(MTIP_DDF_REMOVE_PENDING_BIT, - &port->dd->dd_flag)) { + &dd->dd_flag)) { rv = -ENXIO; goto exec_ic_exit; } - if (readl(port->mmio + PORT_IRQ_STAT) & PORT_IRQ_ERR) { - atomic_inc(&int_cmd->active); /* error */ - break; + port_stat = readl(port->mmio + PORT_IRQ_STAT); + if (!port_stat) + continue; + + if (port_stat & PORT_IRQ_ERR) { + dev_err(&dd->pdev->dev, + "Internal command [%02X] failed\n", + fis->command); + mtip_device_reset(dd); + rv = -EIO; + goto exec_ic_exit; + } else { + writel(port_stat, port->mmio + PORT_IRQ_STAT); + hba_stat = readl(dd->mmio + HOST_IRQ_STAT); + if (hba_stat) + writel(hba_stat, + dd->mmio + HOST_IRQ_STAT); } + break; } } - if (atomic_read(&int_cmd->active) > 1) { - dev_err(&port->dd->pdev->dev, - "Internal command [%02X] failed\n", fis->command); - rv = -EIO; - } if (readl(port->cmd_issue[MTIP_TAG_INTERNAL]) & (1 << MTIP_TAG_INTERNAL)) { rv = -ENXIO; - if (!test_bit(MTIP_DDF_REMOVE_PENDING_BIT, - &port->dd->dd_flag)) { - mtip_restart_port(port); + if (!test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) { + mtip_device_reset(dd); rv = -EAGAIN; } } @@ -1489,6 +1560,12 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer) } #endif + /* Demux ID.DRAT & ID.RZAT to determine trim support */ + if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5)) + port->dd->trim_supp = true; + else + port->dd->trim_supp = false; + /* Set the identify buffer as valid. */ port->identify_valid = 1; @@ -1676,6 +1753,82 @@ static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id, } /* + * Trim unused sectors + * + * @dd pointer to driver_data structure + * @lba starting lba + * @len # of 512b sectors to trim + * + * return value + * -ENOMEM Out of dma memory + * -EINVAL Invalid parameters passed in, trim not supported + * -EIO Error submitting trim request to hw + */ +static int mtip_send_trim(struct driver_data *dd, unsigned int lba, + unsigned int len) +{ + int i, rv = 0; + u64 tlba, tlen, sect_left; + struct mtip_trim_entry *buf; + dma_addr_t dma_addr; + struct host_to_dev_fis fis; + + if (!len || dd->trim_supp == false) + return -EINVAL; + + /* Trim request too big */ + WARN_ON(len > (MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES)); + + /* Trim request not aligned on 4k boundary */ + WARN_ON(len % 8 != 0); + + /* Warn if vu_trim structure is too big */ + WARN_ON(sizeof(struct mtip_trim) > ATA_SECT_SIZE); + + /* Allocate a DMA buffer for the trim structure */ + buf = dmam_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr, + GFP_KERNEL); + if (!buf) + return -ENOMEM; + memset(buf, 0, ATA_SECT_SIZE); + + for (i = 0, sect_left = len, tlba = lba; + i < MTIP_MAX_TRIM_ENTRIES && sect_left; + i++) { + tlen = (sect_left >= MTIP_MAX_TRIM_ENTRY_LEN ? + MTIP_MAX_TRIM_ENTRY_LEN : + sect_left); + buf[i].lba = __force_bit2int cpu_to_le32(tlba); + buf[i].range = __force_bit2int cpu_to_le16(tlen); + tlba += tlen; + sect_left -= tlen; + } + WARN_ON(sect_left != 0); + + /* Build the fis */ + memset(&fis, 0, sizeof(struct host_to_dev_fis)); + fis.type = 0x27; + fis.opts = 1 << 7; + fis.command = 0xfb; + fis.features = 0x60; + fis.sect_count = 1; + fis.device = ATA_DEVICE_OBS; + + if (mtip_exec_internal_command(dd->port, + &fis, + 5, + dma_addr, + ATA_SECT_SIZE, + 0, + GFP_KERNEL, + MTIP_TRIM_TIMEOUT_MS) < 0) + rv = -EIO; + + dmam_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr); + return rv; +} + +/* * Get the drive capacity. * * @dd Pointer to the device data structure. @@ -1699,45 +1852,6 @@ static bool mtip_hw_get_capacity(struct driver_data *dd, sector_t *sectors) } /* - * Reset the HBA. - * - * Resets the HBA by setting the HBA Reset bit in the Global - * HBA Control register. After setting the HBA Reset bit the - * function waits for 1 second before reading the HBA Reset - * bit to make sure it has cleared. If HBA Reset is not clear - * an error is returned. Cannot be used in non-blockable - * context. - * - * @dd Pointer to the driver data structure. - * - * return value - * 0 The reset was successful. - * -1 The HBA Reset bit did not clear. - */ -static int mtip_hba_reset(struct driver_data *dd) -{ - mtip_deinit_port(dd->port); - - /* Set the reset bit */ - writel(HOST_RESET, dd->mmio + HOST_CTL); - - /* Flush */ - readl(dd->mmio + HOST_CTL); - - /* Wait for reset to clear */ - ssleep(1); - - /* Check the bit has cleared */ - if (readl(dd->mmio + HOST_CTL) & HOST_RESET) { - dev_err(&dd->pdev->dev, - "Reset bit did not clear.\n"); - return -1; - } - - return 0; -} - -/* * Display the identify command data. * * @port Pointer to the port data structure. @@ -2598,6 +2712,100 @@ static ssize_t mtip_hw_show_status(struct device *dev, static DEVICE_ATTR(status, S_IRUGO, mtip_hw_show_status, NULL); +/* debugsfs entries */ + +static ssize_t show_device_status(struct device_driver *drv, char *buf) +{ + int size = 0; + struct driver_data *dd, *tmp; + unsigned long flags; + char id_buf[42]; + u16 status = 0; + + spin_lock_irqsave(&dev_lock, flags); + size += sprintf(&buf[size], "Devices Present:\n"); + list_for_each_entry_safe(dd, tmp, &online_list, online_list) { + if (dd->pdev) { + if (dd->port && + dd->port->identify && + dd->port->identify_valid) { + strlcpy(id_buf, + (char *) (dd->port->identify + 10), 21); + status = *(dd->port->identify + 141); + } else { + memset(id_buf, 0, 42); + status = 0; + } + + if (dd->port && + test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags)) { + size += sprintf(&buf[size], + " device %s %s (ftl rebuild %d %%)\n", + dev_name(&dd->pdev->dev), + id_buf, + status); + } else { + size += sprintf(&buf[size], + " device %s %s\n", + dev_name(&dd->pdev->dev), + id_buf); + } + } + } + + size += sprintf(&buf[size], "Devices Being Removed:\n"); + list_for_each_entry_safe(dd, tmp, &removing_list, remove_list) { + if (dd->pdev) { + if (dd->port && + dd->port->identify && + dd->port->identify_valid) { + strlcpy(id_buf, + (char *) (dd->port->identify+10), 21); + status = *(dd->port->identify + 141); + } else { + memset(id_buf, 0, 42); + status = 0; + } + + if (dd->port && + test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags)) { + size += sprintf(&buf[size], + " device %s %s (ftl rebuild %d %%)\n", + dev_name(&dd->pdev->dev), + id_buf, + status); + } else { + size += sprintf(&buf[size], + " device %s %s\n", + dev_name(&dd->pdev->dev), + id_buf); + } + } + } + spin_unlock_irqrestore(&dev_lock, flags); + + return size; +} + +static ssize_t mtip_hw_read_device_status(struct file *f, char __user *ubuf, + size_t len, loff_t *offset) +{ + int size = *offset; + char buf[MTIP_DFS_MAX_BUF_SIZE]; + + if (!len || *offset) + return 0; + + size += show_device_status(NULL, buf); + + *offset = size <= len ? size : len; + size = copy_to_user(ubuf, buf, *offset); + if (size) + return -EFAULT; + + return *offset; +} + static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf, size_t len, loff_t *offset) { @@ -2692,6 +2900,13 @@ static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf, return *offset; } +static const struct file_operations mtip_device_status_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = mtip_hw_read_device_status, + .llseek = no_llseek, +}; + static const struct file_operations mtip_regs_fops = { .owner = THIS_MODULE, .open = simple_open, @@ -3005,20 +3220,24 @@ static int mtip_hw_init(struct driver_data *dd) hba_setup(dd); - tasklet_init(&dd->tasklet, mtip_tasklet, (unsigned long)dd); - - dd->port = kzalloc(sizeof(struct mtip_port), GFP_KERNEL); + dd->port = kzalloc_node(sizeof(struct mtip_port), GFP_KERNEL, + dd->numa_node); if (!dd->port) { dev_err(&dd->pdev->dev, "Memory allocation: port structure\n"); return -ENOMEM; } + /* Continue workqueue setup */ + for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++) + dd->work[i].port = dd->port; + /* Counting semaphore to track command slot usage */ sema_init(&dd->port->cmd_slot, num_command_slots - 1); /* Spinlock to prevent concurrent issue */ - spin_lock_init(&dd->port->cmd_issue_lock); + for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++) + spin_lock_init(&dd->port->cmd_issue_lock[i]); /* Set the port mmio base address. */ dd->port->mmio = dd->mmio + PORT_OFFSET; @@ -3165,6 +3384,7 @@ static int mtip_hw_init(struct driver_data *dd) "Unable to allocate IRQ %d\n", dd->pdev->irq); goto out2; } + irq_set_affinity_hint(dd->pdev->irq, get_cpu_mask(dd->isr_binding)); /* Enable interrupts on the HBA. */ writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN, @@ -3241,7 +3461,8 @@ out3: writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN, dd->mmio + HOST_CTL); - /*Release the IRQ. */ + /* Release the IRQ. */ + irq_set_affinity_hint(dd->pdev->irq, NULL); devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); out2: @@ -3291,11 +3512,9 @@ static int mtip_hw_exit(struct driver_data *dd) del_timer_sync(&dd->port->cmd_timer); /* Release the IRQ. */ + irq_set_affinity_hint(dd->pdev->irq, NULL); devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); - /* Stop the bottom half tasklet. */ - tasklet_kill(&dd->tasklet); - /* Free the command/command header memory. */ dmam_free_coherent(&dd->pdev->dev, HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4), @@ -3641,6 +3860,12 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) } } + if (unlikely(bio->bi_rw & REQ_DISCARD)) { + bio_endio(bio, mtip_send_trim(dd, bio->bi_sector, + bio_sectors(bio))); + return; + } + if (unlikely(!bio_has_data(bio))) { blk_queue_flush(queue, 0); bio_endio(bio, 0); @@ -3711,7 +3936,7 @@ static int mtip_block_initialize(struct driver_data *dd) goto protocol_init_error; } - dd->disk = alloc_disk(MTIP_MAX_MINORS); + dd->disk = alloc_disk_node(MTIP_MAX_MINORS, dd->numa_node); if (dd->disk == NULL) { dev_err(&dd->pdev->dev, "Unable to allocate gendisk structure\n"); @@ -3755,7 +3980,7 @@ static int mtip_block_initialize(struct driver_data *dd) skip_create_disk: /* Allocate the request queue. */ - dd->queue = blk_alloc_queue(GFP_KERNEL); + dd->queue = blk_alloc_queue_node(GFP_KERNEL, dd->numa_node); if (dd->queue == NULL) { dev_err(&dd->pdev->dev, "Unable to allocate request queue\n"); @@ -3783,6 +4008,15 @@ skip_create_disk: */ blk_queue_flush(dd->queue, 0); + /* Signal trim support */ + if (dd->trim_supp == true) { + set_bit(QUEUE_FLAG_DISCARD, &dd->queue->queue_flags); + dd->queue->limits.discard_granularity = 4096; + blk_queue_max_discard_sectors(dd->queue, + MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES); + dd->queue->limits.discard_zeroes_data = 0; + } + /* Set the capacity of the device in 512 byte sectors. */ if (!(mtip_hw_get_capacity(dd, &capacity))) { dev_warn(&dd->pdev->dev, @@ -3813,9 +4047,8 @@ skip_create_disk: start_service_thread: sprintf(thd_name, "mtip_svc_thd_%02d", index); - - dd->mtip_svc_handler = kthread_run(mtip_service_thread, - dd, thd_name); + dd->mtip_svc_handler = kthread_create_on_node(mtip_service_thread, + dd, dd->numa_node, thd_name); if (IS_ERR(dd->mtip_svc_handler)) { dev_err(&dd->pdev->dev, "service thread failed to start\n"); @@ -3823,7 +4056,7 @@ start_service_thread: rv = -EFAULT; goto kthread_run_error; } - + wake_up_process(dd->mtip_svc_handler); if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC) rv = wait_for_rebuild; @@ -3963,6 +4196,56 @@ static int mtip_block_resume(struct driver_data *dd) return 0; } +static void drop_cpu(int cpu) +{ + cpu_use[cpu]--; +} + +static int get_least_used_cpu_on_node(int node) +{ + int cpu, least_used_cpu, least_cnt; + const struct cpumask *node_mask; + + node_mask = cpumask_of_node(node); + least_used_cpu = cpumask_first(node_mask); + least_cnt = cpu_use[least_used_cpu]; + cpu = least_used_cpu; + + for_each_cpu(cpu, node_mask) { + if (cpu_use[cpu] < least_cnt) { + least_used_cpu = cpu; + least_cnt = cpu_use[cpu]; + } + } + cpu_use[least_used_cpu]++; + return least_used_cpu; +} + +/* Helper for selecting a node in round robin mode */ +static inline int mtip_get_next_rr_node(void) +{ + static int next_node = -1; + + if (next_node == -1) { + next_node = first_online_node; + return next_node; + } + + next_node = next_online_node(next_node); + if (next_node == MAX_NUMNODES) + next_node = first_online_node; + return next_node; +} + +static DEFINE_HANDLER(0); +static DEFINE_HANDLER(1); +static DEFINE_HANDLER(2); +static DEFINE_HANDLER(3); +static DEFINE_HANDLER(4); +static DEFINE_HANDLER(5); +static DEFINE_HANDLER(6); +static DEFINE_HANDLER(7); + /* * Called for each supported PCI device detected. * @@ -3977,9 +4260,26 @@ static int mtip_pci_probe(struct pci_dev *pdev, { int rv = 0; struct driver_data *dd = NULL; + char cpu_list[256]; + const struct cpumask *node_mask; + int cpu, i = 0, j = 0; + int my_node = NUMA_NO_NODE; + unsigned long flags; /* Allocate memory for this devices private data. */ - dd = kzalloc(sizeof(struct driver_data), GFP_KERNEL); + my_node = pcibus_to_node(pdev->bus); + if (my_node != NUMA_NO_NODE) { + if (!node_online(my_node)) + my_node = mtip_get_next_rr_node(); + } else { + dev_info(&pdev->dev, "Kernel not reporting proximity, choosing a node\n"); + my_node = mtip_get_next_rr_node(); + } + dev_info(&pdev->dev, "NUMA node %d (closest: %d,%d, probe on %d:%d)\n", + my_node, pcibus_to_node(pdev->bus), dev_to_node(&pdev->dev), + cpu_to_node(smp_processor_id()), smp_processor_id()); + + dd = kzalloc_node(sizeof(struct driver_data), GFP_KERNEL, my_node); if (dd == NULL) { dev_err(&pdev->dev, "Unable to allocate memory for driver data\n"); @@ -4016,19 +4316,87 @@ static int mtip_pci_probe(struct pci_dev *pdev, } } - pci_set_master(pdev); + /* Copy the info we may need later into the private data structure. */ + dd->major = mtip_major; + dd->instance = instance; + dd->pdev = pdev; + dd->numa_node = my_node; + + INIT_LIST_HEAD(&dd->online_list); + INIT_LIST_HEAD(&dd->remove_list); - if (pci_enable_msi(pdev)) { + memset(dd->workq_name, 0, 32); + snprintf(dd->workq_name, 31, "mtipq%d", dd->instance); + + dd->isr_workq = create_workqueue(dd->workq_name); + if (!dd->isr_workq) { + dev_warn(&pdev->dev, "Can't create wq %d\n", dd->instance); + rv = -ENOMEM; + goto block_initialize_err; + } + + memset(cpu_list, 0, sizeof(cpu_list)); + + node_mask = cpumask_of_node(dd->numa_node); + if (!cpumask_empty(node_mask)) { + for_each_cpu(cpu, node_mask) + { + snprintf(&cpu_list[j], 256 - j, "%d ", cpu); + j = strlen(cpu_list); + } + + dev_info(&pdev->dev, "Node %d on package %d has %d cpu(s): %s\n", + dd->numa_node, + topology_physical_package_id(cpumask_first(node_mask)), + nr_cpus_node(dd->numa_node), + cpu_list); + } else + dev_dbg(&pdev->dev, "mtip32xx: node_mask empty\n"); + + dd->isr_binding = get_least_used_cpu_on_node(dd->numa_node); + dev_info(&pdev->dev, "Initial IRQ binding node:cpu %d:%d\n", + cpu_to_node(dd->isr_binding), dd->isr_binding); + + /* first worker context always runs in ISR */ + dd->work[0].cpu_binding = dd->isr_binding; + dd->work[1].cpu_binding = get_least_used_cpu_on_node(dd->numa_node); + dd->work[2].cpu_binding = get_least_used_cpu_on_node(dd->numa_node); + dd->work[3].cpu_binding = dd->work[0].cpu_binding; + dd->work[4].cpu_binding = dd->work[1].cpu_binding; + dd->work[5].cpu_binding = dd->work[2].cpu_binding; + dd->work[6].cpu_binding = dd->work[2].cpu_binding; + dd->work[7].cpu_binding = dd->work[1].cpu_binding; + + /* Log the bindings */ + for_each_present_cpu(cpu) { + memset(cpu_list, 0, sizeof(cpu_list)); + for (i = 0, j = 0; i < MTIP_MAX_SLOT_GROUPS; i++) { + if (dd->work[i].cpu_binding == cpu) { + snprintf(&cpu_list[j], 256 - j, "%d ", i); + j = strlen(cpu_list); + } + } + if (j) + dev_info(&pdev->dev, "CPU %d: WQs %s\n", cpu, cpu_list); + } + + INIT_WORK(&dd->work[0].work, mtip_workq_sdbf0); + INIT_WORK(&dd->work[1].work, mtip_workq_sdbf1); + INIT_WORK(&dd->work[2].work, mtip_workq_sdbf2); + INIT_WORK(&dd->work[3].work, mtip_workq_sdbf3); + INIT_WORK(&dd->work[4].work, mtip_workq_sdbf4); + INIT_WORK(&dd->work[5].work, mtip_workq_sdbf5); + INIT_WORK(&dd->work[6].work, mtip_workq_sdbf6); + INIT_WORK(&dd->work[7].work, mtip_workq_sdbf7); + + pci_set_master(pdev); + rv = pci_enable_msi(pdev); + if (rv) { dev_warn(&pdev->dev, "Unable to enable MSI interrupt.\n"); goto block_initialize_err; } - /* Copy the info we may need later into the private data structure. */ - dd->major = mtip_major; - dd->instance = instance; - dd->pdev = pdev; - /* Initialize the block layer. */ rv = mtip_block_initialize(dd); if (rv < 0) { @@ -4044,11 +4412,25 @@ static int mtip_pci_probe(struct pci_dev *pdev, instance++; if (rv != MTIP_FTL_REBUILD_MAGIC) set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag); + else + rv = 0; /* device in rebuild state, return 0 from probe */ + + /* Add to online list even if in ftl rebuild */ + spin_lock_irqsave(&dev_lock, flags); + list_add(&dd->online_list, &online_list); + spin_unlock_irqrestore(&dev_lock, flags); + goto done; block_initialize_err: pci_disable_msi(pdev); - + if (dd->isr_workq) { + flush_workqueue(dd->isr_workq); + destroy_workqueue(dd->isr_workq); + drop_cpu(dd->work[0].cpu_binding); + drop_cpu(dd->work[1].cpu_binding); + drop_cpu(dd->work[2].cpu_binding); + } setmask_err: pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); @@ -4071,9 +4453,15 @@ static void mtip_pci_remove(struct pci_dev *pdev) { struct driver_data *dd = pci_get_drvdata(pdev); int counter = 0; + unsigned long flags; set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag); + spin_lock_irqsave(&dev_lock, flags); + list_del_init(&dd->online_list); + list_add(&dd->remove_list, &removing_list); + spin_unlock_irqrestore(&dev_lock, flags); + if (mtip_check_surprise_removal(pdev)) { while (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) { counter++; @@ -4089,8 +4477,20 @@ static void mtip_pci_remove(struct pci_dev *pdev) /* Clean up the block layer. */ mtip_block_remove(dd); + if (dd->isr_workq) { + flush_workqueue(dd->isr_workq); + destroy_workqueue(dd->isr_workq); + drop_cpu(dd->work[0].cpu_binding); + drop_cpu(dd->work[1].cpu_binding); + drop_cpu(dd->work[2].cpu_binding); + } + pci_disable_msi(pdev); + spin_lock_irqsave(&dev_lock, flags); + list_del_init(&dd->remove_list); + spin_unlock_irqrestore(&dev_lock, flags); + kfree(dd); pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); } @@ -4238,6 +4638,11 @@ static int __init mtip_init(void) pr_info(MTIP_DRV_NAME " Version " MTIP_DRV_VERSION "\n"); + spin_lock_init(&dev_lock); + + INIT_LIST_HEAD(&online_list); + INIT_LIST_HEAD(&removing_list); + /* Allocate a major block device number to use with this driver. */ error = register_blkdev(0, MTIP_DRV_NAME); if (error <= 0) { @@ -4247,11 +4652,18 @@ static int __init mtip_init(void) } mtip_major = error; - if (!dfs_parent) { - dfs_parent = debugfs_create_dir("rssd", NULL); - if (IS_ERR_OR_NULL(dfs_parent)) { - pr_warn("Error creating debugfs parent\n"); - dfs_parent = NULL; + dfs_parent = debugfs_create_dir("rssd", NULL); + if (IS_ERR_OR_NULL(dfs_parent)) { + pr_warn("Error creating debugfs parent\n"); + dfs_parent = NULL; + } + if (dfs_parent) { + dfs_device_status = debugfs_create_file("device_status", + S_IRUGO, dfs_parent, NULL, + &mtip_device_status_fops); + if (IS_ERR_OR_NULL(dfs_device_status)) { + pr_err("Error creating device_status node\n"); + dfs_device_status = NULL; } } diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index b1742640556..8e8334c9dd0 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -129,9 +129,9 @@ enum { MTIP_PF_EH_ACTIVE_BIT = 1, /* error handling */ MTIP_PF_SE_ACTIVE_BIT = 2, /* secure erase */ MTIP_PF_DM_ACTIVE_BIT = 3, /* download microcde */ - MTIP_PF_PAUSE_IO = ((1 << MTIP_PF_IC_ACTIVE_BIT) | \ - (1 << MTIP_PF_EH_ACTIVE_BIT) | \ - (1 << MTIP_PF_SE_ACTIVE_BIT) | \ + MTIP_PF_PAUSE_IO = ((1 << MTIP_PF_IC_ACTIVE_BIT) | + (1 << MTIP_PF_EH_ACTIVE_BIT) | + (1 << MTIP_PF_SE_ACTIVE_BIT) | (1 << MTIP_PF_DM_ACTIVE_BIT)), MTIP_PF_SVC_THD_ACTIVE_BIT = 4, @@ -144,9 +144,9 @@ enum { MTIP_DDF_REMOVE_PENDING_BIT = 1, MTIP_DDF_OVER_TEMP_BIT = 2, MTIP_DDF_WRITE_PROTECT_BIT = 3, - MTIP_DDF_STOP_IO = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) | \ - (1 << MTIP_DDF_SEC_LOCK_BIT) | \ - (1 << MTIP_DDF_OVER_TEMP_BIT) | \ + MTIP_DDF_STOP_IO = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) | + (1 << MTIP_DDF_SEC_LOCK_BIT) | + (1 << MTIP_DDF_OVER_TEMP_BIT) | (1 << MTIP_DDF_WRITE_PROTECT_BIT)), MTIP_DDF_CLEANUP_BIT = 5, @@ -164,6 +164,35 @@ struct smart_attr { u8 res[3]; } __packed; +struct mtip_work { + struct work_struct work; + void *port; + int cpu_binding; + u32 completed; +} ____cacheline_aligned_in_smp; + +#define DEFINE_HANDLER(group) \ + void mtip_workq_sdbf##group(struct work_struct *work) \ + { \ + struct mtip_work *w = (struct mtip_work *) work; \ + mtip_workq_sdbfx(w->port, group, w->completed); \ + } + +#define MTIP_TRIM_TIMEOUT_MS 240000 +#define MTIP_MAX_TRIM_ENTRIES 8 +#define MTIP_MAX_TRIM_ENTRY_LEN 0xfff8 + +struct mtip_trim_entry { + u32 lba; /* starting lba of region */ + u16 rsvd; /* unused */ + u16 range; /* # of 512b blocks to trim */ +} __packed; + +struct mtip_trim { + /* Array of regions to trim */ + struct mtip_trim_entry entry[MTIP_MAX_TRIM_ENTRIES]; +} __packed; + /* Register Frame Information Structure (FIS), host to device. */ struct host_to_dev_fis { /* @@ -424,7 +453,7 @@ struct mtip_port { */ struct semaphore cmd_slot; /* Spinlock for working around command-issue bug. */ - spinlock_t cmd_issue_lock; + spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS]; }; /* @@ -447,9 +476,6 @@ struct driver_data { struct mtip_port *port; /* Pointer to the port data structure. */ - /* Tasklet used to process the bottom half of the ISR. */ - struct tasklet_struct tasklet; - unsigned product_type; /* magic value declaring the product type */ unsigned slot_groups; /* number of slot groups the product supports */ @@ -461,6 +487,24 @@ struct driver_data { struct task_struct *mtip_svc_handler; /* task_struct of svc thd */ struct dentry *dfs_node; + + bool trim_supp; /* flag indicating trim support */ + + int numa_node; /* NUMA support */ + + char workq_name[32]; + + struct workqueue_struct *isr_workq; + + struct mtip_work work[MTIP_MAX_SLOT_GROUPS]; + + atomic_t irq_workers_active; + + int isr_binding; + + struct list_head online_list; /* linkage for online list */ + + struct list_head remove_list; /* linkage for removing list */ }; #endif diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 043ddcca4ab..037288e7874 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -98,6 +98,7 @@ static const char *nbdcmd_to_ascii(int cmd) case NBD_CMD_READ: return "read"; case NBD_CMD_WRITE: return "write"; case NBD_CMD_DISC: return "disconnect"; + case NBD_CMD_FLUSH: return "flush"; case NBD_CMD_TRIM: return "trim/discard"; } return "invalid"; @@ -244,8 +245,15 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req) request.magic = htonl(NBD_REQUEST_MAGIC); request.type = htonl(nbd_cmd(req)); - request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); - request.len = htonl(size); + + if (nbd_cmd(req) == NBD_CMD_FLUSH) { + /* Other values are reserved for FLUSH requests. */ + request.from = 0; + request.len = 0; + } else { + request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); + request.len = htonl(size); + } memcpy(request.handle, &req, sizeof(req)); dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n", @@ -482,6 +490,11 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req) } } + if (req->cmd_flags & REQ_FLUSH) { + BUG_ON(unlikely(blk_rq_sectors(req))); + nbd_cmd(req) = NBD_CMD_FLUSH; + } + req->errors = 0; mutex_lock(&nbd->tx_lock); @@ -551,6 +564,7 @@ static int nbd_thread(void *data) */ static void do_nbd_request(struct request_queue *q) + __releases(q->queue_lock) __acquires(q->queue_lock) { struct request *req; @@ -595,12 +609,20 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, struct request sreq; dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); + if (!nbd->sock) + return -EINVAL; + mutex_unlock(&nbd->tx_lock); + fsync_bdev(bdev); + mutex_lock(&nbd->tx_lock); blk_rq_init(NULL, &sreq); sreq.cmd_type = REQ_TYPE_SPECIAL; nbd_cmd(&sreq) = NBD_CMD_DISC; + + /* Check again after getting mutex back. */ if (!nbd->sock) return -EINVAL; + nbd_send_req(nbd, &sreq); return 0; } @@ -614,6 +636,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, nbd_clear_que(nbd); BUG_ON(!list_empty(&nbd->queue_head)); BUG_ON(!list_empty(&nbd->waiting_queue)); + kill_bdev(bdev); if (file) fput(file); return 0; @@ -625,7 +648,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, return -EBUSY; file = fget(arg); if (file) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); if (S_ISSOCK(inode->i_mode)) { nbd->file = file; nbd->sock = SOCKET_I(inode); @@ -681,9 +704,15 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, mutex_unlock(&nbd->tx_lock); + if (nbd->flags & NBD_FLAG_READ_ONLY) + set_device_ro(bdev, true); if (nbd->flags & NBD_FLAG_SEND_TRIM) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); + if (nbd->flags & NBD_FLAG_SEND_FLUSH) + blk_queue_flush(nbd->disk->queue, REQ_FLUSH); + else + blk_queue_flush(nbd->disk->queue, 0); thread = kthread_create(nbd_thread, nbd, nbd->disk->disk_name); if (IS_ERR(thread)) { @@ -702,9 +731,12 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, nbd->file = NULL; nbd_clear_que(nbd); dev_warn(disk_to_dev(nbd->disk), "queue cleared\n"); + kill_bdev(bdev); queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); + set_device_ro(bdev, false); if (file) fput(file); + nbd->flags = 0; nbd->bytesize = 0; bdev->bd_inode->i_size = 0; set_capacity(nbd->disk, 0); @@ -824,6 +856,8 @@ static int __init nbd_init(void) disk->queue->limits.discard_granularity = 512; disk->queue->limits.max_discard_sectors = UINT_MAX; disk->queue->limits.discard_zeroes_data = 0; + blk_queue_max_hw_sectors(disk->queue, 65536); + disk->queue->limits.max_sectors = 256; } if (register_blkdev(NBD_MAJOR, "nbd")) { diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 07fb2dfaae1..9dcefe40380 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -135,6 +135,7 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); + BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); } typedef void (*nvme_completion_fn)(struct nvme_dev *, void *, @@ -237,7 +238,8 @@ static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, *fn = special_completion; return CMD_CTX_INVALID; } - *fn = info[cmdid].fn; + if (fn) + *fn = info[cmdid].fn; ctx = info[cmdid].ctx; info[cmdid].fn = special_completion; info[cmdid].ctx = CMD_CTX_COMPLETED; @@ -335,6 +337,7 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) iod->offset = offsetof(struct nvme_iod, sg[nseg]); iod->npages = -1; iod->length = nbytes; + iod->nents = 0; } return iod; @@ -375,7 +378,8 @@ static void bio_completion(struct nvme_dev *dev, void *ctx, struct bio *bio = iod->private; u16 status = le16_to_cpup(&cqe->status) >> 1; - dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, + if (iod->nents) + dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); nvme_free_iod(dev, iod); if (status) { @@ -589,7 +593,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, result = nvme_map_bio(nvmeq->q_dmadev, iod, bio, dma_dir, psegs); if (result < 0) - goto free_iod; + goto free_cmdid; length = result; cmnd->rw.command_id = cmdid; @@ -609,6 +613,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, return 0; + free_cmdid: + free_cmdid(nvmeq, cmdid, NULL); free_iod: nvme_free_iod(nvmeq->dev, iod); nomem: @@ -835,8 +841,8 @@ static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, return nvme_submit_admin_cmd(dev, &c, NULL); } -static int nvme_get_features(struct nvme_dev *dev, unsigned fid, - unsigned nsid, dma_addr_t dma_addr) +static int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, + dma_addr_t dma_addr, u32 *result) { struct nvme_command c; @@ -846,7 +852,7 @@ static int nvme_get_features(struct nvme_dev *dev, unsigned fid, c.features.prp1 = cpu_to_le64(dma_addr); c.features.fid = cpu_to_le32(fid); - return nvme_submit_admin_cmd(dev, &c, NULL); + return nvme_submit_admin_cmd(dev, &c, result); } static int nvme_set_features(struct nvme_dev *dev, unsigned fid, @@ -906,6 +912,10 @@ static void nvme_free_queue(struct nvme_dev *dev, int qid) spin_lock_irq(&nvmeq->q_lock); nvme_cancel_ios(nvmeq, false); + while (bio_list_peek(&nvmeq->sq_cong)) { + struct bio *bio = bio_list_pop(&nvmeq->sq_cong); + bio_endio(bio, -EIO); + } spin_unlock_irq(&nvmeq->q_lock); irq_set_affinity_hint(vector, NULL); @@ -1230,12 +1240,17 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, if (length != cmd.data_len) status = -ENOMEM; else - status = nvme_submit_admin_cmd(dev, &c, NULL); + status = nvme_submit_admin_cmd(dev, &c, &cmd.result); if (cmd.data_len) { nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); nvme_free_iod(dev, iod); } + + if (!status && copy_to_user(&ucmd->result, &cmd.result, + sizeof(cmd.result))) + status = -EFAULT; + return status; } @@ -1523,9 +1538,9 @@ static int nvme_dev_add(struct nvme_dev *dev) continue; res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i, - dma_addr + 4096); + dma_addr + 4096, NULL); if (res) - continue; + memset(mem + 4096, 0, 4096); ns = nvme_alloc_ns(dev, i, mem, mem + 4096); if (ns) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 2e7de7a59bf..e0588c6dd86 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2648,7 +2648,7 @@ static int pkt_seq_show(struct seq_file *m, void *p) static int pkt_seq_open(struct inode *inode, struct file *file) { - return single_open(file, pkt_seq_show, PDE(inode)->data); + return single_open(file, pkt_seq_show, PDE_DATA(inode)); } static const struct file_operations pkt_proc_fops = { diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 75e112d6600..06a2e53e5f3 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -525,7 +525,7 @@ static int ps3vram_proc_show(struct seq_file *m, void *v) static int ps3vram_proc_open(struct inode *inode, struct file *file) { - return single_open(file, ps3vram_proc_show, PDE(inode)->data); + return single_open(file, ps3vram_proc_show, PDE_DATA(inode)); } static const struct file_operations ps3vram_proc_fops = { diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 89576a0b3f2..b7b7a88d9f6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -52,9 +52,12 @@ #define SECTOR_SHIFT 9 #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) -/* It might be useful to have this defined elsewhere too */ +/* It might be useful to have these defined elsewhere */ -#define U64_MAX ((u64) (~0ULL)) +#define U8_MAX ((u8) (~0U)) +#define U16_MAX ((u16) (~0U)) +#define U32_MAX ((u32) (~0U)) +#define U64_MAX ((u64) (~0ULL)) #define RBD_DRV_NAME "rbd" #define RBD_DRV_NAME_LONG "rbd (rados block device)" @@ -66,7 +69,6 @@ (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ -#define RBD_MAX_OPT_LEN 1024 #define RBD_SNAP_HEAD_NAME "-" @@ -93,8 +95,6 @@ #define DEV_NAME_LEN 32 #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) -#define RBD_READ_ONLY_DEFAULT false - /* * block device image metadata (in-memory version) */ @@ -119,16 +119,33 @@ struct rbd_image_header { * An rbd image specification. * * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely - * identify an image. + * identify an image. Each rbd_dev structure includes a pointer to + * an rbd_spec structure that encapsulates this identity. + * + * Each of the id's in an rbd_spec has an associated name. For a + * user-mapped image, the names are supplied and the id's associated + * with them are looked up. For a layered image, a parent image is + * defined by the tuple, and the names are looked up. + * + * An rbd_dev structure contains a parent_spec pointer which is + * non-null if the image it represents is a child in a layered + * image. This pointer will refer to the rbd_spec structure used + * by the parent rbd_dev for its own identity (i.e., the structure + * is shared between the parent and child). + * + * Since these structures are populated once, during the discovery + * phase of image construction, they are effectively immutable so + * we make no effort to synchronize access to them. + * + * Note that code herein does not assume the image name is known (it + * could be a null pointer). */ struct rbd_spec { u64 pool_id; char *pool_name; char *image_id; - size_t image_id_len; char *image_name; - size_t image_name_len; u64 snap_id; char *snap_name; @@ -136,10 +153,6 @@ struct rbd_spec { struct kref kref; }; -struct rbd_options { - bool read_only; -}; - /* * an instance of the client. multiple devices may share an rbd client. */ @@ -149,37 +162,76 @@ struct rbd_client { struct list_head node; }; -/* - * a request completion status - */ -struct rbd_req_status { - int done; - int rc; - u64 bytes; +struct rbd_img_request; +typedef void (*rbd_img_callback_t)(struct rbd_img_request *); + +#define BAD_WHICH U32_MAX /* Good which or bad which, which? */ + +struct rbd_obj_request; +typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); + +enum obj_request_type { + OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES }; -/* - * a collection of requests - */ -struct rbd_req_coll { - int total; - int num_done; +struct rbd_obj_request { + const char *object_name; + u64 offset; /* object start byte */ + u64 length; /* bytes from offset */ + + struct rbd_img_request *img_request; + struct list_head links; /* img_request->obj_requests */ + u32 which; /* posn image request list */ + + enum obj_request_type type; + union { + struct bio *bio_list; + struct { + struct page **pages; + u32 page_count; + }; + }; + + struct ceph_osd_request *osd_req; + + u64 xferred; /* bytes transferred */ + u64 version; + int result; + atomic_t done; + + rbd_obj_callback_t callback; + struct completion completion; + struct kref kref; - struct rbd_req_status status[0]; }; -/* - * a single io request - */ -struct rbd_request { - struct request *rq; /* blk layer request */ - struct bio *bio; /* cloned bio */ - struct page **pages; /* list of used pages */ - u64 len; - int coll_index; - struct rbd_req_coll *coll; +struct rbd_img_request { + struct request *rq; + struct rbd_device *rbd_dev; + u64 offset; /* starting image byte offset */ + u64 length; /* byte count from offset */ + bool write_request; /* false for read */ + union { + struct ceph_snap_context *snapc; /* for writes */ + u64 snap_id; /* for reads */ + }; + spinlock_t completion_lock;/* protects next_completion */ + u32 next_completion; + rbd_img_callback_t callback; + + u32 obj_request_count; + struct list_head obj_requests; /* rbd_obj_request structs */ + + struct kref kref; }; +#define for_each_obj_request(ireq, oreq) \ + list_for_each_entry(oreq, &(ireq)->obj_requests, links) +#define for_each_obj_request_from(ireq, oreq) \ + list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) +#define for_each_obj_request_safe(ireq, oreq, n) \ + list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) + struct rbd_snap { struct device dev; const char *name; @@ -209,16 +261,18 @@ struct rbd_device { char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ - spinlock_t lock; /* queue lock */ + spinlock_t lock; /* queue, flags, open_count */ struct rbd_image_header header; - bool exists; + unsigned long flags; /* possibly lock protected */ struct rbd_spec *spec; char *header_name; + struct ceph_file_layout layout; + struct ceph_osd_event *watch_event; - struct ceph_osd_request *watch_request; + struct rbd_obj_request *watch_request; struct rbd_spec *parent_spec; u64 parent_overlap; @@ -235,7 +289,19 @@ struct rbd_device { /* sysfs related */ struct device dev; - unsigned long open_count; + unsigned long open_count; /* protected by lock */ +}; + +/* + * Flag bits for rbd_dev->flags. If atomicity is required, + * rbd_dev->lock is used to protect access. + * + * Currently, only the "removing" flag (which is coupled with the + * "open_count" field) requires atomic access. + */ +enum rbd_dev_flags { + RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ + RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ }; static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ @@ -277,6 +343,33 @@ static struct device rbd_root_dev = { .release = rbd_root_dev_release, }; +static __printf(2, 3) +void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + if (!rbd_dev) + printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); + else if (rbd_dev->disk) + printk(KERN_WARNING "%s: %s: %pV\n", + RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); + else if (rbd_dev->spec && rbd_dev->spec->image_name) + printk(KERN_WARNING "%s: image %s: %pV\n", + RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); + else if (rbd_dev->spec && rbd_dev->spec->image_id) + printk(KERN_WARNING "%s: id %s: %pV\n", + RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); + else /* punt */ + printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", + RBD_DRV_NAME, rbd_dev, &vaf); + va_end(args); +} + #ifdef RBD_DEBUG #define rbd_assert(expr) \ if (unlikely(!(expr))) { \ @@ -296,14 +389,23 @@ static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); static int rbd_open(struct block_device *bdev, fmode_t mode) { struct rbd_device *rbd_dev = bdev->bd_disk->private_data; + bool removing = false; if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) return -EROFS; + spin_lock_irq(&rbd_dev->lock); + if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) + removing = true; + else + rbd_dev->open_count++; + spin_unlock_irq(&rbd_dev->lock); + if (removing) + return -ENOENT; + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); (void) get_device(&rbd_dev->dev); set_device_ro(bdev, rbd_dev->mapping.read_only); - rbd_dev->open_count++; mutex_unlock(&ctl_mutex); return 0; @@ -312,10 +414,14 @@ static int rbd_open(struct block_device *bdev, fmode_t mode) static int rbd_release(struct gendisk *disk, fmode_t mode) { struct rbd_device *rbd_dev = disk->private_data; + unsigned long open_count_before; + + spin_lock_irq(&rbd_dev->lock); + open_count_before = rbd_dev->open_count--; + spin_unlock_irq(&rbd_dev->lock); + rbd_assert(open_count_before > 0); mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - rbd_assert(rbd_dev->open_count > 0); - rbd_dev->open_count--; put_device(&rbd_dev->dev); mutex_unlock(&ctl_mutex); @@ -337,7 +443,7 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) struct rbd_client *rbdc; int ret = -ENOMEM; - dout("rbd_client_create\n"); + dout("%s:\n", __func__); rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); if (!rbdc) goto out_opt; @@ -361,8 +467,8 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) spin_unlock(&rbd_client_list_lock); mutex_unlock(&ctl_mutex); + dout("%s: rbdc %p\n", __func__, rbdc); - dout("rbd_client_create created %p\n", rbdc); return rbdc; out_err: @@ -373,6 +479,8 @@ out_mutex: out_opt: if (ceph_opts) ceph_destroy_options(ceph_opts); + dout("%s: error %d\n", __func__, ret); + return ERR_PTR(ret); } @@ -426,6 +534,12 @@ static match_table_t rbd_opts_tokens = { {-1, NULL} }; +struct rbd_options { + bool read_only; +}; + +#define RBD_READ_ONLY_DEFAULT false + static int parse_rbd_opts_token(char *c, void *private) { struct rbd_options *rbd_opts = private; @@ -493,7 +607,7 @@ static void rbd_client_release(struct kref *kref) { struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); - dout("rbd_release_client %p\n", rbdc); + dout("%s: rbdc %p\n", __func__, rbdc); spin_lock(&rbd_client_list_lock); list_del(&rbdc->node); spin_unlock(&rbd_client_list_lock); @@ -512,18 +626,6 @@ static void rbd_put_client(struct rbd_client *rbdc) kref_put(&rbdc->kref, rbd_client_release); } -/* - * Destroy requests collection - */ -static void rbd_coll_release(struct kref *kref) -{ - struct rbd_req_coll *coll = - container_of(kref, struct rbd_req_coll, kref); - - dout("rbd_coll_release %p\n", coll); - kfree(coll); -} - static bool rbd_image_format_valid(u32 image_format) { return image_format == 1 || image_format == 2; @@ -707,7 +809,8 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) goto done; rbd_dev->mapping.read_only = true; } - rbd_dev->exists = true; + set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); + done: return ret; } @@ -724,7 +827,7 @@ static void rbd_header_free(struct rbd_image_header *header) header->snapc = NULL; } -static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) +static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) { char *name; u64 segment; @@ -767,23 +870,6 @@ static u64 rbd_segment_length(struct rbd_device *rbd_dev, return length; } -static int rbd_get_num_segments(struct rbd_image_header *header, - u64 ofs, u64 len) -{ - u64 start_seg; - u64 end_seg; - - if (!len) - return 0; - if (len - 1 > U64_MAX - ofs) - return -ERANGE; - - start_seg = ofs >> header->obj_order; - end_seg = (ofs + len - 1) >> header->obj_order; - - return end_seg - start_seg + 1; -} - /* * returns the size of an object in the image */ @@ -949,8 +1035,10 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src, unsigned int bi_size; struct bio *bio; - if (!bi) + if (!bi) { + rbd_warn(NULL, "bio_chain exhausted with %u left", len); goto out_err; /* EINVAL; ran out of bio's */ + } bi_size = min_t(unsigned int, bi->bi_size - off, len); bio = bio_clone_range(bi, off, bi_size, gfpmask); if (!bio) @@ -976,399 +1064,735 @@ out_err: return NULL; } -/* - * helpers for osd request op vectors. - */ -static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops, - int opcode, u32 payload_len) +static void rbd_obj_request_get(struct rbd_obj_request *obj_request) { - struct ceph_osd_req_op *ops; + dout("%s: obj %p (was %d)\n", __func__, obj_request, + atomic_read(&obj_request->kref.refcount)); + kref_get(&obj_request->kref); +} - ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO); - if (!ops) +static void rbd_obj_request_destroy(struct kref *kref); +static void rbd_obj_request_put(struct rbd_obj_request *obj_request) +{ + rbd_assert(obj_request != NULL); + dout("%s: obj %p (was %d)\n", __func__, obj_request, + atomic_read(&obj_request->kref.refcount)); + kref_put(&obj_request->kref, rbd_obj_request_destroy); +} + +static void rbd_img_request_get(struct rbd_img_request *img_request) +{ + dout("%s: img %p (was %d)\n", __func__, img_request, + atomic_read(&img_request->kref.refcount)); + kref_get(&img_request->kref); +} + +static void rbd_img_request_destroy(struct kref *kref); +static void rbd_img_request_put(struct rbd_img_request *img_request) +{ + rbd_assert(img_request != NULL); + dout("%s: img %p (was %d)\n", __func__, img_request, + atomic_read(&img_request->kref.refcount)); + kref_put(&img_request->kref, rbd_img_request_destroy); +} + +static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, + struct rbd_obj_request *obj_request) +{ + rbd_assert(obj_request->img_request == NULL); + + rbd_obj_request_get(obj_request); + obj_request->img_request = img_request; + obj_request->which = img_request->obj_request_count; + rbd_assert(obj_request->which != BAD_WHICH); + img_request->obj_request_count++; + list_add_tail(&obj_request->links, &img_request->obj_requests); + dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, + obj_request->which); +} + +static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, + struct rbd_obj_request *obj_request) +{ + rbd_assert(obj_request->which != BAD_WHICH); + + dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, + obj_request->which); + list_del(&obj_request->links); + rbd_assert(img_request->obj_request_count > 0); + img_request->obj_request_count--; + rbd_assert(obj_request->which == img_request->obj_request_count); + obj_request->which = BAD_WHICH; + rbd_assert(obj_request->img_request == img_request); + obj_request->img_request = NULL; + obj_request->callback = NULL; + rbd_obj_request_put(obj_request); +} + +static bool obj_request_type_valid(enum obj_request_type type) +{ + switch (type) { + case OBJ_REQUEST_NODATA: + case OBJ_REQUEST_BIO: + case OBJ_REQUEST_PAGES: + return true; + default: + return false; + } +} + +static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) +{ + struct ceph_osd_req_op *op; + va_list args; + size_t size; + + op = kzalloc(sizeof (*op), GFP_NOIO); + if (!op) return NULL; + op->op = opcode; + va_start(args, opcode); + switch (opcode) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_WRITE: + /* rbd_osd_req_op_create(READ, offset, length) */ + /* rbd_osd_req_op_create(WRITE, offset, length) */ + op->extent.offset = va_arg(args, u64); + op->extent.length = va_arg(args, u64); + if (opcode == CEPH_OSD_OP_WRITE) + op->payload_len = op->extent.length; + break; + case CEPH_OSD_OP_STAT: + break; + case CEPH_OSD_OP_CALL: + /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */ + op->cls.class_name = va_arg(args, char *); + size = strlen(op->cls.class_name); + rbd_assert(size <= (size_t) U8_MAX); + op->cls.class_len = size; + op->payload_len = size; + + op->cls.method_name = va_arg(args, char *); + size = strlen(op->cls.method_name); + rbd_assert(size <= (size_t) U8_MAX); + op->cls.method_len = size; + op->payload_len += size; + + op->cls.argc = 0; + op->cls.indata = va_arg(args, void *); + size = va_arg(args, size_t); + rbd_assert(size <= (size_t) U32_MAX); + op->cls.indata_len = (u32) size; + op->payload_len += size; + break; + case CEPH_OSD_OP_NOTIFY_ACK: + case CEPH_OSD_OP_WATCH: + /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */ + /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */ + op->watch.cookie = va_arg(args, u64); + op->watch.ver = va_arg(args, u64); + op->watch.ver = cpu_to_le64(op->watch.ver); + if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int)) + op->watch.flag = (u8) 1; + break; + default: + rbd_warn(NULL, "unsupported opcode %hu\n", opcode); + kfree(op); + op = NULL; + break; + } + va_end(args); - ops[0].op = opcode; + return op; +} - /* - * op extent offset and length will be set later on - * in calc_raw_layout() - */ - ops[0].payload_len = payload_len; +static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op) +{ + kfree(op); +} + +static int rbd_obj_request_submit(struct ceph_osd_client *osdc, + struct rbd_obj_request *obj_request) +{ + dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request); + + return ceph_osdc_start_request(osdc, obj_request->osd_req, false); +} - return ops; +static void rbd_img_request_complete(struct rbd_img_request *img_request) +{ + dout("%s: img %p\n", __func__, img_request); + if (img_request->callback) + img_request->callback(img_request); + else + rbd_img_request_put(img_request); } -static void rbd_destroy_ops(struct ceph_osd_req_op *ops) +/* Caller is responsible for rbd_obj_request_destroy(obj_request) */ + +static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) { - kfree(ops); + dout("%s: obj %p\n", __func__, obj_request); + + return wait_for_completion_interruptible(&obj_request->completion); } -static void rbd_coll_end_req_index(struct request *rq, - struct rbd_req_coll *coll, - int index, - int ret, u64 len) +static void obj_request_done_init(struct rbd_obj_request *obj_request) { - struct request_queue *q; - int min, max, i; + atomic_set(&obj_request->done, 0); + smp_wmb(); +} - dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n", - coll, index, ret, (unsigned long long) len); +static void obj_request_done_set(struct rbd_obj_request *obj_request) +{ + int done; - if (!rq) - return; + done = atomic_inc_return(&obj_request->done); + if (done > 1) { + struct rbd_img_request *img_request = obj_request->img_request; + struct rbd_device *rbd_dev; - if (!coll) { - blk_end_request(rq, ret, len); - return; + rbd_dev = img_request ? img_request->rbd_dev : NULL; + rbd_warn(rbd_dev, "obj_request %p was already done\n", + obj_request); } +} - q = rq->q; - - spin_lock_irq(q->queue_lock); - coll->status[index].done = 1; - coll->status[index].rc = ret; - coll->status[index].bytes = len; - max = min = coll->num_done; - while (max < coll->total && coll->status[max].done) - max++; - - for (i = min; i<max; i++) { - __blk_end_request(rq, coll->status[i].rc, - coll->status[i].bytes); - coll->num_done++; - kref_put(&coll->kref, rbd_coll_release); +static bool obj_request_done_test(struct rbd_obj_request *obj_request) +{ + smp_mb(); + return atomic_read(&obj_request->done) != 0; +} + +static void +rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) +{ + dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, + obj_request, obj_request->img_request, obj_request->result, + obj_request->xferred, obj_request->length); + /* + * ENOENT means a hole in the image. We zero-fill the + * entire length of the request. A short read also implies + * zero-fill to the end of the request. Either way we + * update the xferred count to indicate the whole request + * was satisfied. + */ + BUG_ON(obj_request->type != OBJ_REQUEST_BIO); + if (obj_request->result == -ENOENT) { + zero_bio_chain(obj_request->bio_list, 0); + obj_request->result = 0; + obj_request->xferred = obj_request->length; + } else if (obj_request->xferred < obj_request->length && + !obj_request->result) { + zero_bio_chain(obj_request->bio_list, obj_request->xferred); + obj_request->xferred = obj_request->length; } - spin_unlock_irq(q->queue_lock); + obj_request_done_set(obj_request); +} + +static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) +{ + dout("%s: obj %p cb %p\n", __func__, obj_request, + obj_request->callback); + if (obj_request->callback) + obj_request->callback(obj_request); + else + complete_all(&obj_request->completion); +} + +static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) +{ + dout("%s: obj %p\n", __func__, obj_request); + obj_request_done_set(obj_request); +} + +static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) +{ + dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request, + obj_request->result, obj_request->xferred, obj_request->length); + if (obj_request->img_request) + rbd_img_obj_request_read_callback(obj_request); + else + obj_request_done_set(obj_request); } -static void rbd_coll_end_req(struct rbd_request *req, - int ret, u64 len) +static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) { - rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len); + dout("%s: obj %p result %d %llu\n", __func__, obj_request, + obj_request->result, obj_request->length); + /* + * There is no such thing as a successful short write. + * Our xferred value is the number of bytes transferred + * back. Set it to our originally-requested length. + */ + obj_request->xferred = obj_request->length; + obj_request_done_set(obj_request); } /* - * Send ceph osd request + * For a simple stat call there's nothing to do. We'll do more if + * this is part of a write sequence for a layered image. */ -static int rbd_do_request(struct request *rq, - struct rbd_device *rbd_dev, - struct ceph_snap_context *snapc, - u64 snapid, - const char *object_name, u64 ofs, u64 len, - struct bio *bio, - struct page **pages, - int num_pages, - int flags, - struct ceph_osd_req_op *ops, - struct rbd_req_coll *coll, - int coll_index, - void (*rbd_cb)(struct ceph_osd_request *req, - struct ceph_msg *msg), - struct ceph_osd_request **linger_req, - u64 *ver) -{ - struct ceph_osd_request *req; - struct ceph_file_layout *layout; - int ret; - u64 bno; - struct timespec mtime = CURRENT_TIME; - struct rbd_request *req_data; - struct ceph_osd_request_head *reqhead; - struct ceph_osd_client *osdc; +static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) +{ + dout("%s: obj %p\n", __func__, obj_request); + obj_request_done_set(obj_request); +} - req_data = kzalloc(sizeof(*req_data), GFP_NOIO); - if (!req_data) { - if (coll) - rbd_coll_end_req_index(rq, coll, coll_index, - -ENOMEM, len); - return -ENOMEM; +static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, + struct ceph_msg *msg) +{ + struct rbd_obj_request *obj_request = osd_req->r_priv; + u16 opcode; + + dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); + rbd_assert(osd_req == obj_request->osd_req); + rbd_assert(!!obj_request->img_request ^ + (obj_request->which == BAD_WHICH)); + + if (osd_req->r_result < 0) + obj_request->result = osd_req->r_result; + obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version); + + WARN_ON(osd_req->r_num_ops != 1); /* For now */ + + /* + * We support a 64-bit length, but ultimately it has to be + * passed to blk_end_request(), which takes an unsigned int. + */ + obj_request->xferred = osd_req->r_reply_op_len[0]; + rbd_assert(obj_request->xferred < (u64) UINT_MAX); + opcode = osd_req->r_request_ops[0].op; + switch (opcode) { + case CEPH_OSD_OP_READ: + rbd_osd_read_callback(obj_request); + break; + case CEPH_OSD_OP_WRITE: + rbd_osd_write_callback(obj_request); + break; + case CEPH_OSD_OP_STAT: + rbd_osd_stat_callback(obj_request); + break; + case CEPH_OSD_OP_CALL: + case CEPH_OSD_OP_NOTIFY_ACK: + case CEPH_OSD_OP_WATCH: + rbd_osd_trivial_callback(obj_request); + break; + default: + rbd_warn(NULL, "%s: unsupported op %hu\n", + obj_request->object_name, (unsigned short) opcode); + break; } - if (coll) { - req_data->coll = coll; - req_data->coll_index = coll_index; + if (obj_request_done_test(obj_request)) + rbd_obj_request_complete(obj_request); +} + +static struct ceph_osd_request *rbd_osd_req_create( + struct rbd_device *rbd_dev, + bool write_request, + struct rbd_obj_request *obj_request, + struct ceph_osd_req_op *op) +{ + struct rbd_img_request *img_request = obj_request->img_request; + struct ceph_snap_context *snapc = NULL; + struct ceph_osd_client *osdc; + struct ceph_osd_request *osd_req; + struct timespec now; + struct timespec *mtime; + u64 snap_id = CEPH_NOSNAP; + u64 offset = obj_request->offset; + u64 length = obj_request->length; + + if (img_request) { + rbd_assert(img_request->write_request == write_request); + if (img_request->write_request) + snapc = img_request->snapc; + else + snap_id = img_request->snap_id; } - dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n", - object_name, (unsigned long long) ofs, - (unsigned long long) len, coll, coll_index); + /* Allocate and initialize the request, for the single op */ osdc = &rbd_dev->rbd_client->client->osdc; - req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, - false, GFP_NOIO, pages, bio); - if (!req) { - ret = -ENOMEM; - goto done_pages; + osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); + if (!osd_req) + return NULL; /* ENOMEM */ + + rbd_assert(obj_request_type_valid(obj_request->type)); + switch (obj_request->type) { + case OBJ_REQUEST_NODATA: + break; /* Nothing to do */ + case OBJ_REQUEST_BIO: + rbd_assert(obj_request->bio_list != NULL); + osd_req->r_bio = obj_request->bio_list; + break; + case OBJ_REQUEST_PAGES: + osd_req->r_pages = obj_request->pages; + osd_req->r_num_pages = obj_request->page_count; + osd_req->r_page_alignment = offset & ~PAGE_MASK; + break; + } + + if (write_request) { + osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; + now = CURRENT_TIME; + mtime = &now; + } else { + osd_req->r_flags = CEPH_OSD_FLAG_READ; + mtime = NULL; /* not needed for reads */ + offset = 0; /* These are not used... */ + length = 0; /* ...for osd read requests */ } - req->r_callback = rbd_cb; + osd_req->r_callback = rbd_osd_req_callback; + osd_req->r_priv = obj_request; - req_data->rq = rq; - req_data->bio = bio; - req_data->pages = pages; - req_data->len = len; + osd_req->r_oid_len = strlen(obj_request->object_name); + rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); + memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); - req->r_priv = req_data; + osd_req->r_file_layout = rbd_dev->layout; /* struct */ - reqhead = req->r_request->front.iov_base; - reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); + /* osd_req will get its own reference to snapc (if non-null) */ - strncpy(req->r_oid, object_name, sizeof(req->r_oid)); - req->r_oid_len = strlen(req->r_oid); + ceph_osdc_build_request(osd_req, offset, length, 1, op, + snapc, snap_id, mtime); - layout = &req->r_file_layout; - memset(layout, 0, sizeof(*layout)); - layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - layout->fl_stripe_count = cpu_to_le32(1); - layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id); - ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, - req, ops); - rbd_assert(ret == 0); + return osd_req; +} - ceph_osdc_build_request(req, ofs, &len, - ops, - snapc, - &mtime, - req->r_oid, req->r_oid_len); +static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) +{ + ceph_osdc_put_request(osd_req); +} - if (linger_req) { - ceph_osdc_set_request_linger(osdc, req); - *linger_req = req; - } +/* object_name is assumed to be a non-null pointer and NUL-terminated */ - ret = ceph_osdc_start_request(osdc, req, false); - if (ret < 0) - goto done_err; - - if (!rbd_cb) { - ret = ceph_osdc_wait_request(osdc, req); - if (ver) - *ver = le64_to_cpu(req->r_reassert_version.version); - dout("reassert_ver=%llu\n", - (unsigned long long) - le64_to_cpu(req->r_reassert_version.version)); - ceph_osdc_put_request(req); +static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, + u64 offset, u64 length, + enum obj_request_type type) +{ + struct rbd_obj_request *obj_request; + size_t size; + char *name; + + rbd_assert(obj_request_type_valid(type)); + + size = strlen(object_name) + 1; + obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL); + if (!obj_request) + return NULL; + + name = (char *)(obj_request + 1); + obj_request->object_name = memcpy(name, object_name, size); + obj_request->offset = offset; + obj_request->length = length; + obj_request->which = BAD_WHICH; + obj_request->type = type; + INIT_LIST_HEAD(&obj_request->links); + obj_request_done_init(obj_request); + init_completion(&obj_request->completion); + kref_init(&obj_request->kref); + + dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, + offset, length, (int)type, obj_request); + + return obj_request; +} + +static void rbd_obj_request_destroy(struct kref *kref) +{ + struct rbd_obj_request *obj_request; + + obj_request = container_of(kref, struct rbd_obj_request, kref); + + dout("%s: obj %p\n", __func__, obj_request); + + rbd_assert(obj_request->img_request == NULL); + rbd_assert(obj_request->which == BAD_WHICH); + + if (obj_request->osd_req) + rbd_osd_req_destroy(obj_request->osd_req); + + rbd_assert(obj_request_type_valid(obj_request->type)); + switch (obj_request->type) { + case OBJ_REQUEST_NODATA: + break; /* Nothing to do */ + case OBJ_REQUEST_BIO: + if (obj_request->bio_list) + bio_chain_put(obj_request->bio_list); + break; + case OBJ_REQUEST_PAGES: + if (obj_request->pages) + ceph_release_page_vector(obj_request->pages, + obj_request->page_count); + break; } - return ret; -done_err: - bio_chain_put(req_data->bio); - ceph_osdc_put_request(req); -done_pages: - rbd_coll_end_req(req_data, ret, len); - kfree(req_data); - return ret; + kfree(obj_request); } /* - * Ceph osd op callback + * Caller is responsible for filling in the list of object requests + * that comprises the image request, and the Linux request pointer + * (if there is one). */ -static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) -{ - struct rbd_request *req_data = req->r_priv; - struct ceph_osd_reply_head *replyhead; - struct ceph_osd_op *op; - __s32 rc; - u64 bytes; - int read_op; - - /* parse reply */ - replyhead = msg->front.iov_base; - WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); - op = (void *)(replyhead + 1); - rc = le32_to_cpu(replyhead->result); - bytes = le64_to_cpu(op->extent.length); - read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ); - - dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n", - (unsigned long long) bytes, read_op, (int) rc); - - if (rc == -ENOENT && read_op) { - zero_bio_chain(req_data->bio, 0); - rc = 0; - } else if (rc == 0 && read_op && bytes < req_data->len) { - zero_bio_chain(req_data->bio, bytes); - bytes = req_data->len; - } +static struct rbd_img_request *rbd_img_request_create( + struct rbd_device *rbd_dev, + u64 offset, u64 length, + bool write_request) +{ + struct rbd_img_request *img_request; + struct ceph_snap_context *snapc = NULL; - rbd_coll_end_req(req_data, rc, bytes); + img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC); + if (!img_request) + return NULL; - if (req_data->bio) - bio_chain_put(req_data->bio); + if (write_request) { + down_read(&rbd_dev->header_rwsem); + snapc = ceph_get_snap_context(rbd_dev->header.snapc); + up_read(&rbd_dev->header_rwsem); + if (WARN_ON(!snapc)) { + kfree(img_request); + return NULL; /* Shouldn't happen */ + } + } - ceph_osdc_put_request(req); - kfree(req_data); + img_request->rq = NULL; + img_request->rbd_dev = rbd_dev; + img_request->offset = offset; + img_request->length = length; + img_request->write_request = write_request; + if (write_request) + img_request->snapc = snapc; + else + img_request->snap_id = rbd_dev->spec->snap_id; + spin_lock_init(&img_request->completion_lock); + img_request->next_completion = 0; + img_request->callback = NULL; + img_request->obj_request_count = 0; + INIT_LIST_HEAD(&img_request->obj_requests); + kref_init(&img_request->kref); + + rbd_img_request_get(img_request); /* Avoid a warning */ + rbd_img_request_put(img_request); /* TEMPORARY */ + + dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, + write_request ? "write" : "read", offset, length, + img_request); + + return img_request; } -static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) +static void rbd_img_request_destroy(struct kref *kref) { - ceph_osdc_put_request(req); + struct rbd_img_request *img_request; + struct rbd_obj_request *obj_request; + struct rbd_obj_request *next_obj_request; + + img_request = container_of(kref, struct rbd_img_request, kref); + + dout("%s: img %p\n", __func__, img_request); + + for_each_obj_request_safe(img_request, obj_request, next_obj_request) + rbd_img_obj_request_del(img_request, obj_request); + rbd_assert(img_request->obj_request_count == 0); + + if (img_request->write_request) + ceph_put_snap_context(img_request->snapc); + + kfree(img_request); } -/* - * Do a synchronous ceph osd operation - */ -static int rbd_req_sync_op(struct rbd_device *rbd_dev, - struct ceph_snap_context *snapc, - u64 snapid, - int flags, - struct ceph_osd_req_op *ops, - const char *object_name, - u64 ofs, u64 inbound_size, - char *inbound, - struct ceph_osd_request **linger_req, - u64 *ver) +static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, + struct bio *bio_list) { - int ret; - struct page **pages; - int num_pages; - - rbd_assert(ops != NULL); + struct rbd_device *rbd_dev = img_request->rbd_dev; + struct rbd_obj_request *obj_request = NULL; + struct rbd_obj_request *next_obj_request; + unsigned int bio_offset; + u64 image_offset; + u64 resid; + u16 opcode; + + dout("%s: img %p bio %p\n", __func__, img_request, bio_list); + + opcode = img_request->write_request ? CEPH_OSD_OP_WRITE + : CEPH_OSD_OP_READ; + bio_offset = 0; + image_offset = img_request->offset; + rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT); + resid = img_request->length; + rbd_assert(resid > 0); + while (resid) { + const char *object_name; + unsigned int clone_size; + struct ceph_osd_req_op *op; + u64 offset; + u64 length; + + object_name = rbd_segment_name(rbd_dev, image_offset); + if (!object_name) + goto out_unwind; + offset = rbd_segment_offset(rbd_dev, image_offset); + length = rbd_segment_length(rbd_dev, image_offset, resid); + obj_request = rbd_obj_request_create(object_name, + offset, length, + OBJ_REQUEST_BIO); + kfree(object_name); /* object request has its own copy */ + if (!obj_request) + goto out_unwind; + + rbd_assert(length <= (u64) UINT_MAX); + clone_size = (unsigned int) length; + obj_request->bio_list = bio_chain_clone_range(&bio_list, + &bio_offset, clone_size, + GFP_ATOMIC); + if (!obj_request->bio_list) + goto out_partial; - num_pages = calc_pages_for(ofs, inbound_size); - pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); - if (IS_ERR(pages)) - return PTR_ERR(pages); + /* + * Build up the op to use in building the osd + * request. Note that the contents of the op are + * copied by rbd_osd_req_create(). + */ + op = rbd_osd_req_op_create(opcode, offset, length); + if (!op) + goto out_partial; + obj_request->osd_req = rbd_osd_req_create(rbd_dev, + img_request->write_request, + obj_request, op); + rbd_osd_req_op_destroy(op); + if (!obj_request->osd_req) + goto out_partial; + /* status and version are initially zero-filled */ + + rbd_img_obj_request_add(img_request, obj_request); + + image_offset += length; + resid -= length; + } - ret = rbd_do_request(NULL, rbd_dev, snapc, snapid, - object_name, ofs, inbound_size, NULL, - pages, num_pages, - flags, - ops, - NULL, 0, - NULL, - linger_req, ver); - if (ret < 0) - goto done; + return 0; - if ((flags & CEPH_OSD_FLAG_READ) && inbound) - ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret); +out_partial: + rbd_obj_request_put(obj_request); +out_unwind: + for_each_obj_request_safe(img_request, obj_request, next_obj_request) + rbd_obj_request_put(obj_request); -done: - ceph_release_page_vector(pages, num_pages); - return ret; + return -ENOMEM; } -/* - * Do an asynchronous ceph osd operation - */ -static int rbd_do_op(struct request *rq, - struct rbd_device *rbd_dev, - struct ceph_snap_context *snapc, - u64 ofs, u64 len, - struct bio *bio, - struct rbd_req_coll *coll, - int coll_index) -{ - char *seg_name; - u64 seg_ofs; - u64 seg_len; - int ret; - struct ceph_osd_req_op *ops; - u32 payload_len; - int opcode; - int flags; - u64 snapid; - - seg_name = rbd_segment_name(rbd_dev, ofs); - if (!seg_name) - return -ENOMEM; - seg_len = rbd_segment_length(rbd_dev, ofs, len); - seg_ofs = rbd_segment_offset(rbd_dev, ofs); - - if (rq_data_dir(rq) == WRITE) { - opcode = CEPH_OSD_OP_WRITE; - flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK; - snapid = CEPH_NOSNAP; - payload_len = seg_len; - } else { - opcode = CEPH_OSD_OP_READ; - flags = CEPH_OSD_FLAG_READ; - snapc = NULL; - snapid = rbd_dev->spec->snap_id; - payload_len = 0; +static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request; + u32 which = obj_request->which; + bool more = true; + + img_request = obj_request->img_request; + + dout("%s: img %p obj %p\n", __func__, img_request, obj_request); + rbd_assert(img_request != NULL); + rbd_assert(img_request->rq != NULL); + rbd_assert(img_request->obj_request_count > 0); + rbd_assert(which != BAD_WHICH); + rbd_assert(which < img_request->obj_request_count); + rbd_assert(which >= img_request->next_completion); + + spin_lock_irq(&img_request->completion_lock); + if (which != img_request->next_completion) + goto out; + + for_each_obj_request_from(img_request, obj_request) { + unsigned int xferred; + int result; + + rbd_assert(more); + rbd_assert(which < img_request->obj_request_count); + + if (!obj_request_done_test(obj_request)) + break; + + rbd_assert(obj_request->xferred <= (u64) UINT_MAX); + xferred = (unsigned int) obj_request->xferred; + result = (int) obj_request->result; + if (result) + rbd_warn(NULL, "obj_request %s result %d xferred %u\n", + img_request->write_request ? "write" : "read", + result, xferred); + + more = blk_end_request(img_request->rq, result, xferred); + which++; } - ret = -ENOMEM; - ops = rbd_create_rw_ops(1, opcode, payload_len); - if (!ops) - goto done; + rbd_assert(more ^ (which == img_request->obj_request_count)); + img_request->next_completion = which; +out: + spin_unlock_irq(&img_request->completion_lock); - /* we've taken care of segment sizes earlier when we - cloned the bios. We should never have a segment - truncated at this point */ - rbd_assert(seg_len == len); - - ret = rbd_do_request(rq, rbd_dev, snapc, snapid, - seg_name, seg_ofs, seg_len, - bio, - NULL, 0, - flags, - ops, - coll, coll_index, - rbd_req_cb, 0, NULL); - - rbd_destroy_ops(ops); -done: - kfree(seg_name); - return ret; + if (!more) + rbd_img_request_complete(img_request); } -/* - * Request sync osd read - */ -static int rbd_req_sync_read(struct rbd_device *rbd_dev, - u64 snapid, - const char *object_name, - u64 ofs, u64 len, - char *buf, - u64 *ver) -{ - struct ceph_osd_req_op *ops; - int ret; +static int rbd_img_request_submit(struct rbd_img_request *img_request) +{ + struct rbd_device *rbd_dev = img_request->rbd_dev; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct rbd_obj_request *obj_request; + struct rbd_obj_request *next_obj_request; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0); - if (!ops) - return -ENOMEM; + dout("%s: img %p\n", __func__, img_request); + for_each_obj_request_safe(img_request, obj_request, next_obj_request) { + int ret; - ret = rbd_req_sync_op(rbd_dev, NULL, - snapid, - CEPH_OSD_FLAG_READ, - ops, object_name, ofs, len, buf, NULL, ver); - rbd_destroy_ops(ops); + obj_request->callback = rbd_img_obj_callback; + ret = rbd_obj_request_submit(osdc, obj_request); + if (ret) + return ret; + /* + * The image request has its own reference to each + * of its object requests, so we can safely drop the + * initial one here. + */ + rbd_obj_request_put(obj_request); + } - return ret; + return 0; } -/* - * Request sync osd watch - */ -static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, - u64 ver, - u64 notify_id) +static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, + u64 ver, u64 notify_id) { - struct ceph_osd_req_op *ops; + struct rbd_obj_request *obj_request; + struct ceph_osd_req_op *op; + struct ceph_osd_client *osdc; int ret; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0); - if (!ops) + obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, + OBJ_REQUEST_NODATA); + if (!obj_request) return -ENOMEM; - ops[0].watch.ver = cpu_to_le64(ver); - ops[0].watch.cookie = notify_id; - ops[0].watch.flag = 0; + ret = -ENOMEM; + op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver); + if (!op) + goto out; + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, + obj_request, op); + rbd_osd_req_op_destroy(op); + if (!obj_request->osd_req) + goto out; - ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, - rbd_dev->header_name, 0, 0, NULL, - NULL, 0, - CEPH_OSD_FLAG_READ, - ops, - NULL, 0, - rbd_simple_req_cb, 0, NULL); + osdc = &rbd_dev->rbd_client->client->osdc; + obj_request->callback = rbd_obj_request_put; + ret = rbd_obj_request_submit(osdc, obj_request); +out: + if (ret) + rbd_obj_request_put(obj_request); - rbd_destroy_ops(ops); return ret; } @@ -1381,95 +1805,103 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) if (!rbd_dev) return; - dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", + dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, rbd_dev->header_name, (unsigned long long) notify_id, (unsigned int) opcode); rc = rbd_dev_refresh(rbd_dev, &hver); if (rc) - pr_warning(RBD_DRV_NAME "%d got notification but failed to " - " update snaps: %d\n", rbd_dev->major, rc); + rbd_warn(rbd_dev, "got notification but failed to " + " update snaps: %d\n", rc); - rbd_req_sync_notify_ack(rbd_dev, hver, notify_id); + rbd_obj_notify_ack(rbd_dev, hver, notify_id); } /* - * Request sync osd watch + * Request sync osd watch/unwatch. The value of "start" determines + * whether a watch request is being initiated or torn down. */ -static int rbd_req_sync_watch(struct rbd_device *rbd_dev) +static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) { - struct ceph_osd_req_op *ops; struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct rbd_obj_request *obj_request; + struct ceph_osd_req_op *op; int ret; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); - if (!ops) - return -ENOMEM; + rbd_assert(start ^ !!rbd_dev->watch_event); + rbd_assert(start ^ !!rbd_dev->watch_request); - ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, - (void *)rbd_dev, &rbd_dev->watch_event); - if (ret < 0) - goto fail; + if (start) { + ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, + &rbd_dev->watch_event); + if (ret < 0) + return ret; + rbd_assert(rbd_dev->watch_event != NULL); + } - ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version); - ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); - ops[0].watch.flag = 1; + ret = -ENOMEM; + obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, + OBJ_REQUEST_NODATA); + if (!obj_request) + goto out_cancel; + + op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH, + rbd_dev->watch_event->cookie, + rbd_dev->header.obj_version, start); + if (!op) + goto out_cancel; + obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, + obj_request, op); + rbd_osd_req_op_destroy(op); + if (!obj_request->osd_req) + goto out_cancel; + + if (start) + ceph_osdc_set_request_linger(osdc, obj_request->osd_req); + else + ceph_osdc_unregister_linger_request(osdc, + rbd_dev->watch_request->osd_req); + ret = rbd_obj_request_submit(osdc, obj_request); + if (ret) + goto out_cancel; + ret = rbd_obj_request_wait(obj_request); + if (ret) + goto out_cancel; + ret = obj_request->result; + if (ret) + goto out_cancel; - ret = rbd_req_sync_op(rbd_dev, NULL, - CEPH_NOSNAP, - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - ops, - rbd_dev->header_name, - 0, 0, NULL, - &rbd_dev->watch_request, NULL); + /* + * A watch request is set to linger, so the underlying osd + * request won't go away until we unregister it. We retain + * a pointer to the object request during that time (in + * rbd_dev->watch_request), so we'll keep a reference to + * it. We'll drop that reference (below) after we've + * unregistered it. + */ + if (start) { + rbd_dev->watch_request = obj_request; - if (ret < 0) - goto fail_event; + return 0; + } - rbd_destroy_ops(ops); - return 0; + /* We have successfully torn down the watch request */ -fail_event: + rbd_obj_request_put(rbd_dev->watch_request); + rbd_dev->watch_request = NULL; +out_cancel: + /* Cancel the event if we're tearing down, or on error */ ceph_osdc_cancel_event(rbd_dev->watch_event); rbd_dev->watch_event = NULL; -fail: - rbd_destroy_ops(ops); - return ret; -} + if (obj_request) + rbd_obj_request_put(obj_request); -/* - * Request sync osd unwatch - */ -static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) -{ - struct ceph_osd_req_op *ops; - int ret; - - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); - if (!ops) - return -ENOMEM; - - ops[0].watch.ver = 0; - ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); - ops[0].watch.flag = 0; - - ret = rbd_req_sync_op(rbd_dev, NULL, - CEPH_NOSNAP, - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - ops, - rbd_dev->header_name, - 0, 0, NULL, NULL, NULL); - - - rbd_destroy_ops(ops); - ceph_osdc_cancel_event(rbd_dev->watch_event); - rbd_dev->watch_event = NULL; return ret; } /* * Synchronous osd object method call */ -static int rbd_req_sync_exec(struct rbd_device *rbd_dev, +static int rbd_obj_method_sync(struct rbd_device *rbd_dev, const char *object_name, const char *class_name, const char *method_name, @@ -1477,169 +1909,154 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, size_t outbound_size, char *inbound, size_t inbound_size, - int flags, - u64 *ver) + u64 *version) { - struct ceph_osd_req_op *ops; - int class_name_len = strlen(class_name); - int method_name_len = strlen(method_name); - int payload_size; + struct rbd_obj_request *obj_request; + struct ceph_osd_client *osdc; + struct ceph_osd_req_op *op; + struct page **pages; + u32 page_count; int ret; /* - * Any input parameters required by the method we're calling - * will be sent along with the class and method names as - * part of the message payload. That data and its size are - * supplied via the indata and indata_len fields (named from - * the perspective of the server side) in the OSD request - * operation. + * Method calls are ultimately read operations but they + * don't involve object data (so no offset or length). + * The result should placed into the inbound buffer + * provided. They also supply outbound data--parameters for + * the object method. Currently if this is present it will + * be a snapshot id. */ - payload_size = class_name_len + method_name_len + outbound_size; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size); - if (!ops) - return -ENOMEM; + page_count = (u32) calc_pages_for(0, inbound_size); + pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); + if (IS_ERR(pages)) + return PTR_ERR(pages); - ops[0].cls.class_name = class_name; - ops[0].cls.class_len = (__u8) class_name_len; - ops[0].cls.method_name = method_name; - ops[0].cls.method_len = (__u8) method_name_len; - ops[0].cls.argc = 0; - ops[0].cls.indata = outbound; - ops[0].cls.indata_len = outbound_size; + ret = -ENOMEM; + obj_request = rbd_obj_request_create(object_name, 0, 0, + OBJ_REQUEST_PAGES); + if (!obj_request) + goto out; - ret = rbd_req_sync_op(rbd_dev, NULL, - CEPH_NOSNAP, - flags, ops, - object_name, 0, inbound_size, inbound, - NULL, ver); + obj_request->pages = pages; + obj_request->page_count = page_count; - rbd_destroy_ops(ops); + op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name, + method_name, outbound, outbound_size); + if (!op) + goto out; + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, + obj_request, op); + rbd_osd_req_op_destroy(op); + if (!obj_request->osd_req) + goto out; - dout("cls_exec returned %d\n", ret); - return ret; -} + osdc = &rbd_dev->rbd_client->client->osdc; + ret = rbd_obj_request_submit(osdc, obj_request); + if (ret) + goto out; + ret = rbd_obj_request_wait(obj_request); + if (ret) + goto out; -static struct rbd_req_coll *rbd_alloc_coll(int num_reqs) -{ - struct rbd_req_coll *coll = - kzalloc(sizeof(struct rbd_req_coll) + - sizeof(struct rbd_req_status) * num_reqs, - GFP_ATOMIC); + ret = obj_request->result; + if (ret < 0) + goto out; + ret = 0; + ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); + if (version) + *version = obj_request->version; +out: + if (obj_request) + rbd_obj_request_put(obj_request); + else + ceph_release_page_vector(pages, page_count); - if (!coll) - return NULL; - coll->total = num_reqs; - kref_init(&coll->kref); - return coll; + return ret; } -/* - * block device queue callback - */ -static void rbd_rq_fn(struct request_queue *q) +static void rbd_request_fn(struct request_queue *q) + __releases(q->queue_lock) __acquires(q->queue_lock) { struct rbd_device *rbd_dev = q->queuedata; + bool read_only = rbd_dev->mapping.read_only; struct request *rq; + int result; while ((rq = blk_fetch_request(q))) { - struct bio *bio; - bool do_write; - unsigned int size; - u64 ofs; - int num_segs, cur_seg = 0; - struct rbd_req_coll *coll; - struct ceph_snap_context *snapc; - unsigned int bio_offset; - - dout("fetched request\n"); - - /* filter out block requests we don't understand */ - if ((rq->cmd_type != REQ_TYPE_FS)) { - __blk_end_request_all(rq, 0); - continue; - } + bool write_request = rq_data_dir(rq) == WRITE; + struct rbd_img_request *img_request; + u64 offset; + u64 length; - /* deduce our operation (read, write) */ - do_write = (rq_data_dir(rq) == WRITE); - if (do_write && rbd_dev->mapping.read_only) { - __blk_end_request_all(rq, -EROFS); + /* Ignore any non-FS requests that filter through. */ + + if (rq->cmd_type != REQ_TYPE_FS) { + dout("%s: non-fs request type %d\n", __func__, + (int) rq->cmd_type); + __blk_end_request_all(rq, 0); continue; } - spin_unlock_irq(q->queue_lock); + /* Ignore/skip any zero-length requests */ - down_read(&rbd_dev->header_rwsem); + offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; + length = (u64) blk_rq_bytes(rq); - if (!rbd_dev->exists) { - rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); - up_read(&rbd_dev->header_rwsem); - dout("request for non-existent snapshot"); - spin_lock_irq(q->queue_lock); - __blk_end_request_all(rq, -ENXIO); + if (!length) { + dout("%s: zero-length request\n", __func__); + __blk_end_request_all(rq, 0); continue; } - snapc = ceph_get_snap_context(rbd_dev->header.snapc); - - up_read(&rbd_dev->header_rwsem); - - size = blk_rq_bytes(rq); - ofs = blk_rq_pos(rq) * SECTOR_SIZE; - bio = rq->bio; + spin_unlock_irq(q->queue_lock); - dout("%s 0x%x bytes at 0x%llx\n", - do_write ? "write" : "read", - size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); + /* Disallow writes to a read-only device */ - num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); - if (num_segs <= 0) { - spin_lock_irq(q->queue_lock); - __blk_end_request_all(rq, num_segs); - ceph_put_snap_context(snapc); - continue; - } - coll = rbd_alloc_coll(num_segs); - if (!coll) { - spin_lock_irq(q->queue_lock); - __blk_end_request_all(rq, -ENOMEM); - ceph_put_snap_context(snapc); - continue; + if (write_request) { + result = -EROFS; + if (read_only) + goto end_request; + rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); } - bio_offset = 0; - do { - u64 limit = rbd_segment_length(rbd_dev, ofs, size); - unsigned int chain_size; - struct bio *bio_chain; - - BUG_ON(limit > (u64) UINT_MAX); - chain_size = (unsigned int) limit; - dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); - - kref_get(&coll->kref); + /* + * Quit early if the mapped snapshot no longer + * exists. It's still possible the snapshot will + * have disappeared by the time our request arrives + * at the osd, but there's no sense in sending it if + * we already know. + */ + if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { + dout("request for non-existent snapshot"); + rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); + result = -ENXIO; + goto end_request; + } - /* Pass a cloned bio chain via an osd request */ + result = -EINVAL; + if (WARN_ON(offset && length > U64_MAX - offset + 1)) + goto end_request; /* Shouldn't happen */ - bio_chain = bio_chain_clone_range(&bio, - &bio_offset, chain_size, - GFP_ATOMIC); - if (bio_chain) - (void) rbd_do_op(rq, rbd_dev, snapc, - ofs, chain_size, - bio_chain, coll, cur_seg); - else - rbd_coll_end_req_index(rq, coll, cur_seg, - -ENOMEM, chain_size); - size -= chain_size; - ofs += chain_size; + result = -ENOMEM; + img_request = rbd_img_request_create(rbd_dev, offset, length, + write_request); + if (!img_request) + goto end_request; - cur_seg++; - } while (size > 0); - kref_put(&coll->kref, rbd_coll_release); + img_request->rq = rq; + result = rbd_img_request_fill_bio(img_request, rq->bio); + if (!result) + result = rbd_img_request_submit(img_request); + if (result) + rbd_img_request_put(img_request); +end_request: spin_lock_irq(q->queue_lock); - - ceph_put_snap_context(snapc); + if (result < 0) { + rbd_warn(rbd_dev, "obj_request %s result %d\n", + write_request ? "write" : "read", result); + __blk_end_request_all(rq, result); + } } } @@ -1703,6 +2120,71 @@ static void rbd_free_disk(struct rbd_device *rbd_dev) put_disk(disk); } +static int rbd_obj_read_sync(struct rbd_device *rbd_dev, + const char *object_name, + u64 offset, u64 length, + char *buf, u64 *version) + +{ + struct ceph_osd_req_op *op; + struct rbd_obj_request *obj_request; + struct ceph_osd_client *osdc; + struct page **pages = NULL; + u32 page_count; + size_t size; + int ret; + + page_count = (u32) calc_pages_for(offset, length); + pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); + if (IS_ERR(pages)) + ret = PTR_ERR(pages); + + ret = -ENOMEM; + obj_request = rbd_obj_request_create(object_name, offset, length, + OBJ_REQUEST_PAGES); + if (!obj_request) + goto out; + + obj_request->pages = pages; + obj_request->page_count = page_count; + + op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length); + if (!op) + goto out; + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, + obj_request, op); + rbd_osd_req_op_destroy(op); + if (!obj_request->osd_req) + goto out; + + osdc = &rbd_dev->rbd_client->client->osdc; + ret = rbd_obj_request_submit(osdc, obj_request); + if (ret) + goto out; + ret = rbd_obj_request_wait(obj_request); + if (ret) + goto out; + + ret = obj_request->result; + if (ret < 0) + goto out; + + rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); + size = (size_t) obj_request->xferred; + ceph_copy_from_page_vector(pages, buf, 0, size); + rbd_assert(size <= (size_t) INT_MAX); + ret = (int) size; + if (version) + *version = obj_request->version; +out: + if (obj_request) + rbd_obj_request_put(obj_request); + else + ceph_release_page_vector(pages, page_count); + + return ret; +} + /* * Read the complete header for the given rbd device. * @@ -1741,24 +2223,20 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) if (!ondisk) return ERR_PTR(-ENOMEM); - ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP, - rbd_dev->header_name, + ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 0, size, (char *) ondisk, version); - if (ret < 0) goto out_err; if (WARN_ON((size_t) ret < size)) { ret = -ENXIO; - pr_warning("short header read for image %s" - " (want %zd got %d)\n", - rbd_dev->spec->image_name, size, ret); + rbd_warn(rbd_dev, "short header read (want %zd got %d)", + size, ret); goto out_err; } if (!rbd_dev_ondisk_valid(ondisk)) { ret = -ENXIO; - pr_warning("invalid header for image %s\n", - rbd_dev->spec->image_name); + rbd_warn(rbd_dev, "invalid header"); goto out_err; } @@ -1895,8 +2373,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) disk->fops = &rbd_bd_ops; disk->private_data = rbd_dev; - /* init rq */ - q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); + q = blk_init_queue(rbd_request_fn, &rbd_dev->lock); if (!q) goto out_disk; @@ -2233,7 +2710,7 @@ static void rbd_spec_free(struct kref *kref) kfree(spec); } -struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, +static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, struct rbd_spec *spec) { struct rbd_device *rbd_dev; @@ -2243,6 +2720,7 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, return NULL; spin_lock_init(&rbd_dev->lock); + rbd_dev->flags = 0; INIT_LIST_HEAD(&rbd_dev->node); INIT_LIST_HEAD(&rbd_dev->snaps); init_rwsem(&rbd_dev->header_rwsem); @@ -2250,6 +2728,13 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, rbd_dev->spec = spec; rbd_dev->rbd_client = rbdc; + /* Initialize the layout used for all rbd requests */ + + rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); + rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); + return rbd_dev; } @@ -2360,12 +2845,11 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, __le64 size; } __attribute__ ((packed)) size_buf = { 0 }; - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_size", (char *) &snapid, sizeof (snapid), - (char *) &size_buf, sizeof (size_buf), - CEPH_OSD_FLAG_READ, NULL); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + (char *) &size_buf, sizeof (size_buf), NULL); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -2396,15 +2880,13 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) if (!reply_buf) return -ENOMEM; - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_object_prefix", NULL, 0, - reply_buf, RBD_OBJ_PREFIX_LEN_MAX, - CEPH_OSD_FLAG_READ, NULL); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; - ret = 0; /* rbd_req_sync_exec() can return positive */ p = reply_buf; rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, @@ -2435,12 +2917,12 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, u64 incompat; int ret; - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_features", (char *) &snapid, sizeof (snapid), (char *) &features_buf, sizeof (features_buf), - CEPH_OSD_FLAG_READ, NULL); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + NULL); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -2474,7 +2956,6 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) void *end; char *image_id; u64 overlap; - size_t len = 0; int ret; parent_spec = rbd_spec_alloc(); @@ -2492,12 +2973,11 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) } snapid = cpu_to_le64(CEPH_NOSNAP); - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_parent", (char *) &snapid, sizeof (snapid), - (char *) reply_buf, size, - CEPH_OSD_FLAG_READ, NULL); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + (char *) reply_buf, size, NULL); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out_err; @@ -2508,13 +2988,18 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) if (parent_spec->pool_id == CEPH_NOPOOL) goto out; /* No parent? No problem. */ - image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); + /* The ceph file layout needs to fit pool id in 32 bits */ + + ret = -EIO; + if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX)) + goto out; + + image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); if (IS_ERR(image_id)) { ret = PTR_ERR(image_id); goto out_err; } parent_spec->image_id = image_id; - parent_spec->image_id_len = len; ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); ceph_decode_64_safe(&p, end, overlap, out_err); @@ -2544,26 +3029,25 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev) rbd_assert(!rbd_dev->spec->image_name); - image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len; + len = strlen(rbd_dev->spec->image_id); + image_id_size = sizeof (__le32) + len; image_id = kmalloc(image_id_size, GFP_KERNEL); if (!image_id) return NULL; p = image_id; end = (char *) image_id + image_id_size; - ceph_encode_string(&p, end, rbd_dev->spec->image_id, - (u32) rbd_dev->spec->image_id_len); + ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len); size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; reply_buf = kmalloc(size, GFP_KERNEL); if (!reply_buf) goto out; - ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY, + ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, "rbd", "dir_get_name", image_id, image_id_size, - (char *) reply_buf, size, - CEPH_OSD_FLAG_READ, NULL); + (char *) reply_buf, size, NULL); if (ret < 0) goto out; p = reply_buf; @@ -2602,8 +3086,11 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) osdc = &rbd_dev->rbd_client->client->osdc; name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); - if (!name) - return -EIO; /* pool id too large (>= 2^31) */ + if (!name) { + rbd_warn(rbd_dev, "there is no pool with id %llu", + rbd_dev->spec->pool_id); /* Really a BUG() */ + return -EIO; + } rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); if (!rbd_dev->spec->pool_name) @@ -2612,19 +3099,17 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) /* Fetch the image name; tolerate failure here */ name = rbd_dev_image_name(rbd_dev); - if (name) { - rbd_dev->spec->image_name_len = strlen(name); + if (name) rbd_dev->spec->image_name = (char *) name; - } else { - pr_warning(RBD_DRV_NAME "%d " - "unable to get image name for image id %s\n", - rbd_dev->major, rbd_dev->spec->image_id); - } + else + rbd_warn(rbd_dev, "unable to get image name"); /* Look up the snapshot name. */ name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); if (!name) { + rbd_warn(rbd_dev, "no snapshot with id %llu", + rbd_dev->spec->snap_id); /* Really a BUG() */ ret = -EIO; goto out_err; } @@ -2665,12 +3150,11 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) if (!reply_buf) return -ENOMEM; - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_snapcontext", NULL, 0, - reply_buf, size, - CEPH_OSD_FLAG_READ, ver); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + reply_buf, size, ver); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -2735,12 +3219,11 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) return ERR_PTR(-ENOMEM); snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_snapshot_name", (char *) &snap_id, sizeof (snap_id), - reply_buf, size, - CEPH_OSD_FLAG_READ, NULL); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + reply_buf, size, NULL); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -2766,7 +3249,7 @@ out: static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, u64 *snap_size, u64 *snap_features) { - __le64 snap_id; + u64 snap_id; u8 order; int ret; @@ -2865,10 +3348,17 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { struct list_head *next = links->next; - /* Existing snapshot not in the new snap context */ - + /* + * A previously-existing snapshot is not in + * the new snap context. + * + * If the now missing snapshot is the one the + * image is mapped to, clear its exists flag + * so we can avoid sending any more requests + * to it. + */ if (rbd_dev->spec->snap_id == snap->id) - rbd_dev->exists = false; + clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); rbd_remove_snap_dev(snap); dout("%ssnap id %llu has been removed\n", rbd_dev->spec->snap_id == snap->id ? @@ -2942,7 +3432,7 @@ static int rbd_dev_snaps_register(struct rbd_device *rbd_dev) struct rbd_snap *snap; int ret = 0; - dout("%s called\n", __func__); + dout("%s:\n", __func__); if (WARN_ON(!device_is_registered(&rbd_dev->dev))) return -EIO; @@ -2983,22 +3473,6 @@ static void rbd_bus_del_dev(struct rbd_device *rbd_dev) device_unregister(&rbd_dev->dev); } -static int rbd_init_watch_dev(struct rbd_device *rbd_dev) -{ - int ret, rc; - - do { - ret = rbd_req_sync_watch(rbd_dev); - if (ret == -ERANGE) { - rc = rbd_dev_refresh(rbd_dev, NULL); - if (rc < 0) - return rc; - } - } while (ret == -ERANGE); - - return ret; -} - static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); /* @@ -3138,11 +3612,9 @@ static inline char *dup_token(const char **buf, size_t *lenp) size_t len; len = next_token(buf); - dup = kmalloc(len + 1, GFP_KERNEL); + dup = kmemdup(*buf, len + 1, GFP_KERNEL); if (!dup) return NULL; - - memcpy(dup, *buf, len); *(dup + len) = '\0'; *buf += len; @@ -3210,8 +3682,10 @@ static int rbd_add_parse_args(const char *buf, /* The first four tokens are required */ len = next_token(&buf); - if (!len) - return -EINVAL; /* Missing monitor address(es) */ + if (!len) { + rbd_warn(NULL, "no monitor address(es) provided"); + return -EINVAL; + } mon_addrs = buf; mon_addrs_size = len + 1; buf += len; @@ -3220,8 +3694,10 @@ static int rbd_add_parse_args(const char *buf, options = dup_token(&buf, NULL); if (!options) return -ENOMEM; - if (!*options) - goto out_err; /* Missing options */ + if (!*options) { + rbd_warn(NULL, "no options provided"); + goto out_err; + } spec = rbd_spec_alloc(); if (!spec) @@ -3230,14 +3706,18 @@ static int rbd_add_parse_args(const char *buf, spec->pool_name = dup_token(&buf, NULL); if (!spec->pool_name) goto out_mem; - if (!*spec->pool_name) - goto out_err; /* Missing pool name */ + if (!*spec->pool_name) { + rbd_warn(NULL, "no pool name provided"); + goto out_err; + } - spec->image_name = dup_token(&buf, &spec->image_name_len); + spec->image_name = dup_token(&buf, NULL); if (!spec->image_name) goto out_mem; - if (!*spec->image_name) - goto out_err; /* Missing image name */ + if (!*spec->image_name) { + rbd_warn(NULL, "no image name provided"); + goto out_err; + } /* * Snapshot name is optional; default is to use "-" @@ -3251,10 +3731,9 @@ static int rbd_add_parse_args(const char *buf, ret = -ENAMETOOLONG; goto out_err; } - spec->snap_name = kmalloc(len + 1, GFP_KERNEL); + spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL); if (!spec->snap_name) goto out_mem; - memcpy(spec->snap_name, buf, len); *(spec->snap_name + len) = '\0'; /* Initialize all rbd options to the defaults */ @@ -3323,7 +3802,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) * First, see if the format 2 image id file exists, and if * so, get the image's persistent id from it. */ - size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len; + size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); object_name = kmalloc(size, GFP_NOIO); if (!object_name) return -ENOMEM; @@ -3339,21 +3818,18 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) goto out; } - ret = rbd_req_sync_exec(rbd_dev, object_name, + ret = rbd_obj_method_sync(rbd_dev, object_name, "rbd", "get_id", NULL, 0, - response, RBD_IMAGE_ID_LEN_MAX, - CEPH_OSD_FLAG_READ, NULL); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + response, RBD_IMAGE_ID_LEN_MAX, NULL); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; - ret = 0; /* rbd_req_sync_exec() can return positive */ p = response; rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, p + RBD_IMAGE_ID_LEN_MAX, - &rbd_dev->spec->image_id_len, - GFP_NOIO); + NULL, GFP_NOIO); if (IS_ERR(rbd_dev->spec->image_id)) { ret = PTR_ERR(rbd_dev->spec->image_id); rbd_dev->spec->image_id = NULL; @@ -3377,11 +3853,10 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); if (!rbd_dev->spec->image_id) return -ENOMEM; - rbd_dev->spec->image_id_len = 0; /* Record the header object name for this rbd image. */ - size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX); + size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX); rbd_dev->header_name = kmalloc(size, GFP_KERNEL); if (!rbd_dev->header_name) { ret = -ENOMEM; @@ -3427,7 +3902,7 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) * Image id was filled in by the caller. Record the header * object name for this rbd image. */ - size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len; + size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id); rbd_dev->header_name = kmalloc(size, GFP_KERNEL); if (!rbd_dev->header_name) return -ENOMEM; @@ -3542,7 +4017,7 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) if (ret) goto err_out_bus; - ret = rbd_init_watch_dev(rbd_dev); + ret = rbd_dev_header_watch_sync(rbd_dev, 1); if (ret) goto err_out_bus; @@ -3638,6 +4113,13 @@ static ssize_t rbd_add(struct bus_type *bus, goto err_out_client; spec->pool_id = (u64) rc; + /* The ceph file layout needs to fit pool id in 32 bits */ + + if (WARN_ON(spec->pool_id > (u64) U32_MAX)) { + rc = -EIO; + goto err_out_client; + } + rbd_dev = rbd_dev_create(rbdc, spec); if (!rbd_dev) goto err_out_client; @@ -3691,15 +4173,8 @@ static void rbd_dev_release(struct device *dev) { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - if (rbd_dev->watch_request) { - struct ceph_client *client = rbd_dev->rbd_client->client; - - ceph_osdc_unregister_linger_request(&client->osdc, - rbd_dev->watch_request); - } if (rbd_dev->watch_event) - rbd_req_sync_unwatch(rbd_dev); - + rbd_dev_header_watch_sync(rbd_dev, 0); /* clean up and free blkdev */ rbd_free_disk(rbd_dev); @@ -3743,10 +4218,14 @@ static ssize_t rbd_remove(struct bus_type *bus, goto done; } - if (rbd_dev->open_count) { + spin_lock_irq(&rbd_dev->lock); + if (rbd_dev->open_count) ret = -EBUSY; + else + set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); + spin_unlock_irq(&rbd_dev->lock); + if (ret < 0) goto done; - } rbd_remove_all_snaps(rbd_dev); rbd_bus_del_dev(rbd_dev); @@ -3782,10 +4261,15 @@ static void rbd_sysfs_cleanup(void) device_unregister(&rbd_root_dev); } -int __init rbd_init(void) +static int __init rbd_init(void) { int rc; + if (!libceph_compatible(NULL)) { + rbd_warn(NULL, "libceph incompatibility (quitting)"); + + return -EINVAL; + } rc = rbd_sysfs_init(); if (rc) return rc; @@ -3793,7 +4277,7 @@ int __init rbd_init(void) return 0; } -void __exit rbd_exit(void) +static void __exit rbd_exit(void) { rbd_sysfs_cleanup(); } diff --git a/drivers/block/rsxx/Makefile b/drivers/block/rsxx/Makefile new file mode 100644 index 00000000000..b1c53c0aa45 --- /dev/null +++ b/drivers/block/rsxx/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_BLK_DEV_RSXX) += rsxx.o +rsxx-objs := config.o core.o cregs.o dev.o dma.o diff --git a/drivers/block/rsxx/config.c b/drivers/block/rsxx/config.c new file mode 100644 index 00000000000..10cd530d3e1 --- /dev/null +++ b/drivers/block/rsxx/config.c @@ -0,0 +1,211 @@ +/* +* Filename: config.c +* +* +* Authors: Joshua Morris <josh.h.morris@us.ibm.com> +* Philip Kelleher <pjk1939@linux.vnet.ibm.com> +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include <linux/types.h> +#include <linux/crc32.h> +#include <linux/swab.h> + +#include "rsxx_priv.h" +#include "rsxx_cfg.h" + +static void initialize_config(struct rsxx_card_cfg *cfg) +{ + cfg->hdr.version = RSXX_CFG_VERSION; + + cfg->data.block_size = RSXX_HW_BLK_SIZE; + cfg->data.stripe_size = RSXX_HW_BLK_SIZE; + cfg->data.vendor_id = RSXX_VENDOR_ID_IBM; + cfg->data.cache_order = (-1); + cfg->data.intr_coal.mode = RSXX_INTR_COAL_DISABLED; + cfg->data.intr_coal.count = 0; + cfg->data.intr_coal.latency = 0; +} + +static u32 config_data_crc32(struct rsxx_card_cfg *cfg) +{ + /* + * Return the compliment of the CRC to ensure compatibility + * (i.e. this is how early rsxx drivers did it.) + */ + + return ~crc32(~0, &cfg->data, sizeof(cfg->data)); +} + + +/*----------------- Config Byte Swap Functions -------------------*/ +static void config_hdr_be_to_cpu(struct card_cfg_hdr *hdr) +{ + hdr->version = be32_to_cpu((__force __be32) hdr->version); + hdr->crc = be32_to_cpu((__force __be32) hdr->crc); +} + +static void config_hdr_cpu_to_be(struct card_cfg_hdr *hdr) +{ + hdr->version = (__force u32) cpu_to_be32(hdr->version); + hdr->crc = (__force u32) cpu_to_be32(hdr->crc); +} + +static void config_data_swab(struct rsxx_card_cfg *cfg) +{ + u32 *data = (u32 *) &cfg->data; + int i; + + for (i = 0; i < (sizeof(cfg->data) / 4); i++) + data[i] = swab32(data[i]); +} + +static void config_data_le_to_cpu(struct rsxx_card_cfg *cfg) +{ + u32 *data = (u32 *) &cfg->data; + int i; + + for (i = 0; i < (sizeof(cfg->data) / 4); i++) + data[i] = le32_to_cpu((__force __le32) data[i]); +} + +static void config_data_cpu_to_le(struct rsxx_card_cfg *cfg) +{ + u32 *data = (u32 *) &cfg->data; + int i; + + for (i = 0; i < (sizeof(cfg->data) / 4); i++) + data[i] = (__force u32) cpu_to_le32(data[i]); +} + + +/*----------------- Config Operations ------------------*/ +static int rsxx_save_config(struct rsxx_cardinfo *card) +{ + struct rsxx_card_cfg cfg; + int st; + + memcpy(&cfg, &card->config, sizeof(cfg)); + + if (unlikely(cfg.hdr.version != RSXX_CFG_VERSION)) { + dev_err(CARD_TO_DEV(card), + "Cannot save config with invalid version %d\n", + cfg.hdr.version); + return -EINVAL; + } + + /* Convert data to little endian for the CRC calculation. */ + config_data_cpu_to_le(&cfg); + + cfg.hdr.crc = config_data_crc32(&cfg); + + /* + * Swap the data from little endian to big endian so it can be + * stored. + */ + config_data_swab(&cfg); + config_hdr_cpu_to_be(&cfg.hdr); + + st = rsxx_creg_write(card, CREG_ADD_CONFIG, sizeof(cfg), &cfg, 1); + if (st) + return st; + + return 0; +} + +int rsxx_load_config(struct rsxx_cardinfo *card) +{ + int st; + u32 crc; + + st = rsxx_creg_read(card, CREG_ADD_CONFIG, sizeof(card->config), + &card->config, 1); + if (st) { + dev_err(CARD_TO_DEV(card), + "Failed reading card config.\n"); + return st; + } + + config_hdr_be_to_cpu(&card->config.hdr); + + if (card->config.hdr.version == RSXX_CFG_VERSION) { + /* + * We calculate the CRC with the data in little endian, because + * early drivers did not take big endian CPUs into account. + * The data is always stored in big endian, so we need to byte + * swap it before calculating the CRC. + */ + + config_data_swab(&card->config); + + /* Check the CRC */ + crc = config_data_crc32(&card->config); + if (crc != card->config.hdr.crc) { + dev_err(CARD_TO_DEV(card), + "Config corruption detected!\n"); + dev_info(CARD_TO_DEV(card), + "CRC (sb x%08x is x%08x)\n", + card->config.hdr.crc, crc); + return -EIO; + } + + /* Convert the data to CPU byteorder */ + config_data_le_to_cpu(&card->config); + + } else if (card->config.hdr.version != 0) { + dev_err(CARD_TO_DEV(card), + "Invalid config version %d.\n", + card->config.hdr.version); + /* + * Config version changes require special handling from the + * user + */ + return -EINVAL; + } else { + dev_info(CARD_TO_DEV(card), + "Initializing card configuration.\n"); + initialize_config(&card->config); + st = rsxx_save_config(card); + if (st) + return st; + } + + card->config_valid = 1; + + dev_dbg(CARD_TO_DEV(card), "version: x%08x\n", + card->config.hdr.version); + dev_dbg(CARD_TO_DEV(card), "crc: x%08x\n", + card->config.hdr.crc); + dev_dbg(CARD_TO_DEV(card), "block_size: x%08x\n", + card->config.data.block_size); + dev_dbg(CARD_TO_DEV(card), "stripe_size: x%08x\n", + card->config.data.stripe_size); + dev_dbg(CARD_TO_DEV(card), "vendor_id: x%08x\n", + card->config.data.vendor_id); + dev_dbg(CARD_TO_DEV(card), "cache_order: x%08x\n", + card->config.data.cache_order); + dev_dbg(CARD_TO_DEV(card), "mode: x%08x\n", + card->config.data.intr_coal.mode); + dev_dbg(CARD_TO_DEV(card), "count: x%08x\n", + card->config.data.intr_coal.count); + dev_dbg(CARD_TO_DEV(card), "latency: x%08x\n", + card->config.data.intr_coal.latency); + + return 0; +} + diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c new file mode 100644 index 00000000000..5af21f2db29 --- /dev/null +++ b/drivers/block/rsxx/core.c @@ -0,0 +1,860 @@ +/* +* Filename: core.c +* +* +* Authors: Joshua Morris <josh.h.morris@us.ibm.com> +* Philip Kelleher <pjk1939@linux.vnet.ibm.com> +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/reboot.h> +#include <linux/slab.h> +#include <linux/bitops.h> +#include <linux/delay.h> + +#include <linux/genhd.h> +#include <linux/idr.h> + +#include "rsxx_priv.h" +#include "rsxx_cfg.h" + +#define NO_LEGACY 0 + +MODULE_DESCRIPTION("IBM FlashSystem 70/80 PCIe SSD Device Driver"); +MODULE_AUTHOR("Joshua Morris/Philip Kelleher, IBM"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(DRIVER_VERSION); + +static unsigned int force_legacy = NO_LEGACY; +module_param(force_legacy, uint, 0444); +MODULE_PARM_DESC(force_legacy, "Force the use of legacy type PCI interrupts"); + +static DEFINE_IDA(rsxx_disk_ida); +static DEFINE_SPINLOCK(rsxx_ida_lock); + +/*----------------- Interrupt Control & Handling -------------------*/ + +static void rsxx_mask_interrupts(struct rsxx_cardinfo *card) +{ + card->isr_mask = 0; + card->ier_mask = 0; +} + +static void __enable_intr(unsigned int *mask, unsigned int intr) +{ + *mask |= intr; +} + +static void __disable_intr(unsigned int *mask, unsigned int intr) +{ + *mask &= ~intr; +} + +/* + * NOTE: Disabling the IER will disable the hardware interrupt. + * Disabling the ISR will disable the software handling of the ISR bit. + * + * Enable/Disable interrupt functions assume the card->irq_lock + * is held by the caller. + */ +void rsxx_enable_ier(struct rsxx_cardinfo *card, unsigned int intr) +{ + if (unlikely(card->halt) || + unlikely(card->eeh_state)) + return; + + __enable_intr(&card->ier_mask, intr); + iowrite32(card->ier_mask, card->regmap + IER); +} + +void rsxx_disable_ier(struct rsxx_cardinfo *card, unsigned int intr) +{ + if (unlikely(card->eeh_state)) + return; + + __disable_intr(&card->ier_mask, intr); + iowrite32(card->ier_mask, card->regmap + IER); +} + +void rsxx_enable_ier_and_isr(struct rsxx_cardinfo *card, + unsigned int intr) +{ + if (unlikely(card->halt) || + unlikely(card->eeh_state)) + return; + + __enable_intr(&card->isr_mask, intr); + __enable_intr(&card->ier_mask, intr); + iowrite32(card->ier_mask, card->regmap + IER); +} +void rsxx_disable_ier_and_isr(struct rsxx_cardinfo *card, + unsigned int intr) +{ + if (unlikely(card->eeh_state)) + return; + + __disable_intr(&card->isr_mask, intr); + __disable_intr(&card->ier_mask, intr); + iowrite32(card->ier_mask, card->regmap + IER); +} + +static irqreturn_t rsxx_isr(int irq, void *pdata) +{ + struct rsxx_cardinfo *card = pdata; + unsigned int isr; + int handled = 0; + int reread_isr; + int i; + + spin_lock(&card->irq_lock); + + do { + reread_isr = 0; + + if (unlikely(card->eeh_state)) + break; + + isr = ioread32(card->regmap + ISR); + if (isr == 0xffffffff) { + /* + * A few systems seem to have an intermittent issue + * where PCI reads return all Fs, but retrying the read + * a little later will return as expected. + */ + dev_info(CARD_TO_DEV(card), + "ISR = 0xFFFFFFFF, retrying later\n"); + break; + } + + isr &= card->isr_mask; + if (!isr) + break; + + for (i = 0; i < card->n_targets; i++) { + if (isr & CR_INTR_DMA(i)) { + if (card->ier_mask & CR_INTR_DMA(i)) { + rsxx_disable_ier(card, CR_INTR_DMA(i)); + reread_isr = 1; + } + queue_work(card->ctrl[i].done_wq, + &card->ctrl[i].dma_done_work); + handled++; + } + } + + if (isr & CR_INTR_CREG) { + schedule_work(&card->creg_ctrl.done_work); + handled++; + } + + if (isr & CR_INTR_EVENT) { + schedule_work(&card->event_work); + rsxx_disable_ier_and_isr(card, CR_INTR_EVENT); + handled++; + } + } while (reread_isr); + + spin_unlock(&card->irq_lock); + + return handled ? IRQ_HANDLED : IRQ_NONE; +} + +/*----------------- Card Event Handler -------------------*/ +static const char * const rsxx_card_state_to_str(unsigned int state) +{ + static const char * const state_strings[] = { + "Unknown", "Shutdown", "Starting", "Formatting", + "Uninitialized", "Good", "Shutting Down", + "Fault", "Read Only Fault", "dStroying" + }; + + return state_strings[ffs(state)]; +} + +static void card_state_change(struct rsxx_cardinfo *card, + unsigned int new_state) +{ + int st; + + dev_info(CARD_TO_DEV(card), + "card state change detected.(%s -> %s)\n", + rsxx_card_state_to_str(card->state), + rsxx_card_state_to_str(new_state)); + + card->state = new_state; + + /* Don't attach DMA interfaces if the card has an invalid config */ + if (!card->config_valid) + return; + + switch (new_state) { + case CARD_STATE_RD_ONLY_FAULT: + dev_crit(CARD_TO_DEV(card), + "Hardware has entered read-only mode!\n"); + /* + * Fall through so the DMA devices can be attached and + * the user can attempt to pull off their data. + */ + case CARD_STATE_GOOD: + st = rsxx_get_card_size8(card, &card->size8); + if (st) + dev_err(CARD_TO_DEV(card), + "Failed attaching DMA devices\n"); + + if (card->config_valid) + set_capacity(card->gendisk, card->size8 >> 9); + break; + + case CARD_STATE_FAULT: + dev_crit(CARD_TO_DEV(card), + "Hardware Fault reported!\n"); + /* Fall through. */ + + /* Everything else, detach DMA interface if it's attached. */ + case CARD_STATE_SHUTDOWN: + case CARD_STATE_STARTING: + case CARD_STATE_FORMATTING: + case CARD_STATE_UNINITIALIZED: + case CARD_STATE_SHUTTING_DOWN: + /* + * dStroy is a term coined by marketing to represent the low level + * secure erase. + */ + case CARD_STATE_DSTROYING: + set_capacity(card->gendisk, 0); + break; + } +} + +static void card_event_handler(struct work_struct *work) +{ + struct rsxx_cardinfo *card; + unsigned int state; + unsigned long flags; + int st; + + card = container_of(work, struct rsxx_cardinfo, event_work); + + if (unlikely(card->halt)) + return; + + /* + * Enable the interrupt now to avoid any weird race conditions where a + * state change might occur while rsxx_get_card_state() is + * processing a returned creg cmd. + */ + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_enable_ier_and_isr(card, CR_INTR_EVENT); + spin_unlock_irqrestore(&card->irq_lock, flags); + + st = rsxx_get_card_state(card, &state); + if (st) { + dev_info(CARD_TO_DEV(card), + "Failed reading state after event.\n"); + return; + } + + if (card->state != state) + card_state_change(card, state); + + if (card->creg_ctrl.creg_stats.stat & CREG_STAT_LOG_PENDING) + rsxx_read_hw_log(card); +} + +/*----------------- Card Operations -------------------*/ +static int card_shutdown(struct rsxx_cardinfo *card) +{ + unsigned int state; + signed long start; + const int timeout = msecs_to_jiffies(120000); + int st; + + /* We can't issue a shutdown if the card is in a transition state */ + start = jiffies; + do { + st = rsxx_get_card_state(card, &state); + if (st) + return st; + } while (state == CARD_STATE_STARTING && + (jiffies - start < timeout)); + + if (state == CARD_STATE_STARTING) + return -ETIMEDOUT; + + /* Only issue a shutdown if we need to */ + if ((state != CARD_STATE_SHUTTING_DOWN) && + (state != CARD_STATE_SHUTDOWN)) { + st = rsxx_issue_card_cmd(card, CARD_CMD_SHUTDOWN); + if (st) + return st; + } + + start = jiffies; + do { + st = rsxx_get_card_state(card, &state); + if (st) + return st; + } while (state != CARD_STATE_SHUTDOWN && + (jiffies - start < timeout)); + + if (state != CARD_STATE_SHUTDOWN) + return -ETIMEDOUT; + + return 0; +} + +static int rsxx_eeh_frozen(struct pci_dev *dev) +{ + struct rsxx_cardinfo *card = pci_get_drvdata(dev); + int i; + int st; + + dev_warn(&dev->dev, "IBM FlashSystem PCI: preparing for slot reset.\n"); + + card->eeh_state = 1; + rsxx_mask_interrupts(card); + + /* + * We need to guarantee that the write for eeh_state and masking + * interrupts does not become reordered. This will prevent a possible + * race condition with the EEH code. + */ + wmb(); + + pci_disable_device(dev); + + st = rsxx_eeh_save_issued_dmas(card); + if (st) + return st; + + rsxx_eeh_save_issued_creg(card); + + for (i = 0; i < card->n_targets; i++) { + if (card->ctrl[i].status.buf) + pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8, + card->ctrl[i].status.buf, + card->ctrl[i].status.dma_addr); + if (card->ctrl[i].cmd.buf) + pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8, + card->ctrl[i].cmd.buf, + card->ctrl[i].cmd.dma_addr); + } + + return 0; +} + +static void rsxx_eeh_failure(struct pci_dev *dev) +{ + struct rsxx_cardinfo *card = pci_get_drvdata(dev); + int i; + + dev_err(&dev->dev, "IBM FlashSystem PCI: disabling failed card.\n"); + + card->eeh_state = 1; + + for (i = 0; i < card->n_targets; i++) + del_timer_sync(&card->ctrl[i].activity_timer); + + rsxx_eeh_cancel_dmas(card); +} + +static int rsxx_eeh_fifo_flush_poll(struct rsxx_cardinfo *card) +{ + unsigned int status; + int iter = 0; + + /* We need to wait for the hardware to reset */ + while (iter++ < 10) { + status = ioread32(card->regmap + PCI_RECONFIG); + + if (status & RSXX_FLUSH_BUSY) { + ssleep(1); + continue; + } + + if (status & RSXX_FLUSH_TIMEOUT) + dev_warn(CARD_TO_DEV(card), "HW: flash controller timeout\n"); + return 0; + } + + /* Hardware failed resetting itself. */ + return -1; +} + +static pci_ers_result_t rsxx_error_detected(struct pci_dev *dev, + enum pci_channel_state error) +{ + int st; + + if (dev->revision < RSXX_EEH_SUPPORT) + return PCI_ERS_RESULT_NONE; + + if (error == pci_channel_io_perm_failure) { + rsxx_eeh_failure(dev); + return PCI_ERS_RESULT_DISCONNECT; + } + + st = rsxx_eeh_frozen(dev); + if (st) { + dev_err(&dev->dev, "Slot reset setup failed\n"); + rsxx_eeh_failure(dev); + return PCI_ERS_RESULT_DISCONNECT; + } + + return PCI_ERS_RESULT_NEED_RESET; +} + +static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev) +{ + struct rsxx_cardinfo *card = pci_get_drvdata(dev); + unsigned long flags; + int i; + int st; + + dev_warn(&dev->dev, + "IBM FlashSystem PCI: recovering from slot reset.\n"); + + st = pci_enable_device(dev); + if (st) + goto failed_hw_setup; + + pci_set_master(dev); + + st = rsxx_eeh_fifo_flush_poll(card); + if (st) + goto failed_hw_setup; + + rsxx_dma_queue_reset(card); + + for (i = 0; i < card->n_targets; i++) { + st = rsxx_hw_buffers_init(dev, &card->ctrl[i]); + if (st) + goto failed_hw_buffers_init; + } + + if (card->config_valid) + rsxx_dma_configure(card); + + /* Clears the ISR register from spurious interrupts */ + st = ioread32(card->regmap + ISR); + + card->eeh_state = 0; + + st = rsxx_eeh_remap_dmas(card); + if (st) + goto failed_remap_dmas; + + spin_lock_irqsave(&card->irq_lock, flags); + if (card->n_targets & RSXX_MAX_TARGETS) + rsxx_enable_ier_and_isr(card, CR_INTR_ALL_G); + else + rsxx_enable_ier_and_isr(card, CR_INTR_ALL_C); + spin_unlock_irqrestore(&card->irq_lock, flags); + + rsxx_kick_creg_queue(card); + + for (i = 0; i < card->n_targets; i++) { + spin_lock(&card->ctrl[i].queue_lock); + if (list_empty(&card->ctrl[i].queue)) { + spin_unlock(&card->ctrl[i].queue_lock); + continue; + } + spin_unlock(&card->ctrl[i].queue_lock); + + queue_work(card->ctrl[i].issue_wq, + &card->ctrl[i].issue_dma_work); + } + + dev_info(&dev->dev, "IBM FlashSystem PCI: recovery complete.\n"); + + return PCI_ERS_RESULT_RECOVERED; + +failed_hw_buffers_init: +failed_remap_dmas: + for (i = 0; i < card->n_targets; i++) { + if (card->ctrl[i].status.buf) + pci_free_consistent(card->dev, + STATUS_BUFFER_SIZE8, + card->ctrl[i].status.buf, + card->ctrl[i].status.dma_addr); + if (card->ctrl[i].cmd.buf) + pci_free_consistent(card->dev, + COMMAND_BUFFER_SIZE8, + card->ctrl[i].cmd.buf, + card->ctrl[i].cmd.dma_addr); + } +failed_hw_setup: + rsxx_eeh_failure(dev); + return PCI_ERS_RESULT_DISCONNECT; + +} + +/*----------------- Driver Initialization & Setup -------------------*/ +/* Returns: 0 if the driver is compatible with the device + -1 if the driver is NOT compatible with the device */ +static int rsxx_compatibility_check(struct rsxx_cardinfo *card) +{ + unsigned char pci_rev; + + pci_read_config_byte(card->dev, PCI_REVISION_ID, &pci_rev); + + if (pci_rev > RS70_PCI_REV_SUPPORTED) + return -1; + return 0; +} + +static int rsxx_pci_probe(struct pci_dev *dev, + const struct pci_device_id *id) +{ + struct rsxx_cardinfo *card; + int st; + + dev_info(&dev->dev, "PCI-Flash SSD discovered\n"); + + card = kzalloc(sizeof(*card), GFP_KERNEL); + if (!card) + return -ENOMEM; + + card->dev = dev; + pci_set_drvdata(dev, card); + + do { + if (!ida_pre_get(&rsxx_disk_ida, GFP_KERNEL)) { + st = -ENOMEM; + goto failed_ida_get; + } + + spin_lock(&rsxx_ida_lock); + st = ida_get_new(&rsxx_disk_ida, &card->disk_id); + spin_unlock(&rsxx_ida_lock); + } while (st == -EAGAIN); + + if (st) + goto failed_ida_get; + + st = pci_enable_device(dev); + if (st) + goto failed_enable; + + pci_set_master(dev); + pci_set_dma_max_seg_size(dev, RSXX_HW_BLK_SIZE); + + st = pci_set_dma_mask(dev, DMA_BIT_MASK(64)); + if (st) { + dev_err(CARD_TO_DEV(card), + "No usable DMA configuration,aborting\n"); + goto failed_dma_mask; + } + + st = pci_request_regions(dev, DRIVER_NAME); + if (st) { + dev_err(CARD_TO_DEV(card), + "Failed to request memory region\n"); + goto failed_request_regions; + } + + if (pci_resource_len(dev, 0) == 0) { + dev_err(CARD_TO_DEV(card), "BAR0 has length 0!\n"); + st = -ENOMEM; + goto failed_iomap; + } + + card->regmap = pci_iomap(dev, 0, 0); + if (!card->regmap) { + dev_err(CARD_TO_DEV(card), "Failed to map BAR0\n"); + st = -ENOMEM; + goto failed_iomap; + } + + spin_lock_init(&card->irq_lock); + card->halt = 0; + card->eeh_state = 0; + + spin_lock_irq(&card->irq_lock); + rsxx_disable_ier_and_isr(card, CR_INTR_ALL); + spin_unlock_irq(&card->irq_lock); + + if (!force_legacy) { + st = pci_enable_msi(dev); + if (st) + dev_warn(CARD_TO_DEV(card), + "Failed to enable MSI\n"); + } + + st = request_irq(dev->irq, rsxx_isr, IRQF_DISABLED | IRQF_SHARED, + DRIVER_NAME, card); + if (st) { + dev_err(CARD_TO_DEV(card), + "Failed requesting IRQ%d\n", dev->irq); + goto failed_irq; + } + + /************* Setup Processor Command Interface *************/ + rsxx_creg_setup(card); + + spin_lock_irq(&card->irq_lock); + rsxx_enable_ier_and_isr(card, CR_INTR_CREG); + spin_unlock_irq(&card->irq_lock); + + st = rsxx_compatibility_check(card); + if (st) { + dev_warn(CARD_TO_DEV(card), + "Incompatible driver detected. Please update the driver.\n"); + st = -EINVAL; + goto failed_compatiblity_check; + } + + /************* Load Card Config *************/ + st = rsxx_load_config(card); + if (st) + dev_err(CARD_TO_DEV(card), + "Failed loading card config\n"); + + /************* Setup DMA Engine *************/ + st = rsxx_get_num_targets(card, &card->n_targets); + if (st) + dev_info(CARD_TO_DEV(card), + "Failed reading the number of DMA targets\n"); + + card->ctrl = kzalloc(card->n_targets * sizeof(*card->ctrl), GFP_KERNEL); + if (!card->ctrl) { + st = -ENOMEM; + goto failed_dma_setup; + } + + st = rsxx_dma_setup(card); + if (st) { + dev_info(CARD_TO_DEV(card), + "Failed to setup DMA engine\n"); + goto failed_dma_setup; + } + + /************* Setup Card Event Handler *************/ + INIT_WORK(&card->event_work, card_event_handler); + + st = rsxx_setup_dev(card); + if (st) + goto failed_create_dev; + + rsxx_get_card_state(card, &card->state); + + dev_info(CARD_TO_DEV(card), + "card state: %s\n", + rsxx_card_state_to_str(card->state)); + + /* + * Now that the DMA Engine and devices have been setup, + * we can enable the event interrupt(it kicks off actions in + * those layers so we couldn't enable it right away.) + */ + spin_lock_irq(&card->irq_lock); + rsxx_enable_ier_and_isr(card, CR_INTR_EVENT); + spin_unlock_irq(&card->irq_lock); + + if (card->state == CARD_STATE_SHUTDOWN) { + st = rsxx_issue_card_cmd(card, CARD_CMD_STARTUP); + if (st) + dev_crit(CARD_TO_DEV(card), + "Failed issuing card startup\n"); + } else if (card->state == CARD_STATE_GOOD || + card->state == CARD_STATE_RD_ONLY_FAULT) { + st = rsxx_get_card_size8(card, &card->size8); + if (st) + card->size8 = 0; + } + + rsxx_attach_dev(card); + + return 0; + +failed_create_dev: + rsxx_dma_destroy(card); +failed_dma_setup: +failed_compatiblity_check: + spin_lock_irq(&card->irq_lock); + rsxx_disable_ier_and_isr(card, CR_INTR_ALL); + spin_unlock_irq(&card->irq_lock); + free_irq(dev->irq, card); + if (!force_legacy) + pci_disable_msi(dev); +failed_irq: + pci_iounmap(dev, card->regmap); +failed_iomap: + pci_release_regions(dev); +failed_request_regions: +failed_dma_mask: + pci_disable_device(dev); +failed_enable: + spin_lock(&rsxx_ida_lock); + ida_remove(&rsxx_disk_ida, card->disk_id); + spin_unlock(&rsxx_ida_lock); +failed_ida_get: + kfree(card); + + return st; +} + +static void rsxx_pci_remove(struct pci_dev *dev) +{ + struct rsxx_cardinfo *card = pci_get_drvdata(dev); + unsigned long flags; + int st; + int i; + + if (!card) + return; + + dev_info(CARD_TO_DEV(card), + "Removing PCI-Flash SSD.\n"); + + rsxx_detach_dev(card); + + for (i = 0; i < card->n_targets; i++) { + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_disable_ier_and_isr(card, CR_INTR_DMA(i)); + spin_unlock_irqrestore(&card->irq_lock, flags); + } + + st = card_shutdown(card); + if (st) + dev_crit(CARD_TO_DEV(card), "Shutdown failed!\n"); + + /* Sync outstanding event handlers. */ + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_disable_ier_and_isr(card, CR_INTR_EVENT); + spin_unlock_irqrestore(&card->irq_lock, flags); + + cancel_work_sync(&card->event_work); + + rsxx_destroy_dev(card); + rsxx_dma_destroy(card); + + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_disable_ier_and_isr(card, CR_INTR_ALL); + spin_unlock_irqrestore(&card->irq_lock, flags); + + /* Prevent work_structs from re-queuing themselves. */ + card->halt = 1; + + free_irq(dev->irq, card); + + if (!force_legacy) + pci_disable_msi(dev); + + rsxx_creg_destroy(card); + + pci_iounmap(dev, card->regmap); + + pci_disable_device(dev); + pci_release_regions(dev); + + kfree(card); +} + +static int rsxx_pci_suspend(struct pci_dev *dev, pm_message_t state) +{ + /* We don't support suspend at this time. */ + return -ENOSYS; +} + +static void rsxx_pci_shutdown(struct pci_dev *dev) +{ + struct rsxx_cardinfo *card = pci_get_drvdata(dev); + unsigned long flags; + int i; + + if (!card) + return; + + dev_info(CARD_TO_DEV(card), "Shutting down PCI-Flash SSD.\n"); + + rsxx_detach_dev(card); + + for (i = 0; i < card->n_targets; i++) { + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_disable_ier_and_isr(card, CR_INTR_DMA(i)); + spin_unlock_irqrestore(&card->irq_lock, flags); + } + + card_shutdown(card); +} + +static const struct pci_error_handlers rsxx_err_handler = { + .error_detected = rsxx_error_detected, + .slot_reset = rsxx_slot_reset, +}; + +static DEFINE_PCI_DEVICE_TABLE(rsxx_pci_ids) = { + {PCI_DEVICE(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_FS70_FLASH)}, + {PCI_DEVICE(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_FS80_FLASH)}, + {0,}, +}; + +MODULE_DEVICE_TABLE(pci, rsxx_pci_ids); + +static struct pci_driver rsxx_pci_driver = { + .name = DRIVER_NAME, + .id_table = rsxx_pci_ids, + .probe = rsxx_pci_probe, + .remove = rsxx_pci_remove, + .suspend = rsxx_pci_suspend, + .shutdown = rsxx_pci_shutdown, + .err_handler = &rsxx_err_handler, +}; + +static int __init rsxx_core_init(void) +{ + int st; + + st = rsxx_dev_init(); + if (st) + return st; + + st = rsxx_dma_init(); + if (st) + goto dma_init_failed; + + st = rsxx_creg_init(); + if (st) + goto creg_init_failed; + + return pci_register_driver(&rsxx_pci_driver); + +creg_init_failed: + rsxx_dma_cleanup(); +dma_init_failed: + rsxx_dev_cleanup(); + + return st; +} + +static void __exit rsxx_core_cleanup(void) +{ + pci_unregister_driver(&rsxx_pci_driver); + rsxx_creg_cleanup(); + rsxx_dma_cleanup(); + rsxx_dev_cleanup(); +} + +module_init(rsxx_core_init); +module_exit(rsxx_core_cleanup); diff --git a/drivers/block/rsxx/cregs.c b/drivers/block/rsxx/cregs.c new file mode 100644 index 00000000000..4b5c020a0a6 --- /dev/null +++ b/drivers/block/rsxx/cregs.c @@ -0,0 +1,790 @@ +/* +* Filename: cregs.c +* +* +* Authors: Joshua Morris <josh.h.morris@us.ibm.com> +* Philip Kelleher <pjk1939@linux.vnet.ibm.com> +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include <linux/completion.h> +#include <linux/slab.h> + +#include "rsxx_priv.h" + +#define CREG_TIMEOUT_MSEC 10000 + +typedef void (*creg_cmd_cb)(struct rsxx_cardinfo *card, + struct creg_cmd *cmd, + int st); + +struct creg_cmd { + struct list_head list; + creg_cmd_cb cb; + void *cb_private; + unsigned int op; + unsigned int addr; + int cnt8; + void *buf; + unsigned int stream; + unsigned int status; +}; + +static struct kmem_cache *creg_cmd_pool; + + +/*------------ Private Functions --------------*/ + +#if defined(__LITTLE_ENDIAN) +#define LITTLE_ENDIAN 1 +#elif defined(__BIG_ENDIAN) +#define LITTLE_ENDIAN 0 +#else +#error Unknown endianess!!! Aborting... +#endif + +static int copy_to_creg_data(struct rsxx_cardinfo *card, + int cnt8, + void *buf, + unsigned int stream) +{ + int i = 0; + u32 *data = buf; + + if (unlikely(card->eeh_state)) + return -EIO; + + for (i = 0; cnt8 > 0; i++, cnt8 -= 4) { + /* + * Firmware implementation makes it necessary to byte swap on + * little endian processors. + */ + if (LITTLE_ENDIAN && stream) + iowrite32be(data[i], card->regmap + CREG_DATA(i)); + else + iowrite32(data[i], card->regmap + CREG_DATA(i)); + } + + return 0; +} + + +static int copy_from_creg_data(struct rsxx_cardinfo *card, + int cnt8, + void *buf, + unsigned int stream) +{ + int i = 0; + u32 *data = buf; + + if (unlikely(card->eeh_state)) + return -EIO; + + for (i = 0; cnt8 > 0; i++, cnt8 -= 4) { + /* + * Firmware implementation makes it necessary to byte swap on + * little endian processors. + */ + if (LITTLE_ENDIAN && stream) + data[i] = ioread32be(card->regmap + CREG_DATA(i)); + else + data[i] = ioread32(card->regmap + CREG_DATA(i)); + } + + return 0; +} + +static void creg_issue_cmd(struct rsxx_cardinfo *card, struct creg_cmd *cmd) +{ + int st; + + if (unlikely(card->eeh_state)) + return; + + iowrite32(cmd->addr, card->regmap + CREG_ADD); + iowrite32(cmd->cnt8, card->regmap + CREG_CNT); + + if (cmd->op == CREG_OP_WRITE) { + if (cmd->buf) { + st = copy_to_creg_data(card, cmd->cnt8, + cmd->buf, cmd->stream); + if (st) + return; + } + } + + if (unlikely(card->eeh_state)) + return; + + /* Setting the valid bit will kick off the command. */ + iowrite32(cmd->op, card->regmap + CREG_CMD); +} + +static void creg_kick_queue(struct rsxx_cardinfo *card) +{ + if (card->creg_ctrl.active || list_empty(&card->creg_ctrl.queue)) + return; + + card->creg_ctrl.active = 1; + card->creg_ctrl.active_cmd = list_first_entry(&card->creg_ctrl.queue, + struct creg_cmd, list); + list_del(&card->creg_ctrl.active_cmd->list); + card->creg_ctrl.q_depth--; + + /* + * We have to set the timer before we push the new command. Otherwise, + * we could create a race condition that would occur if the timer + * was not canceled, and expired after the new command was pushed, + * but before the command was issued to hardware. + */ + mod_timer(&card->creg_ctrl.cmd_timer, + jiffies + msecs_to_jiffies(CREG_TIMEOUT_MSEC)); + + creg_issue_cmd(card, card->creg_ctrl.active_cmd); +} + +static int creg_queue_cmd(struct rsxx_cardinfo *card, + unsigned int op, + unsigned int addr, + unsigned int cnt8, + void *buf, + int stream, + creg_cmd_cb callback, + void *cb_private) +{ + struct creg_cmd *cmd; + + /* Don't queue stuff up if we're halted. */ + if (unlikely(card->halt)) + return -EINVAL; + + if (card->creg_ctrl.reset) + return -EAGAIN; + + if (cnt8 > MAX_CREG_DATA8) + return -EINVAL; + + cmd = kmem_cache_alloc(creg_cmd_pool, GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + INIT_LIST_HEAD(&cmd->list); + + cmd->op = op; + cmd->addr = addr; + cmd->cnt8 = cnt8; + cmd->buf = buf; + cmd->stream = stream; + cmd->cb = callback; + cmd->cb_private = cb_private; + cmd->status = 0; + + spin_lock_bh(&card->creg_ctrl.lock); + list_add_tail(&cmd->list, &card->creg_ctrl.queue); + card->creg_ctrl.q_depth++; + creg_kick_queue(card); + spin_unlock_bh(&card->creg_ctrl.lock); + + return 0; +} + +static void creg_cmd_timed_out(unsigned long data) +{ + struct rsxx_cardinfo *card = (struct rsxx_cardinfo *) data; + struct creg_cmd *cmd; + + spin_lock(&card->creg_ctrl.lock); + cmd = card->creg_ctrl.active_cmd; + card->creg_ctrl.active_cmd = NULL; + spin_unlock(&card->creg_ctrl.lock); + + if (cmd == NULL) { + card->creg_ctrl.creg_stats.creg_timeout++; + dev_warn(CARD_TO_DEV(card), + "No active command associated with timeout!\n"); + return; + } + + if (cmd->cb) + cmd->cb(card, cmd, -ETIMEDOUT); + + kmem_cache_free(creg_cmd_pool, cmd); + + + spin_lock(&card->creg_ctrl.lock); + card->creg_ctrl.active = 0; + creg_kick_queue(card); + spin_unlock(&card->creg_ctrl.lock); +} + + +static void creg_cmd_done(struct work_struct *work) +{ + struct rsxx_cardinfo *card; + struct creg_cmd *cmd; + int st = 0; + + card = container_of(work, struct rsxx_cardinfo, + creg_ctrl.done_work); + + /* + * The timer could not be cancelled for some reason, + * race to pop the active command. + */ + if (del_timer_sync(&card->creg_ctrl.cmd_timer) == 0) + card->creg_ctrl.creg_stats.failed_cancel_timer++; + + spin_lock_bh(&card->creg_ctrl.lock); + cmd = card->creg_ctrl.active_cmd; + card->creg_ctrl.active_cmd = NULL; + spin_unlock_bh(&card->creg_ctrl.lock); + + if (cmd == NULL) { + dev_err(CARD_TO_DEV(card), + "Spurious creg interrupt!\n"); + return; + } + + card->creg_ctrl.creg_stats.stat = ioread32(card->regmap + CREG_STAT); + cmd->status = card->creg_ctrl.creg_stats.stat; + if ((cmd->status & CREG_STAT_STATUS_MASK) == 0) { + dev_err(CARD_TO_DEV(card), + "Invalid status on creg command\n"); + /* + * At this point we're probably reading garbage from HW. Don't + * do anything else that could mess up the system and let + * the sync function return an error. + */ + st = -EIO; + goto creg_done; + } else if (cmd->status & CREG_STAT_ERROR) { + st = -EIO; + } + + if ((cmd->op == CREG_OP_READ)) { + unsigned int cnt8 = ioread32(card->regmap + CREG_CNT); + + /* Paranoid Sanity Checks */ + if (!cmd->buf) { + dev_err(CARD_TO_DEV(card), + "Buffer not given for read.\n"); + st = -EIO; + goto creg_done; + } + if (cnt8 != cmd->cnt8) { + dev_err(CARD_TO_DEV(card), + "count mismatch\n"); + st = -EIO; + goto creg_done; + } + + st = copy_from_creg_data(card, cnt8, cmd->buf, cmd->stream); + } + +creg_done: + if (cmd->cb) + cmd->cb(card, cmd, st); + + kmem_cache_free(creg_cmd_pool, cmd); + + spin_lock_bh(&card->creg_ctrl.lock); + card->creg_ctrl.active = 0; + creg_kick_queue(card); + spin_unlock_bh(&card->creg_ctrl.lock); +} + +static void creg_reset(struct rsxx_cardinfo *card) +{ + struct creg_cmd *cmd = NULL; + struct creg_cmd *tmp; + unsigned long flags; + + /* + * mutex_trylock is used here because if reset_lock is taken then a + * reset is already happening. So, we can just go ahead and return. + */ + if (!mutex_trylock(&card->creg_ctrl.reset_lock)) + return; + + card->creg_ctrl.reset = 1; + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_disable_ier_and_isr(card, CR_INTR_CREG | CR_INTR_EVENT); + spin_unlock_irqrestore(&card->irq_lock, flags); + + dev_warn(CARD_TO_DEV(card), + "Resetting creg interface for recovery\n"); + + /* Cancel outstanding commands */ + spin_lock_bh(&card->creg_ctrl.lock); + list_for_each_entry_safe(cmd, tmp, &card->creg_ctrl.queue, list) { + list_del(&cmd->list); + card->creg_ctrl.q_depth--; + if (cmd->cb) + cmd->cb(card, cmd, -ECANCELED); + kmem_cache_free(creg_cmd_pool, cmd); + } + + cmd = card->creg_ctrl.active_cmd; + card->creg_ctrl.active_cmd = NULL; + if (cmd) { + if (timer_pending(&card->creg_ctrl.cmd_timer)) + del_timer_sync(&card->creg_ctrl.cmd_timer); + + if (cmd->cb) + cmd->cb(card, cmd, -ECANCELED); + kmem_cache_free(creg_cmd_pool, cmd); + + card->creg_ctrl.active = 0; + } + spin_unlock_bh(&card->creg_ctrl.lock); + + card->creg_ctrl.reset = 0; + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_enable_ier_and_isr(card, CR_INTR_CREG | CR_INTR_EVENT); + spin_unlock_irqrestore(&card->irq_lock, flags); + + mutex_unlock(&card->creg_ctrl.reset_lock); +} + +/* Used for synchronous accesses */ +struct creg_completion { + struct completion *cmd_done; + int st; + u32 creg_status; +}; + +static void creg_cmd_done_cb(struct rsxx_cardinfo *card, + struct creg_cmd *cmd, + int st) +{ + struct creg_completion *cmd_completion; + + cmd_completion = cmd->cb_private; + BUG_ON(!cmd_completion); + + cmd_completion->st = st; + cmd_completion->creg_status = cmd->status; + complete(cmd_completion->cmd_done); +} + +static int __issue_creg_rw(struct rsxx_cardinfo *card, + unsigned int op, + unsigned int addr, + unsigned int cnt8, + void *buf, + int stream, + unsigned int *hw_stat) +{ + DECLARE_COMPLETION_ONSTACK(cmd_done); + struct creg_completion completion; + unsigned long timeout; + int st; + + completion.cmd_done = &cmd_done; + completion.st = 0; + completion.creg_status = 0; + + st = creg_queue_cmd(card, op, addr, cnt8, buf, stream, creg_cmd_done_cb, + &completion); + if (st) + return st; + + /* + * This timeout is necessary for unresponsive hardware. The additional + * 20 seconds to used to guarantee that each cregs requests has time to + * complete. + */ + timeout = msecs_to_jiffies(CREG_TIMEOUT_MSEC * + card->creg_ctrl.q_depth + 20000); + + /* + * The creg interface is guaranteed to complete. It has a timeout + * mechanism that will kick in if hardware does not respond. + */ + st = wait_for_completion_timeout(completion.cmd_done, timeout); + if (st == 0) { + /* + * This is really bad, because the kernel timer did not + * expire and notify us of a timeout! + */ + dev_crit(CARD_TO_DEV(card), + "cregs timer failed\n"); + creg_reset(card); + return -EIO; + } + + *hw_stat = completion.creg_status; + + if (completion.st) { + dev_warn(CARD_TO_DEV(card), + "creg command failed(%d x%08x)\n", + completion.st, addr); + return completion.st; + } + + return 0; +} + +static int issue_creg_rw(struct rsxx_cardinfo *card, + u32 addr, + unsigned int size8, + void *data, + int stream, + int read) +{ + unsigned int hw_stat; + unsigned int xfer; + unsigned int op; + int st; + + op = read ? CREG_OP_READ : CREG_OP_WRITE; + + do { + xfer = min_t(unsigned int, size8, MAX_CREG_DATA8); + + st = __issue_creg_rw(card, op, addr, xfer, + data, stream, &hw_stat); + if (st) + return st; + + data = (char *)data + xfer; + addr += xfer; + size8 -= xfer; + } while (size8); + + return 0; +} + +/* ---------------------------- Public API ---------------------------------- */ +int rsxx_creg_write(struct rsxx_cardinfo *card, + u32 addr, + unsigned int size8, + void *data, + int byte_stream) +{ + return issue_creg_rw(card, addr, size8, data, byte_stream, 0); +} + +int rsxx_creg_read(struct rsxx_cardinfo *card, + u32 addr, + unsigned int size8, + void *data, + int byte_stream) +{ + return issue_creg_rw(card, addr, size8, data, byte_stream, 1); +} + +int rsxx_get_card_state(struct rsxx_cardinfo *card, unsigned int *state) +{ + return rsxx_creg_read(card, CREG_ADD_CARD_STATE, + sizeof(*state), state, 0); +} + +int rsxx_get_card_size8(struct rsxx_cardinfo *card, u64 *size8) +{ + unsigned int size; + int st; + + st = rsxx_creg_read(card, CREG_ADD_CARD_SIZE, + sizeof(size), &size, 0); + if (st) + return st; + + *size8 = (u64)size * RSXX_HW_BLK_SIZE; + return 0; +} + +int rsxx_get_num_targets(struct rsxx_cardinfo *card, + unsigned int *n_targets) +{ + return rsxx_creg_read(card, CREG_ADD_NUM_TARGETS, + sizeof(*n_targets), n_targets, 0); +} + +int rsxx_get_card_capabilities(struct rsxx_cardinfo *card, + u32 *capabilities) +{ + return rsxx_creg_read(card, CREG_ADD_CAPABILITIES, + sizeof(*capabilities), capabilities, 0); +} + +int rsxx_issue_card_cmd(struct rsxx_cardinfo *card, u32 cmd) +{ + return rsxx_creg_write(card, CREG_ADD_CARD_CMD, + sizeof(cmd), &cmd, 0); +} + + +/*----------------- HW Log Functions -------------------*/ +static void hw_log_msg(struct rsxx_cardinfo *card, const char *str, int len) +{ + static char level; + + /* + * New messages start with "<#>", where # is the log level. Messages + * that extend past the log buffer will use the previous level + */ + if ((len > 3) && (str[0] == '<') && (str[2] == '>')) { + level = str[1]; + str += 3; /* Skip past the log level. */ + len -= 3; + } + + switch (level) { + case '0': + dev_emerg(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '1': + dev_alert(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '2': + dev_crit(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '3': + dev_err(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '4': + dev_warn(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '5': + dev_notice(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '6': + dev_info(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '7': + dev_dbg(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + default: + dev_info(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + } +} + +/* + * The substrncpy function copies the src string (which includes the + * terminating '\0' character), up to the count into the dest pointer. + * Returns the number of bytes copied to dest. + */ +static int substrncpy(char *dest, const char *src, int count) +{ + int max_cnt = count; + + while (count) { + count--; + *dest = *src; + if (*dest == '\0') + break; + src++; + dest++; + } + return max_cnt - count; +} + + +static void read_hw_log_done(struct rsxx_cardinfo *card, + struct creg_cmd *cmd, + int st) +{ + char *buf; + char *log_str; + int cnt; + int len; + int off; + + buf = cmd->buf; + off = 0; + + /* Failed getting the log message */ + if (st) + return; + + while (off < cmd->cnt8) { + log_str = &card->log.buf[card->log.buf_len]; + cnt = min(cmd->cnt8 - off, LOG_BUF_SIZE8 - card->log.buf_len); + len = substrncpy(log_str, &buf[off], cnt); + + off += len; + card->log.buf_len += len; + + /* + * Flush the log if we've hit the end of a message or if we've + * run out of buffer space. + */ + if ((log_str[len - 1] == '\0') || + (card->log.buf_len == LOG_BUF_SIZE8)) { + if (card->log.buf_len != 1) /* Don't log blank lines. */ + hw_log_msg(card, card->log.buf, + card->log.buf_len); + card->log.buf_len = 0; + } + + } + + if (cmd->status & CREG_STAT_LOG_PENDING) + rsxx_read_hw_log(card); +} + +int rsxx_read_hw_log(struct rsxx_cardinfo *card) +{ + int st; + + st = creg_queue_cmd(card, CREG_OP_READ, CREG_ADD_LOG, + sizeof(card->log.tmp), card->log.tmp, + 1, read_hw_log_done, NULL); + if (st) + dev_err(CARD_TO_DEV(card), + "Failed getting log text\n"); + + return st; +} + +/*-------------- IOCTL REG Access ------------------*/ +static int issue_reg_cmd(struct rsxx_cardinfo *card, + struct rsxx_reg_access *cmd, + int read) +{ + unsigned int op = read ? CREG_OP_READ : CREG_OP_WRITE; + + return __issue_creg_rw(card, op, cmd->addr, cmd->cnt, cmd->data, + cmd->stream, &cmd->stat); +} + +int rsxx_reg_access(struct rsxx_cardinfo *card, + struct rsxx_reg_access __user *ucmd, + int read) +{ + struct rsxx_reg_access cmd; + int st; + + st = copy_from_user(&cmd, ucmd, sizeof(cmd)); + if (st) + return -EFAULT; + + if (cmd.cnt > RSXX_MAX_REG_CNT) + return -EFAULT; + + st = issue_reg_cmd(card, &cmd, read); + if (st) + return st; + + st = put_user(cmd.stat, &ucmd->stat); + if (st) + return -EFAULT; + + if (read) { + st = copy_to_user(ucmd->data, cmd.data, cmd.cnt); + if (st) + return -EFAULT; + } + + return 0; +} + +void rsxx_eeh_save_issued_creg(struct rsxx_cardinfo *card) +{ + struct creg_cmd *cmd = NULL; + + cmd = card->creg_ctrl.active_cmd; + card->creg_ctrl.active_cmd = NULL; + + if (cmd) { + del_timer_sync(&card->creg_ctrl.cmd_timer); + + spin_lock_bh(&card->creg_ctrl.lock); + list_add(&cmd->list, &card->creg_ctrl.queue); + card->creg_ctrl.q_depth++; + card->creg_ctrl.active = 0; + spin_unlock_bh(&card->creg_ctrl.lock); + } +} + +void rsxx_kick_creg_queue(struct rsxx_cardinfo *card) +{ + spin_lock_bh(&card->creg_ctrl.lock); + if (!list_empty(&card->creg_ctrl.queue)) + creg_kick_queue(card); + spin_unlock_bh(&card->creg_ctrl.lock); +} + +/*------------ Initialization & Setup --------------*/ +int rsxx_creg_setup(struct rsxx_cardinfo *card) +{ + card->creg_ctrl.active_cmd = NULL; + + INIT_WORK(&card->creg_ctrl.done_work, creg_cmd_done); + mutex_init(&card->creg_ctrl.reset_lock); + INIT_LIST_HEAD(&card->creg_ctrl.queue); + spin_lock_init(&card->creg_ctrl.lock); + setup_timer(&card->creg_ctrl.cmd_timer, creg_cmd_timed_out, + (unsigned long) card); + + return 0; +} + +void rsxx_creg_destroy(struct rsxx_cardinfo *card) +{ + struct creg_cmd *cmd; + struct creg_cmd *tmp; + int cnt = 0; + + /* Cancel outstanding commands */ + spin_lock_bh(&card->creg_ctrl.lock); + list_for_each_entry_safe(cmd, tmp, &card->creg_ctrl.queue, list) { + list_del(&cmd->list); + if (cmd->cb) + cmd->cb(card, cmd, -ECANCELED); + kmem_cache_free(creg_cmd_pool, cmd); + cnt++; + } + + if (cnt) + dev_info(CARD_TO_DEV(card), + "Canceled %d queue creg commands\n", cnt); + + cmd = card->creg_ctrl.active_cmd; + card->creg_ctrl.active_cmd = NULL; + if (cmd) { + if (timer_pending(&card->creg_ctrl.cmd_timer)) + del_timer_sync(&card->creg_ctrl.cmd_timer); + + if (cmd->cb) + cmd->cb(card, cmd, -ECANCELED); + dev_info(CARD_TO_DEV(card), + "Canceled active creg command\n"); + kmem_cache_free(creg_cmd_pool, cmd); + } + spin_unlock_bh(&card->creg_ctrl.lock); + + cancel_work_sync(&card->creg_ctrl.done_work); +} + + +int rsxx_creg_init(void) +{ + creg_cmd_pool = KMEM_CACHE(creg_cmd, SLAB_HWCACHE_ALIGN); + if (!creg_cmd_pool) + return -ENOMEM; + + return 0; +} + +void rsxx_creg_cleanup(void) +{ + kmem_cache_destroy(creg_cmd_pool); +} diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c new file mode 100644 index 00000000000..4346d17d294 --- /dev/null +++ b/drivers/block/rsxx/dev.c @@ -0,0 +1,367 @@ +/* +* Filename: dev.c +* +* +* Authors: Joshua Morris <josh.h.morris@us.ibm.com> +* Philip Kelleher <pjk1939@linux.vnet.ibm.com> +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/slab.h> + +#include <linux/hdreg.h> +#include <linux/genhd.h> +#include <linux/blkdev.h> +#include <linux/bio.h> + +#include <linux/fs.h> + +#include "rsxx_priv.h" + +static unsigned int blkdev_minors = 64; +module_param(blkdev_minors, uint, 0444); +MODULE_PARM_DESC(blkdev_minors, "Number of minors(partitions)"); + +/* + * For now I'm making this tweakable in case any applications hit this limit. + * If you see a "bio too big" error in the log you will need to raise this + * value. + */ +static unsigned int blkdev_max_hw_sectors = 1024; +module_param(blkdev_max_hw_sectors, uint, 0444); +MODULE_PARM_DESC(blkdev_max_hw_sectors, "Max hw sectors for a single BIO"); + +static unsigned int enable_blkdev = 1; +module_param(enable_blkdev , uint, 0444); +MODULE_PARM_DESC(enable_blkdev, "Enable block device interfaces"); + + +struct rsxx_bio_meta { + struct bio *bio; + atomic_t pending_dmas; + atomic_t error; + unsigned long start_time; +}; + +static struct kmem_cache *bio_meta_pool; + +/*----------------- Block Device Operations -----------------*/ +static int rsxx_blkdev_ioctl(struct block_device *bdev, + fmode_t mode, + unsigned int cmd, + unsigned long arg) +{ + struct rsxx_cardinfo *card = bdev->bd_disk->private_data; + + switch (cmd) { + case RSXX_GETREG: + return rsxx_reg_access(card, (void __user *)arg, 1); + case RSXX_SETREG: + return rsxx_reg_access(card, (void __user *)arg, 0); + } + + return -ENOTTY; +} + +static int rsxx_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct rsxx_cardinfo *card = bdev->bd_disk->private_data; + u64 blocks = card->size8 >> 9; + + /* + * get geometry: Fake it. I haven't found any drivers that set + * geo->start, so we won't either. + */ + if (card->size8) { + geo->heads = 64; + geo->sectors = 16; + do_div(blocks, (geo->heads * geo->sectors)); + geo->cylinders = blocks; + } else { + geo->heads = 0; + geo->sectors = 0; + geo->cylinders = 0; + } + return 0; +} + +static const struct block_device_operations rsxx_fops = { + .owner = THIS_MODULE, + .getgeo = rsxx_getgeo, + .ioctl = rsxx_blkdev_ioctl, +}; + +static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio) +{ + struct hd_struct *part0 = &card->gendisk->part0; + int rw = bio_data_dir(bio); + int cpu; + + cpu = part_stat_lock(); + + part_round_stats(cpu, part0); + part_inc_in_flight(part0, rw); + + part_stat_unlock(); +} + +static void disk_stats_complete(struct rsxx_cardinfo *card, + struct bio *bio, + unsigned long start_time) +{ + struct hd_struct *part0 = &card->gendisk->part0; + unsigned long duration = jiffies - start_time; + int rw = bio_data_dir(bio); + int cpu; + + cpu = part_stat_lock(); + + part_stat_add(cpu, part0, sectors[rw], bio_sectors(bio)); + part_stat_inc(cpu, part0, ios[rw]); + part_stat_add(cpu, part0, ticks[rw], duration); + + part_round_stats(cpu, part0); + part_dec_in_flight(part0, rw); + + part_stat_unlock(); +} + +static void bio_dma_done_cb(struct rsxx_cardinfo *card, + void *cb_data, + unsigned int error) +{ + struct rsxx_bio_meta *meta = cb_data; + + if (error) + atomic_set(&meta->error, 1); + + if (atomic_dec_and_test(&meta->pending_dmas)) { + disk_stats_complete(card, meta->bio, meta->start_time); + + bio_endio(meta->bio, atomic_read(&meta->error) ? -EIO : 0); + kmem_cache_free(bio_meta_pool, meta); + } +} + +static void rsxx_make_request(struct request_queue *q, struct bio *bio) +{ + struct rsxx_cardinfo *card = q->queuedata; + struct rsxx_bio_meta *bio_meta; + int st = -EINVAL; + + might_sleep(); + + if (unlikely(card->halt)) { + st = -EFAULT; + goto req_err; + } + + if (unlikely(card->dma_fault)) { + st = (-EFAULT); + goto req_err; + } + + if (bio->bi_size == 0) { + dev_err(CARD_TO_DEV(card), "size zero BIO!\n"); + goto req_err; + } + + bio_meta = kmem_cache_alloc(bio_meta_pool, GFP_KERNEL); + if (!bio_meta) { + st = -ENOMEM; + goto req_err; + } + + bio_meta->bio = bio; + atomic_set(&bio_meta->error, 0); + atomic_set(&bio_meta->pending_dmas, 0); + bio_meta->start_time = jiffies; + + disk_stats_start(card, bio); + + dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n", + bio_data_dir(bio) ? 'W' : 'R', bio_meta, + (u64)bio->bi_sector << 9, bio->bi_size); + + st = rsxx_dma_queue_bio(card, bio, &bio_meta->pending_dmas, + bio_dma_done_cb, bio_meta); + if (st) + goto queue_err; + + return; + +queue_err: + kmem_cache_free(bio_meta_pool, bio_meta); +req_err: + bio_endio(bio, st); +} + +/*----------------- Device Setup -------------------*/ +static bool rsxx_discard_supported(struct rsxx_cardinfo *card) +{ + unsigned char pci_rev; + + pci_read_config_byte(card->dev, PCI_REVISION_ID, &pci_rev); + + return (pci_rev >= RSXX_DISCARD_SUPPORT); +} + +static unsigned short rsxx_get_logical_block_size( + struct rsxx_cardinfo *card) +{ + u32 capabilities = 0; + int st; + + st = rsxx_get_card_capabilities(card, &capabilities); + if (st) + dev_warn(CARD_TO_DEV(card), + "Failed reading card capabilities register\n"); + + /* Earlier firmware did not have support for 512 byte accesses */ + if (capabilities & CARD_CAP_SUBPAGE_WRITES) + return 512; + else + return RSXX_HW_BLK_SIZE; +} + +int rsxx_attach_dev(struct rsxx_cardinfo *card) +{ + mutex_lock(&card->dev_lock); + + /* The block device requires the stripe size from the config. */ + if (enable_blkdev) { + if (card->config_valid) + set_capacity(card->gendisk, card->size8 >> 9); + else + set_capacity(card->gendisk, 0); + add_disk(card->gendisk); + + card->bdev_attached = 1; + } + + mutex_unlock(&card->dev_lock); + + return 0; +} + +void rsxx_detach_dev(struct rsxx_cardinfo *card) +{ + mutex_lock(&card->dev_lock); + + if (card->bdev_attached) { + del_gendisk(card->gendisk); + card->bdev_attached = 0; + } + + mutex_unlock(&card->dev_lock); +} + +int rsxx_setup_dev(struct rsxx_cardinfo *card) +{ + unsigned short blk_size; + + mutex_init(&card->dev_lock); + + if (!enable_blkdev) + return 0; + + card->major = register_blkdev(0, DRIVER_NAME); + if (card->major < 0) { + dev_err(CARD_TO_DEV(card), "Failed to get major number\n"); + return -ENOMEM; + } + + card->queue = blk_alloc_queue(GFP_KERNEL); + if (!card->queue) { + dev_err(CARD_TO_DEV(card), "Failed queue alloc\n"); + unregister_blkdev(card->major, DRIVER_NAME); + return -ENOMEM; + } + + card->gendisk = alloc_disk(blkdev_minors); + if (!card->gendisk) { + dev_err(CARD_TO_DEV(card), "Failed disk alloc\n"); + blk_cleanup_queue(card->queue); + unregister_blkdev(card->major, DRIVER_NAME); + return -ENOMEM; + } + + blk_size = rsxx_get_logical_block_size(card); + + blk_queue_make_request(card->queue, rsxx_make_request); + blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY); + blk_queue_dma_alignment(card->queue, blk_size - 1); + blk_queue_max_hw_sectors(card->queue, blkdev_max_hw_sectors); + blk_queue_logical_block_size(card->queue, blk_size); + blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE); + + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, card->queue); + if (rsxx_discard_supported(card)) { + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, card->queue); + blk_queue_max_discard_sectors(card->queue, + RSXX_HW_BLK_SIZE >> 9); + card->queue->limits.discard_granularity = RSXX_HW_BLK_SIZE; + card->queue->limits.discard_alignment = RSXX_HW_BLK_SIZE; + card->queue->limits.discard_zeroes_data = 1; + } + + card->queue->queuedata = card; + + snprintf(card->gendisk->disk_name, sizeof(card->gendisk->disk_name), + "rsxx%d", card->disk_id); + card->gendisk->driverfs_dev = &card->dev->dev; + card->gendisk->major = card->major; + card->gendisk->first_minor = 0; + card->gendisk->fops = &rsxx_fops; + card->gendisk->private_data = card; + card->gendisk->queue = card->queue; + + return 0; +} + +void rsxx_destroy_dev(struct rsxx_cardinfo *card) +{ + if (!enable_blkdev) + return; + + put_disk(card->gendisk); + card->gendisk = NULL; + + blk_cleanup_queue(card->queue); + unregister_blkdev(card->major, DRIVER_NAME); +} + +int rsxx_dev_init(void) +{ + bio_meta_pool = KMEM_CACHE(rsxx_bio_meta, SLAB_HWCACHE_ALIGN); + if (!bio_meta_pool) + return -ENOMEM; + + return 0; +} + +void rsxx_dev_cleanup(void) +{ + kmem_cache_destroy(bio_meta_pool); +} + + diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c new file mode 100644 index 00000000000..0607513cfb4 --- /dev/null +++ b/drivers/block/rsxx/dma.c @@ -0,0 +1,1095 @@ +/* +* Filename: dma.c +* +* +* Authors: Joshua Morris <josh.h.morris@us.ibm.com> +* Philip Kelleher <pjk1939@linux.vnet.ibm.com> +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include <linux/slab.h> +#include "rsxx_priv.h" + +struct rsxx_dma { + struct list_head list; + u8 cmd; + unsigned int laddr; /* Logical address */ + struct { + u32 off; + u32 cnt; + } sub_page; + dma_addr_t dma_addr; + struct page *page; + unsigned int pg_off; /* Page Offset */ + rsxx_dma_cb cb; + void *cb_data; +}; + +/* This timeout is used to detect a stalled DMA channel */ +#define DMA_ACTIVITY_TIMEOUT msecs_to_jiffies(10000) + +struct hw_status { + u8 status; + u8 tag; + __le16 count; + __le32 _rsvd2; + __le64 _rsvd3; +} __packed; + +enum rsxx_dma_status { + DMA_SW_ERR = 0x1, + DMA_HW_FAULT = 0x2, + DMA_CANCELLED = 0x4, +}; + +struct hw_cmd { + u8 command; + u8 tag; + u8 _rsvd; + u8 sub_page; /* Bit[0:2]: 512byte offset */ + /* Bit[4:6]: 512byte count */ + __le32 device_addr; + __le64 host_addr; +} __packed; + +enum rsxx_hw_cmd { + HW_CMD_BLK_DISCARD = 0x70, + HW_CMD_BLK_WRITE = 0x80, + HW_CMD_BLK_READ = 0xC0, + HW_CMD_BLK_RECON_READ = 0xE0, +}; + +enum rsxx_hw_status { + HW_STATUS_CRC = 0x01, + HW_STATUS_HARD_ERR = 0x02, + HW_STATUS_SOFT_ERR = 0x04, + HW_STATUS_FAULT = 0x08, +}; + +static struct kmem_cache *rsxx_dma_pool; + +struct dma_tracker { + int next_tag; + struct rsxx_dma *dma; +}; + +#define DMA_TRACKER_LIST_SIZE8 (sizeof(struct dma_tracker_list) + \ + (sizeof(struct dma_tracker) * RSXX_MAX_OUTSTANDING_CMDS)) + +struct dma_tracker_list { + spinlock_t lock; + int head; + struct dma_tracker list[0]; +}; + + +/*----------------- Misc Utility Functions -------------------*/ +static unsigned int rsxx_addr8_to_laddr(u64 addr8, struct rsxx_cardinfo *card) +{ + unsigned long long tgt_addr8; + + tgt_addr8 = ((addr8 >> card->_stripe.upper_shift) & + card->_stripe.upper_mask) | + ((addr8) & card->_stripe.lower_mask); + do_div(tgt_addr8, RSXX_HW_BLK_SIZE); + return tgt_addr8; +} + +static unsigned int rsxx_get_dma_tgt(struct rsxx_cardinfo *card, u64 addr8) +{ + unsigned int tgt; + + tgt = (addr8 >> card->_stripe.target_shift) & card->_stripe.target_mask; + + return tgt; +} + +void rsxx_dma_queue_reset(struct rsxx_cardinfo *card) +{ + /* Reset all DMA Command/Status Queues */ + iowrite32(DMA_QUEUE_RESET, card->regmap + RESET); +} + +static unsigned int get_dma_size(struct rsxx_dma *dma) +{ + if (dma->sub_page.cnt) + return dma->sub_page.cnt << 9; + else + return RSXX_HW_BLK_SIZE; +} + + +/*----------------- DMA Tracker -------------------*/ +static void set_tracker_dma(struct dma_tracker_list *trackers, + int tag, + struct rsxx_dma *dma) +{ + trackers->list[tag].dma = dma; +} + +static struct rsxx_dma *get_tracker_dma(struct dma_tracker_list *trackers, + int tag) +{ + return trackers->list[tag].dma; +} + +static int pop_tracker(struct dma_tracker_list *trackers) +{ + int tag; + + spin_lock(&trackers->lock); + tag = trackers->head; + if (tag != -1) { + trackers->head = trackers->list[tag].next_tag; + trackers->list[tag].next_tag = -1; + } + spin_unlock(&trackers->lock); + + return tag; +} + +static void push_tracker(struct dma_tracker_list *trackers, int tag) +{ + spin_lock(&trackers->lock); + trackers->list[tag].next_tag = trackers->head; + trackers->head = tag; + trackers->list[tag].dma = NULL; + spin_unlock(&trackers->lock); +} + + +/*----------------- Interrupt Coalescing -------------*/ +/* + * Interrupt Coalescing Register Format: + * Interrupt Timer (64ns units) [15:0] + * Interrupt Count [24:16] + * Reserved [31:25] +*/ +#define INTR_COAL_LATENCY_MASK (0x0000ffff) + +#define INTR_COAL_COUNT_SHIFT 16 +#define INTR_COAL_COUNT_BITS 9 +#define INTR_COAL_COUNT_MASK (((1 << INTR_COAL_COUNT_BITS) - 1) << \ + INTR_COAL_COUNT_SHIFT) +#define INTR_COAL_LATENCY_UNITS_NS 64 + + +static u32 dma_intr_coal_val(u32 mode, u32 count, u32 latency) +{ + u32 latency_units = latency / INTR_COAL_LATENCY_UNITS_NS; + + if (mode == RSXX_INTR_COAL_DISABLED) + return 0; + + return ((count << INTR_COAL_COUNT_SHIFT) & INTR_COAL_COUNT_MASK) | + (latency_units & INTR_COAL_LATENCY_MASK); + +} + +static void dma_intr_coal_auto_tune(struct rsxx_cardinfo *card) +{ + int i; + u32 q_depth = 0; + u32 intr_coal; + + if (card->config.data.intr_coal.mode != RSXX_INTR_COAL_AUTO_TUNE || + unlikely(card->eeh_state)) + return; + + for (i = 0; i < card->n_targets; i++) + q_depth += atomic_read(&card->ctrl[i].stats.hw_q_depth); + + intr_coal = dma_intr_coal_val(card->config.data.intr_coal.mode, + q_depth / 2, + card->config.data.intr_coal.latency); + iowrite32(intr_coal, card->regmap + INTR_COAL); +} + +/*----------------- RSXX DMA Handling -------------------*/ +static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl, + struct rsxx_dma *dma, + unsigned int status) +{ + if (status & DMA_SW_ERR) + ctrl->stats.dma_sw_err++; + if (status & DMA_HW_FAULT) + ctrl->stats.dma_hw_fault++; + if (status & DMA_CANCELLED) + ctrl->stats.dma_cancelled++; + + if (dma->dma_addr) + pci_unmap_page(ctrl->card->dev, dma->dma_addr, + get_dma_size(dma), + dma->cmd == HW_CMD_BLK_WRITE ? + PCI_DMA_TODEVICE : + PCI_DMA_FROMDEVICE); + + if (dma->cb) + dma->cb(ctrl->card, dma->cb_data, status ? 1 : 0); + + kmem_cache_free(rsxx_dma_pool, dma); +} + +static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl, + struct rsxx_dma *dma) +{ + /* + * Requeued DMAs go to the front of the queue so they are issued + * first. + */ + spin_lock(&ctrl->queue_lock); + list_add(&dma->list, &ctrl->queue); + spin_unlock(&ctrl->queue_lock); +} + +static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl, + struct rsxx_dma *dma, + u8 hw_st) +{ + unsigned int status = 0; + int requeue_cmd = 0; + + dev_dbg(CARD_TO_DEV(ctrl->card), + "Handling DMA error(cmd x%02x, laddr x%08x st:x%02x)\n", + dma->cmd, dma->laddr, hw_st); + + if (hw_st & HW_STATUS_CRC) + ctrl->stats.crc_errors++; + if (hw_st & HW_STATUS_HARD_ERR) + ctrl->stats.hard_errors++; + if (hw_st & HW_STATUS_SOFT_ERR) + ctrl->stats.soft_errors++; + + switch (dma->cmd) { + case HW_CMD_BLK_READ: + if (hw_st & (HW_STATUS_CRC | HW_STATUS_HARD_ERR)) { + if (ctrl->card->scrub_hard) { + dma->cmd = HW_CMD_BLK_RECON_READ; + requeue_cmd = 1; + ctrl->stats.reads_retried++; + } else { + status |= DMA_HW_FAULT; + ctrl->stats.reads_failed++; + } + } else if (hw_st & HW_STATUS_FAULT) { + status |= DMA_HW_FAULT; + ctrl->stats.reads_failed++; + } + + break; + case HW_CMD_BLK_RECON_READ: + if (hw_st & (HW_STATUS_CRC | HW_STATUS_HARD_ERR)) { + /* Data could not be reconstructed. */ + status |= DMA_HW_FAULT; + ctrl->stats.reads_failed++; + } + + break; + case HW_CMD_BLK_WRITE: + status |= DMA_HW_FAULT; + ctrl->stats.writes_failed++; + + break; + case HW_CMD_BLK_DISCARD: + status |= DMA_HW_FAULT; + ctrl->stats.discards_failed++; + + break; + default: + dev_err(CARD_TO_DEV(ctrl->card), + "Unknown command in DMA!(cmd: x%02x " + "laddr x%08x st: x%02x\n", + dma->cmd, dma->laddr, hw_st); + status |= DMA_SW_ERR; + + break; + } + + if (requeue_cmd) + rsxx_requeue_dma(ctrl, dma); + else + rsxx_complete_dma(ctrl, dma, status); +} + +static void dma_engine_stalled(unsigned long data) +{ + struct rsxx_dma_ctrl *ctrl = (struct rsxx_dma_ctrl *)data; + + if (atomic_read(&ctrl->stats.hw_q_depth) == 0 || + unlikely(ctrl->card->eeh_state)) + return; + + if (ctrl->cmd.idx != ioread32(ctrl->regmap + SW_CMD_IDX)) { + /* + * The dma engine was stalled because the SW_CMD_IDX write + * was lost. Issue it again to recover. + */ + dev_warn(CARD_TO_DEV(ctrl->card), + "SW_CMD_IDX write was lost, re-writing...\n"); + iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX); + mod_timer(&ctrl->activity_timer, + jiffies + DMA_ACTIVITY_TIMEOUT); + } else { + dev_warn(CARD_TO_DEV(ctrl->card), + "DMA channel %d has stalled, faulting interface.\n", + ctrl->id); + ctrl->card->dma_fault = 1; + } +} + +static void rsxx_issue_dmas(struct work_struct *work) +{ + struct rsxx_dma_ctrl *ctrl; + struct rsxx_dma *dma; + int tag; + int cmds_pending = 0; + struct hw_cmd *hw_cmd_buf; + + ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work); + hw_cmd_buf = ctrl->cmd.buf; + + if (unlikely(ctrl->card->halt) || + unlikely(ctrl->card->eeh_state)) + return; + + while (1) { + spin_lock(&ctrl->queue_lock); + if (list_empty(&ctrl->queue)) { + spin_unlock(&ctrl->queue_lock); + break; + } + spin_unlock(&ctrl->queue_lock); + + tag = pop_tracker(ctrl->trackers); + if (tag == -1) + break; + + spin_lock(&ctrl->queue_lock); + dma = list_entry(ctrl->queue.next, struct rsxx_dma, list); + list_del(&dma->list); + ctrl->stats.sw_q_depth--; + spin_unlock(&ctrl->queue_lock); + + /* + * This will catch any DMAs that slipped in right before the + * fault, but was queued after all the other DMAs were + * cancelled. + */ + if (unlikely(ctrl->card->dma_fault)) { + push_tracker(ctrl->trackers, tag); + rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); + continue; + } + + set_tracker_dma(ctrl->trackers, tag, dma); + hw_cmd_buf[ctrl->cmd.idx].command = dma->cmd; + hw_cmd_buf[ctrl->cmd.idx].tag = tag; + hw_cmd_buf[ctrl->cmd.idx]._rsvd = 0; + hw_cmd_buf[ctrl->cmd.idx].sub_page = + ((dma->sub_page.cnt & 0x7) << 4) | + (dma->sub_page.off & 0x7); + + hw_cmd_buf[ctrl->cmd.idx].device_addr = + cpu_to_le32(dma->laddr); + + hw_cmd_buf[ctrl->cmd.idx].host_addr = + cpu_to_le64(dma->dma_addr); + + dev_dbg(CARD_TO_DEV(ctrl->card), + "Issue DMA%d(laddr %d tag %d) to idx %d\n", + ctrl->id, dma->laddr, tag, ctrl->cmd.idx); + + ctrl->cmd.idx = (ctrl->cmd.idx + 1) & RSXX_CS_IDX_MASK; + cmds_pending++; + + if (dma->cmd == HW_CMD_BLK_WRITE) + ctrl->stats.writes_issued++; + else if (dma->cmd == HW_CMD_BLK_DISCARD) + ctrl->stats.discards_issued++; + else + ctrl->stats.reads_issued++; + } + + /* Let HW know we've queued commands. */ + if (cmds_pending) { + atomic_add(cmds_pending, &ctrl->stats.hw_q_depth); + mod_timer(&ctrl->activity_timer, + jiffies + DMA_ACTIVITY_TIMEOUT); + + if (unlikely(ctrl->card->eeh_state)) { + del_timer_sync(&ctrl->activity_timer); + return; + } + + iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX); + } +} + +static void rsxx_dma_done(struct work_struct *work) +{ + struct rsxx_dma_ctrl *ctrl; + struct rsxx_dma *dma; + unsigned long flags; + u16 count; + u8 status; + u8 tag; + struct hw_status *hw_st_buf; + + ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work); + hw_st_buf = ctrl->status.buf; + + if (unlikely(ctrl->card->halt) || + unlikely(ctrl->card->dma_fault) || + unlikely(ctrl->card->eeh_state)) + return; + + count = le16_to_cpu(hw_st_buf[ctrl->status.idx].count); + + while (count == ctrl->e_cnt) { + /* + * The read memory-barrier is necessary to keep aggressive + * processors/optimizers (such as the PPC Apple G5) from + * reordering the following status-buffer tag & status read + * *before* the count read on subsequent iterations of the + * loop! + */ + rmb(); + + status = hw_st_buf[ctrl->status.idx].status; + tag = hw_st_buf[ctrl->status.idx].tag; + + dma = get_tracker_dma(ctrl->trackers, tag); + if (dma == NULL) { + spin_lock_irqsave(&ctrl->card->irq_lock, flags); + rsxx_disable_ier(ctrl->card, CR_INTR_DMA_ALL); + spin_unlock_irqrestore(&ctrl->card->irq_lock, flags); + + dev_err(CARD_TO_DEV(ctrl->card), + "No tracker for tag %d " + "(idx %d id %d)\n", + tag, ctrl->status.idx, ctrl->id); + return; + } + + dev_dbg(CARD_TO_DEV(ctrl->card), + "Completing DMA%d" + "(laddr x%x tag %d st: x%x cnt: x%04x) from idx %d.\n", + ctrl->id, dma->laddr, tag, status, count, + ctrl->status.idx); + + atomic_dec(&ctrl->stats.hw_q_depth); + + mod_timer(&ctrl->activity_timer, + jiffies + DMA_ACTIVITY_TIMEOUT); + + if (status) + rsxx_handle_dma_error(ctrl, dma, status); + else + rsxx_complete_dma(ctrl, dma, 0); + + push_tracker(ctrl->trackers, tag); + + ctrl->status.idx = (ctrl->status.idx + 1) & + RSXX_CS_IDX_MASK; + ctrl->e_cnt++; + + count = le16_to_cpu(hw_st_buf[ctrl->status.idx].count); + } + + dma_intr_coal_auto_tune(ctrl->card); + + if (atomic_read(&ctrl->stats.hw_q_depth) == 0) + del_timer_sync(&ctrl->activity_timer); + + spin_lock_irqsave(&ctrl->card->irq_lock, flags); + rsxx_enable_ier(ctrl->card, CR_INTR_DMA(ctrl->id)); + spin_unlock_irqrestore(&ctrl->card->irq_lock, flags); + + spin_lock(&ctrl->queue_lock); + if (ctrl->stats.sw_q_depth) + queue_work(ctrl->issue_wq, &ctrl->issue_dma_work); + spin_unlock(&ctrl->queue_lock); +} + +static int rsxx_cleanup_dma_queue(struct rsxx_cardinfo *card, + struct list_head *q) +{ + struct rsxx_dma *dma; + struct rsxx_dma *tmp; + int cnt = 0; + + list_for_each_entry_safe(dma, tmp, q, list) { + list_del(&dma->list); + + if (dma->dma_addr) + pci_unmap_page(card->dev, dma->dma_addr, + get_dma_size(dma), + (dma->cmd == HW_CMD_BLK_WRITE) ? + PCI_DMA_TODEVICE : + PCI_DMA_FROMDEVICE); + kmem_cache_free(rsxx_dma_pool, dma); + cnt++; + } + + return cnt; +} + +static int rsxx_queue_discard(struct rsxx_cardinfo *card, + struct list_head *q, + unsigned int laddr, + rsxx_dma_cb cb, + void *cb_data) +{ + struct rsxx_dma *dma; + + dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL); + if (!dma) + return -ENOMEM; + + dma->cmd = HW_CMD_BLK_DISCARD; + dma->laddr = laddr; + dma->dma_addr = 0; + dma->sub_page.off = 0; + dma->sub_page.cnt = 0; + dma->page = NULL; + dma->pg_off = 0; + dma->cb = cb; + dma->cb_data = cb_data; + + dev_dbg(CARD_TO_DEV(card), "Queuing[D] laddr %x\n", dma->laddr); + + list_add_tail(&dma->list, q); + + return 0; +} + +static int rsxx_queue_dma(struct rsxx_cardinfo *card, + struct list_head *q, + int dir, + unsigned int dma_off, + unsigned int dma_len, + unsigned int laddr, + struct page *page, + unsigned int pg_off, + rsxx_dma_cb cb, + void *cb_data) +{ + struct rsxx_dma *dma; + + dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL); + if (!dma) + return -ENOMEM; + + dma->dma_addr = pci_map_page(card->dev, page, pg_off, dma_len, + dir ? PCI_DMA_TODEVICE : + PCI_DMA_FROMDEVICE); + if (!dma->dma_addr) { + kmem_cache_free(rsxx_dma_pool, dma); + return -ENOMEM; + } + + dma->cmd = dir ? HW_CMD_BLK_WRITE : HW_CMD_BLK_READ; + dma->laddr = laddr; + dma->sub_page.off = (dma_off >> 9); + dma->sub_page.cnt = (dma_len >> 9); + dma->page = page; + dma->pg_off = pg_off; + dma->cb = cb; + dma->cb_data = cb_data; + + dev_dbg(CARD_TO_DEV(card), + "Queuing[%c] laddr %x off %d cnt %d page %p pg_off %d\n", + dir ? 'W' : 'R', dma->laddr, dma->sub_page.off, + dma->sub_page.cnt, dma->page, dma->pg_off); + + /* Queue the DMA */ + list_add_tail(&dma->list, q); + + return 0; +} + +int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, + struct bio *bio, + atomic_t *n_dmas, + rsxx_dma_cb cb, + void *cb_data) +{ + struct list_head dma_list[RSXX_MAX_TARGETS]; + struct bio_vec *bvec; + unsigned long long addr8; + unsigned int laddr; + unsigned int bv_len; + unsigned int bv_off; + unsigned int dma_off; + unsigned int dma_len; + int dma_cnt[RSXX_MAX_TARGETS]; + int tgt; + int st; + int i; + + addr8 = bio->bi_sector << 9; /* sectors are 512 bytes */ + atomic_set(n_dmas, 0); + + for (i = 0; i < card->n_targets; i++) { + INIT_LIST_HEAD(&dma_list[i]); + dma_cnt[i] = 0; + } + + if (bio->bi_rw & REQ_DISCARD) { + bv_len = bio->bi_size; + + while (bv_len > 0) { + tgt = rsxx_get_dma_tgt(card, addr8); + laddr = rsxx_addr8_to_laddr(addr8, card); + + st = rsxx_queue_discard(card, &dma_list[tgt], laddr, + cb, cb_data); + if (st) + goto bvec_err; + + dma_cnt[tgt]++; + atomic_inc(n_dmas); + addr8 += RSXX_HW_BLK_SIZE; + bv_len -= RSXX_HW_BLK_SIZE; + } + } else { + bio_for_each_segment(bvec, bio, i) { + bv_len = bvec->bv_len; + bv_off = bvec->bv_offset; + + while (bv_len > 0) { + tgt = rsxx_get_dma_tgt(card, addr8); + laddr = rsxx_addr8_to_laddr(addr8, card); + dma_off = addr8 & RSXX_HW_BLK_MASK; + dma_len = min(bv_len, + RSXX_HW_BLK_SIZE - dma_off); + + st = rsxx_queue_dma(card, &dma_list[tgt], + bio_data_dir(bio), + dma_off, dma_len, + laddr, bvec->bv_page, + bv_off, cb, cb_data); + if (st) + goto bvec_err; + + dma_cnt[tgt]++; + atomic_inc(n_dmas); + addr8 += dma_len; + bv_off += dma_len; + bv_len -= dma_len; + } + } + } + + for (i = 0; i < card->n_targets; i++) { + if (!list_empty(&dma_list[i])) { + spin_lock(&card->ctrl[i].queue_lock); + card->ctrl[i].stats.sw_q_depth += dma_cnt[i]; + list_splice_tail(&dma_list[i], &card->ctrl[i].queue); + spin_unlock(&card->ctrl[i].queue_lock); + + queue_work(card->ctrl[i].issue_wq, + &card->ctrl[i].issue_dma_work); + } + } + + return 0; + +bvec_err: + for (i = 0; i < card->n_targets; i++) + rsxx_cleanup_dma_queue(card, &dma_list[i]); + + return st; +} + + +/*----------------- DMA Engine Initialization & Setup -------------------*/ +int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl) +{ + ctrl->status.buf = pci_alloc_consistent(dev, STATUS_BUFFER_SIZE8, + &ctrl->status.dma_addr); + ctrl->cmd.buf = pci_alloc_consistent(dev, COMMAND_BUFFER_SIZE8, + &ctrl->cmd.dma_addr); + if (ctrl->status.buf == NULL || ctrl->cmd.buf == NULL) + return -ENOMEM; + + memset(ctrl->status.buf, 0xac, STATUS_BUFFER_SIZE8); + iowrite32(lower_32_bits(ctrl->status.dma_addr), + ctrl->regmap + SB_ADD_LO); + iowrite32(upper_32_bits(ctrl->status.dma_addr), + ctrl->regmap + SB_ADD_HI); + + memset(ctrl->cmd.buf, 0x83, COMMAND_BUFFER_SIZE8); + iowrite32(lower_32_bits(ctrl->cmd.dma_addr), ctrl->regmap + CB_ADD_LO); + iowrite32(upper_32_bits(ctrl->cmd.dma_addr), ctrl->regmap + CB_ADD_HI); + + ctrl->status.idx = ioread32(ctrl->regmap + HW_STATUS_CNT); + if (ctrl->status.idx > RSXX_MAX_OUTSTANDING_CMDS) { + dev_crit(&dev->dev, "Failed reading status cnt x%x\n", + ctrl->status.idx); + return -EINVAL; + } + iowrite32(ctrl->status.idx, ctrl->regmap + HW_STATUS_CNT); + iowrite32(ctrl->status.idx, ctrl->regmap + SW_STATUS_CNT); + + ctrl->cmd.idx = ioread32(ctrl->regmap + HW_CMD_IDX); + if (ctrl->cmd.idx > RSXX_MAX_OUTSTANDING_CMDS) { + dev_crit(&dev->dev, "Failed reading cmd cnt x%x\n", + ctrl->status.idx); + return -EINVAL; + } + iowrite32(ctrl->cmd.idx, ctrl->regmap + HW_CMD_IDX); + iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX); + + return 0; +} + +static int rsxx_dma_ctrl_init(struct pci_dev *dev, + struct rsxx_dma_ctrl *ctrl) +{ + int i; + int st; + + memset(&ctrl->stats, 0, sizeof(ctrl->stats)); + + ctrl->trackers = vmalloc(DMA_TRACKER_LIST_SIZE8); + if (!ctrl->trackers) + return -ENOMEM; + + ctrl->trackers->head = 0; + for (i = 0; i < RSXX_MAX_OUTSTANDING_CMDS; i++) { + ctrl->trackers->list[i].next_tag = i + 1; + ctrl->trackers->list[i].dma = NULL; + } + ctrl->trackers->list[RSXX_MAX_OUTSTANDING_CMDS-1].next_tag = -1; + spin_lock_init(&ctrl->trackers->lock); + + spin_lock_init(&ctrl->queue_lock); + INIT_LIST_HEAD(&ctrl->queue); + + setup_timer(&ctrl->activity_timer, dma_engine_stalled, + (unsigned long)ctrl); + + ctrl->issue_wq = alloc_ordered_workqueue(DRIVER_NAME"_issue", 0); + if (!ctrl->issue_wq) + return -ENOMEM; + + ctrl->done_wq = alloc_ordered_workqueue(DRIVER_NAME"_done", 0); + if (!ctrl->done_wq) + return -ENOMEM; + + INIT_WORK(&ctrl->issue_dma_work, rsxx_issue_dmas); + INIT_WORK(&ctrl->dma_done_work, rsxx_dma_done); + + st = rsxx_hw_buffers_init(dev, ctrl); + if (st) + return st; + + return 0; +} + +static int rsxx_dma_stripe_setup(struct rsxx_cardinfo *card, + unsigned int stripe_size8) +{ + if (!is_power_of_2(stripe_size8)) { + dev_err(CARD_TO_DEV(card), + "stripe_size is NOT a power of 2!\n"); + return -EINVAL; + } + + card->_stripe.lower_mask = stripe_size8 - 1; + + card->_stripe.upper_mask = ~(card->_stripe.lower_mask); + card->_stripe.upper_shift = ffs(card->n_targets) - 1; + + card->_stripe.target_mask = card->n_targets - 1; + card->_stripe.target_shift = ffs(stripe_size8) - 1; + + dev_dbg(CARD_TO_DEV(card), "_stripe.lower_mask = x%016llx\n", + card->_stripe.lower_mask); + dev_dbg(CARD_TO_DEV(card), "_stripe.upper_shift = x%016llx\n", + card->_stripe.upper_shift); + dev_dbg(CARD_TO_DEV(card), "_stripe.upper_mask = x%016llx\n", + card->_stripe.upper_mask); + dev_dbg(CARD_TO_DEV(card), "_stripe.target_mask = x%016llx\n", + card->_stripe.target_mask); + dev_dbg(CARD_TO_DEV(card), "_stripe.target_shift = x%016llx\n", + card->_stripe.target_shift); + + return 0; +} + +int rsxx_dma_configure(struct rsxx_cardinfo *card) +{ + u32 intr_coal; + + intr_coal = dma_intr_coal_val(card->config.data.intr_coal.mode, + card->config.data.intr_coal.count, + card->config.data.intr_coal.latency); + iowrite32(intr_coal, card->regmap + INTR_COAL); + + return rsxx_dma_stripe_setup(card, card->config.data.stripe_size); +} + +int rsxx_dma_setup(struct rsxx_cardinfo *card) +{ + unsigned long flags; + int st; + int i; + + dev_info(CARD_TO_DEV(card), + "Initializing %d DMA targets\n", + card->n_targets); + + /* Regmap is divided up into 4K chunks. One for each DMA channel */ + for (i = 0; i < card->n_targets; i++) + card->ctrl[i].regmap = card->regmap + (i * 4096); + + card->dma_fault = 0; + + /* Reset the DMA queues */ + rsxx_dma_queue_reset(card); + + /************* Setup DMA Control *************/ + for (i = 0; i < card->n_targets; i++) { + st = rsxx_dma_ctrl_init(card->dev, &card->ctrl[i]); + if (st) + goto failed_dma_setup; + + card->ctrl[i].card = card; + card->ctrl[i].id = i; + } + + card->scrub_hard = 1; + + if (card->config_valid) + rsxx_dma_configure(card); + + /* Enable the interrupts after all setup has completed. */ + for (i = 0; i < card->n_targets; i++) { + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_enable_ier_and_isr(card, CR_INTR_DMA(i)); + spin_unlock_irqrestore(&card->irq_lock, flags); + } + + return 0; + +failed_dma_setup: + for (i = 0; i < card->n_targets; i++) { + struct rsxx_dma_ctrl *ctrl = &card->ctrl[i]; + + if (ctrl->issue_wq) { + destroy_workqueue(ctrl->issue_wq); + ctrl->issue_wq = NULL; + } + + if (ctrl->done_wq) { + destroy_workqueue(ctrl->done_wq); + ctrl->done_wq = NULL; + } + + if (ctrl->trackers) + vfree(ctrl->trackers); + + if (ctrl->status.buf) + pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8, + ctrl->status.buf, + ctrl->status.dma_addr); + if (ctrl->cmd.buf) + pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8, + ctrl->cmd.buf, ctrl->cmd.dma_addr); + } + + return st; +} + + +void rsxx_dma_destroy(struct rsxx_cardinfo *card) +{ + struct rsxx_dma_ctrl *ctrl; + struct rsxx_dma *dma; + int i, j; + int cnt = 0; + + for (i = 0; i < card->n_targets; i++) { + ctrl = &card->ctrl[i]; + + if (ctrl->issue_wq) { + destroy_workqueue(ctrl->issue_wq); + ctrl->issue_wq = NULL; + } + + if (ctrl->done_wq) { + destroy_workqueue(ctrl->done_wq); + ctrl->done_wq = NULL; + } + + if (timer_pending(&ctrl->activity_timer)) + del_timer_sync(&ctrl->activity_timer); + + /* Clean up the DMA queue */ + spin_lock(&ctrl->queue_lock); + cnt = rsxx_cleanup_dma_queue(card, &ctrl->queue); + spin_unlock(&ctrl->queue_lock); + + if (cnt) + dev_info(CARD_TO_DEV(card), + "Freed %d queued DMAs on channel %d\n", + cnt, i); + + /* Clean up issued DMAs */ + for (j = 0; j < RSXX_MAX_OUTSTANDING_CMDS; j++) { + dma = get_tracker_dma(ctrl->trackers, j); + if (dma) { + pci_unmap_page(card->dev, dma->dma_addr, + get_dma_size(dma), + (dma->cmd == HW_CMD_BLK_WRITE) ? + PCI_DMA_TODEVICE : + PCI_DMA_FROMDEVICE); + kmem_cache_free(rsxx_dma_pool, dma); + cnt++; + } + } + + if (cnt) + dev_info(CARD_TO_DEV(card), + "Freed %d pending DMAs on channel %d\n", + cnt, i); + + vfree(ctrl->trackers); + + pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8, + ctrl->status.buf, ctrl->status.dma_addr); + pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8, + ctrl->cmd.buf, ctrl->cmd.dma_addr); + } +} + +int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) +{ + int i; + int j; + int cnt; + struct rsxx_dma *dma; + struct list_head *issued_dmas; + + issued_dmas = kzalloc(sizeof(*issued_dmas) * card->n_targets, + GFP_KERNEL); + if (!issued_dmas) + return -ENOMEM; + + for (i = 0; i < card->n_targets; i++) { + INIT_LIST_HEAD(&issued_dmas[i]); + cnt = 0; + for (j = 0; j < RSXX_MAX_OUTSTANDING_CMDS; j++) { + dma = get_tracker_dma(card->ctrl[i].trackers, j); + if (dma == NULL) + continue; + + if (dma->cmd == HW_CMD_BLK_WRITE) + card->ctrl[i].stats.writes_issued--; + else if (dma->cmd == HW_CMD_BLK_DISCARD) + card->ctrl[i].stats.discards_issued--; + else + card->ctrl[i].stats.reads_issued--; + + list_add_tail(&dma->list, &issued_dmas[i]); + push_tracker(card->ctrl[i].trackers, j); + cnt++; + } + + spin_lock(&card->ctrl[i].queue_lock); + list_splice(&issued_dmas[i], &card->ctrl[i].queue); + + atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth); + card->ctrl[i].stats.sw_q_depth += cnt; + card->ctrl[i].e_cnt = 0; + + list_for_each_entry(dma, &card->ctrl[i].queue, list) { + if (dma->dma_addr) + pci_unmap_page(card->dev, dma->dma_addr, + get_dma_size(dma), + dma->cmd == HW_CMD_BLK_WRITE ? + PCI_DMA_TODEVICE : + PCI_DMA_FROMDEVICE); + } + spin_unlock(&card->ctrl[i].queue_lock); + } + + kfree(issued_dmas); + + return 0; +} + +void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card) +{ + struct rsxx_dma *dma; + struct rsxx_dma *tmp; + int i; + + for (i = 0; i < card->n_targets; i++) { + spin_lock(&card->ctrl[i].queue_lock); + list_for_each_entry_safe(dma, tmp, &card->ctrl[i].queue, list) { + list_del(&dma->list); + + rsxx_complete_dma(&card->ctrl[i], dma, DMA_CANCELLED); + } + spin_unlock(&card->ctrl[i].queue_lock); + } +} + +int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card) +{ + struct rsxx_dma *dma; + int i; + + for (i = 0; i < card->n_targets; i++) { + spin_lock(&card->ctrl[i].queue_lock); + list_for_each_entry(dma, &card->ctrl[i].queue, list) { + dma->dma_addr = pci_map_page(card->dev, dma->page, + dma->pg_off, get_dma_size(dma), + dma->cmd == HW_CMD_BLK_WRITE ? + PCI_DMA_TODEVICE : + PCI_DMA_FROMDEVICE); + if (!dma->dma_addr) { + spin_unlock(&card->ctrl[i].queue_lock); + kmem_cache_free(rsxx_dma_pool, dma); + return -ENOMEM; + } + } + spin_unlock(&card->ctrl[i].queue_lock); + } + + return 0; +} + +int rsxx_dma_init(void) +{ + rsxx_dma_pool = KMEM_CACHE(rsxx_dma, SLAB_HWCACHE_ALIGN); + if (!rsxx_dma_pool) + return -ENOMEM; + + return 0; +} + + +void rsxx_dma_cleanup(void) +{ + kmem_cache_destroy(rsxx_dma_pool); +} + diff --git a/drivers/block/rsxx/rsxx.h b/drivers/block/rsxx/rsxx.h new file mode 100644 index 00000000000..24ba3642bd8 --- /dev/null +++ b/drivers/block/rsxx/rsxx.h @@ -0,0 +1,47 @@ +/* +* Filename: rsxx.h +* +* +* Authors: Joshua Morris <josh.h.morris@us.ibm.com> +* Philip Kelleher <pjk1939@linux.vnet.ibm.com> +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef __RSXX_H__ +#define __RSXX_H__ + +/*----------------- IOCTL Definitions -------------------*/ + +#define RSXX_MAX_DATA 8 + +struct rsxx_reg_access { + __u32 addr; + __u32 cnt; + __u32 stat; + __u32 stream; + __u32 data[RSXX_MAX_DATA]; +}; + +#define RSXX_MAX_REG_CNT (RSXX_MAX_DATA * (sizeof(__u32))) + +#define RSXX_IOC_MAGIC 'r' + +#define RSXX_GETREG _IOWR(RSXX_IOC_MAGIC, 0x20, struct rsxx_reg_access) +#define RSXX_SETREG _IOWR(RSXX_IOC_MAGIC, 0x21, struct rsxx_reg_access) + +#endif /* __RSXX_H_ */ diff --git a/drivers/block/rsxx/rsxx_cfg.h b/drivers/block/rsxx/rsxx_cfg.h new file mode 100644 index 00000000000..f384c943846 --- /dev/null +++ b/drivers/block/rsxx/rsxx_cfg.h @@ -0,0 +1,72 @@ +/* +* Filename: rsXX_cfg.h +* +* +* Authors: Joshua Morris <josh.h.morris@us.ibm.com> +* Philip Kelleher <pjk1939@linux.vnet.ibm.com> +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef __RSXX_CFG_H__ +#define __RSXX_CFG_H__ + +/* NOTE: Config values will be saved in network byte order (i.e. Big endian) */ +#include <linux/types.h> + +/* + * The card config version must match the driver's expected version. If it does + * not, the DMA interfaces will not be attached and the user will need to + * initialize/upgrade the card configuration using the card config utility. + */ +#define RSXX_CFG_VERSION 4 + +struct card_cfg_hdr { + __u32 version; + __u32 crc; +}; + +struct card_cfg_data { + __u32 block_size; + __u32 stripe_size; + __u32 vendor_id; + __u32 cache_order; + struct { + __u32 mode; /* Disabled, manual, auto-tune... */ + __u32 count; /* Number of intr to coalesce */ + __u32 latency;/* Max wait time (in ns) */ + } intr_coal; +}; + +struct rsxx_card_cfg { + struct card_cfg_hdr hdr; + struct card_cfg_data data; +}; + +/* Vendor ID Values */ +#define RSXX_VENDOR_ID_IBM 0 +#define RSXX_VENDOR_ID_DSI 1 +#define RSXX_VENDOR_COUNT 2 + +/* Interrupt Coalescing Values */ +#define RSXX_INTR_COAL_DISABLED 0 +#define RSXX_INTR_COAL_EXPLICIT 1 +#define RSXX_INTR_COAL_AUTO_TUNE 2 + + +#endif /* __RSXX_CFG_H__ */ + diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h new file mode 100644 index 00000000000..382e8bf5c03 --- /dev/null +++ b/drivers/block/rsxx/rsxx_priv.h @@ -0,0 +1,419 @@ +/* +* Filename: rsxx_priv.h +* +* +* Authors: Joshua Morris <josh.h.morris@us.ibm.com> +* Philip Kelleher <pjk1939@linux.vnet.ibm.com> +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef __RSXX_PRIV_H__ +#define __RSXX_PRIV_H__ + +#include <linux/version.h> +#include <linux/semaphore.h> + +#include <linux/fs.h> +#include <linux/interrupt.h> +#include <linux/mutex.h> +#include <linux/pci.h> +#include <linux/spinlock.h> +#include <linux/sysfs.h> +#include <linux/workqueue.h> +#include <linux/bio.h> +#include <linux/vmalloc.h> +#include <linux/timer.h> +#include <linux/ioctl.h> + +#include "rsxx.h" +#include "rsxx_cfg.h" + +struct proc_cmd; + +#define PCI_DEVICE_ID_FS70_FLASH 0x04A9 +#define PCI_DEVICE_ID_FS80_FLASH 0x04AA + +#define RS70_PCI_REV_SUPPORTED 4 + +#define DRIVER_NAME "rsxx" +#define DRIVER_VERSION "4.0" + +/* Block size is 4096 */ +#define RSXX_HW_BLK_SHIFT 12 +#define RSXX_HW_BLK_SIZE (1 << RSXX_HW_BLK_SHIFT) +#define RSXX_HW_BLK_MASK (RSXX_HW_BLK_SIZE - 1) + +#define MAX_CREG_DATA8 32 +#define LOG_BUF_SIZE8 128 + +#define RSXX_MAX_OUTSTANDING_CMDS 255 +#define RSXX_CS_IDX_MASK 0xff + +#define STATUS_BUFFER_SIZE8 4096 +#define COMMAND_BUFFER_SIZE8 4096 + +#define RSXX_MAX_TARGETS 8 + +struct dma_tracker_list; + +/* DMA Command/Status Buffer structure */ +struct rsxx_cs_buffer { + dma_addr_t dma_addr; + void *buf; + u32 idx; +}; + +struct rsxx_dma_stats { + u32 crc_errors; + u32 hard_errors; + u32 soft_errors; + u32 writes_issued; + u32 writes_failed; + u32 reads_issued; + u32 reads_failed; + u32 reads_retried; + u32 discards_issued; + u32 discards_failed; + u32 done_rescheduled; + u32 issue_rescheduled; + u32 dma_sw_err; + u32 dma_hw_fault; + u32 dma_cancelled; + u32 sw_q_depth; /* Number of DMAs on the SW queue. */ + atomic_t hw_q_depth; /* Number of DMAs queued to HW. */ +}; + +struct rsxx_dma_ctrl { + struct rsxx_cardinfo *card; + int id; + void __iomem *regmap; + struct rsxx_cs_buffer status; + struct rsxx_cs_buffer cmd; + u16 e_cnt; + spinlock_t queue_lock; + struct list_head queue; + struct workqueue_struct *issue_wq; + struct work_struct issue_dma_work; + struct workqueue_struct *done_wq; + struct work_struct dma_done_work; + struct timer_list activity_timer; + struct dma_tracker_list *trackers; + struct rsxx_dma_stats stats; +}; + +struct rsxx_cardinfo { + struct pci_dev *dev; + unsigned int halt; + unsigned int eeh_state; + + void __iomem *regmap; + spinlock_t irq_lock; + unsigned int isr_mask; + unsigned int ier_mask; + + struct rsxx_card_cfg config; + int config_valid; + + /* Embedded CPU Communication */ + struct { + spinlock_t lock; + bool active; + struct creg_cmd *active_cmd; + struct work_struct done_work; + struct list_head queue; + unsigned int q_depth; + /* Cache the creg status to prevent ioreads */ + struct { + u32 stat; + u32 failed_cancel_timer; + u32 creg_timeout; + } creg_stats; + struct timer_list cmd_timer; + struct mutex reset_lock; + int reset; + } creg_ctrl; + + struct { + char tmp[MAX_CREG_DATA8]; + char buf[LOG_BUF_SIZE8]; /* terminated */ + int buf_len; + } log; + + struct work_struct event_work; + unsigned int state; + u64 size8; + + /* Lock the device attach/detach function */ + struct mutex dev_lock; + + /* Block Device Variables */ + bool bdev_attached; + int disk_id; + int major; + struct request_queue *queue; + struct gendisk *gendisk; + struct { + /* Used to convert a byte address to a device address. */ + u64 lower_mask; + u64 upper_shift; + u64 upper_mask; + u64 target_mask; + u64 target_shift; + } _stripe; + unsigned int dma_fault; + + int scrub_hard; + + int n_targets; + struct rsxx_dma_ctrl *ctrl; +}; + +enum rsxx_pci_regmap { + HWID = 0x00, /* Hardware Identification Register */ + SCRATCH = 0x04, /* Scratch/Debug Register */ + RESET = 0x08, /* Reset Register */ + ISR = 0x10, /* Interrupt Status Register */ + IER = 0x14, /* Interrupt Enable Register */ + IPR = 0x18, /* Interrupt Poll Register */ + CB_ADD_LO = 0x20, /* Command Host Buffer Address [31:0] */ + CB_ADD_HI = 0x24, /* Command Host Buffer Address [63:32]*/ + HW_CMD_IDX = 0x28, /* Hardware Processed Command Index */ + SW_CMD_IDX = 0x2C, /* Software Processed Command Index */ + SB_ADD_LO = 0x30, /* Status Host Buffer Address [31:0] */ + SB_ADD_HI = 0x34, /* Status Host Buffer Address [63:32] */ + HW_STATUS_CNT = 0x38, /* Hardware Status Counter */ + SW_STATUS_CNT = 0x3C, /* Deprecated */ + CREG_CMD = 0x40, /* CPU Command Register */ + CREG_ADD = 0x44, /* CPU Address Register */ + CREG_CNT = 0x48, /* CPU Count Register */ + CREG_STAT = 0x4C, /* CPU Status Register */ + CREG_DATA0 = 0x50, /* CPU Data Registers */ + CREG_DATA1 = 0x54, + CREG_DATA2 = 0x58, + CREG_DATA3 = 0x5C, + CREG_DATA4 = 0x60, + CREG_DATA5 = 0x64, + CREG_DATA6 = 0x68, + CREG_DATA7 = 0x6c, + INTR_COAL = 0x70, /* Interrupt Coalescing Register */ + HW_ERROR = 0x74, /* Card Error Register */ + PCI_DEBUG0 = 0x78, /* PCI Debug Registers */ + PCI_DEBUG1 = 0x7C, + PCI_DEBUG2 = 0x80, + PCI_DEBUG3 = 0x84, + PCI_DEBUG4 = 0x88, + PCI_DEBUG5 = 0x8C, + PCI_DEBUG6 = 0x90, + PCI_DEBUG7 = 0x94, + PCI_POWER_THROTTLE = 0x98, + PERF_CTRL = 0x9c, + PERF_TIMER_LO = 0xa0, + PERF_TIMER_HI = 0xa4, + PERF_RD512_LO = 0xa8, + PERF_RD512_HI = 0xac, + PERF_WR512_LO = 0xb0, + PERF_WR512_HI = 0xb4, + PCI_RECONFIG = 0xb8, +}; + +enum rsxx_intr { + CR_INTR_DMA0 = 0x00000001, + CR_INTR_CREG = 0x00000002, + CR_INTR_DMA1 = 0x00000004, + CR_INTR_EVENT = 0x00000008, + CR_INTR_DMA2 = 0x00000010, + CR_INTR_DMA3 = 0x00000020, + CR_INTR_DMA4 = 0x00000040, + CR_INTR_DMA5 = 0x00000080, + CR_INTR_DMA6 = 0x00000100, + CR_INTR_DMA7 = 0x00000200, + CR_INTR_ALL_C = 0x0000003f, + CR_INTR_ALL_G = 0x000003ff, + CR_INTR_DMA_ALL = 0x000003f5, + CR_INTR_ALL = 0xffffffff, +}; + +static inline int CR_INTR_DMA(int N) +{ + static const unsigned int _CR_INTR_DMA[] = { + CR_INTR_DMA0, CR_INTR_DMA1, CR_INTR_DMA2, CR_INTR_DMA3, + CR_INTR_DMA4, CR_INTR_DMA5, CR_INTR_DMA6, CR_INTR_DMA7 + }; + return _CR_INTR_DMA[N]; +} +enum rsxx_pci_reset { + DMA_QUEUE_RESET = 0x00000001, +}; + +enum rsxx_hw_fifo_flush { + RSXX_FLUSH_BUSY = 0x00000002, + RSXX_FLUSH_TIMEOUT = 0x00000004, +}; + +enum rsxx_pci_revision { + RSXX_DISCARD_SUPPORT = 2, + RSXX_EEH_SUPPORT = 3, +}; + +enum rsxx_creg_cmd { + CREG_CMD_TAG_MASK = 0x0000FF00, + CREG_OP_WRITE = 0x000000C0, + CREG_OP_READ = 0x000000E0, +}; + +enum rsxx_creg_addr { + CREG_ADD_CARD_CMD = 0x80001000, + CREG_ADD_CARD_STATE = 0x80001004, + CREG_ADD_CARD_SIZE = 0x8000100c, + CREG_ADD_CAPABILITIES = 0x80001050, + CREG_ADD_LOG = 0x80002000, + CREG_ADD_NUM_TARGETS = 0x80003000, + CREG_ADD_CONFIG = 0xB0000000, +}; + +enum rsxx_creg_card_cmd { + CARD_CMD_STARTUP = 1, + CARD_CMD_SHUTDOWN = 2, + CARD_CMD_LOW_LEVEL_FORMAT = 3, + CARD_CMD_FPGA_RECONFIG_BR = 4, + CARD_CMD_FPGA_RECONFIG_MAIN = 5, + CARD_CMD_BACKUP = 6, + CARD_CMD_RESET = 7, + CARD_CMD_deprecated = 8, + CARD_CMD_UNINITIALIZE = 9, + CARD_CMD_DSTROY_EMERGENCY = 10, + CARD_CMD_DSTROY_NORMAL = 11, + CARD_CMD_DSTROY_EXTENDED = 12, + CARD_CMD_DSTROY_ABORT = 13, +}; + +enum rsxx_card_state { + CARD_STATE_SHUTDOWN = 0x00000001, + CARD_STATE_STARTING = 0x00000002, + CARD_STATE_FORMATTING = 0x00000004, + CARD_STATE_UNINITIALIZED = 0x00000008, + CARD_STATE_GOOD = 0x00000010, + CARD_STATE_SHUTTING_DOWN = 0x00000020, + CARD_STATE_FAULT = 0x00000040, + CARD_STATE_RD_ONLY_FAULT = 0x00000080, + CARD_STATE_DSTROYING = 0x00000100, +}; + +enum rsxx_led { + LED_DEFAULT = 0x0, + LED_IDENTIFY = 0x1, + LED_SOAK = 0x2, +}; + +enum rsxx_creg_flash_lock { + CREG_FLASH_LOCK = 1, + CREG_FLASH_UNLOCK = 2, +}; + +enum rsxx_card_capabilities { + CARD_CAP_SUBPAGE_WRITES = 0x00000080, +}; + +enum rsxx_creg_stat { + CREG_STAT_STATUS_MASK = 0x00000003, + CREG_STAT_SUCCESS = 0x1, + CREG_STAT_ERROR = 0x2, + CREG_STAT_CHAR_PENDING = 0x00000004, /* Character I/O pending bit */ + CREG_STAT_LOG_PENDING = 0x00000008, /* HW log message pending bit */ + CREG_STAT_TAG_MASK = 0x0000ff00, +}; + +static inline unsigned int CREG_DATA(int N) +{ + return CREG_DATA0 + (N << 2); +} + +/*----------------- Convenient Log Wrappers -------------------*/ +#define CARD_TO_DEV(__CARD) (&(__CARD)->dev->dev) + +/***** config.c *****/ +int rsxx_load_config(struct rsxx_cardinfo *card); + +/***** core.c *****/ +void rsxx_enable_ier(struct rsxx_cardinfo *card, unsigned int intr); +void rsxx_disable_ier(struct rsxx_cardinfo *card, unsigned int intr); +void rsxx_enable_ier_and_isr(struct rsxx_cardinfo *card, + unsigned int intr); +void rsxx_disable_ier_and_isr(struct rsxx_cardinfo *card, + unsigned int intr); + +/***** dev.c *****/ +int rsxx_attach_dev(struct rsxx_cardinfo *card); +void rsxx_detach_dev(struct rsxx_cardinfo *card); +int rsxx_setup_dev(struct rsxx_cardinfo *card); +void rsxx_destroy_dev(struct rsxx_cardinfo *card); +int rsxx_dev_init(void); +void rsxx_dev_cleanup(void); + +/***** dma.c ****/ +typedef void (*rsxx_dma_cb)(struct rsxx_cardinfo *card, + void *cb_data, + unsigned int status); +int rsxx_dma_setup(struct rsxx_cardinfo *card); +void rsxx_dma_destroy(struct rsxx_cardinfo *card); +int rsxx_dma_init(void); +void rsxx_dma_cleanup(void); +void rsxx_dma_queue_reset(struct rsxx_cardinfo *card); +int rsxx_dma_configure(struct rsxx_cardinfo *card); +int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, + struct bio *bio, + atomic_t *n_dmas, + rsxx_dma_cb cb, + void *cb_data); +int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl); +int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card); +void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card); +int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card); + +/***** cregs.c *****/ +int rsxx_creg_write(struct rsxx_cardinfo *card, u32 addr, + unsigned int size8, + void *data, + int byte_stream); +int rsxx_creg_read(struct rsxx_cardinfo *card, + u32 addr, + unsigned int size8, + void *data, + int byte_stream); +int rsxx_read_hw_log(struct rsxx_cardinfo *card); +int rsxx_get_card_state(struct rsxx_cardinfo *card, + unsigned int *state); +int rsxx_get_card_size8(struct rsxx_cardinfo *card, u64 *size8); +int rsxx_get_num_targets(struct rsxx_cardinfo *card, + unsigned int *n_targets); +int rsxx_get_card_capabilities(struct rsxx_cardinfo *card, + u32 *capabilities); +int rsxx_issue_card_cmd(struct rsxx_cardinfo *card, u32 cmd); +int rsxx_creg_setup(struct rsxx_cardinfo *card); +void rsxx_creg_destroy(struct rsxx_cardinfo *card); +int rsxx_creg_init(void); +void rsxx_creg_cleanup(void); +int rsxx_reg_access(struct rsxx_cardinfo *card, + struct rsxx_reg_access __user *ucmd, + int read); +void rsxx_eeh_save_issued_creg(struct rsxx_cardinfo *card); +void rsxx_kick_creg_queue(struct rsxx_cardinfo *card); + + + +#endif /* __DRIVERS_BLOCK_RSXX_H__ */ diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 57763c54363..758f2ac878c 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -1090,10 +1090,13 @@ static const struct block_device_operations floppy_fops = { static void swim3_mb_event(struct macio_dev* mdev, int mb_state) { struct floppy_state *fs = macio_get_drvdata(mdev); - struct swim3 __iomem *sw = fs->swim3; + struct swim3 __iomem *sw; if (!fs) return; + + sw = fs->swim3; + if (mb_state != MB_FD) return; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 8ad21a25bc0..64723953e1c 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -100,96 +100,103 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk, return vbr; } -static void virtblk_add_buf_wait(struct virtio_blk *vblk, - struct virtblk_req *vbr, - unsigned long out, - unsigned long in) +static int __virtblk_add_req(struct virtqueue *vq, + struct virtblk_req *vbr, + struct scatterlist *data_sg, + bool have_data) { - DEFINE_WAIT(wait); + struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6]; + unsigned int num_out = 0, num_in = 0; + int type = vbr->out_hdr.type & ~VIRTIO_BLK_T_OUT; - for (;;) { - prepare_to_wait_exclusive(&vblk->queue_wait, &wait, - TASK_UNINTERRUPTIBLE); + sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); + sgs[num_out++] = &hdr; - spin_lock_irq(vblk->disk->queue->queue_lock); - if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, - GFP_ATOMIC) < 0) { - spin_unlock_irq(vblk->disk->queue->queue_lock); - io_schedule(); - } else { - virtqueue_kick(vblk->vq); - spin_unlock_irq(vblk->disk->queue->queue_lock); - break; - } + /* + * If this is a packet command we need a couple of additional headers. + * Behind the normal outhdr we put a segment with the scsi command + * block, and before the normal inhdr we put the sense data and the + * inhdr with additional status information. + */ + if (type == VIRTIO_BLK_T_SCSI_CMD) { + sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len); + sgs[num_out++] = &cmd; + } + if (have_data) { + if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT) + sgs[num_out++] = data_sg; + else + sgs[num_out + num_in++] = data_sg; } - finish_wait(&vblk->queue_wait, &wait); + if (type == VIRTIO_BLK_T_SCSI_CMD) { + sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE); + sgs[num_out + num_in++] = &sense; + sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr)); + sgs[num_out + num_in++] = &inhdr; + } + + sg_init_one(&status, &vbr->status, sizeof(vbr->status)); + sgs[num_out + num_in++] = &status; + + return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); } -static inline void virtblk_add_req(struct virtblk_req *vbr, - unsigned int out, unsigned int in) +static void virtblk_add_req(struct virtblk_req *vbr, bool have_data) { struct virtio_blk *vblk = vbr->vblk; + DEFINE_WAIT(wait); + int ret; spin_lock_irq(vblk->disk->queue->queue_lock); - if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, - GFP_ATOMIC) < 0)) { + while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg, + have_data)) < 0)) { + prepare_to_wait_exclusive(&vblk->queue_wait, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_irq(vblk->disk->queue->queue_lock); - virtblk_add_buf_wait(vblk, vbr, out, in); - return; + io_schedule(); + spin_lock_irq(vblk->disk->queue->queue_lock); + + finish_wait(&vblk->queue_wait, &wait); } + virtqueue_kick(vblk->vq); spin_unlock_irq(vblk->disk->queue->queue_lock); } -static int virtblk_bio_send_flush(struct virtblk_req *vbr) +static void virtblk_bio_send_flush(struct virtblk_req *vbr) { - unsigned int out = 0, in = 0; - vbr->flags |= VBLK_IS_FLUSH; vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; vbr->out_hdr.sector = 0; vbr->out_hdr.ioprio = 0; - sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); - sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status)); - - virtblk_add_req(vbr, out, in); - return 0; + virtblk_add_req(vbr, false); } -static int virtblk_bio_send_data(struct virtblk_req *vbr) +static void virtblk_bio_send_data(struct virtblk_req *vbr) { struct virtio_blk *vblk = vbr->vblk; - unsigned int num, out = 0, in = 0; struct bio *bio = vbr->bio; + bool have_data; vbr->flags &= ~VBLK_IS_FLUSH; vbr->out_hdr.type = 0; vbr->out_hdr.sector = bio->bi_sector; vbr->out_hdr.ioprio = bio_prio(bio); - sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); - - num = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg + out); - - sg_set_buf(&vbr->sg[num + out + in++], &vbr->status, - sizeof(vbr->status)); - - if (num) { - if (bio->bi_rw & REQ_WRITE) { + if (blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg)) { + have_data = true; + if (bio->bi_rw & REQ_WRITE) vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; - out += num; - } else { + else vbr->out_hdr.type |= VIRTIO_BLK_T_IN; - in += num; - } - } + } else + have_data = false; - virtblk_add_req(vbr, out, in); - - return 0; + virtblk_add_req(vbr, have_data); } static void virtblk_bio_send_data_work(struct work_struct *work) @@ -298,7 +305,7 @@ static void virtblk_done(struct virtqueue *vq) static bool do_req(struct request_queue *q, struct virtio_blk *vblk, struct request *req) { - unsigned long num, out = 0, in = 0; + unsigned int num; struct virtblk_req *vbr; vbr = virtblk_alloc_req(vblk, GFP_ATOMIC); @@ -335,40 +342,15 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, } } - sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); - - /* - * If this is a packet command we need a couple of additional headers. - * Behind the normal outhdr we put a segment with the scsi command - * block, and before the normal inhdr we put the sense data and the - * inhdr with additional status information before the normal inhdr. - */ - if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) - sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len); - - num = blk_rq_map_sg(q, vbr->req, vblk->sg + out); - - if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) { - sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, SCSI_SENSE_BUFFERSIZE); - sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr, - sizeof(vbr->in_hdr)); - } - - sg_set_buf(&vblk->sg[num + out + in++], &vbr->status, - sizeof(vbr->status)); - + num = blk_rq_map_sg(q, vbr->req, vblk->sg); if (num) { - if (rq_data_dir(vbr->req) == WRITE) { + if (rq_data_dir(vbr->req) == WRITE) vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; - out += num; - } else { + else vbr->out_hdr.type |= VIRTIO_BLK_T_IN; - in += num; - } } - if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, - GFP_ATOMIC) < 0) { + if (__virtblk_add_req(vblk->vq, vbr, vblk->sg, num) < 0) { mempool_free(vbr, vblk->pool); return false; } @@ -539,6 +521,7 @@ static void virtblk_config_changed_work(struct work_struct *work) struct virtio_device *vdev = vblk->vdev; struct request_queue *q = vblk->disk->queue; char cap_str_2[10], cap_str_10[10]; + char *envp[] = { "RESIZE=1", NULL }; u64 capacity, size; mutex_lock(&vblk->config_lock); @@ -568,6 +551,7 @@ static void virtblk_config_changed_work(struct work_struct *work) set_capacity(vblk->disk, capacity); revalidate_disk(vblk->disk); + kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp); done: mutex_unlock(&vblk->config_lock); } diff --git a/drivers/block/xd.c b/drivers/block/xd.c deleted file mode 100644 index ff540520bad..00000000000 --- a/drivers/block/xd.c +++ /dev/null @@ -1,1123 +0,0 @@ -/* - * This file contains the driver for an XT hard disk controller - * (at least the DTC 5150X) for Linux. - * - * Author: Pat Mackinlay, pat@it.com.au - * Date: 29/09/92 - * - * Revised: 01/01/93, ... - * - * Ref: DTC 5150X Controller Specification (thanks to Kevin Fowler, - * kevinf@agora.rain.com) - * Also thanks to: Salvador Abreu, Dave Thaler, Risto Kankkunen and - * Wim Van Dorst. - * - * Revised: 04/04/94 by Risto Kankkunen - * Moved the detection code from xd_init() to xd_geninit() as it needed - * interrupts enabled and Linus didn't want to enable them in that first - * phase. xd_geninit() is the place to do these kinds of things anyway, - * he says. - * - * Modularized: 04/10/96 by Todd Fries, tfries@umr.edu - * - * Revised: 13/12/97 by Andrzej Krzysztofowicz, ankry@mif.pg.gda.pl - * Fixed some problems with disk initialization and module initiation. - * Added support for manual geometry setting (except Seagate controllers) - * in form: - * xd_geo=<cyl_xda>,<head_xda>,<sec_xda>[,<cyl_xdb>,<head_xdb>,<sec_xdb>] - * Recovered DMA access. Abridged messages. Added support for DTC5051CX, - * WD1002-27X & XEBEC controllers. Driver uses now some jumper settings. - * Extended ioctl() support. - * - * Bugfix: 15/02/01, Paul G. - inform queue layer of tiny xd_maxsect. - * - */ - -#include <linux/module.h> -#include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/mm.h> -#include <linux/fs.h> -#include <linux/kernel.h> -#include <linux/timer.h> -#include <linux/genhd.h> -#include <linux/hdreg.h> -#include <linux/ioport.h> -#include <linux/init.h> -#include <linux/wait.h> -#include <linux/blkdev.h> -#include <linux/mutex.h> -#include <linux/blkpg.h> -#include <linux/delay.h> -#include <linux/io.h> -#include <linux/gfp.h> - -#include <asm/uaccess.h> -#include <asm/dma.h> - -#include "xd.h" - -static DEFINE_MUTEX(xd_mutex); -static void __init do_xd_setup (int *integers); -#ifdef MODULE -static int xd[5] = { -1,-1,-1,-1, }; -#endif - -#define XD_DONT_USE_DMA 0 /* Initial value. may be overriden using - "nodma" module option */ -#define XD_INIT_DISK_DELAY (30) /* 30 ms delay during disk initialization */ - -/* Above may need to be increased if a problem with the 2nd drive detection - (ST11M controller) or resetting a controller (WD) appears */ - -static XD_INFO xd_info[XD_MAXDRIVES]; - -/* If you try this driver and find that your card is not detected by the driver at bootup, you need to add your BIOS - signature and details to the following list of signatures. A BIOS signature is a string embedded into the first - few bytes of your controller's on-board ROM BIOS. To find out what yours is, use something like MS-DOS's DEBUG - command. Run DEBUG, and then you can examine your BIOS signature with: - - d xxxx:0000 - - where xxxx is the segment of your controller (like C800 or D000 or something). On the ASCII dump at the right, you should - be able to see a string mentioning the manufacturer's copyright etc. Add this string into the table below. The parameters - in the table are, in order: - - offset ; this is the offset (in bytes) from the start of your ROM where the signature starts - signature ; this is the actual text of the signature - xd_?_init_controller ; this is the controller init routine used by your controller - xd_?_init_drive ; this is the drive init routine used by your controller - - The controllers directly supported at the moment are: DTC 5150x, WD 1004A27X, ST11M/R and override. If your controller is - made by the same manufacturer as one of these, try using the same init routines as they do. If that doesn't work, your - best bet is to use the "override" routines. These routines use a "portable" method of getting the disk's geometry, and - may work with your card. If none of these seem to work, try sending me some email and I'll see what I can do <grin>. - - NOTE: You can now specify your XT controller's parameters from the command line in the form xd=TYPE,IRQ,IO,DMA. The driver - should be able to detect your drive's geometry from this info. (eg: xd=0,5,0x320,3 is the "standard"). */ - -#include <asm/page.h> -#define xd_dma_mem_alloc(size) __get_dma_pages(GFP_KERNEL,get_order(size)) -#define xd_dma_mem_free(addr, size) free_pages(addr, get_order(size)) -static char *xd_dma_buffer; - -static XD_SIGNATURE xd_sigs[] __initdata = { - { 0x0000,"Override geometry handler",NULL,xd_override_init_drive,"n unknown" }, /* Pat Mackinlay, pat@it.com.au */ - { 0x0008,"[BXD06 (C) DTC 17-MAY-1985]",xd_dtc_init_controller,xd_dtc5150cx_init_drive," DTC 5150CX" }, /* Andrzej Krzysztofowicz, ankry@mif.pg.gda.pl */ - { 0x000B,"CRD18A Not an IBM rom. (C) Copyright Data Technology Corp. 05/31/88",xd_dtc_init_controller,xd_dtc_init_drive," DTC 5150X" }, /* Todd Fries, tfries@umr.edu */ - { 0x000B,"CXD23A Not an IBM ROM (C)Copyright Data Technology Corp 12/03/88",xd_dtc_init_controller,xd_dtc_init_drive," DTC 5150X" }, /* Pat Mackinlay, pat@it.com.au */ - { 0x0008,"07/15/86(C) Copyright 1986 Western Digital Corp.",xd_wd_init_controller,xd_wd_init_drive," Western Dig. 1002-27X" }, /* Andrzej Krzysztofowicz, ankry@mif.pg.gda.pl */ - { 0x0008,"06/24/88(C) Copyright 1988 Western Digital Corp.",xd_wd_init_controller,xd_wd_init_drive," Western Dig. WDXT-GEN2" }, /* Dan Newcombe, newcombe@aa.csc.peachnet.edu */ - { 0x0015,"SEAGATE ST11 BIOS REVISION",xd_seagate_init_controller,xd_seagate_init_drive," Seagate ST11M/R" }, /* Salvador Abreu, spa@fct.unl.pt */ - { 0x0010,"ST11R BIOS",xd_seagate_init_controller,xd_seagate_init_drive," Seagate ST11M/R" }, /* Risto Kankkunen, risto.kankkunen@cs.helsinki.fi */ - { 0x0010,"ST11 BIOS v1.7",xd_seagate_init_controller,xd_seagate_init_drive," Seagate ST11R" }, /* Alan Hourihane, alanh@fairlite.demon.co.uk */ - { 0x1000,"(c)Copyright 1987 SMS",xd_omti_init_controller,xd_omti_init_drive,"n OMTI 5520" }, /* Dirk Melchers, dirk@merlin.nbg.sub.org */ - { 0x0006,"COPYRIGHT XEBEC (C) 1984",xd_xebec_init_controller,xd_xebec_init_drive," XEBEC" }, /* Andrzej Krzysztofowicz, ankry@mif.pg.gda.pl */ - { 0x0008,"(C) Copyright 1984 Western Digital Corp", xd_wd_init_controller, xd_wd_init_drive," Western Dig. 1002s-wx2" }, - { 0x0008,"(C) Copyright 1986 Western Digital Corporation", xd_wd_init_controller, xd_wd_init_drive," 1986 Western Digital" }, /* jfree@sovereign.org */ -}; - -static unsigned int xd_bases[] __initdata = -{ - 0xC8000, 0xCA000, 0xCC000, - 0xCE000, 0xD0000, 0xD2000, - 0xD4000, 0xD6000, 0xD8000, - 0xDA000, 0xDC000, 0xDE000, - 0xE0000 -}; - -static DEFINE_SPINLOCK(xd_lock); - -static struct gendisk *xd_gendisk[2]; - -static int xd_getgeo(struct block_device *bdev, struct hd_geometry *geo); - -static const struct block_device_operations xd_fops = { - .owner = THIS_MODULE, - .ioctl = xd_ioctl, - .getgeo = xd_getgeo, -}; -static DECLARE_WAIT_QUEUE_HEAD(xd_wait_int); -static u_char xd_drives, xd_irq = 5, xd_dma = 3, xd_maxsectors; -static u_char xd_override __initdata = 0, xd_type __initdata = 0; -static u_short xd_iobase = 0x320; -static int xd_geo[XD_MAXDRIVES*3] __initdata = { 0, }; - -static volatile int xdc_busy; -static struct timer_list xd_watchdog_int; - -static volatile u_char xd_error; -static bool nodma = XD_DONT_USE_DMA; - -static struct request_queue *xd_queue; - -/* xd_init: register the block device number and set up pointer tables */ -static int __init xd_init(void) -{ - u_char i,controller; - unsigned int address; - int err; - -#ifdef MODULE - { - u_char count = 0; - for (i = 4; i > 0; i--) - if (((xd[i] = xd[i-1]) >= 0) && !count) - count = i; - if ((xd[0] = count)) - do_xd_setup(xd); - } -#endif - - init_timer (&xd_watchdog_int); xd_watchdog_int.function = xd_watchdog; - - err = -EBUSY; - if (register_blkdev(XT_DISK_MAJOR, "xd")) - goto out1; - - err = -ENOMEM; - xd_queue = blk_init_queue(do_xd_request, &xd_lock); - if (!xd_queue) - goto out1a; - - if (xd_detect(&controller,&address)) { - - printk("Detected a%s controller (type %d) at address %06x\n", - xd_sigs[controller].name,controller,address); - if (!request_region(xd_iobase,4,"xd")) { - printk("xd: Ports at 0x%x are not available\n", - xd_iobase); - goto out2; - } - if (controller) - xd_sigs[controller].init_controller(address); - xd_drives = xd_initdrives(xd_sigs[controller].init_drive); - - printk("Detected %d hard drive%s (using IRQ%d & DMA%d)\n", - xd_drives,xd_drives == 1 ? "" : "s",xd_irq,xd_dma); - } - - /* - * With the drive detected, xd_maxsectors should now be known. - * If xd_maxsectors is 0, nothing was detected and we fall through - * to return -ENODEV - */ - if (!xd_dma_buffer && xd_maxsectors) { - xd_dma_buffer = (char *)xd_dma_mem_alloc(xd_maxsectors * 0x200); - if (!xd_dma_buffer) { - printk(KERN_ERR "xd: Out of memory.\n"); - goto out3; - } - } - - err = -ENODEV; - if (!xd_drives) - goto out3; - - for (i = 0; i < xd_drives; i++) { - XD_INFO *p = &xd_info[i]; - struct gendisk *disk = alloc_disk(64); - if (!disk) - goto Enomem; - p->unit = i; - disk->major = XT_DISK_MAJOR; - disk->first_minor = i<<6; - sprintf(disk->disk_name, "xd%c", i+'a'); - disk->fops = &xd_fops; - disk->private_data = p; - disk->queue = xd_queue; - set_capacity(disk, p->heads * p->cylinders * p->sectors); - printk(" %s: CHS=%d/%d/%d\n", disk->disk_name, - p->cylinders, p->heads, p->sectors); - xd_gendisk[i] = disk; - } - - err = -EBUSY; - if (request_irq(xd_irq,xd_interrupt_handler, 0, "XT hard disk", NULL)) { - printk("xd: unable to get IRQ%d\n",xd_irq); - goto out4; - } - - if (request_dma(xd_dma,"xd")) { - printk("xd: unable to get DMA%d\n",xd_dma); - goto out5; - } - - /* xd_maxsectors depends on controller - so set after detection */ - blk_queue_max_hw_sectors(xd_queue, xd_maxsectors); - - for (i = 0; i < xd_drives; i++) - add_disk(xd_gendisk[i]); - - return 0; - -out5: - free_irq(xd_irq, NULL); -out4: - for (i = 0; i < xd_drives; i++) - put_disk(xd_gendisk[i]); -out3: - if (xd_maxsectors) - release_region(xd_iobase,4); - - if (xd_dma_buffer) - xd_dma_mem_free((unsigned long)xd_dma_buffer, - xd_maxsectors * 0x200); -out2: - blk_cleanup_queue(xd_queue); -out1a: - unregister_blkdev(XT_DISK_MAJOR, "xd"); -out1: - return err; -Enomem: - err = -ENOMEM; - while (i--) - put_disk(xd_gendisk[i]); - goto out3; -} - -/* xd_detect: scan the possible BIOS ROM locations for the signature strings */ -static u_char __init xd_detect (u_char *controller, unsigned int *address) -{ - int i, j; - - if (xd_override) - { - *controller = xd_type; - *address = 0; - return(1); - } - - for (i = 0; i < ARRAY_SIZE(xd_bases); i++) { - void __iomem *p = ioremap(xd_bases[i], 0x2000); - if (!p) - continue; - for (j = 1; j < ARRAY_SIZE(xd_sigs); j++) { - const char *s = xd_sigs[j].string; - if (check_signature(p + xd_sigs[j].offset, s, strlen(s))) { - *controller = j; - xd_type = j; - *address = xd_bases[i]; - iounmap(p); - return 1; - } - } - iounmap(p); - } - return 0; -} - -/* do_xd_request: handle an incoming request */ -static void do_xd_request (struct request_queue * q) -{ - struct request *req; - - if (xdc_busy) - return; - - req = blk_fetch_request(q); - while (req) { - unsigned block = blk_rq_pos(req); - unsigned count = blk_rq_cur_sectors(req); - XD_INFO *disk = req->rq_disk->private_data; - int res = -EIO; - int retry; - - if (req->cmd_type != REQ_TYPE_FS) - goto done; - if (block + count > get_capacity(req->rq_disk)) - goto done; - for (retry = 0; (retry < XD_RETRIES) && !res; retry++) - res = xd_readwrite(rq_data_dir(req), disk, req->buffer, - block, count); - done: - /* wrap up, 0 = success, -errno = fail */ - if (!__blk_end_request_cur(req, res)) - req = blk_fetch_request(q); - } -} - -static int xd_getgeo(struct block_device *bdev, struct hd_geometry *geo) -{ - XD_INFO *p = bdev->bd_disk->private_data; - - geo->heads = p->heads; - geo->sectors = p->sectors; - geo->cylinders = p->cylinders; - return 0; -} - -/* xd_ioctl: handle device ioctl's */ -static int xd_locked_ioctl(struct block_device *bdev, fmode_t mode, u_int cmd, u_long arg) -{ - switch (cmd) { - case HDIO_SET_DMA: - if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (xdc_busy) return -EBUSY; - nodma = !arg; - if (nodma && xd_dma_buffer) { - xd_dma_mem_free((unsigned long)xd_dma_buffer, - xd_maxsectors * 0x200); - xd_dma_buffer = NULL; - } else if (!nodma && !xd_dma_buffer) { - xd_dma_buffer = (char *)xd_dma_mem_alloc(xd_maxsectors * 0x200); - if (!xd_dma_buffer) { - nodma = XD_DONT_USE_DMA; - return -ENOMEM; - } - } - return 0; - case HDIO_GET_DMA: - return put_user(!nodma, (long __user *) arg); - case HDIO_GET_MULTCOUNT: - return put_user(xd_maxsectors, (long __user *) arg); - default: - return -EINVAL; - } -} - -static int xd_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long param) -{ - int ret; - - mutex_lock(&xd_mutex); - ret = xd_locked_ioctl(bdev, mode, cmd, param); - mutex_unlock(&xd_mutex); - - return ret; -} - -/* xd_readwrite: handle a read/write request */ -static int xd_readwrite (u_char operation,XD_INFO *p,char *buffer,u_int block,u_int count) -{ - int drive = p->unit; - u_char cmdblk[6],sense[4]; - u_short track,cylinder; - u_char head,sector,control,mode = PIO_MODE,temp; - char **real_buffer; - register int i; - -#ifdef DEBUG_READWRITE - printk("xd_readwrite: operation = %s, drive = %d, buffer = 0x%X, block = %d, count = %d\n",operation == READ ? "read" : "write",drive,buffer,block,count); -#endif /* DEBUG_READWRITE */ - - spin_unlock_irq(&xd_lock); - - control = p->control; - if (!xd_dma_buffer) - xd_dma_buffer = (char *)xd_dma_mem_alloc(xd_maxsectors * 0x200); - while (count) { - temp = count < xd_maxsectors ? count : xd_maxsectors; - - track = block / p->sectors; - head = track % p->heads; - cylinder = track / p->heads; - sector = block % p->sectors; - -#ifdef DEBUG_READWRITE - printk("xd_readwrite: drive = %d, head = %d, cylinder = %d, sector = %d, count = %d\n",drive,head,cylinder,sector,temp); -#endif /* DEBUG_READWRITE */ - - if (xd_dma_buffer) { - mode = xd_setup_dma(operation == READ ? DMA_MODE_READ : DMA_MODE_WRITE,(u_char *)(xd_dma_buffer),temp * 0x200); - real_buffer = &xd_dma_buffer; - for (i=0; i < (temp * 0x200); i++) - xd_dma_buffer[i] = buffer[i]; - } - else - real_buffer = &buffer; - - xd_build(cmdblk,operation == READ ? CMD_READ : CMD_WRITE,drive,head,cylinder,sector,temp & 0xFF,control); - - switch (xd_command(cmdblk,mode,(u_char *)(*real_buffer),(u_char *)(*real_buffer),sense,XD_TIMEOUT)) { - case 1: - printk("xd%c: %s timeout, recalibrating drive\n",'a'+drive,(operation == READ ? "read" : "write")); - xd_recalibrate(drive); - spin_lock_irq(&xd_lock); - return -EIO; - case 2: - if (sense[0] & 0x30) { - printk("xd%c: %s - ",'a'+drive,(operation == READ ? "reading" : "writing")); - switch ((sense[0] & 0x30) >> 4) { - case 0: printk("drive error, code = 0x%X",sense[0] & 0x0F); - break; - case 1: printk("controller error, code = 0x%X",sense[0] & 0x0F); - break; - case 2: printk("command error, code = 0x%X",sense[0] & 0x0F); - break; - case 3: printk("miscellaneous error, code = 0x%X",sense[0] & 0x0F); - break; - } - } - if (sense[0] & 0x80) - printk(" - CHS = %d/%d/%d\n",((sense[2] & 0xC0) << 2) | sense[3],sense[1] & 0x1F,sense[2] & 0x3F); - /* reported drive number = (sense[1] & 0xE0) >> 5 */ - else - printk(" - no valid disk address\n"); - spin_lock_irq(&xd_lock); - return -EIO; - } - if (xd_dma_buffer) - for (i=0; i < (temp * 0x200); i++) - buffer[i] = xd_dma_buffer[i]; - - count -= temp, buffer += temp * 0x200, block += temp; - } - spin_lock_irq(&xd_lock); - return 0; -} - -/* xd_recalibrate: recalibrate a given drive and reset controller if necessary */ -static void xd_recalibrate (u_char drive) -{ - u_char cmdblk[6]; - - xd_build(cmdblk,CMD_RECALIBRATE,drive,0,0,0,0,0); - if (xd_command(cmdblk,PIO_MODE,NULL,NULL,NULL,XD_TIMEOUT * 8)) - printk("xd%c: warning! error recalibrating, controller may be unstable\n", 'a'+drive); -} - -/* xd_interrupt_handler: interrupt service routine */ -static irqreturn_t xd_interrupt_handler(int irq, void *dev_id) -{ - if (inb(XD_STATUS) & STAT_INTERRUPT) { /* check if it was our device */ -#ifdef DEBUG_OTHER - printk("xd_interrupt_handler: interrupt detected\n"); -#endif /* DEBUG_OTHER */ - outb(0,XD_CONTROL); /* acknowledge interrupt */ - wake_up(&xd_wait_int); /* and wake up sleeping processes */ - return IRQ_HANDLED; - } - else - printk("xd: unexpected interrupt\n"); - return IRQ_NONE; -} - -/* xd_setup_dma: set up the DMA controller for a data transfer */ -static u_char xd_setup_dma (u_char mode,u_char *buffer,u_int count) -{ - unsigned long f; - - if (nodma) - return (PIO_MODE); - if (((unsigned long) buffer & 0xFFFF0000) != (((unsigned long) buffer + count) & 0xFFFF0000)) { -#ifdef DEBUG_OTHER - printk("xd_setup_dma: using PIO, transfer overlaps 64k boundary\n"); -#endif /* DEBUG_OTHER */ - return (PIO_MODE); - } - - f=claim_dma_lock(); - disable_dma(xd_dma); - clear_dma_ff(xd_dma); - set_dma_mode(xd_dma,mode); - set_dma_addr(xd_dma, (unsigned long) buffer); - set_dma_count(xd_dma,count); - - release_dma_lock(f); - - return (DMA_MODE); /* use DMA and INT */ -} - -/* xd_build: put stuff into an array in a format suitable for the controller */ -static u_char *xd_build (u_char *cmdblk,u_char command,u_char drive,u_char head,u_short cylinder,u_char sector,u_char count,u_char control) -{ - cmdblk[0] = command; - cmdblk[1] = ((drive & 0x07) << 5) | (head & 0x1F); - cmdblk[2] = ((cylinder & 0x300) >> 2) | (sector & 0x3F); - cmdblk[3] = cylinder & 0xFF; - cmdblk[4] = count; - cmdblk[5] = control; - - return (cmdblk); -} - -static void xd_watchdog (unsigned long unused) -{ - xd_error = 1; - wake_up(&xd_wait_int); -} - -/* xd_waitport: waits until port & mask == flags or a timeout occurs. return 1 for a timeout */ -static inline u_char xd_waitport (u_short port,u_char flags,u_char mask,u_long timeout) -{ - u_long expiry = jiffies + timeout; - int success; - - xdc_busy = 1; - while ((success = ((inb(port) & mask) != flags)) && time_before(jiffies, expiry)) - schedule_timeout_uninterruptible(1); - xdc_busy = 0; - return (success); -} - -static inline u_int xd_wait_for_IRQ (void) -{ - unsigned long flags; - xd_watchdog_int.expires = jiffies + 8 * HZ; - add_timer(&xd_watchdog_int); - - flags=claim_dma_lock(); - enable_dma(xd_dma); - release_dma_lock(flags); - - sleep_on(&xd_wait_int); - del_timer(&xd_watchdog_int); - xdc_busy = 0; - - flags=claim_dma_lock(); - disable_dma(xd_dma); - release_dma_lock(flags); - - if (xd_error) { - printk("xd: missed IRQ - command aborted\n"); - xd_error = 0; - return (1); - } - return (0); -} - -/* xd_command: handle all data transfers necessary for a single command */ -static u_int xd_command (u_char *command,u_char mode,u_char *indata,u_char *outdata,u_char *sense,u_long timeout) -{ - u_char cmdblk[6],csb,complete = 0; - -#ifdef DEBUG_COMMAND - printk("xd_command: command = 0x%X, mode = 0x%X, indata = 0x%X, outdata = 0x%X, sense = 0x%X\n",command,mode,indata,outdata,sense); -#endif /* DEBUG_COMMAND */ - - outb(0,XD_SELECT); - outb(mode,XD_CONTROL); - - if (xd_waitport(XD_STATUS,STAT_SELECT,STAT_SELECT,timeout)) - return (1); - - while (!complete) { - if (xd_waitport(XD_STATUS,STAT_READY,STAT_READY,timeout)) - return (1); - - switch (inb(XD_STATUS) & (STAT_COMMAND | STAT_INPUT)) { - case 0: - if (mode == DMA_MODE) { - if (xd_wait_for_IRQ()) - return (1); - } else - outb(outdata ? *outdata++ : 0,XD_DATA); - break; - case STAT_INPUT: - if (mode == DMA_MODE) { - if (xd_wait_for_IRQ()) - return (1); - } else - if (indata) - *indata++ = inb(XD_DATA); - else - inb(XD_DATA); - break; - case STAT_COMMAND: - outb(command ? *command++ : 0,XD_DATA); - break; - case STAT_COMMAND | STAT_INPUT: - complete = 1; - break; - } - } - csb = inb(XD_DATA); - - if (xd_waitport(XD_STATUS,0,STAT_SELECT,timeout)) /* wait until deselected */ - return (1); - - if (csb & CSB_ERROR) { /* read sense data if error */ - xd_build(cmdblk,CMD_SENSE,(csb & CSB_LUN) >> 5,0,0,0,0,0); - if (xd_command(cmdblk,0,sense,NULL,NULL,XD_TIMEOUT)) - printk("xd: warning! sense command failed!\n"); - } - -#ifdef DEBUG_COMMAND - printk("xd_command: completed with csb = 0x%X\n",csb); -#endif /* DEBUG_COMMAND */ - - return (csb & CSB_ERROR); -} - -static u_char __init xd_initdrives (void (*init_drive)(u_char drive)) -{ - u_char cmdblk[6],i,count = 0; - - for (i = 0; i < XD_MAXDRIVES; i++) { - xd_build(cmdblk,CMD_TESTREADY,i,0,0,0,0,0); - if (!xd_command(cmdblk,PIO_MODE,NULL,NULL,NULL,XD_TIMEOUT*8)) { - msleep_interruptible(XD_INIT_DISK_DELAY); - - init_drive(count); - count++; - - msleep_interruptible(XD_INIT_DISK_DELAY); - } - } - return (count); -} - -static void __init xd_manual_geo_set (u_char drive) -{ - xd_info[drive].heads = (u_char)(xd_geo[3 * drive + 1]); - xd_info[drive].cylinders = (u_short)(xd_geo[3 * drive]); - xd_info[drive].sectors = (u_char)(xd_geo[3 * drive + 2]); -} - -static void __init xd_dtc_init_controller (unsigned int address) -{ - switch (address) { - case 0x00000: - case 0xC8000: break; /*initial: 0x320 */ - case 0xCA000: xd_iobase = 0x324; - case 0xD0000: /*5150CX*/ - case 0xD8000: break; /*5150CX & 5150XL*/ - default: printk("xd_dtc_init_controller: unsupported BIOS address %06x\n",address); - break; - } - xd_maxsectors = 0x01; /* my card seems to have trouble doing multi-block transfers? */ - - outb(0,XD_RESET); /* reset the controller */ -} - - -static void __init xd_dtc5150cx_init_drive (u_char drive) -{ - /* values from controller's BIOS - BIOS chip may be removed */ - static u_short geometry_table[][4] = { - {0x200,8,0x200,0x100}, - {0x267,2,0x267,0x267}, - {0x264,4,0x264,0x80}, - {0x132,4,0x132,0x0}, - {0x132,2,0x80, 0x132}, - {0x177,8,0x177,0x0}, - {0x132,8,0x84, 0x0}, - {}, /* not used */ - {0x132,6,0x80, 0x100}, - {0x200,6,0x100,0x100}, - {0x264,2,0x264,0x80}, - {0x280,4,0x280,0x100}, - {0x2B9,3,0x2B9,0x2B9}, - {0x2B9,5,0x2B9,0x2B9}, - {0x280,6,0x280,0x100}, - {0x132,4,0x132,0x0}}; - u_char n; - - n = inb(XD_JUMPER); - n = (drive ? n : (n >> 2)) & 0x33; - n = (n | (n >> 2)) & 0x0F; - if (xd_geo[3*drive]) - xd_manual_geo_set(drive); - else - if (n != 7) { - xd_info[drive].heads = (u_char)(geometry_table[n][1]); /* heads */ - xd_info[drive].cylinders = geometry_table[n][0]; /* cylinders */ - xd_info[drive].sectors = 17; /* sectors */ -#if 0 - xd_info[drive].rwrite = geometry_table[n][2]; /* reduced write */ - xd_info[drive].precomp = geometry_table[n][3] /* write precomp */ - xd_info[drive].ecc = 0x0B; /* ecc length */ -#endif /* 0 */ - } - else { - printk("xd%c: undetermined drive geometry\n",'a'+drive); - return; - } - xd_info[drive].control = 5; /* control byte */ - xd_setparam(CMD_DTCSETPARAM,drive,xd_info[drive].heads,xd_info[drive].cylinders,geometry_table[n][2],geometry_table[n][3],0x0B); - xd_recalibrate(drive); -} - -static void __init xd_dtc_init_drive (u_char drive) -{ - u_char cmdblk[6],buf[64]; - - xd_build(cmdblk,CMD_DTCGETGEOM,drive,0,0,0,0,0); - if (!xd_command(cmdblk,PIO_MODE,buf,NULL,NULL,XD_TIMEOUT * 2)) { - xd_info[drive].heads = buf[0x0A]; /* heads */ - xd_info[drive].cylinders = ((u_short *) (buf))[0x04]; /* cylinders */ - xd_info[drive].sectors = 17; /* sectors */ - if (xd_geo[3*drive]) - xd_manual_geo_set(drive); -#if 0 - xd_info[drive].rwrite = ((u_short *) (buf + 1))[0x05]; /* reduced write */ - xd_info[drive].precomp = ((u_short *) (buf + 1))[0x06]; /* write precomp */ - xd_info[drive].ecc = buf[0x0F]; /* ecc length */ -#endif /* 0 */ - xd_info[drive].control = 0; /* control byte */ - - xd_setparam(CMD_DTCSETPARAM,drive,xd_info[drive].heads,xd_info[drive].cylinders,((u_short *) (buf + 1))[0x05],((u_short *) (buf + 1))[0x06],buf[0x0F]); - xd_build(cmdblk,CMD_DTCSETSTEP,drive,0,0,0,0,7); - if (xd_command(cmdblk,PIO_MODE,NULL,NULL,NULL,XD_TIMEOUT * 2)) - printk("xd_dtc_init_drive: error setting step rate for xd%c\n", 'a'+drive); - } - else - printk("xd_dtc_init_drive: error reading geometry for xd%c\n", 'a'+drive); -} - -static void __init xd_wd_init_controller (unsigned int address) -{ - switch (address) { - case 0x00000: - case 0xC8000: break; /*initial: 0x320 */ - case 0xCA000: xd_iobase = 0x324; break; - case 0xCC000: xd_iobase = 0x328; break; - case 0xCE000: xd_iobase = 0x32C; break; - case 0xD0000: xd_iobase = 0x328; break; /* ? */ - case 0xD8000: xd_iobase = 0x32C; break; /* ? */ - default: printk("xd_wd_init_controller: unsupported BIOS address %06x\n",address); - break; - } - xd_maxsectors = 0x01; /* this one doesn't wrap properly either... */ - - outb(0,XD_RESET); /* reset the controller */ - - msleep(XD_INIT_DISK_DELAY); -} - -static void __init xd_wd_init_drive (u_char drive) -{ - /* values from controller's BIOS - BIOS may be disabled */ - static u_short geometry_table[][4] = { - {0x264,4,0x1C2,0x1C2}, /* common part */ - {0x132,4,0x099,0x0}, - {0x267,2,0x1C2,0x1C2}, - {0x267,4,0x1C2,0x1C2}, - - {0x334,6,0x335,0x335}, /* 1004 series RLL */ - {0x30E,4,0x30F,0x3DC}, - {0x30E,2,0x30F,0x30F}, - {0x267,4,0x268,0x268}, - - {0x3D5,5,0x3D6,0x3D6}, /* 1002 series RLL */ - {0x3DB,7,0x3DC,0x3DC}, - {0x264,4,0x265,0x265}, - {0x267,4,0x268,0x268}}; - - u_char cmdblk[6],buf[0x200]; - u_char n = 0,rll,jumper_state,use_jumper_geo; - u_char wd_1002 = (xd_sigs[xd_type].string[7] == '6'); - - jumper_state = ~(inb(0x322)); - if (jumper_state & 0x40) - xd_irq = 9; - rll = (jumper_state & 0x30) ? (0x04 << wd_1002) : 0; - xd_build(cmdblk,CMD_READ,drive,0,0,0,1,0); - if (!xd_command(cmdblk,PIO_MODE,buf,NULL,NULL,XD_TIMEOUT * 2)) { - xd_info[drive].heads = buf[0x1AF]; /* heads */ - xd_info[drive].cylinders = ((u_short *) (buf + 1))[0xD6]; /* cylinders */ - xd_info[drive].sectors = 17; /* sectors */ - if (xd_geo[3*drive]) - xd_manual_geo_set(drive); -#if 0 - xd_info[drive].rwrite = ((u_short *) (buf))[0xD8]; /* reduced write */ - xd_info[drive].wprecomp = ((u_short *) (buf))[0xDA]; /* write precomp */ - xd_info[drive].ecc = buf[0x1B4]; /* ecc length */ -#endif /* 0 */ - xd_info[drive].control = buf[0x1B5]; /* control byte */ - use_jumper_geo = !(xd_info[drive].heads) || !(xd_info[drive].cylinders); - if (xd_geo[3*drive]) { - xd_manual_geo_set(drive); - xd_info[drive].control = rll ? 7 : 5; - } - else if (use_jumper_geo) { - n = (((jumper_state & 0x0F) >> (drive << 1)) & 0x03) | rll; - xd_info[drive].cylinders = geometry_table[n][0]; - xd_info[drive].heads = (u_char)(geometry_table[n][1]); - xd_info[drive].control = rll ? 7 : 5; -#if 0 - xd_info[drive].rwrite = geometry_table[n][2]; - xd_info[drive].wprecomp = geometry_table[n][3]; - xd_info[drive].ecc = 0x0B; -#endif /* 0 */ - } - if (!wd_1002) { - if (use_jumper_geo) - xd_setparam(CMD_WDSETPARAM,drive,xd_info[drive].heads,xd_info[drive].cylinders, - geometry_table[n][2],geometry_table[n][3],0x0B); - else - xd_setparam(CMD_WDSETPARAM,drive,xd_info[drive].heads,xd_info[drive].cylinders, - ((u_short *) (buf))[0xD8],((u_short *) (buf))[0xDA],buf[0x1B4]); - } - /* 1002 based RLL controller requests converted addressing, but reports physical - (physical 26 sec., logical 17 sec.) - 1004 based ???? */ - if (rll & wd_1002) { - if ((xd_info[drive].cylinders *= 26, - xd_info[drive].cylinders /= 17) > 1023) - xd_info[drive].cylinders = 1023; /* 1024 ? */ -#if 0 - xd_info[drive].rwrite *= 26; - xd_info[drive].rwrite /= 17; - xd_info[drive].wprecomp *= 26 - xd_info[drive].wprecomp /= 17; -#endif /* 0 */ - } - } - else - printk("xd_wd_init_drive: error reading geometry for xd%c\n",'a'+drive); - -} - -static void __init xd_seagate_init_controller (unsigned int address) -{ - switch (address) { - case 0x00000: - case 0xC8000: break; /*initial: 0x320 */ - case 0xD0000: xd_iobase = 0x324; break; - case 0xD8000: xd_iobase = 0x328; break; - case 0xE0000: xd_iobase = 0x32C; break; - default: printk("xd_seagate_init_controller: unsupported BIOS address %06x\n",address); - break; - } - xd_maxsectors = 0x40; - - outb(0,XD_RESET); /* reset the controller */ -} - -static void __init xd_seagate_init_drive (u_char drive) -{ - u_char cmdblk[6],buf[0x200]; - - xd_build(cmdblk,CMD_ST11GETGEOM,drive,0,0,0,1,0); - if (!xd_command(cmdblk,PIO_MODE,buf,NULL,NULL,XD_TIMEOUT * 2)) { - xd_info[drive].heads = buf[0x04]; /* heads */ - xd_info[drive].cylinders = (buf[0x02] << 8) | buf[0x03]; /* cylinders */ - xd_info[drive].sectors = buf[0x05]; /* sectors */ - xd_info[drive].control = 0; /* control byte */ - } - else - printk("xd_seagate_init_drive: error reading geometry from xd%c\n", 'a'+drive); -} - -/* Omti support courtesy Dirk Melchers */ -static void __init xd_omti_init_controller (unsigned int address) -{ - switch (address) { - case 0x00000: - case 0xC8000: break; /*initial: 0x320 */ - case 0xD0000: xd_iobase = 0x324; break; - case 0xD8000: xd_iobase = 0x328; break; - case 0xE0000: xd_iobase = 0x32C; break; - default: printk("xd_omti_init_controller: unsupported BIOS address %06x\n",address); - break; - } - - xd_maxsectors = 0x40; - - outb(0,XD_RESET); /* reset the controller */ -} - -static void __init xd_omti_init_drive (u_char drive) -{ - /* gets infos from drive */ - xd_override_init_drive(drive); - - /* set other parameters, Hardcoded, not that nice :-) */ - xd_info[drive].control = 2; -} - -/* Xebec support (AK) */ -static void __init xd_xebec_init_controller (unsigned int address) -{ -/* iobase may be set manually in range 0x300 - 0x33C - irq may be set manually to 2(9),3,4,5,6,7 - dma may be set manually to 1,2,3 - (How to detect them ???) -BIOS address may be set manually in range 0x0 - 0xF8000 -If you need non-standard settings use the xd=... command */ - - switch (address) { - case 0x00000: - case 0xC8000: /* initially: xd_iobase==0x320 */ - case 0xD0000: - case 0xD2000: - case 0xD4000: - case 0xD6000: - case 0xD8000: - case 0xDA000: - case 0xDC000: - case 0xDE000: - case 0xE0000: break; - default: printk("xd_xebec_init_controller: unsupported BIOS address %06x\n",address); - break; - } - - xd_maxsectors = 0x01; - outb(0,XD_RESET); /* reset the controller */ - - msleep(XD_INIT_DISK_DELAY); -} - -static void __init xd_xebec_init_drive (u_char drive) -{ - /* values from controller's BIOS - BIOS chip may be removed */ - static u_short geometry_table[][5] = { - {0x132,4,0x080,0x080,0x7}, - {0x132,4,0x080,0x080,0x17}, - {0x264,2,0x100,0x100,0x7}, - {0x264,2,0x100,0x100,0x17}, - {0x132,8,0x080,0x080,0x7}, - {0x132,8,0x080,0x080,0x17}, - {0x264,4,0x100,0x100,0x6}, - {0x264,4,0x100,0x100,0x17}, - {0x2BC,5,0x2BC,0x12C,0x6}, - {0x3A5,4,0x3A5,0x3A5,0x7}, - {0x26C,6,0x26C,0x26C,0x7}, - {0x200,8,0x200,0x100,0x17}, - {0x400,5,0x400,0x400,0x7}, - {0x400,6,0x400,0x400,0x7}, - {0x264,8,0x264,0x200,0x17}, - {0x33E,7,0x33E,0x200,0x7}}; - u_char n; - - n = inb(XD_JUMPER) & 0x0F; /* BIOS's drive number: same geometry - is assumed for BOTH drives */ - if (xd_geo[3*drive]) - xd_manual_geo_set(drive); - else { - xd_info[drive].heads = (u_char)(geometry_table[n][1]); /* heads */ - xd_info[drive].cylinders = geometry_table[n][0]; /* cylinders */ - xd_info[drive].sectors = 17; /* sectors */ -#if 0 - xd_info[drive].rwrite = geometry_table[n][2]; /* reduced write */ - xd_info[drive].precomp = geometry_table[n][3] /* write precomp */ - xd_info[drive].ecc = 0x0B; /* ecc length */ -#endif /* 0 */ - } - xd_info[drive].control = geometry_table[n][4]; /* control byte */ - xd_setparam(CMD_XBSETPARAM,drive,xd_info[drive].heads,xd_info[drive].cylinders,geometry_table[n][2],geometry_table[n][3],0x0B); - xd_recalibrate(drive); -} - -/* xd_override_init_drive: this finds disk geometry in a "binary search" style, narrowing in on the "correct" number of heads - etc. by trying values until it gets the highest successful value. Idea courtesy Salvador Abreu (spa@fct.unl.pt). */ -static void __init xd_override_init_drive (u_char drive) -{ - u_short min[] = { 0,0,0 },max[] = { 16,1024,64 },test[] = { 0,0,0 }; - u_char cmdblk[6],i; - - if (xd_geo[3*drive]) - xd_manual_geo_set(drive); - else { - for (i = 0; i < 3; i++) { - while (min[i] != max[i] - 1) { - test[i] = (min[i] + max[i]) / 2; - xd_build(cmdblk,CMD_SEEK,drive,(u_char) test[0],(u_short) test[1],(u_char) test[2],0,0); - if (!xd_command(cmdblk,PIO_MODE,NULL,NULL,NULL,XD_TIMEOUT * 2)) - min[i] = test[i]; - else - max[i] = test[i]; - } - test[i] = min[i]; - } - xd_info[drive].heads = (u_char) min[0] + 1; - xd_info[drive].cylinders = (u_short) min[1] + 1; - xd_info[drive].sectors = (u_char) min[2] + 1; - } - xd_info[drive].control = 0; -} - -/* xd_setup: initialise controller from command line parameters */ -static void __init do_xd_setup (int *integers) -{ - switch (integers[0]) { - case 4: if (integers[4] < 0) - nodma = 1; - else if (integers[4] < 8) - xd_dma = integers[4]; - case 3: if ((integers[3] > 0) && (integers[3] <= 0x3FC)) - xd_iobase = integers[3]; - case 2: if ((integers[2] > 0) && (integers[2] < 16)) - xd_irq = integers[2]; - case 1: xd_override = 1; - if ((integers[1] >= 0) && (integers[1] < ARRAY_SIZE(xd_sigs))) - xd_type = integers[1]; - case 0: break; - default:printk("xd: too many parameters for xd\n"); - } - xd_maxsectors = 0x01; -} - -/* xd_setparam: set the drive characteristics */ -static void __init xd_setparam (u_char command,u_char drive,u_char heads,u_short cylinders,u_short rwrite,u_short wprecomp,u_char ecc) -{ - u_char cmdblk[14]; - - xd_build(cmdblk,command,drive,0,0,0,0,0); - cmdblk[6] = (u_char) (cylinders >> 8) & 0x03; - cmdblk[7] = (u_char) (cylinders & 0xFF); - cmdblk[8] = heads & 0x1F; - cmdblk[9] = (u_char) (rwrite >> 8) & 0x03; - cmdblk[10] = (u_char) (rwrite & 0xFF); - cmdblk[11] = (u_char) (wprecomp >> 8) & 0x03; - cmdblk[12] = (u_char) (wprecomp & 0xFF); - cmdblk[13] = ecc; - - /* Some controllers require geometry info as data, not command */ - - if (xd_command(cmdblk,PIO_MODE,NULL,&cmdblk[6],NULL,XD_TIMEOUT * 2)) - printk("xd: error setting characteristics for xd%c\n", 'a'+drive); -} - - -#ifdef MODULE - -module_param_array(xd, int, NULL, 0); -module_param_array(xd_geo, int, NULL, 0); -module_param(nodma, bool, 0); - -MODULE_LICENSE("GPL"); - -void cleanup_module(void) -{ - int i; - unregister_blkdev(XT_DISK_MAJOR, "xd"); - for (i = 0; i < xd_drives; i++) { - del_gendisk(xd_gendisk[i]); - put_disk(xd_gendisk[i]); - } - blk_cleanup_queue(xd_queue); - release_region(xd_iobase,4); - if (xd_drives) { - free_irq(xd_irq, NULL); - free_dma(xd_dma); - if (xd_dma_buffer) - xd_dma_mem_free((unsigned long)xd_dma_buffer, xd_maxsectors * 0x200); - } -} -#else - -static int __init xd_setup (char *str) -{ - int ints[5]; - get_options (str, ARRAY_SIZE (ints), ints); - do_xd_setup (ints); - return 1; -} - -/* xd_manual_geo_init: initialise drive geometry from command line parameters - (used only for WD drives) */ -static int __init xd_manual_geo_init (char *str) -{ - int i, integers[1 + 3*XD_MAXDRIVES]; - - get_options (str, ARRAY_SIZE (integers), integers); - if (integers[0]%3 != 0) { - printk("xd: incorrect number of parameters for xd_geo\n"); - return 1; - } - for (i = 0; (i < integers[0]) && (i < 3*XD_MAXDRIVES); i++) - xd_geo[i] = integers[i+1]; - return 1; -} - -__setup ("xd=", xd_setup); -__setup ("xd_geo=", xd_manual_geo_init); - -#endif /* MODULE */ - -module_init(xd_init); -MODULE_ALIAS_BLOCKDEV_MAJOR(XT_DISK_MAJOR); diff --git a/drivers/block/xd.h b/drivers/block/xd.h deleted file mode 100644 index 37cacef16e9..00000000000 --- a/drivers/block/xd.h +++ /dev/null @@ -1,134 +0,0 @@ -#ifndef _LINUX_XD_H -#define _LINUX_XD_H - -/* - * This file contains the definitions for the IO ports and errors etc. for XT hard disk controllers (at least the DTC 5150X). - * - * Author: Pat Mackinlay, pat@it.com.au - * Date: 29/09/92 - * - * Revised: 01/01/93, ... - * - * Ref: DTC 5150X Controller Specification (thanks to Kevin Fowler, kevinf@agora.rain.com) - * Also thanks to: Salvador Abreu, Dave Thaler, Risto Kankkunen and Wim Van Dorst. - */ - -#include <linux/interrupt.h> - -/* XT hard disk controller registers */ -#define XD_DATA (xd_iobase + 0x00) /* data RW register */ -#define XD_RESET (xd_iobase + 0x01) /* reset WO register */ -#define XD_STATUS (xd_iobase + 0x01) /* status RO register */ -#define XD_SELECT (xd_iobase + 0x02) /* select WO register */ -#define XD_JUMPER (xd_iobase + 0x02) /* jumper RO register */ -#define XD_CONTROL (xd_iobase + 0x03) /* DMAE/INTE WO register */ -#define XD_RESERVED (xd_iobase + 0x03) /* reserved */ - -/* XT hard disk controller commands (incomplete list) */ -#define CMD_TESTREADY 0x00 /* test drive ready */ -#define CMD_RECALIBRATE 0x01 /* recalibrate drive */ -#define CMD_SENSE 0x03 /* request sense */ -#define CMD_FORMATDRV 0x04 /* format drive */ -#define CMD_VERIFY 0x05 /* read verify */ -#define CMD_FORMATTRK 0x06 /* format track */ -#define CMD_FORMATBAD 0x07 /* format bad track */ -#define CMD_READ 0x08 /* read */ -#define CMD_WRITE 0x0A /* write */ -#define CMD_SEEK 0x0B /* seek */ - -/* Controller specific commands */ -#define CMD_DTCSETPARAM 0x0C /* set drive parameters (DTC 5150X & CX only?) */ -#define CMD_DTCGETECC 0x0D /* get ecc error length (DTC 5150X only?) */ -#define CMD_DTCREADBUF 0x0E /* read sector buffer (DTC 5150X only?) */ -#define CMD_DTCWRITEBUF 0x0F /* write sector buffer (DTC 5150X only?) */ -#define CMD_DTCREMAPTRK 0x11 /* assign alternate track (DTC 5150X only?) */ -#define CMD_DTCGETPARAM 0xFB /* get drive parameters (DTC 5150X only?) */ -#define CMD_DTCSETSTEP 0xFC /* set step rate (DTC 5150X only?) */ -#define CMD_DTCSETGEOM 0xFE /* set geometry data (DTC 5150X only?) */ -#define CMD_DTCGETGEOM 0xFF /* get geometry data (DTC 5150X only?) */ -#define CMD_ST11GETGEOM 0xF8 /* get geometry data (Seagate ST11R/M only?) */ -#define CMD_WDSETPARAM 0x0C /* set drive parameters (WD 1004A27X only?) */ -#define CMD_XBSETPARAM 0x0C /* set drive parameters (XEBEC only?) */ - -/* Bits for command status byte */ -#define CSB_ERROR 0x02 /* error */ -#define CSB_LUN 0x20 /* logical Unit Number */ - -/* XT hard disk controller status bits */ -#define STAT_READY 0x01 /* controller is ready */ -#define STAT_INPUT 0x02 /* data flowing from controller to host */ -#define STAT_COMMAND 0x04 /* controller in command phase */ -#define STAT_SELECT 0x08 /* controller is selected */ -#define STAT_REQUEST 0x10 /* controller requesting data */ -#define STAT_INTERRUPT 0x20 /* controller requesting interrupt */ - -/* XT hard disk controller control bits */ -#define PIO_MODE 0x00 /* control bits to set for PIO */ -#define DMA_MODE 0x03 /* control bits to set for DMA & interrupt */ - -#define XD_MAXDRIVES 2 /* maximum 2 drives */ -#define XD_TIMEOUT HZ /* 1 second timeout */ -#define XD_RETRIES 4 /* maximum 4 retries */ - -#undef DEBUG /* define for debugging output */ - -#ifdef DEBUG - #define DEBUG_STARTUP /* debug driver initialisation */ - #define DEBUG_OVERRIDE /* debug override geometry detection */ - #define DEBUG_READWRITE /* debug each read/write command */ - #define DEBUG_OTHER /* debug misc. interrupt/DMA stuff */ - #define DEBUG_COMMAND /* debug each controller command */ -#endif /* DEBUG */ - -/* this structure defines the XT drives and their types */ -typedef struct { - u_char heads; - u_short cylinders; - u_char sectors; - u_char control; - int unit; -} XD_INFO; - -/* this structure defines a ROM BIOS signature */ -typedef struct { - unsigned int offset; - const char *string; - void (*init_controller)(unsigned int address); - void (*init_drive)(u_char drive); - const char *name; -} XD_SIGNATURE; - -#ifndef MODULE -static int xd_manual_geo_init (char *command); -#endif /* MODULE */ -static u_char xd_detect (u_char *controller, unsigned int *address); -static u_char xd_initdrives (void (*init_drive)(u_char drive)); - -static void do_xd_request (struct request_queue * q); -static int xd_ioctl (struct block_device *bdev,fmode_t mode,unsigned int cmd,unsigned long arg); -static int xd_readwrite (u_char operation,XD_INFO *disk,char *buffer,u_int block,u_int count); -static void xd_recalibrate (u_char drive); - -static irqreturn_t xd_interrupt_handler(int irq, void *dev_id); -static u_char xd_setup_dma (u_char opcode,u_char *buffer,u_int count); -static u_char *xd_build (u_char *cmdblk,u_char command,u_char drive,u_char head,u_short cylinder,u_char sector,u_char count,u_char control); -static void xd_watchdog (unsigned long unused); -static inline u_char xd_waitport (u_short port,u_char flags,u_char mask,u_long timeout); -static u_int xd_command (u_char *command,u_char mode,u_char *indata,u_char *outdata,u_char *sense,u_long timeout); - -/* card specific setup and geometry gathering code */ -static void xd_dtc_init_controller (unsigned int address); -static void xd_dtc5150cx_init_drive (u_char drive); -static void xd_dtc_init_drive (u_char drive); -static void xd_wd_init_controller (unsigned int address); -static void xd_wd_init_drive (u_char drive); -static void xd_seagate_init_controller (unsigned int address); -static void xd_seagate_init_drive (u_char drive); -static void xd_omti_init_controller (unsigned int address); -static void xd_omti_init_drive (u_char drive); -static void xd_xebec_init_controller (unsigned int address); -static void xd_xebec_init_drive (u_char drive); -static void xd_setparam (u_char command,u_char drive,u_char heads,u_short cylinders,u_short rwrite,u_short wprecomp,u_char ecc); -static void xd_override_init_drive (u_char drive); - -#endif /* _LINUX_XD_H */ diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 5ac841ff6cc..dd5b2fed97e 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -46,6 +46,7 @@ #include <xen/xen.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> +#include <xen/balloon.h> #include "common.h" /* @@ -163,7 +164,7 @@ static void make_response(struct xen_blkif *blkif, u64 id, #define foreach_grant_safe(pos, n, rbtree, node) \ for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \ - (n) = rb_next(&(pos)->node); \ + (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL; \ &(pos)->node != NULL; \ (pos) = container_of(n, typeof(*(pos)), node), \ (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL) @@ -239,6 +240,7 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num) ret = gnttab_unmap_refs(unmap, NULL, pages, segs_to_unmap); BUG_ON(ret); + free_xenballooned_pages(segs_to_unmap, pages); segs_to_unmap = 0; } @@ -379,8 +381,8 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id) static void print_stats(struct xen_blkif *blkif) { - pr_info("xen-blkback (%s): oo %3d | rd %4d | wr %4d | f %4d" - " | ds %4d\n", + pr_info("xen-blkback (%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" + " | ds %4llu\n", current->comm, blkif->st_oo_req, blkif->st_rd_req, blkif->st_wr_req, blkif->st_f_req, blkif->st_ds_req); @@ -440,7 +442,7 @@ int xen_blkif_schedule(void *arg) } struct seg_buf { - unsigned long buf; + unsigned int offset; unsigned int nsec; }; /* @@ -527,8 +529,8 @@ static int xen_blkbk_map(struct blkif_request *req, GFP_KERNEL); if (!persistent_gnt) return -ENOMEM; - persistent_gnt->page = alloc_page(GFP_KERNEL); - if (!persistent_gnt->page) { + if (alloc_xenballooned_pages(1, &persistent_gnt->page, + false)) { kfree(persistent_gnt); return -ENOMEM; } @@ -619,30 +621,21 @@ static int xen_blkbk_map(struct blkif_request *req, * If this is a new persistent grant * save the handler */ - persistent_gnts[i]->handle = map[j].handle; - persistent_gnts[i]->dev_bus_addr = - map[j++].dev_bus_addr; + persistent_gnts[i]->handle = map[j++].handle; } pending_handle(pending_req, i) = persistent_gnts[i]->handle; if (ret) continue; - - seg[i].buf = persistent_gnts[i]->dev_bus_addr | - (req->u.rw.seg[i].first_sect << 9); } else { - pending_handle(pending_req, i) = map[j].handle; + pending_handle(pending_req, i) = map[j++].handle; bitmap_set(pending_req->unmap_seg, i, 1); - if (ret) { - j++; + if (ret) continue; - } - - seg[i].buf = map[j++].dev_bus_addr | - (req->u.rw.seg[i].first_sect << 9); } + seg[i].offset = (req->u.rw.seg[i].first_sect << 9); } return ret; } @@ -677,6 +670,16 @@ static int dispatch_discard_io(struct xen_blkif *blkif, return err; } +static int dispatch_other_io(struct xen_blkif *blkif, + struct blkif_request *req, + struct pending_req *pending_req) +{ + free_req(pending_req); + make_response(blkif, req->u.other.id, req->operation, + BLKIF_RSP_EOPNOTSUPP); + return -EIO; +} + static void xen_blk_drain_io(struct xen_blkif *blkif) { atomic_set(&blkif->drain, 1); @@ -798,17 +801,30 @@ __do_block_io_op(struct xen_blkif *blkif) /* Apply all sanity checks to /private copy/ of request. */ barrier(); - if (unlikely(req.operation == BLKIF_OP_DISCARD)) { + + switch (req.operation) { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + case BLKIF_OP_WRITE_BARRIER: + case BLKIF_OP_FLUSH_DISKCACHE: + if (dispatch_rw_block_io(blkif, &req, pending_req)) + goto done; + break; + case BLKIF_OP_DISCARD: free_req(pending_req); if (dispatch_discard_io(blkif, &req)) - break; - } else if (dispatch_rw_block_io(blkif, &req, pending_req)) + goto done; break; + default: + if (dispatch_other_io(blkif, &req, pending_req)) + goto done; + break; + } /* Yield point for this unbounded loop. */ cond_resched(); } - +done: return more_to_do; } @@ -879,7 +895,6 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, goto fail_response; } - preq.dev = req->u.rw.handle; preq.sector_number = req->u.rw.sector_number; preq.nr_sects = 0; @@ -903,7 +918,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, pr_debug(DRV_PFX "access denied: %s of [%llu,%llu] on dev=%04x\n", operation == READ ? "read" : "write", preq.sector_number, - preq.sector_number + preq.nr_sects, preq.dev); + preq.sector_number + preq.nr_sects, + blkif->vbd.pdevice); goto fail_response; } @@ -946,7 +962,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, (bio_add_page(bio, pages[i], seg[i].nsec << 9, - seg[i].buf & ~PAGE_MASK) == 0)) { + seg[i].offset) == 0)) { bio = bio_alloc(GFP_KERNEL, nseg-i); if (unlikely(bio == NULL)) @@ -976,13 +992,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, bio->bi_end_io = end_block_io_op; } - /* - * We set it one so that the last submit_bio does not have to call - * atomic_inc. - */ atomic_set(&pending_req->pendcnt, nbio); - - /* Get a reference count for the disk queue and start sending I/O */ blk_start_plug(&plug); for (i = 0; i < nbio; i++) @@ -1010,6 +1020,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, fail_put_bio: for (i = 0; i < nbio; i++) bio_put(biolist[i]); + atomic_set(&pending_req->pendcnt, 1); __end_block_io_op(pending_req, -EINVAL); msleep(1); /* back off a bit */ return -EIO; diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 6072390c7f5..60103e2517b 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -77,11 +77,18 @@ struct blkif_x86_32_request_discard { uint64_t nr_sectors; } __attribute__((__packed__)); +struct blkif_x86_32_request_other { + uint8_t _pad1; + blkif_vdev_t _pad2; + uint64_t id; /* private guest value, echoed in resp */ +} __attribute__((__packed__)); + struct blkif_x86_32_request { uint8_t operation; /* BLKIF_OP_??? */ union { struct blkif_x86_32_request_rw rw; struct blkif_x86_32_request_discard discard; + struct blkif_x86_32_request_other other; } u; } __attribute__((__packed__)); @@ -113,11 +120,19 @@ struct blkif_x86_64_request_discard { uint64_t nr_sectors; } __attribute__((__packed__)); +struct blkif_x86_64_request_other { + uint8_t _pad1; + blkif_vdev_t _pad2; + uint32_t _pad3; /* offsetof(blkif_..,u.discard.id)==8 */ + uint64_t id; /* private guest value, echoed in resp */ +} __attribute__((__packed__)); + struct blkif_x86_64_request { uint8_t operation; /* BLKIF_OP_??? */ union { struct blkif_x86_64_request_rw rw; struct blkif_x86_64_request_discard discard; + struct blkif_x86_64_request_other other; } u; } __attribute__((__packed__)); @@ -172,7 +187,6 @@ struct persistent_gnt { struct page *page; grant_ref_t gnt; grant_handle_t handle; - uint64_t dev_bus_addr; struct rb_node node; }; @@ -208,13 +222,13 @@ struct xen_blkif { /* statistics */ unsigned long st_print; - int st_rd_req; - int st_wr_req; - int st_oo_req; - int st_f_req; - int st_ds_req; - int st_rd_sect; - int st_wr_sect; + unsigned long long st_rd_req; + unsigned long long st_wr_req; + unsigned long long st_oo_req; + unsigned long long st_f_req; + unsigned long long st_ds_req; + unsigned long long st_rd_sect; + unsigned long long st_wr_sect; wait_queue_head_t waiting_to_free; }; @@ -278,6 +292,11 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst, dst->u.discard.nr_sectors = src->u.discard.nr_sectors; break; default: + /* + * Don't know how to translate this op. Only get the + * ID so failure can be reported to the frontend. + */ + dst->u.other.id = src->u.other.id; break; } } @@ -309,6 +328,11 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst, dst->u.discard.nr_sectors = src->u.discard.nr_sectors; break; default: + /* + * Don't know how to translate this op. Only get the + * ID so failure can be reported to the frontend. + */ + dst->u.other.id = src->u.other.id; break; } } diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 63980722db4..8bfd1bcf95e 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -230,13 +230,13 @@ int __init xen_blkif_interface_init(void) } \ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) -VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req); -VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req); -VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req); -VBD_SHOW(f_req, "%d\n", be->blkif->st_f_req); -VBD_SHOW(ds_req, "%d\n", be->blkif->st_ds_req); -VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect); -VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect); +VBD_SHOW(oo_req, "%llu\n", be->blkif->st_oo_req); +VBD_SHOW(rd_req, "%llu\n", be->blkif->st_rd_req); +VBD_SHOW(wr_req, "%llu\n", be->blkif->st_wr_req); +VBD_SHOW(f_req, "%llu\n", be->blkif->st_f_req); +VBD_SHOW(ds_req, "%llu\n", be->blkif->st_ds_req); +VBD_SHOW(rd_sect, "%llu\n", be->blkif->st_rd_sect); +VBD_SHOW(wr_sect, "%llu\n", be->blkif->st_wr_sect); static struct attribute *xen_vbdstat_attrs[] = { &dev_attr_oo_req.attr, @@ -367,6 +367,7 @@ static int xen_blkbk_remove(struct xenbus_device *dev) be->blkif = NULL; } + kfree(be->mode); kfree(be); dev_set_drvdata(&dev->dev, NULL); return 0; @@ -502,6 +503,7 @@ static void backend_changed(struct xenbus_watch *watch, = container_of(watch, struct backend_info, backend_watch); struct xenbus_device *dev = be->dev; int cdrom = 0; + unsigned long handle; char *device_type; DPRINTK(""); @@ -521,10 +523,10 @@ static void backend_changed(struct xenbus_watch *watch, return; } - if ((be->major || be->minor) && - ((be->major != major) || (be->minor != minor))) { - pr_warn(DRV_PFX "changing physical device (from %x:%x to %x:%x) not supported.\n", - be->major, be->minor, major, minor); + if (be->major | be->minor) { + if (be->major != major || be->minor != minor) + pr_warn(DRV_PFX "changing physical device (from %x:%x to %x:%x) not supported.\n", + be->major, be->minor, major, minor); return; } @@ -542,36 +544,33 @@ static void backend_changed(struct xenbus_watch *watch, kfree(device_type); } - if (be->major == 0 && be->minor == 0) { - /* Front end dir is a number, which is used as the handle. */ - - char *p = strrchr(dev->otherend, '/') + 1; - long handle; - err = strict_strtoul(p, 0, &handle); - if (err) - return; + /* Front end dir is a number, which is used as the handle. */ + err = strict_strtoul(strrchr(dev->otherend, '/') + 1, 0, &handle); + if (err) + return; - be->major = major; - be->minor = minor; + be->major = major; + be->minor = minor; - err = xen_vbd_create(be->blkif, handle, major, minor, - (NULL == strchr(be->mode, 'w')), cdrom); - if (err) { - be->major = 0; - be->minor = 0; - xenbus_dev_fatal(dev, err, "creating vbd structure"); - return; - } + err = xen_vbd_create(be->blkif, handle, major, minor, + !strchr(be->mode, 'w'), cdrom); + if (err) + xenbus_dev_fatal(dev, err, "creating vbd structure"); + else { err = xenvbd_sysfs_addif(dev); if (err) { xen_vbd_free(&be->blkif->vbd); - be->major = 0; - be->minor = 0; xenbus_dev_fatal(dev, err, "creating sysfs entries"); - return; } + } + if (err) { + kfree(be->mode); + be->mode = NULL; + be->major = 0; + be->minor = 0; + } else { /* We're potentially connected now */ xen_update_blkif_status(be->blkif); } diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 11043c18ac5..a894f88762d 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -44,7 +44,7 @@ #include <linux/mutex.h> #include <linux/scatterlist.h> #include <linux/bitmap.h> -#include <linux/llist.h> +#include <linux/list.h> #include <xen/xen.h> #include <xen/xenbus.h> @@ -68,13 +68,12 @@ enum blkif_state { struct grant { grant_ref_t gref; unsigned long pfn; - struct llist_node node; + struct list_head node; }; struct blk_shadow { struct blkif_request req; struct request *request; - unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST]; }; @@ -105,7 +104,7 @@ struct blkfront_info struct work_struct work; struct gnttab_free_callback callback; struct blk_shadow shadow[BLK_RING_SIZE]; - struct llist_head persistent_gnts; + struct list_head persistent_gnts; unsigned int persistent_gnts_c; unsigned long shadow_free; unsigned int feature_flush; @@ -165,6 +164,69 @@ static int add_id_to_freelist(struct blkfront_info *info, return 0; } +static int fill_grant_buffer(struct blkfront_info *info, int num) +{ + struct page *granted_page; + struct grant *gnt_list_entry, *n; + int i = 0; + + while(i < num) { + gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO); + if (!gnt_list_entry) + goto out_of_memory; + + granted_page = alloc_page(GFP_NOIO); + if (!granted_page) { + kfree(gnt_list_entry); + goto out_of_memory; + } + + gnt_list_entry->pfn = page_to_pfn(granted_page); + gnt_list_entry->gref = GRANT_INVALID_REF; + list_add(&gnt_list_entry->node, &info->persistent_gnts); + i++; + } + + return 0; + +out_of_memory: + list_for_each_entry_safe(gnt_list_entry, n, + &info->persistent_gnts, node) { + list_del(&gnt_list_entry->node); + __free_page(pfn_to_page(gnt_list_entry->pfn)); + kfree(gnt_list_entry); + i--; + } + BUG_ON(i != 0); + return -ENOMEM; +} + +static struct grant *get_grant(grant_ref_t *gref_head, + struct blkfront_info *info) +{ + struct grant *gnt_list_entry; + unsigned long buffer_mfn; + + BUG_ON(list_empty(&info->persistent_gnts)); + gnt_list_entry = list_first_entry(&info->persistent_gnts, struct grant, + node); + list_del(&gnt_list_entry->node); + + if (gnt_list_entry->gref != GRANT_INVALID_REF) { + info->persistent_gnts_c--; + return gnt_list_entry; + } + + /* Assign a gref to this page */ + gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); + BUG_ON(gnt_list_entry->gref == -ENOSPC); + buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn); + gnttab_grant_foreign_access_ref(gnt_list_entry->gref, + info->xbdev->otherend_id, + buffer_mfn, 0); + return gnt_list_entry; +} + static const char *op_name(int op) { static const char *const names[] = { @@ -293,7 +355,6 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode, static int blkif_queue_request(struct request *req) { struct blkfront_info *info = req->rq_disk->private_data; - unsigned long buffer_mfn; struct blkif_request *ring_req; unsigned long id; unsigned int fsect, lsect; @@ -306,7 +367,6 @@ static int blkif_queue_request(struct request *req) */ bool new_persistent_gnts; grant_ref_t gref_head; - struct page *granted_page; struct grant *gnt_list_entry = NULL; struct scatterlist *sg; @@ -370,41 +430,8 @@ static int blkif_queue_request(struct request *req) fsect = sg->offset >> 9; lsect = fsect + (sg->length >> 9) - 1; - if (info->persistent_gnts_c) { - BUG_ON(llist_empty(&info->persistent_gnts)); - gnt_list_entry = llist_entry( - llist_del_first(&info->persistent_gnts), - struct grant, node); - - ref = gnt_list_entry->gref; - buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn); - info->persistent_gnts_c--; - } else { - ref = gnttab_claim_grant_reference(&gref_head); - BUG_ON(ref == -ENOSPC); - - gnt_list_entry = - kmalloc(sizeof(struct grant), - GFP_ATOMIC); - if (!gnt_list_entry) - return -ENOMEM; - - granted_page = alloc_page(GFP_ATOMIC); - if (!granted_page) { - kfree(gnt_list_entry); - return -ENOMEM; - } - - gnt_list_entry->pfn = - page_to_pfn(granted_page); - gnt_list_entry->gref = ref; - - buffer_mfn = pfn_to_mfn(page_to_pfn( - granted_page)); - gnttab_grant_foreign_access_ref(ref, - info->xbdev->otherend_id, - buffer_mfn, 0); - } + gnt_list_entry = get_grant(&gref_head, info); + ref = gnt_list_entry->gref; info->shadow[id].grants_used[i] = gnt_list_entry; @@ -435,7 +462,6 @@ static int blkif_queue_request(struct request *req) kunmap_atomic(shared_data); } - info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); ring_req->u.rw.seg[i] = (struct blkif_request_segment) { .gref = ref, @@ -790,9 +816,8 @@ static void blkif_restart_queue(struct work_struct *work) static void blkif_free(struct blkfront_info *info, int suspend) { - struct llist_node *all_gnts; struct grant *persistent_gnt; - struct llist_node *n; + struct grant *n; /* Prevent new requests being issued until we fix things up. */ spin_lock_irq(&info->io_lock); @@ -803,15 +828,20 @@ static void blkif_free(struct blkfront_info *info, int suspend) blk_stop_queue(info->rq); /* Remove all persistent grants */ - if (info->persistent_gnts_c) { - all_gnts = llist_del_all(&info->persistent_gnts); - llist_for_each_entry_safe(persistent_gnt, n, all_gnts, node) { - gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); + if (!list_empty(&info->persistent_gnts)) { + list_for_each_entry_safe(persistent_gnt, n, + &info->persistent_gnts, node) { + list_del(&persistent_gnt->node); + if (persistent_gnt->gref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(persistent_gnt->gref, + 0, 0UL); + info->persistent_gnts_c--; + } __free_page(pfn_to_page(persistent_gnt->pfn)); kfree(persistent_gnt); } - info->persistent_gnts_c = 0; } + BUG_ON(info->persistent_gnts_c != 0); /* No more gnttab callback work. */ gnttab_cancel_free_callback(&info->callback); @@ -868,7 +898,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, } /* Add the persistent grant into the list of free grants */ for (i = 0; i < s->req.u.rw.nr_segments; i++) { - llist_add(&s->grants_used[i]->node, &info->persistent_gnts); + list_add(&s->grants_used[i]->node, &info->persistent_gnts); info->persistent_gnts_c++; } } @@ -1006,6 +1036,12 @@ static int setup_blkring(struct xenbus_device *dev, sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST); + /* Allocate memory for grants */ + err = fill_grant_buffer(info, BLK_RING_SIZE * + BLKIF_MAX_SEGMENTS_PER_REQUEST); + if (err) + goto fail; + err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); if (err < 0) { free_page((unsigned long)sring); @@ -1164,7 +1200,7 @@ static int blkfront_probe(struct xenbus_device *dev, spin_lock_init(&info->io_lock); info->xbdev = dev; info->vdevice = vdevice; - init_llist_head(&info->persistent_gnts); + INIT_LIST_HEAD(&info->persistent_gnts); info->persistent_gnts_c = 0; info->connected = BLKIF_STATE_DISCONNECTED; INIT_WORK(&info->work, blkif_restart_queue); @@ -1196,11 +1232,10 @@ static int blkif_recover(struct blkfront_info *info) int j; /* Stage 1: Make a safe copy of the shadow state. */ - copy = kmalloc(sizeof(info->shadow), + copy = kmemdup(info->shadow, sizeof(info->shadow), GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); if (!copy) return -ENOMEM; - memcpy(copy, info->shadow, sizeof(info->shadow)); /* Stage 2: Set up free list. */ memset(&info->shadow, 0, sizeof(info->shadow)); @@ -1229,7 +1264,7 @@ static int blkif_recover(struct blkfront_info *info) gnttab_grant_foreign_access_ref( req->u.rw.seg[j].gref, info->xbdev->otherend_id, - pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]), + pfn_to_mfn(copy[i].grants_used[j]->pfn), 0); } info->shadow[req->u.rw.id].req = *req; |