[SCSI] Handle disk devices which can not process medium access commands

We have experienced several devices which fail in a fashion we do not currently handle gracefully in SCSI. After a failure these devices will respond to the SCSI primary command set (INQUIRY, TEST UNIT READY, etc.) but any command accessing the storage medium will time out. The following patch adds an callback that can be used by upper level drivers to inspect the results of an error handling command. This in turn has been used to implement additional checking in the SCSI disk driver. If a medium access command fails twice but TEST UNIT READY succeeds both times in the subsequent error handling we will offline the device. The maximum number of failed commands required to take a device offline can be tweaked in sysfs. Also add a new error flag to scsi_debug which allows this scenario to be easily reproduced. [jejb: fix up integer parsing to use kstrtouint] Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com> Signed-off-by: James Bottomley <JBottomley@Parallels.com>
author: Martin K. Petersen <martin.petersen@oracle.com> 2012-02-09 13:48:53 -0500
committer: James Bottomley <JBottomley@Parallels.com> 2012-02-19 10:14:52 -0600
commit: 18a4d0a22ed6c54b67af7718c305cd010f09ddf8 (patch)
tree: 06e22a92290ff84b2c1d5abb09424493de384c4b /drivers/scsi/sd.c
parent: a78e21dc5e9f896ecee5b1fbe189690dfcca38e1 (diff)
1 files changed, 82 insertions, 0 deletions
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 8c525aa1b85..bd17cf8af01 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -107,6 +107,7 @@ static int sd_suspend(struct device *, pm_message_t state);
 static int sd_resume(struct device *);
 static void sd_rescan(struct device *);
 static int sd_done(struct scsi_cmnd *);
+static int sd_eh_action(struct scsi_cmnd *, unsigned char *, int, int);
 static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer);
 static void scsi_disk_release(struct device *cdev);
 static void sd_print_sense_hdr(struct scsi_disk *, struct scsi_sense_hdr *);
@@ -346,6 +347,31 @@ sd_store_provisioning_mode(struct device *dev, struct device_attribute *attr,
 	return count;
 }
 
+static ssize_t
+sd_show_max_medium_access_timeouts(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct scsi_disk *sdkp = to_scsi_disk(dev);
+
+	return snprintf(buf, 20, "%u\n", sdkp->max_medium_access_timeouts);
+}
+
+static ssize_t
+sd_store_max_medium_access_timeouts(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t count)
+{
+	struct scsi_disk *sdkp = to_scsi_disk(dev);
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	err = kstrtouint(buf, 10, &sdkp->max_medium_access_timeouts);
+
+	return err ? err : count;
+}
+
 static struct device_attribute sd_disk_attrs[] = {
 	__ATTR(cache_type, S_IRUGO|S_IWUSR, sd_show_cache_type,
 	       sd_store_cache_type),
@@ -360,6 +386,9 @@ static struct device_attribute sd_disk_attrs[] = {
 	__ATTR(thin_provisioning, S_IRUGO, sd_show_thin_provisioning, NULL),
 	__ATTR(provisioning_mode, S_IRUGO|S_IWUSR, sd_show_provisioning_mode,
 	       sd_store_provisioning_mode),
+	__ATTR(max_medium_access_timeouts, S_IRUGO|S_IWUSR,
+	       sd_show_max_medium_access_timeouts,
+	       sd_store_max_medium_access_timeouts),
 	__ATTR_NULL,
 };
 
@@ -382,6 +411,7 @@ static struct scsi_driver sd_template = {
 	},
 	.rescan			= sd_rescan,
 	.done			= sd_done,
+	.eh_action		= sd_eh_action,
 };
 
 /*
@@ -1313,6 +1343,55 @@ static const struct block_device_operations sd_fops = {
 	.unlock_native_capacity	= sd_unlock_native_capacity,
 };
 
+/**
+ *	sd_eh_action - error handling callback
+ *	@scmd:		sd-issued command that has failed
+ *	@eh_cmnd:	The command that was sent during error handling
+ *	@eh_cmnd_len:	Length of eh_cmnd in bytes
+ *	@eh_disp:	The recovery disposition suggested by the midlayer
+ *
+ *	This function is called by the SCSI midlayer upon completion of
+ *	an error handling command (TEST UNIT READY, START STOP UNIT,
+ *	etc.) The command sent to the device by the error handler is
+ *	stored in eh_cmnd. The result of sending the eh command is
+ *	passed in eh_disp.
+ **/
+static int sd_eh_action(struct scsi_cmnd *scmd, unsigned char *eh_cmnd,
+			int eh_cmnd_len, int eh_disp)
+{
+	struct scsi_disk *sdkp = scsi_disk(scmd->request->rq_disk);
+
+	if (!scsi_device_online(scmd->device) ||
+	    !scsi_medium_access_command(scmd))
+		return eh_disp;
+
+	/*
+	 * The device has timed out executing a medium access command.
+	 * However, the TEST UNIT READY command sent during error
+	 * handling completed successfully. Either the device is in the
+	 * process of recovering or has it suffered an internal failure
+	 * that prevents access to the storage medium.
+	 */
+	if (host_byte(scmd->result) == DID_TIME_OUT && eh_disp == SUCCESS &&
+	    eh_cmnd_len && eh_cmnd[0] == TEST_UNIT_READY)
+		sdkp->medium_access_timed_out++;
+
+	/*
+	 * If the device keeps failing read/write commands but TEST UNIT
+	 * READY always completes successfully we assume that medium
+	 * access is no longer possible and take the device offline.
+	 */
+	if (sdkp->medium_access_timed_out >= sdkp->max_medium_access_timeouts) {
+		scmd_printk(KERN_ERR, scmd,
+			    "Medium access timeout failure. Offlining disk!\n");
+		scsi_device_set_state(scmd->device, SDEV_OFFLINE);
+
+		return FAILED;
+	}
+
+	return eh_disp;
+}
+
 static unsigned int sd_completed_bytes(struct scsi_cmnd *scmd)
 {
 	u64 start_lba = blk_rq_pos(scmd->request);
@@ -1402,6 +1481,8 @@ static int sd_done(struct scsi_cmnd *SCpnt)
 	    (!sense_valid || sense_deferred))
 		goto out;
 
+	sdkp->medium_access_timed_out = 0;
+
 	switch (sshdr.sense_key) {
 	case HARDWARE_ERROR:
 	case MEDIUM_ERROR:
@@ -2523,6 +2604,7 @@ static void sd_probe_async(void *data, async_cookie_t cookie)
 	sdkp->RCD = 0;
 	sdkp->ATO = 0;
 	sdkp->first_scan = 1;
+	sdkp->max_medium_access_timeouts = SD_MAX_MEDIUM_TIMEOUTS;
 
 	sd_revalidate_disk(gd);
author	Martin K. Petersen <martin.petersen@oracle.com>	2012-02-09 13:48:53 -0500
committer	James Bottomley <JBottomley@Parallels.com>	2012-02-19 10:14:52 -0600
commit	18a4d0a22ed6c54b67af7718c305cd010f09ddf8 (patch)
tree	06e22a92290ff84b2c1d5abb09424493de384c4b /drivers/scsi/sd.c
parent	a78e21dc5e9f896ecee5b1fbe189690dfcca38e1 (diff)