diff options
Diffstat (limited to 'drivers/block')
41 files changed, 1585 insertions, 1368 deletions
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 748dea4f34d..758da2287d9 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1406,7 +1406,7 @@ next_segment: track = block / (floppy->dtype->sects * floppy->type->sect_mult); sector = block % (floppy->dtype->sects * floppy->type->sect_mult); - data = rq->buffer + 512 * cnt; + data = bio_data(rq->bio) + 512 * cnt; #ifdef DEBUG printk("access to track %d, sector %d, with buffer at " "0x%08lx\n", track, sector, data); diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 96b629e1f0c..2104b1b4ccd 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -1484,7 +1484,7 @@ repeat: ReqCnt = 0; ReqCmd = rq_data_dir(fd_request); ReqBlock = blk_rq_pos(fd_request); - ReqBuffer = fd_request->buffer; + ReqBuffer = bio_data(fd_request->bio); setup_req_params( drive ); do_fd_action( drive ); @@ -1952,7 +1952,7 @@ static int __init atari_floppy_init (void) goto Enomem; } TrackBuffer = DMABuffer + 512; - PhysDMABuffer = virt_to_phys(DMABuffer); + PhysDMABuffer = atari_stram_to_phys(DMABuffer); PhysTrackBuffer = virt_to_phys(TrackBuffer); BufferDrive = BufferSide = BufferTrack = -1; diff --git a/drivers/block/brd.c b/drivers/block/brd.c index e73b85cf075..c7d138eca73 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -200,11 +200,11 @@ static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n) copy = min_t(size_t, n, PAGE_SIZE - offset); if (!brd_insert_page(brd, sector)) - return -ENOMEM; + return -ENOSPC; if (copy < n) { sector += copy >> SECTOR_SHIFT; if (!brd_insert_page(brd, sector)) - return -ENOMEM; + return -ENOSPC; } return 0; } @@ -360,6 +360,15 @@ out: bio_endio(bio, err); } +static int brd_rw_page(struct block_device *bdev, sector_t sector, + struct page *page, int rw) +{ + struct brd_device *brd = bdev->bd_disk->private_data; + int err = brd_do_bvec(brd, page, PAGE_CACHE_SIZE, 0, rw, sector); + page_endio(page, rw & WRITE, err); + return err; +} + #ifdef CONFIG_BLK_DEV_XIP static int brd_direct_access(struct block_device *bdev, sector_t sector, void **kaddr, unsigned long *pfn) @@ -375,7 +384,7 @@ static int brd_direct_access(struct block_device *bdev, sector_t sector, return -ERANGE; page = brd_insert_page(brd, sector); if (!page) - return -ENOMEM; + return -ENOSPC; *kaddr = page_address(page); *pfn = page_to_pfn(page); @@ -419,6 +428,7 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode, static const struct block_device_operations brd_fops = { .owner = THIS_MODULE, + .rw_page = brd_rw_page, .ioctl = brd_ioctl, #ifdef CONFIG_BLK_DEV_XIP .direct_access = brd_direct_access, diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 73894ca3395..4595c22f33f 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -4080,7 +4080,7 @@ static void cciss_interrupt_mode(ctlr_info_t *h) goto default_int_mode; if (pci_find_capability(h->pdev, PCI_CAP_ID_MSIX)) { - err = pci_enable_msix(h->pdev, cciss_msix_entries, 4); + err = pci_enable_msix_exact(h->pdev, cciss_msix_entries, 4); if (!err) { h->intr[0] = cciss_msix_entries[0].vector; h->intr[1] = cciss_msix_entries[1].vector; @@ -4088,10 +4088,6 @@ static void cciss_interrupt_mode(ctlr_info_t *h) h->intr[3] = cciss_msix_entries[3].vector; h->msix_vector = 1; return; - } - if (err > 0) { - dev_warn(&h->pdev->dev, - "only %d MSI-X vectors available\n", err); } else { dev_warn(&h->pdev->dev, "MSI-X init failed %d\n", err); diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 90ae4ba8f9e..05a1780ffa8 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -29,7 +29,6 @@ #include <linux/drbd_limits.h> #include <linux/dynamic_debug.h> #include "drbd_int.h" -#include "drbd_wrappers.h" enum al_transaction_types { @@ -204,7 +203,7 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd BUG_ON(!bdev->md_bdev); - drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", + dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", current->comm, current->pid, __func__, (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", (void*)_RET_IP_ ); @@ -276,7 +275,6 @@ bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval return _al_get(device, first, true); } -static bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i) { /* for bios crossing activity log extent boundaries, @@ -846,7 +844,7 @@ void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, int size, int wake_up = 0; unsigned long flags; - if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { + if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { drbd_err(device, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", (unsigned long long)sector, size); return; @@ -920,7 +918,7 @@ int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size if (size == 0) return 0; - if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { + if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { drbd_err(device, "sector: %llus, size: %d\n", (unsigned long long)sector, size); return 0; @@ -1023,8 +1021,7 @@ int drbd_rs_begin_io(struct drbd_device *device, sector_t sector) unsigned int enr = BM_SECT_TO_EXT(sector); struct bm_extent *bm_ext; int i, sig; - int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait. - 200 times -> 20 seconds. */ + bool sa; retry: sig = wait_event_interruptible(device->al_wait, @@ -1035,12 +1032,15 @@ retry: if (test_bit(BME_LOCKED, &bm_ext->flags)) return 0; + /* step aside only while we are above c-min-rate; unless disabled. */ + sa = drbd_rs_c_min_rate_throttle(device); + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { sig = wait_event_interruptible(device->al_wait, !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) || - test_bit(BME_PRIORITY, &bm_ext->flags)); + (sa && test_bit(BME_PRIORITY, &bm_ext->flags))); - if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) { + if (sig || (sa && test_bit(BME_PRIORITY, &bm_ext->flags))) { spin_lock_irq(&device->al_lock); if (lc_put(device->resync, &bm_ext->lce) == 0) { bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */ @@ -1052,9 +1052,6 @@ retry: return -EINTR; if (schedule_timeout_interruptible(HZ/10)) return -EINTR; - if (sa && --sa == 0) - drbd_warn(device, "drbd_rs_begin_io() stepped aside for 20sec." - "Resync stalled?\n"); goto retry; } } @@ -1288,7 +1285,7 @@ void drbd_rs_failed_io(struct drbd_device *device, sector_t sector, int size) sector_t esector, nr_sectors; int wake_up = 0; - if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { + if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { drbd_err(device, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", (unsigned long long)sector, size); return; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index e7093d4291f..a76ceb344d6 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -382,6 +382,12 @@ enum { __EE_CALL_AL_COMPLETE_IO, __EE_MAY_SET_IN_SYNC, + /* is this a TRIM aka REQ_DISCARD? */ + __EE_IS_TRIM, + /* our lower level cannot handle trim, + * and we want to fall back to zeroout instead */ + __EE_IS_TRIM_USE_ZEROOUT, + /* In case a barrier failed, * we need to resubmit without the barrier flag. */ __EE_RESUBMITTED, @@ -405,7 +411,9 @@ enum { }; #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) -#define EE_RESUBMITTED (1<<__EE_RESUBMITTED) +#define EE_IS_TRIM (1<<__EE_IS_TRIM) +#define EE_IS_TRIM_USE_ZEROOUT (1<<__EE_IS_TRIM_USE_ZEROOUT) +#define EE_RESUBMITTED (1<<__EE_RESUBMITTED) #define EE_WAS_ERROR (1<<__EE_WAS_ERROR) #define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) #define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS) @@ -579,6 +587,7 @@ struct drbd_resource { struct list_head resources; struct res_opts res_opts; struct mutex conf_update; /* mutex for ready-copy-update of net_conf and disk_conf */ + struct mutex adm_mutex; /* mutex to serialize administrative requests */ spinlock_t req_lock; unsigned susp:1; /* IO suspended by user */ @@ -609,6 +618,7 @@ struct drbd_connection { struct drbd_socket data; /* data/barrier/cstate/parameter packets */ struct drbd_socket meta; /* ping/ack (metadata) packets */ int agreed_pro_version; /* actually used protocol version */ + u32 agreed_features; unsigned long last_received; /* in jiffies, either socket */ unsigned int ko_count; @@ -814,6 +824,28 @@ struct drbd_device { struct submit_worker submit; }; +struct drbd_config_context { + /* assigned from drbd_genlmsghdr */ + unsigned int minor; + /* assigned from request attributes, if present */ + unsigned int volume; +#define VOLUME_UNSPECIFIED (-1U) + /* pointer into the request skb, + * limited lifetime! */ + char *resource_name; + struct nlattr *my_addr; + struct nlattr *peer_addr; + + /* reply buffer */ + struct sk_buff *reply_skb; + /* pointer into reply buffer */ + struct drbd_genlmsghdr *reply_dh; + /* resolved from attributes, if possible */ + struct drbd_device *device; + struct drbd_resource *resource; + struct drbd_connection *connection; +}; + static inline struct drbd_device *minor_to_device(unsigned int minor) { return (struct drbd_device *)idr_find(&drbd_devices, minor); @@ -821,7 +853,7 @@ static inline struct drbd_device *minor_to_device(unsigned int minor) static inline struct drbd_peer_device *first_peer_device(struct drbd_device *device) { - return list_first_entry(&device->peer_devices, struct drbd_peer_device, peer_devices); + return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices); } #define for_each_resource(resource, _resources) \ @@ -1139,6 +1171,12 @@ struct bm_extent { #define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */ #define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */ +/* For now, don't allow more than one activity log extent worth of data + * to be discarded in one go. We may need to rework drbd_al_begin_io() + * to allow for even larger discard ranges */ +#define DRBD_MAX_DISCARD_SIZE AL_EXTENT_SIZE +#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9) + extern int drbd_bm_init(struct drbd_device *device); extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits); extern void drbd_bm_cleanup(struct drbd_device *device); @@ -1229,9 +1267,9 @@ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); extern rwlock_t global_state_lock; extern int conn_lowest_minor(struct drbd_connection *connection); -enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned int minor, int vnr); +extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor); extern void drbd_destroy_device(struct kref *kref); -extern void drbd_delete_device(struct drbd_device *mdev); +extern void drbd_delete_device(struct drbd_device *device); extern struct drbd_resource *drbd_create_resource(const char *name); extern void drbd_free_resource(struct drbd_resource *resource); @@ -1257,7 +1295,7 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t); /* drbd_nl.c */ -extern int drbd_msg_put_info(const char *info); +extern int drbd_msg_put_info(struct sk_buff *skb, const char *info); extern void drbd_suspend_io(struct drbd_device *device); extern void drbd_resume_io(struct drbd_device *device); extern char *ppsize(char *buf, unsigned long long size); @@ -1283,6 +1321,10 @@ extern void conn_try_outdate_peer_async(struct drbd_connection *connection); extern int drbd_khelper(struct drbd_device *device, char *cmd); /* drbd_worker.c */ +/* bi_end_io handlers */ +extern void drbd_md_io_complete(struct bio *bio, int error); +extern void drbd_peer_request_endio(struct bio *bio, int error); +extern void drbd_request_endio(struct bio *bio, int error); extern int drbd_worker(struct drbd_thread *thi); enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor); void drbd_resync_after_changed(struct drbd_device *device); @@ -1332,16 +1374,20 @@ extern int w_start_resync(struct drbd_work *, int); extern void resync_timer_fn(unsigned long data); extern void start_resync_timer_fn(unsigned long data); +extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req); + /* drbd_receiver.c */ extern int drbd_receiver(struct drbd_thread *thi); extern int drbd_asender(struct drbd_thread *thi); -extern int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector); +extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); +extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector); extern int drbd_submit_peer_request(struct drbd_device *, struct drbd_peer_request *, const unsigned, const int); extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *); extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64, sector_t, unsigned int, + bool, gfp_t) __must_hold(local); extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *, int); @@ -1401,6 +1447,37 @@ static inline void drbd_tcp_quickack(struct socket *sock) (char*)&val, sizeof(val)); } +/* sets the number of 512 byte sectors of our virtual device */ +static inline void drbd_set_my_capacity(struct drbd_device *device, + sector_t size) +{ + /* set_capacity(device->this_bdev->bd_disk, size); */ + set_capacity(device->vdisk, size); + device->this_bdev->bd_inode->i_size = (loff_t)size << 9; +} + +/* + * used to submit our private bio + */ +static inline void drbd_generic_make_request(struct drbd_device *device, + int fault_type, struct bio *bio) +{ + __release(local); + if (!bio->bi_bdev) { + printk(KERN_ERR "drbd%d: drbd_generic_make_request: " + "bio->bi_bdev == NULL\n", + device_to_minor(device)); + dump_stack(); + bio_endio(bio, -ENODEV); + return; + } + + if (drbd_insert_fault(device, fault_type)) + bio_endio(bio, -EIO); + else + generic_make_request(bio); +} + void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo); /* drbd_proc.c */ @@ -1410,6 +1487,7 @@ extern const char *drbd_conn_str(enum drbd_conns s); extern const char *drbd_role_str(enum drbd_role s); /* drbd_actlog.c */ +extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i); extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i); extern void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate); extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i); @@ -2144,7 +2222,7 @@ static inline void drbd_md_flush(struct drbd_device *device) static inline struct drbd_connection *first_connection(struct drbd_resource *resource) { - return list_first_entry(&resource->connections, + return list_first_entry_or_null(&resource->connections, struct drbd_connection, connections); } diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 331e5cc1227..960645c26e6 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1607,8 +1607,8 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection, unsigned long b return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0; } -/* Used to send write requests - * R_PRIMARY -> Peer (P_DATA) +/* Used to send write or TRIM aka REQ_DISCARD requests + * R_PRIMARY -> Peer (P_DATA, P_TRIM) */ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *req) { @@ -1640,6 +1640,16 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * dp_flags |= DP_SEND_WRITE_ACK; } p->dp_flags = cpu_to_be32(dp_flags); + + if (dp_flags & DP_DISCARD) { + struct p_trim *t = (struct p_trim*)p; + t->size = cpu_to_be32(req->i.size); + err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0); + goto out; + } + + /* our digest is still only over the payload. + * TRIM does not carry any payload. */ if (dgs) drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1); err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size); @@ -1675,6 +1685,7 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * ... Be noisy about digest too large ... } */ } +out: mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */ return err; @@ -2570,6 +2581,7 @@ struct drbd_resource *drbd_create_resource(const char *name) INIT_LIST_HEAD(&resource->connections); list_add_tail_rcu(&resource->resources, &drbd_resources); mutex_init(&resource->conf_update); + mutex_init(&resource->adm_mutex); spin_lock_init(&resource->req_lock); return resource; @@ -2687,14 +2699,16 @@ static int init_submitter(struct drbd_device *device) return 0; } -enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned int minor, int vnr) +enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor) { + struct drbd_resource *resource = adm_ctx->resource; struct drbd_connection *connection; struct drbd_device *device; struct drbd_peer_device *peer_device, *tmp_peer_device; struct gendisk *disk; struct request_queue *q; int id; + int vnr = adm_ctx->volume; enum drbd_ret_code err = ERR_NOMEM; device = minor_to_device(minor); @@ -2763,7 +2777,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i if (id < 0) { if (id == -ENOSPC) { err = ERR_MINOR_EXISTS; - drbd_msg_put_info("requested minor exists already"); + drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already"); } goto out_no_minor_idr; } @@ -2773,7 +2787,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i if (id < 0) { if (id == -ENOSPC) { err = ERR_MINOR_EXISTS; - drbd_msg_put_info("requested minor exists already"); + drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already"); } goto out_idr_remove_minor; } @@ -2794,7 +2808,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i if (id < 0) { if (id == -ENOSPC) { err = ERR_INVALID_REQUEST; - drbd_msg_put_info("requested volume exists already"); + drbd_msg_put_info(adm_ctx->reply_skb, "requested volume exists already"); } goto out_idr_remove_from_resource; } @@ -2803,7 +2817,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i if (init_submitter(device)) { err = ERR_NOMEM; - drbd_msg_put_info("unable to create submit workqueue"); + drbd_msg_put_info(adm_ctx->reply_skb, "unable to create submit workqueue"); goto out_idr_remove_vol; } diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 526414bc2ca..1b35c45c92b 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -34,7 +34,6 @@ #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" -#include "drbd_wrappers.h" #include <asm/unaligned.h> #include <linux/drbd_limits.h> #include <linux/kthread.h> @@ -82,32 +81,6 @@ int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb); /* used blkdev_get_by_path, to claim our meta data device(s) */ static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; -/* Configuration is strictly serialized, because generic netlink message - * processing is strictly serialized by the genl_lock(). - * Which means we can use one static global drbd_config_context struct. - */ -static struct drbd_config_context { - /* assigned from drbd_genlmsghdr */ - unsigned int minor; - /* assigned from request attributes, if present */ - unsigned int volume; -#define VOLUME_UNSPECIFIED (-1U) - /* pointer into the request skb, - * limited lifetime! */ - char *resource_name; - struct nlattr *my_addr; - struct nlattr *peer_addr; - - /* reply buffer */ - struct sk_buff *reply_skb; - /* pointer into reply buffer */ - struct drbd_genlmsghdr *reply_dh; - /* resolved from attributes, if possible */ - struct drbd_device *device; - struct drbd_resource *resource; - struct drbd_connection *connection; -} adm_ctx; - static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) { genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb)))); @@ -117,9 +90,8 @@ static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only * reason it could fail was no space in skb, and there are 4k available. */ -int drbd_msg_put_info(const char *info) +int drbd_msg_put_info(struct sk_buff *skb, const char *info) { - struct sk_buff *skb = adm_ctx.reply_skb; struct nlattr *nla; int err = -EMSGSIZE; @@ -143,42 +115,46 @@ int drbd_msg_put_info(const char *info) * and per-family private info->pointers. * But we need to stay compatible with older kernels. * If it returns successfully, adm_ctx members are valid. + * + * At this point, we still rely on the global genl_lock(). + * If we want to avoid that, and allow "genl_family.parallel_ops", we may need + * to add additional synchronization against object destruction/modification. */ #define DRBD_ADM_NEED_MINOR 1 #define DRBD_ADM_NEED_RESOURCE 2 #define DRBD_ADM_NEED_CONNECTION 4 -static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info, - unsigned flags) +static int drbd_adm_prepare(struct drbd_config_context *adm_ctx, + struct sk_buff *skb, struct genl_info *info, unsigned flags) { struct drbd_genlmsghdr *d_in = info->userhdr; const u8 cmd = info->genlhdr->cmd; int err; - memset(&adm_ctx, 0, sizeof(adm_ctx)); + memset(adm_ctx, 0, sizeof(*adm_ctx)); /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */ if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN)) return -EPERM; - adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); - if (!adm_ctx.reply_skb) { + adm_ctx->reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!adm_ctx->reply_skb) { err = -ENOMEM; goto fail; } - adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb, + adm_ctx->reply_dh = genlmsg_put_reply(adm_ctx->reply_skb, info, &drbd_genl_family, 0, cmd); /* put of a few bytes into a fresh skb of >= 4k will always succeed. * but anyways */ - if (!adm_ctx.reply_dh) { + if (!adm_ctx->reply_dh) { err = -ENOMEM; goto fail; } - adm_ctx.reply_dh->minor = d_in->minor; - adm_ctx.reply_dh->ret_code = NO_ERROR; + adm_ctx->reply_dh->minor = d_in->minor; + adm_ctx->reply_dh->ret_code = NO_ERROR; - adm_ctx.volume = VOLUME_UNSPECIFIED; + adm_ctx->volume = VOLUME_UNSPECIFIED; if (info->attrs[DRBD_NLA_CFG_CONTEXT]) { struct nlattr *nla; /* parse and validate only */ @@ -188,111 +164,131 @@ static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info, /* It was present, and valid, * copy it over to the reply skb. */ - err = nla_put_nohdr(adm_ctx.reply_skb, + err = nla_put_nohdr(adm_ctx->reply_skb, info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len, info->attrs[DRBD_NLA_CFG_CONTEXT]); if (err) goto fail; - /* and assign stuff to the global adm_ctx */ + /* and assign stuff to the adm_ctx */ nla = nested_attr_tb[__nla_type(T_ctx_volume)]; if (nla) - adm_ctx.volume = nla_get_u32(nla); + adm_ctx->volume = nla_get_u32(nla); nla = nested_attr_tb[__nla_type(T_ctx_resource_name)]; if (nla) - adm_ctx.resource_name = nla_data(nla); - adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)]; - adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; - if ((adm_ctx.my_addr && - nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.connection->my_addr)) || - (adm_ctx.peer_addr && - nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.connection->peer_addr))) { + adm_ctx->resource_name = nla_data(nla); + adm_ctx->my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)]; + adm_ctx->peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; + if ((adm_ctx->my_addr && + nla_len(adm_ctx->my_addr) > sizeof(adm_ctx->connection->my_addr)) || + (adm_ctx->peer_addr && + nla_len(adm_ctx->peer_addr) > sizeof(adm_ctx->connection->peer_addr))) { err = -EINVAL; goto fail; } } - adm_ctx.minor = d_in->minor; - adm_ctx.device = minor_to_device(d_in->minor); - if (adm_ctx.resource_name) { - adm_ctx.resource = drbd_find_resource(adm_ctx.resource_name); + adm_ctx->minor = d_in->minor; + adm_ctx->device = minor_to_device(d_in->minor); + + /* We are protected by the global genl_lock(). + * But we may explicitly drop it/retake it in drbd_adm_set_role(), + * so make sure this object stays around. */ + if (adm_ctx->device) + kref_get(&adm_ctx->device->kref); + + if (adm_ctx->resource_name) { + adm_ctx->resource = drbd_find_resource(adm_ctx->resource_name); } - if (!adm_ctx.device && (flags & DRBD_ADM_NEED_MINOR)) { - drbd_msg_put_info("unknown minor"); + if (!adm_ctx->device && (flags & DRBD_ADM_NEED_MINOR)) { + drbd_msg_put_info(adm_ctx->reply_skb, "unknown minor"); return ERR_MINOR_INVALID; } - if (!adm_ctx.resource && (flags & DRBD_ADM_NEED_RESOURCE)) { - drbd_msg_put_info("unknown resource"); - if (adm_ctx.resource_name) + if (!adm_ctx->resource && (flags & DRBD_ADM_NEED_RESOURCE)) { + drbd_msg_put_info(adm_ctx->reply_skb, "unknown resource"); + if (adm_ctx->resource_name) return ERR_RES_NOT_KNOWN; return ERR_INVALID_REQUEST; } if (flags & DRBD_ADM_NEED_CONNECTION) { - if (adm_ctx.resource) { - drbd_msg_put_info("no resource name expected"); + if (adm_ctx->resource) { + drbd_msg_put_info(adm_ctx->reply_skb, "no resource name expected"); return ERR_INVALID_REQUEST; } - if (adm_ctx.device) { - drbd_msg_put_info("no minor number expected"); + if (adm_ctx->device) { + drbd_msg_put_info(adm_ctx->reply_skb, "no minor number expected"); return ERR_INVALID_REQUEST; } - if (adm_ctx.my_addr && adm_ctx.peer_addr) - adm_ctx.connection = conn_get_by_addrs(nla_data(adm_ctx.my_addr), - nla_len(adm_ctx.my_addr), - nla_data(adm_ctx.peer_addr), - nla_len(adm_ctx.peer_addr)); - if (!adm_ctx.connection) { - drbd_msg_put_info("unknown connection"); + if (adm_ctx->my_addr && adm_ctx->peer_addr) + adm_ctx->connection = conn_get_by_addrs(nla_data(adm_ctx->my_addr), + nla_len(adm_ctx->my_addr), + nla_data(adm_ctx->peer_addr), + nla_len(adm_ctx->peer_addr)); + if (!adm_ctx->connection) { + drbd_msg_put_info(adm_ctx->reply_skb, "unknown connection"); return ERR_INVALID_REQUEST; } } /* some more paranoia, if the request was over-determined */ - if (adm_ctx.device && adm_ctx.resource && - adm_ctx.device->resource != adm_ctx.resource) { + if (adm_ctx->device && adm_ctx->resource && + adm_ctx->device->resource != adm_ctx->resource) { pr_warning("request: minor=%u, resource=%s; but that minor belongs to resource %s\n", - adm_ctx.minor, adm_ctx.resource->name, - adm_ctx.device->resource->name); - drbd_msg_put_info("minor exists in different resource"); + adm_ctx->minor, adm_ctx->resource->name, + adm_ctx->device->resource->name); + drbd_msg_put_info(adm_ctx->reply_skb, "minor exists in different resource"); return ERR_INVALID_REQUEST; } - if (adm_ctx.device && - adm_ctx.volume != VOLUME_UNSPECIFIED && - adm_ctx.volume != adm_ctx.device->vnr) { + if (adm_ctx->device && + adm_ctx->volume != VOLUME_UNSPECIFIED && + adm_ctx->volume != adm_ctx->device->vnr) { pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n", - adm_ctx.minor, adm_ctx.volume, - adm_ctx.device->vnr, - adm_ctx.device->resource->name); - drbd_msg_put_info("minor exists as different volume"); + adm_ctx->minor, adm_ctx->volume, + adm_ctx->device->vnr, + adm_ctx->device->resource->name); + drbd_msg_put_info(adm_ctx->reply_skb, "minor exists as different volume"); return ERR_INVALID_REQUEST; } + /* still, provide adm_ctx->resource always, if possible. */ + if (!adm_ctx->resource) { + adm_ctx->resource = adm_ctx->device ? adm_ctx->device->resource + : adm_ctx->connection ? adm_ctx->connection->resource : NULL; + if (adm_ctx->resource) + kref_get(&adm_ctx->resource->kref); + } + return NO_ERROR; fail: - nlmsg_free(adm_ctx.reply_skb); - adm_ctx.reply_skb = NULL; + nlmsg_free(adm_ctx->reply_skb); + adm_ctx->reply_skb = NULL; return err; } -static int drbd_adm_finish(struct genl_info *info, int retcode) +static int drbd_adm_finish(struct drbd_config_context *adm_ctx, + struct genl_info *info, int retcode) { - if (adm_ctx.connection) { - kref_put(&adm_ctx.connection->kref, drbd_destroy_connection); - adm_ctx.connection = NULL; + if (adm_ctx->device) { + kref_put(&adm_ctx->device->kref, drbd_destroy_device); + adm_ctx->device = NULL; } - if (adm_ctx.resource) { - kref_put(&adm_ctx.resource->kref, drbd_destroy_resource); - adm_ctx.resource = NULL; + if (adm_ctx->connection) { + kref_put(&adm_ctx->connection->kref, &drbd_destroy_connection); + adm_ctx->connection = NULL; + } + if (adm_ctx->resource) { + kref_put(&adm_ctx->resource->kref, drbd_destroy_resource); + adm_ctx->resource = NULL; } - if (!adm_ctx.reply_skb) + if (!adm_ctx->reply_skb) return -ENOMEM; - adm_ctx.reply_dh->ret_code = retcode; - drbd_adm_send_reply(adm_ctx.reply_skb, info); + adm_ctx->reply_dh->ret_code = retcode; + drbd_adm_send_reply(adm_ctx->reply_skb, info); return 0; } @@ -426,6 +422,14 @@ static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connec } rcu_read_unlock(); + if (fp == FP_NOT_AVAIL) { + /* IO Suspending works on the whole resource. + Do it only for one device. */ + vnr = 0; + peer_device = idr_get_next(&connection->peer_devices, &vnr); + drbd_change_state(peer_device->device, CS_VERBOSE | CS_HARD, NS(susp_fen, 0)); + } + return fp; } @@ -438,12 +442,13 @@ bool conn_try_outdate_peer(struct drbd_connection *connection) char *ex_to_string; int r; + spin_lock_irq(&connection->resource->req_lock); if (connection->cstate >= C_WF_REPORT_PARAMS) { drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n"); + spin_unlock_irq(&connection->resource->req_lock); return false; } - spin_lock_irq(&connection->resource->req_lock); connect_cnt = connection->connect_cnt; spin_unlock_irq(&connection->resource->req_lock); @@ -654,11 +659,11 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force) put_ldev(device); } } else { - mutex_lock(&device->resource->conf_update); + /* Called from drbd_adm_set_role only. + * We are still holding the conf_update mutex. */ nc = first_peer_device(device)->connection->net_conf; if (nc) nc->discard_my_data = 0; /* without copy; single bit op is atomic */ - mutex_unlock(&device->resource->conf_update); set_disk_ro(device->vdisk, false); if (get_ldev(device)) { @@ -700,11 +705,12 @@ static const char *from_attrs_err_to_txt(int err) int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct set_role_parms parms; int err; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -715,17 +721,22 @@ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) err = set_role_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto out; } } + genl_unlock(); + mutex_lock(&adm_ctx.resource->adm_mutex); if (info->genlhdr->cmd == DRBD_ADM_PRIMARY) retcode = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate); else retcode = drbd_set_role(adm_ctx.device, R_SECONDARY, 0); + + mutex_unlock(&adm_ctx.resource->adm_mutex); + genl_lock(); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -1104,15 +1115,18 @@ static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_ struct request_queue * const q = device->rq_queue; unsigned int max_hw_sectors = max_bio_size >> 9; unsigned int max_segments = 0; + struct request_queue *b = NULL; if (get_ldev_if_state(device, D_ATTACHING)) { - struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue; + b = device->ldev->backing_bdev->bd_disk->queue; max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); rcu_read_lock(); max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs; rcu_read_unlock(); - put_ldev(device); + + blk_set_stacking_limits(&q->limits); + blk_queue_max_write_same_sectors(q, 0); } blk_queue_logical_block_size(q, 512); @@ -1121,8 +1135,25 @@ static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_ blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1); - if (get_ldev_if_state(device, D_ATTACHING)) { - struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue; + if (b) { + struct drbd_connection *connection = first_peer_device(device)->connection; + + if (blk_queue_discard(b) && + (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) { + /* For now, don't allow more than one activity log extent worth of data + * to be discarded in one go. We may need to rework drbd_al_begin_io() + * to allow for even larger discard ranges */ + q->limits.max_discard_sectors = DRBD_MAX_DISCARD_SECTORS; + + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + /* REALLY? Is stacking secdiscard "legal"? */ + if (blk_queue_secdiscard(b)) + queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q); + } else { + q->limits.max_discard_sectors = 0; + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); + queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q); + } blk_queue_stack_limits(q, b); @@ -1164,8 +1195,14 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device) peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */ else peer = DRBD_MAX_BIO_SIZE; - } + /* We may later detach and re-attach on a disconnected Primary. + * Avoid this setting to jump back in that case. + * We want to store what we know the peer DRBD can handle, + * not what the peer IO backend can handle. */ + if (peer > device->peer_max_bio_size) + device->peer_max_bio_size = peer; + } new = min(local, peer); if (device->state.role == R_PRIMARY && new < now) @@ -1258,19 +1295,21 @@ static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev) int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct drbd_device *device; struct disk_conf *new_disk_conf, *old_disk_conf; struct fifo_buffer *old_plan = NULL, *new_plan = NULL; int err, fifo_size; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) - goto out; + goto finish; device = adm_ctx.device; + mutex_lock(&adm_ctx.resource->adm_mutex); /* we also need a disk * to change the options on */ @@ -1294,7 +1333,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) err = disk_conf_from_attrs_for_change(new_disk_conf, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail_unlock; } @@ -1385,12 +1424,15 @@ fail_unlock: success: put_ldev(device); out: - drbd_adm_finish(info, retcode); + mutex_unlock(&adm_ctx.resource->adm_mutex); + finish: + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_device *device; int err; enum drbd_ret_code retcode; @@ -1406,13 +1448,14 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) enum drbd_state_rv rv; struct net_conf *nc; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto finish; device = adm_ctx.device; + mutex_lock(&adm_ctx.resource->adm_mutex); conn_reconfig_start(first_peer_device(device)->connection); /* if you want to reconfigure, please tear down first */ @@ -1455,7 +1498,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) err = disk_conf_from_attrs(new_disk_conf, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail; } @@ -1619,7 +1662,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) } if (device->state.conn < C_CONNECTED && - device->state.role == R_PRIMARY && + device->state.role == R_PRIMARY && device->ed_uuid && (device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { drbd_err(device, "Can only attach to data with current UUID=%016llX\n", (unsigned long long)device->ed_uuid); @@ -1797,7 +1840,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE); put_ldev(device); conn_reconfig_done(first_peer_device(device)->connection); - drbd_adm_finish(info, retcode); + mutex_unlock(&adm_ctx.resource->adm_mutex); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; force_diskless_dec: @@ -1819,9 +1863,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) kfree(new_disk_conf); lc_destroy(resync_lru); kfree(new_plan); - + mutex_unlock(&adm_ctx.resource->adm_mutex); finish: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -1860,11 +1904,12 @@ out: * Only then we have finally detached. */ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct detach_parms parms = { }; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -1874,14 +1919,16 @@ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) err = detach_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto out; } } + mutex_lock(&adm_ctx.resource->adm_mutex); retcode = adm_detach(adm_ctx.device, parms.force_detach); + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2055,6 +2102,7 @@ static void free_crypto(struct crypto *crypto) int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct drbd_connection *connection; struct net_conf *old_net_conf, *new_net_conf = NULL; @@ -2063,13 +2111,14 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) int rsr; /* re-sync running */ struct crypto crypto = { }; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) - goto out; + goto finish; connection = adm_ctx.connection; + mutex_lock(&adm_ctx.resource->adm_mutex); new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); if (!new_net_conf) { @@ -2084,7 +2133,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) old_net_conf = connection->net_conf; if (!old_net_conf) { - drbd_msg_put_info("net conf missing, try connect"); + drbd_msg_put_info(adm_ctx.reply_skb, "net conf missing, try connect"); retcode = ERR_INVALID_REQUEST; goto fail; } @@ -2096,7 +2145,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) err = net_conf_from_attrs_for_change(new_net_conf, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail; } @@ -2167,12 +2216,15 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) done: conn_reconfig_done(connection); out: - drbd_adm_finish(info, retcode); + mutex_unlock(&adm_ctx.resource->adm_mutex); + finish: + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_peer_device *peer_device; struct net_conf *old_net_conf, *new_net_conf = NULL; struct crypto crypto = { }; @@ -2182,14 +2234,14 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) int i; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) { - drbd_msg_put_info("connection endpoint(s) missing"); + drbd_msg_put_info(adm_ctx.reply_skb, "connection endpoint(s) missing"); retcode = ERR_INVALID_REQUEST; goto out; } @@ -2215,6 +2267,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) } } + mutex_lock(&adm_ctx.resource->adm_mutex); connection = first_connection(adm_ctx.resource); conn_reconfig_start(connection); @@ -2235,7 +2288,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) err = net_conf_from_attrs(new_net_conf, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail; } @@ -2284,7 +2337,8 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) retcode = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE); conn_reconfig_done(connection); - drbd_adm_finish(info, retcode); + mutex_unlock(&adm_ctx.resource->adm_mutex); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; fail: @@ -2292,8 +2346,9 @@ fail: kfree(new_net_conf); conn_reconfig_done(connection); + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2356,13 +2411,14 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct disconnect_parms parms; struct drbd_connection *connection; enum drbd_state_rv rv; enum drbd_ret_code retcode; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -2374,18 +2430,20 @@ int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) err = disconnect_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail; } } + mutex_lock(&adm_ctx.resource->adm_mutex); rv = conn_try_disconnect(connection, parms.force_disconnect); if (rv < SS_SUCCESS) retcode = rv; /* FIXME: Type mismatch. */ else retcode = NO_ERROR; + mutex_unlock(&adm_ctx.resource->adm_mutex); fail: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2407,6 +2465,7 @@ void resync_after_online_grow(struct drbd_device *device) int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct disk_conf *old_disk_conf, *new_disk_conf = NULL; struct resize_parms rs; struct drbd_device *device; @@ -2417,12 +2476,13 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) sector_t u_size; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) - goto fail; + goto finish; + mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; if (!get_ldev(device)) { retcode = ERR_NO_DISK; @@ -2436,7 +2496,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) err = resize_parms_from_attrs(&rs, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail_ldev; } } @@ -2482,7 +2542,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) goto fail_ldev; } - if (device->state.conn != C_CONNECTED) { + if (device->state.conn != C_CONNECTED && !rs.resize_force) { retcode = ERR_MD_LAYOUT_CONNECTED; goto fail_ldev; } @@ -2528,7 +2588,9 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) } fail: - drbd_adm_finish(info, retcode); + mutex_unlock(&adm_ctx.resource->adm_mutex); + finish: + drbd_adm_finish(&adm_ctx, info, retcode); return 0; fail_ldev: @@ -2538,11 +2600,12 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct res_opts res_opts; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -2555,33 +2618,37 @@ int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info) err = res_opts_from_attrs(&res_opts, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto fail; } + mutex_lock(&adm_ctx.resource->adm_mutex); err = set_resource_options(adm_ctx.resource, &res_opts); if (err) { retcode = ERR_INVALID_REQUEST; if (err == -ENOMEM) retcode = ERR_NOMEM; } + mutex_unlock(&adm_ctx.resource->adm_mutex); fail: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_device *device; int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; /* If there is still bitmap IO pending, probably because of a previous @@ -2605,26 +2672,29 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) } else retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T)); drbd_resume_io(device); - + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info, union drbd_state mask, union drbd_state val) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); retcode = drbd_request_state(adm_ctx.device, mask, val); + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2639,15 +2709,17 @@ static int drbd_bmio_set_susp_al(struct drbd_device *device) int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; int retcode; /* drbd_ret_code, drbd_state_rv */ struct drbd_device *device; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; /* If there is still bitmap IO pending, probably because of a previous @@ -2674,40 +2746,45 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) } else retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S)); drbd_resume_io(device); - + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO) retcode = ERR_PAUSE_IS_SET; + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; union drbd_dev_state s; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); if (drbd_request_state(adm_ctx.device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { s = adm_ctx.device->state; if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) { @@ -2717,9 +2794,9 @@ int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) retcode = ERR_PAUSE_IS_CLEAR; } } - + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2730,15 +2807,17 @@ int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info) int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_device *device; int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; if (test_bit(NEW_CUR_UUID, &device->flags)) { drbd_uuid_new_current(device); @@ -2753,9 +2832,9 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) tl_restart(first_peer_device(device)->connection, FAIL_FROZEN_DISK_IO); } drbd_resume_io(device); - + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -2931,10 +3010,11 @@ nla_put_failure: int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -2946,7 +3026,7 @@ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info) return err; } out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -3133,11 +3213,12 @@ dump: int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct timeout_parms tp; int err; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -3154,17 +3235,18 @@ int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info) return err; } out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_device *device; enum drbd_ret_code retcode; struct start_ov_parms parms; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -3179,10 +3261,12 @@ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) int err = start_ov_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto out; } } + mutex_lock(&adm_ctx.resource->adm_mutex); + /* w_make_ov_request expects position to be aligned */ device->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1); device->ov_stop_sector = parms.ov_stop_sector; @@ -3193,21 +3277,24 @@ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags)); retcode = drbd_request_state(device, NS(conn, C_VERIFY_S)); drbd_resume_io(device); + + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_device *device; enum drbd_ret_code retcode; int skip_initial_sync = 0; int err; struct new_c_uuid_parms args; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -3219,11 +3306,12 @@ int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) err = new_c_uuid_parms_from_attrs(&args, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto out_nolock; } } + mutex_lock(&adm_ctx.resource->adm_mutex); mutex_lock(device->state_mutex); /* Protects us against serialized state changes. */ if (!get_ldev(device)) { @@ -3268,22 +3356,24 @@ out_dec: put_ldev(device); out: mutex_unlock(device->state_mutex); + mutex_unlock(&adm_ctx.resource->adm_mutex); out_nolock: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } static enum drbd_ret_code -drbd_check_resource_name(const char *name) +drbd_check_resource_name(struct drbd_config_context *adm_ctx) { + const char *name = adm_ctx->resource_name; if (!name || !name[0]) { - drbd_msg_put_info("resource name missing"); + drbd_msg_put_info(adm_ctx->reply_skb, "resource name missing"); return ERR_MANDATORY_TAG; } /* if we want to use these in sysfs/configfs/debugfs some day, * we must not allow slashes */ if (strchr(name, '/')) { - drbd_msg_put_info("invalid resource name"); + drbd_msg_put_info(adm_ctx->reply_skb, "invalid resource name"); return ERR_INVALID_REQUEST; } return NO_ERROR; @@ -3291,11 +3381,12 @@ drbd_check_resource_name(const char *name) int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct res_opts res_opts; int err; - retcode = drbd_adm_prepare(skb, info, 0); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, 0); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) @@ -3305,48 +3396,50 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) err = res_opts_from_attrs(&res_opts, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); goto out; } - retcode = drbd_check_resource_name(adm_ctx.resource_name); + retcode = drbd_check_resource_name(&adm_ctx); if (retcode != NO_ERROR) goto out; if (adm_ctx.resource) { if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) { retcode = ERR_INVALID_REQUEST; - drbd_msg_put_info("resource exists"); + drbd_msg_put_info(adm_ctx.reply_skb, "resource exists"); } /* else: still NO_ERROR */ goto out; } + /* not yet safe for genl_family.parallel_ops */ if (!conn_create(adm_ctx.resource_name, &res_opts)) retcode = ERR_NOMEM; out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_genlmsghdr *dh = info->userhdr; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; if (dh->minor > MINORMASK) { - drbd_msg_put_info("requested minor out of range"); + drbd_msg_put_info(adm_ctx.reply_skb, "requested minor out of range"); retcode = ERR_INVALID_REQUEST; goto out; } if (adm_ctx.volume > DRBD_VOLUME_MAX) { - drbd_msg_put_info("requested volume id out of range"); + drbd_msg_put_info(adm_ctx.reply_skb, "requested volume id out of range"); retcode = ERR_INVALID_REQUEST; goto out; } @@ -3360,9 +3453,11 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) goto out; } - retcode = drbd_create_device(adm_ctx.resource, dh->minor, adm_ctx.volume); + mutex_lock(&adm_ctx.resource->adm_mutex); + retcode = drbd_create_device(&adm_ctx, dh->minor); + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } @@ -3383,35 +3478,40 @@ static enum drbd_ret_code adm_del_minor(struct drbd_device *device) int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) goto out; + mutex_lock(&adm_ctx.resource->adm_mutex); retcode = adm_del_minor(adm_ctx.device); + mutex_unlock(&adm_ctx.resource->adm_mutex); out: - drbd_adm_finish(info, retcode); + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_resource *resource; struct drbd_connection *connection; struct drbd_device *device; int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ unsigned i; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) - goto out; + goto finish; resource = adm_ctx.resource; + mutex_lock(&resource->adm_mutex); /* demote */ for_each_connection(connection, resource) { struct drbd_peer_device *peer_device; @@ -3419,14 +3519,14 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) idr_for_each_entry(&connection->peer_devices, peer_device, i) { retcode = drbd_set_role(peer_device->device, R_SECONDARY, 0); if (retcode < SS_SUCCESS) { - drbd_msg_put_info("failed to demote"); + drbd_msg_put_info(adm_ctx.reply_skb, "failed to demote"); goto out; } } retcode = conn_try_disconnect(connection, 0); if (retcode < SS_SUCCESS) { - drbd_msg_put_info("failed to disconnect"); + drbd_msg_put_info(adm_ctx.reply_skb, "failed to disconnect"); goto out; } } @@ -3435,7 +3535,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) idr_for_each_entry(&resource->devices, device, i) { retcode = adm_detach(device, 0); if (retcode < SS_SUCCESS || retcode > NO_ERROR) { - drbd_msg_put_info("failed to detach"); + drbd_msg_put_info(adm_ctx.reply_skb, "failed to detach"); goto out; } } @@ -3453,7 +3553,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) retcode = adm_del_minor(device); if (retcode != NO_ERROR) { /* "can not happen" */ - drbd_msg_put_info("failed to delete volume"); + drbd_msg_put_info(adm_ctx.reply_skb, "failed to delete volume"); goto out; } } @@ -3462,25 +3562,28 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) synchronize_rcu(); drbd_free_resource(resource); retcode = NO_ERROR; - out: - drbd_adm_finish(info, retcode); + mutex_unlock(&resource->adm_mutex); +finish: + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context adm_ctx; struct drbd_resource *resource; struct drbd_connection *connection; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); if (!adm_ctx.reply_skb) return retcode; if (retcode != NO_ERROR) - goto out; + goto finish; resource = adm_ctx.resource; + mutex_lock(&resource->adm_mutex); for_each_connection(connection, resource) { if (connection->cstate > C_STANDALONE) { retcode = ERR_NET_CONFIGURED; @@ -3499,7 +3602,9 @@ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) drbd_free_resource(resource); retcode = NO_ERROR; out: - drbd_adm_finish(info, retcode); + mutex_unlock(&resource->adm_mutex); +finish: + drbd_adm_finish(&adm_ctx, info, retcode); return 0; } diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c index fa672b6df8d..b2d4791498a 100644 --- a/drivers/block/drbd/drbd_nla.c +++ b/drivers/block/drbd/drbd_nla.c @@ -1,4 +1,3 @@ -#include "drbd_wrappers.h" #include <linux/kernel.h> #include <net/netlink.h> #include <linux/drbd_genl_api.h> diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 2f26e8ffa45..89736bdbbc7 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -116,7 +116,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se /* ------------------------ ~18s average ------------------------ */ i = (device->rs_last_mark + 2) % DRBD_SYNC_MARKS; dt = (jiffies - device->rs_mark_time[i]) / HZ; - if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS)) + if (dt > 180) stalled = 1; if (!dt) diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h index 3c04ec0ea33..2da9104a385 100644 --- a/drivers/block/drbd/drbd_protocol.h +++ b/drivers/block/drbd/drbd_protocol.h @@ -54,6 +54,11 @@ enum drbd_packet { P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */ P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */ P_PROTOCOL_UPDATE = 0x2d, /* data sock: is used in established connections */ + /* 0x2e to 0x30 reserved, used in drbd 9 */ + + /* REQ_DISCARD. We used "discard" in different contexts before, + * which is why I chose TRIM here, to disambiguate. */ + P_TRIM = 0x31, P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ P_MAX_OPT_CMD = 0x101, @@ -119,6 +124,11 @@ struct p_data { u32 dp_flags; } __packed; +struct p_trim { + struct p_data p_data; + u32 size; /* == bio->bi_size */ +} __packed; + /* * commands which share a struct: * p_block_ack: @@ -150,6 +160,8 @@ struct p_block_req { * ReportParams */ +#define FF_TRIM 1 + struct p_connection_features { u32 protocol_min; u32 feature_flags; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 68e3992e883..b6c8aaf4931 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -46,9 +46,10 @@ #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" - #include "drbd_vli.h" +#define PRO_FEATURES (FF_TRIM) + struct packet_info { enum drbd_packet cmd; unsigned int size; @@ -65,7 +66,7 @@ enum finish_epoch { static int drbd_do_features(struct drbd_connection *connection); static int drbd_do_auth(struct drbd_connection *connection); static int drbd_disconnected(struct drbd_peer_device *); - +static void conn_wait_active_ee_empty(struct drbd_connection *connection); static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event); static int e_end_block(struct drbd_work *, int); @@ -234,9 +235,17 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device) * @retry: whether to retry, if not enough pages are available right now * * Tries to allocate number pages, first from our own page pool, then from - * the kernel, unless this allocation would exceed the max_buffers setting. + * the kernel. * Possibly retry until DRBD frees sufficient pages somewhere else. * + * If this allocation would exceed the max_buffers setting, we throttle + * allocation (schedule_timeout) to give the system some room to breathe. + * + * We do not use max-buffers as hard limit, because it could lead to + * congestion and further to a distributed deadlock during online-verify or + * (checksum based) resync, if the max-buffers, socket buffer sizes and + * resync-rate settings are mis-configured. + * * Returns a page chain linked via page->private. */ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number, @@ -246,10 +255,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int struct page *page = NULL; struct net_conf *nc; DEFINE_WAIT(wait); - int mxb; + unsigned int mxb; - /* Yes, we may run up to @number over max_buffers. If we - * follow it strictly, the admin will get it wrong anyways. */ rcu_read_lock(); nc = rcu_dereference(peer_device->connection->net_conf); mxb = nc ? nc->max_buffers : 1000000; @@ -277,7 +284,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int break; } - schedule(); + if (schedule_timeout(HZ/10) == 0) + mxb = UINT_MAX; } finish_wait(&drbd_pp_wait, &wait); @@ -331,7 +339,7 @@ You must not have the req_lock: struct drbd_peer_request * drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector, - unsigned int data_size, gfp_t gfp_mask) __must_hold(local) + unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local) { struct drbd_device *device = peer_device->device; struct drbd_peer_request *peer_req; @@ -348,7 +356,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto return NULL; } - if (data_size) { + if (has_payload && data_size) { page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT)); if (!page) goto fail; @@ -1026,24 +1034,27 @@ randomize: if (drbd_send_protocol(connection) == -EOPNOTSUPP) return -1; + /* Prevent a race between resync-handshake and + * being promoted to Primary. + * + * Grab and release the state mutex, so we know that any current + * drbd_set_role() is finished, and any incoming drbd_set_role + * will see the STATE_SENT flag, and wait for it to be cleared. + */ + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + mutex_lock(peer_device->device->state_mutex); + set_bit(STATE_SENT, &connection->flags); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + mutex_unlock(peer_device->device->state_mutex); + rcu_read_lock(); idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { struct drbd_device *device = peer_device->device; kref_get(&device->kref); rcu_read_unlock(); - /* Prevent a race between resync-handshake and - * being promoted to Primary. - * - * Grab and release the state mutex, so we know that any current - * drbd_set_role() is finished, and any incoming drbd_set_role - * will see the STATE_SENT flag, and wait for it to be cleared. - */ - mutex_lock(device->state_mutex); - mutex_unlock(device->state_mutex); - if (discard_my_data) set_bit(DISCARD_MY_DATA, &device->flags); else @@ -1315,6 +1326,20 @@ int drbd_submit_peer_request(struct drbd_device *device, unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; int err = -ENOMEM; + if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) { + /* wait for all pending IO completions, before we start + * zeroing things out. */ + conn_wait_active_ee_empty(first_peer_device(device)->connection); + if (blkdev_issue_zeroout(device->ldev->backing_bdev, + sector, ds >> 9, GFP_NOIO)) + peer_req->flags |= EE_WAS_ERROR; + drbd_endio_write_sec_final(peer_req); + return 0; + } + + if (peer_req->flags & EE_IS_TRIM) + nr_pages = 0; /* discards don't have any payload. */ + /* In most cases, we will only need one bio. But in case the lower * level restrictions happen to be different at this offset on this * side than those of the sending peer, we may need to submit the @@ -1326,7 +1351,7 @@ int drbd_submit_peer_request(struct drbd_device *device, next_bio: bio = bio_alloc(GFP_NOIO, nr_pages); if (!bio) { - drbd_err(device, "submit_ee: Allocation of a bio failed\n"); + drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages); goto fail; } /* > peer_req->i.sector, unless this is the first bio */ @@ -1340,6 +1365,11 @@ next_bio: bios = bio; ++n_bios; + if (rw & REQ_DISCARD) { + bio->bi_iter.bi_size = ds; + goto submit; + } + page_chain_for_each(page) { unsigned len = min_t(unsigned, ds, PAGE_SIZE); if (!bio_add_page(bio, page, len, 0)) { @@ -1360,8 +1390,9 @@ next_bio: sector += len >> 9; --nr_pages; } - D_ASSERT(device, page == NULL); D_ASSERT(device, ds == 0); +submit: + D_ASSERT(device, page == NULL); atomic_set(&peer_req->pending_bios, n_bios); do { @@ -1490,19 +1521,21 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf * and from receive_Data */ static struct drbd_peer_request * read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, - int data_size) __must_hold(local) + struct packet_info *pi) __must_hold(local) { struct drbd_device *device = peer_device->device; const sector_t capacity = drbd_get_capacity(device->this_bdev); struct drbd_peer_request *peer_req; struct page *page; int dgs, ds, err; + int data_size = pi->size; void *dig_in = peer_device->connection->int_dig_in; void *dig_vv = peer_device->connection->int_dig_vv; unsigned long *data; + struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL; dgs = 0; - if (peer_device->connection->peer_integrity_tfm) { + if (!trim && peer_device->connection->peer_integrity_tfm) { dgs = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm); /* * FIXME: Receive the incoming digest into the receive buffer @@ -1514,9 +1547,15 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, data_size -= dgs; } + if (trim) { + D_ASSERT(peer_device, data_size == 0); + data_size = be32_to_cpu(trim->size); + } + if (!expect(IS_ALIGNED(data_size, 512))) return NULL; - if (!expect(data_size <= DRBD_MAX_BIO_SIZE)) + /* prepare for larger trim requests. */ + if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE)) return NULL; /* even though we trust out peer, @@ -1532,11 +1571,11 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ - peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, GFP_NOIO); + peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO); if (!peer_req) return NULL; - if (!data_size) + if (trim) return peer_req; ds = data_size; @@ -1676,12 +1715,12 @@ static int e_end_resync_block(struct drbd_work *w, int unused) } static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector, - int data_size) __releases(local) + struct packet_info *pi) __releases(local) { struct drbd_device *device = peer_device->device; struct drbd_peer_request *peer_req; - peer_req = read_in_block(peer_device, ID_SYNCER, sector, data_size); + peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi); if (!peer_req) goto fail; @@ -1697,7 +1736,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto list_add(&peer_req->w.list, &device->sync_ee); spin_unlock_irq(&device->resource->req_lock); - atomic_add(data_size >> 9, &device->rs_sect_ev); + atomic_add(pi->size >> 9, &device->rs_sect_ev); if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0) return 0; @@ -1785,7 +1824,7 @@ static int receive_RSDataReply(struct drbd_connection *connection, struct packet /* data is submitted to disk within recv_resync_read. * corresponding put_ldev done below on error, * or in drbd_peer_request_endio. */ - err = recv_resync_read(peer_device, sector, pi->size); + err = recv_resync_read(peer_device, sector, pi); } else { if (__ratelimit(&drbd_ratelimit_state)) drbd_err(device, "Can not write resync data to local disk.\n"); @@ -2196,7 +2235,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * */ sector = be64_to_cpu(p->sector); - peer_req = read_in_block(peer_device, p->block_id, sector, pi->size); + peer_req = read_in_block(peer_device, p->block_id, sector, pi); if (!peer_req) { put_ldev(device); return -EIO; @@ -2206,7 +2245,15 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * dp_flags = be32_to_cpu(p->dp_flags); rw |= wire_flags_to_bio(dp_flags); - if (peer_req->pages == NULL) { + if (pi->cmd == P_TRIM) { + struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); + peer_req->flags |= EE_IS_TRIM; + if (!blk_queue_discard(q)) + peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT; + D_ASSERT(peer_device, peer_req->i.size > 0); + D_ASSERT(peer_device, rw & REQ_DISCARD); + D_ASSERT(peer_device, peer_req->pages == NULL); + } else if (peer_req->pages == NULL) { D_ASSERT(device, peer_req->i.size == 0); D_ASSERT(device, dp_flags & DP_FLUSH); } @@ -2242,7 +2289,12 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * update_peer_seq(peer_device, peer_seq); spin_lock_irq(&device->resource->req_lock); } - list_add(&peer_req->w.list, &device->active_ee); + /* if we use the zeroout fallback code, we process synchronously + * and we wait for all pending requests, respectively wait for + * active_ee to become empty in drbd_submit_peer_request(); + * better not add ourselves here. */ + if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0) + list_add(&peer_req->w.list, &device->active_ee); spin_unlock_irq(&device->resource->req_lock); if (device->state.conn == C_SYNC_TARGET) @@ -2313,39 +2365,45 @@ out_interrupted: * The current sync rate used here uses only the most recent two step marks, * to have a short time average so we can react faster. */ -int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector) +bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector) { - struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk; - unsigned long db, dt, dbdt; struct lc_element *tmp; - int curr_events; - int throttle = 0; - unsigned int c_min_rate; - - rcu_read_lock(); - c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate; - rcu_read_unlock(); + bool throttle = true; - /* feature disabled? */ - if (c_min_rate == 0) - return 0; + if (!drbd_rs_c_min_rate_throttle(device)) + return false; spin_lock_irq(&device->al_lock); tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector)); if (tmp) { struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); - if (test_bit(BME_PRIORITY, &bm_ext->flags)) { - spin_unlock_irq(&device->al_lock); - return 0; - } + if (test_bit(BME_PRIORITY, &bm_ext->flags)) + throttle = false; /* Do not slow down if app IO is already waiting for this extent */ } spin_unlock_irq(&device->al_lock); + return throttle; +} + +bool drbd_rs_c_min_rate_throttle(struct drbd_device *device) +{ + struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk; + unsigned long db, dt, dbdt; + unsigned int c_min_rate; + int curr_events; + + rcu_read_lock(); + c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate; + rcu_read_unlock(); + + /* feature disabled? */ + if (c_min_rate == 0) + return false; + curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + (int)part_stat_read(&disk->part0, sectors[1]) - atomic_read(&device->rs_sect_ev); - if (!device->rs_last_events || curr_events - device->rs_last_events > 64) { unsigned long rs_left; int i; @@ -2368,12 +2426,11 @@ int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector) dbdt = Bit2KB(db/dt); if (dbdt > c_min_rate) - throttle = 1; + return true; } - return throttle; + return false; } - static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi) { struct drbd_peer_device *peer_device; @@ -2436,7 +2493,8 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ - peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, GFP_NOIO); + peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, + true /* has real payload */, GFP_NOIO); if (!peer_req) { put_ldev(device); return -ENOMEM; @@ -3648,6 +3706,13 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info put_ldev(device); } + device->peer_max_bio_size = be32_to_cpu(p->max_bio_size); + drbd_reconsider_max_bio_size(device); + /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size(). + In case we cleared the QUEUE_FLAG_DISCARD from our queue in + drbd_reconsider_max_bio_size(), we can be sure that after + drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */ + ddsf = be16_to_cpu(p->dds_flags); if (get_ldev(device)) { dd = drbd_determine_dev_size(device, ddsf, NULL); @@ -3660,9 +3725,6 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info drbd_set_my_capacity(device, p_size); } - device->peer_max_bio_size = be32_to_cpu(p->max_bio_size); - drbd_reconsider_max_bio_size(device); - if (get_ldev(device)) { if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) { device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev); @@ -4423,6 +4485,7 @@ static struct data_cmd drbd_cmd_handler[] = { [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, + [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data }, }; static void drbdd(struct drbd_connection *connection) @@ -4630,6 +4693,7 @@ static int drbd_send_features(struct drbd_connection *connection) memset(p, 0, sizeof(*p)); p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); + p->feature_flags = cpu_to_be32(PRO_FEATURES); return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0); } @@ -4683,10 +4747,14 @@ static int drbd_do_features(struct drbd_connection *connection) goto incompat; connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); + connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags); drbd_info(connection, "Handshake successful: " "Agreed network protocol version %d\n", connection->agreed_pro_version); + drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n", + connection->agreed_features & FF_TRIM ? " " : " not "); + return 1; incompat: @@ -4778,6 +4846,12 @@ static int drbd_do_auth(struct drbd_connection *connection) goto fail; } + if (pi.size < CHALLENGE_LEN) { + drbd_err(connection, "AuthChallenge payload too small.\n"); + rv = -1; + goto fail; + } + peers_ch = kmalloc(pi.size, GFP_NOIO); if (peers_ch == NULL) { drbd_err(connection, "kmalloc of peers_ch failed\n"); @@ -4791,6 +4865,12 @@ static int drbd_do_auth(struct drbd_connection *connection) goto fail; } + if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) { + drbd_err(connection, "Peer presented the same challenge!\n"); + rv = -1; + goto fail; + } + resp_size = crypto_hash_digestsize(connection->cram_hmac_tfm); response = kmalloc(resp_size, GFP_NOIO); if (response == NULL) { diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 3779c8d2875..09803d0d520 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -522,6 +522,13 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); break; + case DISCARD_COMPLETED_NOTSUPP: + case DISCARD_COMPLETED_WITH_ERROR: + /* I'd rather not detach from local disk just because it + * failed a REQ_DISCARD. */ + mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); + break; + case QUEUE_FOR_NET_READ: /* READ or READA, and * no local disk, @@ -1235,6 +1242,7 @@ void do_submit(struct work_struct *ws) if (list_empty(&incoming)) break; +skip_fast_path: wait_event(device->al_wait, prepare_al_transaction_nonblock(device, &incoming, &pending)); /* Maybe more was queued, while we prepared the transaction? * Try to stuff them into this transaction as well. @@ -1273,6 +1281,25 @@ void do_submit(struct work_struct *ws) list_del_init(&req->tl_requests); drbd_send_and_submit(device, req); } + + /* If all currently hot activity log extents are kept busy by + * incoming requests, we still must not totally starve new + * requests to cold extents. In that case, prepare one request + * in blocking mode. */ + list_for_each_entry_safe(req, tmp, &incoming, tl_requests) { + list_del_init(&req->tl_requests); + req->rq_state |= RQ_IN_ACT_LOG; + if (!drbd_al_begin_io_prepare(device, &req->i)) { + /* Corresponding extent was hot after all? */ + drbd_send_and_submit(device, req); + } else { + /* Found a request to a cold extent. + * Put on "pending" list, + * and try to cumulate with more. */ + list_add(&req->tl_requests, &pending); + goto skip_fast_path; + } + } } } @@ -1326,23 +1353,35 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct return limit; } -static struct drbd_request *find_oldest_request(struct drbd_connection *connection) +static void find_oldest_requests( + struct drbd_connection *connection, + struct drbd_device *device, + struct drbd_request **oldest_req_waiting_for_peer, + struct drbd_request **oldest_req_waiting_for_disk) { - /* Walk the transfer log, - * and find the oldest not yet completed request */ struct drbd_request *r; + *oldest_req_waiting_for_peer = NULL; + *oldest_req_waiting_for_disk = NULL; list_for_each_entry(r, &connection->transfer_log, tl_requests) { - if (atomic_read(&r->completion_ref)) - return r; + const unsigned s = r->rq_state; + if (!*oldest_req_waiting_for_peer + && ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) + *oldest_req_waiting_for_peer = r; + + if (!*oldest_req_waiting_for_disk + && (s & RQ_LOCAL_PENDING) && r->device == device) + *oldest_req_waiting_for_disk = r; + + if (*oldest_req_waiting_for_peer && *oldest_req_waiting_for_disk) + break; } - return NULL; } void request_timer_fn(unsigned long data) { struct drbd_device *device = (struct drbd_device *) data; struct drbd_connection *connection = first_peer_device(device)->connection; - struct drbd_request *req; /* oldest request */ + struct drbd_request *req_disk, *req_peer; /* oldest request */ struct net_conf *nc; unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ unsigned long now; @@ -1366,8 +1405,8 @@ void request_timer_fn(unsigned long data) now = jiffies; spin_lock_irq(&device->resource->req_lock); - req = find_oldest_request(connection); - if (!req) { + find_oldest_requests(connection, device, &req_peer, &req_disk); + if (req_peer == NULL && req_disk == NULL) { spin_unlock_irq(&device->resource->req_lock); mod_timer(&device->request_timer, now + et); return; @@ -1389,19 +1428,26 @@ void request_timer_fn(unsigned long data) * ~198 days with 250 HZ, we have a window where the timeout would need * to expire twice (worst case) to become effective. Good enough. */ - if (ent && req->rq_state & RQ_NET_PENDING && - time_after(now, req->start_time + ent) && + if (ent && req_peer && + time_after(now, req_peer->start_time + ent) && !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) { drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n"); _drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); } - if (dt && req->rq_state & RQ_LOCAL_PENDING && req->device == device && - time_after(now, req->start_time + dt) && + if (dt && req_disk && + time_after(now, req_disk->start_time + dt) && !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) { drbd_warn(device, "Local backing device failed to meet the disk-timeout\n"); __drbd_chk_io_error(device, DRBD_FORCE_DETACH); } - nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; + + /* Reschedule timer for the nearest not already expired timeout. + * Fallback to now + min(effective network timeout, disk timeout). */ + ent = (ent && req_peer && time_before(now, req_peer->start_time + ent)) + ? req_peer->start_time + ent : now + et; + dt = (dt && req_disk && time_before(now, req_disk->start_time + dt)) + ? req_disk->start_time + dt : now + et; + nt = time_before(ent, dt) ? ent : dt; spin_unlock_irq(&connection->resource->req_lock); mod_timer(&device->request_timer, nt); } diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index c684c963538..8566cd5866b 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -30,7 +30,6 @@ #include <linux/slab.h> #include <linux/drbd.h> #include "drbd_int.h" -#include "drbd_wrappers.h" /* The request callbacks will be called in irq context by the IDE drivers, and in Softirqs/Tasklets/BH context by the SCSI drivers, @@ -111,11 +110,14 @@ enum drbd_req_event { BARRIER_ACKED, /* in protocol A and B */ DATA_RECEIVED, /* (remote read) */ + COMPLETED_OK, READ_COMPLETED_WITH_ERROR, READ_AHEAD_COMPLETED_WITH_ERROR, WRITE_COMPLETED_WITH_ERROR, + DISCARD_COMPLETED_NOTSUPP, + DISCARD_COMPLETED_WITH_ERROR, + ABORT_DISK_IO, - COMPLETED_OK, RESEND, FAIL_FROZEN_DISK_IO, RESTART_FROZEN_DISK_IO, diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 1a84345a386..a5d8aae00e0 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -54,8 +54,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state); static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *); static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns); -static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state ns, - enum sanitize_state_warnings *warn); +static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os, + union drbd_state ns, enum sanitize_state_warnings *warn); static inline bool is_susp(union drbd_state s) { @@ -287,7 +287,7 @@ _req_st_cond(struct drbd_device *device, union drbd_state mask, spin_lock_irqsave(&device->resource->req_lock, flags); os = drbd_read_state(device); - ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL); + ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); rv = is_valid_transition(os, ns); if (rv >= SS_SUCCESS) rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ @@ -333,7 +333,7 @@ drbd_req_state(struct drbd_device *device, union drbd_state mask, spin_lock_irqsave(&device->resource->req_lock, flags); os = drbd_read_state(device); - ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL); + ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); rv = is_valid_transition(os, ns); if (rv < SS_SUCCESS) { spin_unlock_irqrestore(&device->resource->req_lock, flags); @@ -740,8 +740,8 @@ static void print_sanitize_warnings(struct drbd_device *device, enum sanitize_st * When we loose connection, we have to set the state of the peers disk (pdsk) * to D_UNKNOWN. This rule and many more along those lines are in this function. */ -static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state ns, - enum sanitize_state_warnings *warn) +static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os, + union drbd_state ns, enum sanitize_state_warnings *warn) { enum drbd_fencing_p fp; enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; @@ -882,11 +882,13 @@ static union drbd_state sanitize_state(struct drbd_device *device, union drbd_st } if (fp == FP_STONITH && - (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED)) + (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && + !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ if (device->resource->res_opts.on_no_data == OND_SUSPEND_IO && - (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) + (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) && + !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE)) ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */ if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { @@ -958,7 +960,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, os = drbd_read_state(device); - ns = sanitize_state(device, ns, &ssw); + ns = sanitize_state(device, os, ns, &ssw); if (ns.i == os.i) return SS_NOTHING_TO_DO; @@ -1656,7 +1658,7 @@ conn_is_valid_transition(struct drbd_connection *connection, union drbd_state ma idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { struct drbd_device *device = peer_device->device; os = drbd_read_state(device); - ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL); + ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) ns.disk = os.disk; @@ -1718,7 +1720,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union number_of_volumes++; os = drbd_read_state(device); ns = apply_mask_val(os, mask, val); - ns = sanitize_state(device, ns, NULL); + ns = sanitize_state(device, os, ns, NULL); if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) ns.disk = os.disk; @@ -1763,19 +1765,19 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union static enum drbd_state_rv _conn_rq_cond(struct drbd_connection *connection, union drbd_state mask, union drbd_state val) { - enum drbd_state_rv rv; + enum drbd_state_rv err, rv = SS_UNKNOWN_ERROR; /* continue waiting */; if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &connection->flags)) - return SS_CW_SUCCESS; + rv = SS_CW_SUCCESS; if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &connection->flags)) - return SS_CW_FAILED_BY_PEER; + rv = SS_CW_FAILED_BY_PEER; - rv = conn_is_valid_transition(connection, mask, val, 0); - if (rv == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS) - rv = SS_UNKNOWN_ERROR; /* continue waiting */ + err = conn_is_valid_transition(connection, mask, val, 0); + if (err == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS) + return rv; - return rv; + return err; } enum drbd_state_rv diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 2c4ce42c365..d8f57b6305c 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -118,7 +118,7 @@ static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __rele /* writes on behalf of the partner, or resync writes, * "submitted" by the receiver, final stage. */ -static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local) +void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local) { unsigned long flags = 0; struct drbd_peer_device *peer_device = peer_req->peer_device; @@ -150,7 +150,9 @@ static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __rel do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee); - if (test_bit(__EE_WAS_ERROR, &peer_req->flags)) + /* FIXME do we want to detach for failed REQ_DISCARD? + * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */ + if (peer_req->flags & EE_WAS_ERROR) __drbd_chk_io_error(device, DRBD_WRITE_ERROR); spin_unlock_irqrestore(&device->resource->req_lock, flags); @@ -176,10 +178,12 @@ void drbd_peer_request_endio(struct bio *bio, int error) struct drbd_device *device = peer_req->peer_device->device; int uptodate = bio_flagged(bio, BIO_UPTODATE); int is_write = bio_data_dir(bio) == WRITE; + int is_discard = !!(bio->bi_rw & REQ_DISCARD); if (error && __ratelimit(&drbd_ratelimit_state)) drbd_warn(device, "%s: error=%d s=%llus\n", - is_write ? "write" : "read", error, + is_write ? (is_discard ? "discard" : "write") + : "read", error, (unsigned long long)peer_req->i.sector); if (!error && !uptodate) { if (__ratelimit(&drbd_ratelimit_state)) @@ -263,7 +267,12 @@ void drbd_request_endio(struct bio *bio, int error) /* to avoid recursion in __req_mod */ if (unlikely(error)) { - what = (bio_data_dir(bio) == WRITE) + if (bio->bi_rw & REQ_DISCARD) + what = (error == -EOPNOTSUPP) + ? DISCARD_COMPLETED_NOTSUPP + : DISCARD_COMPLETED_WITH_ERROR; + else + what = (bio_data_dir(bio) == WRITE) ? WRITE_COMPLETED_WITH_ERROR : (bio_rw(bio) == READ) ? READ_COMPLETED_WITH_ERROR @@ -395,7 +404,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, /* GFP_TRY, because if there is no memory available right now, this may * be rescheduled for later. It is "only" background resync, after all. */ peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector, - size, GFP_TRY); + size, true /* has real payload */, GFP_TRY); if (!peer_req) goto defer; @@ -492,10 +501,9 @@ struct fifo_buffer *fifo_alloc(int fifo_size) return fb; } -static int drbd_rs_controller(struct drbd_device *device) +static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in) { struct disk_conf *dc; - unsigned int sect_in; /* Number of sectors that came in since the last turn */ unsigned int want; /* The number of sectors we want in the proxy */ int req_sect; /* Number of sectors to request in this turn */ int correction; /* Number of sectors more we need in the proxy*/ @@ -505,9 +513,6 @@ static int drbd_rs_controller(struct drbd_device *device) int max_sect; struct fifo_buffer *plan; - sect_in = atomic_xchg(&device->rs_sect_in, 0); /* Number of sectors that came in */ - device->rs_in_flight -= sect_in; - dc = rcu_dereference(device->ldev->disk_conf); plan = rcu_dereference(device->rs_plan_s); @@ -550,11 +555,16 @@ static int drbd_rs_controller(struct drbd_device *device) static int drbd_rs_number_requests(struct drbd_device *device) { - int number; + unsigned int sect_in; /* Number of sectors that came in since the last turn */ + int number, mxb; + + sect_in = atomic_xchg(&device->rs_sect_in, 0); + device->rs_in_flight -= sect_in; rcu_read_lock(); + mxb = drbd_get_max_buffers(device) / 2; if (rcu_dereference(device->rs_plan_s)->size) { - number = drbd_rs_controller(device) >> (BM_BLOCK_SHIFT - 9); + number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9); device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME; } else { device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate; @@ -562,8 +572,14 @@ static int drbd_rs_number_requests(struct drbd_device *device) } rcu_read_unlock(); - /* ignore the amount of pending requests, the resync controller should - * throttle down to incoming reply rate soon enough anyways. */ + /* Don't have more than "max-buffers"/2 in-flight. + * Otherwise we may cause the remote site to stall on drbd_alloc_pages(), + * potentially causing a distributed deadlock on congestion during + * online-verify or (checksum-based) resync, if max-buffers, + * socket buffer sizes and resync rate settings are mis-configured. */ + if (mxb - device->rs_in_flight < number) + number = mxb - device->rs_in_flight; + return number; } @@ -597,7 +613,7 @@ static int make_resync_request(struct drbd_device *device, int cancel) max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9; number = drbd_rs_number_requests(device); - if (number == 0) + if (number <= 0) goto requeue; for (i = 0; i < number; i++) { @@ -647,7 +663,7 @@ next_sector: */ align = 1; rollback_i = i; - for (;;) { + while (i < number) { if (size + BM_BLOCK_SIZE > max_bio_size) break; @@ -1670,11 +1686,15 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) } clear_bit(B_RS_H_DONE, &device->flags); - write_lock_irq(&global_state_lock); + /* req_lock: serialize with drbd_send_and_submit() and others + * global_state_lock: for stable sync-after dependencies */ + spin_lock_irq(&device->resource->req_lock); + write_lock(&global_state_lock); /* Did some connection breakage or IO error race with us? */ if (device->state.conn < C_CONNECTED || !get_ldev_if_state(device, D_NEGOTIATING)) { - write_unlock_irq(&global_state_lock); + write_unlock(&global_state_lock); + spin_unlock_irq(&device->resource->req_lock); mutex_unlock(device->state_mutex); return; } @@ -1714,7 +1734,8 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) } _drbd_pause_after(device); } - write_unlock_irq(&global_state_lock); + write_unlock(&global_state_lock); + spin_unlock_irq(&device->resource->req_lock); if (r == SS_SUCCESS) { /* reset rs_last_bcast when a resync or verify is started, @@ -1778,34 +1799,6 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) mutex_unlock(device->state_mutex); } -/* If the resource already closed the current epoch, but we did not - * (because we have not yet seen new requests), we should send the - * corresponding barrier now. Must be checked within the same spinlock - * that is used to check for new requests. */ -static bool need_to_send_barrier(struct drbd_connection *connection) -{ - if (!connection->send.seen_any_write_yet) - return false; - - /* Skip barriers that do not contain any writes. - * This may happen during AHEAD mode. */ - if (!connection->send.current_epoch_writes) - return false; - - /* ->req_lock is held when requests are queued on - * connection->sender_work, and put into ->transfer_log. - * It is also held when ->current_tle_nr is increased. - * So either there are already new requests queued, - * and corresponding barriers will be send there. - * Or nothing new is queued yet, so the difference will be 1. - */ - if (atomic_read(&connection->current_tle_nr) != - connection->send.current_epoch_nr + 1) - return false; - - return true; -} - static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list) { spin_lock_irq(&queue->q_lock); @@ -1864,12 +1857,22 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head * spin_unlock_irq(&connection->resource->req_lock); break; } - send_barrier = need_to_send_barrier(connection); + + /* We found nothing new to do, no to-be-communicated request, + * no other work item. We may still need to close the last + * epoch. Next incoming request epoch will be connection -> + * current transfer log epoch number. If that is different + * from the epoch of the last request we communicated, it is + * safe to send the epoch separating barrier now. + */ + send_barrier = + atomic_read(&connection->current_tle_nr) != + connection->send.current_epoch_nr; spin_unlock_irq(&connection->resource->req_lock); - if (send_barrier) { - drbd_send_barrier(connection); - connection->send.current_epoch_nr++; - } + + if (send_barrier) + maybe_send_barrier(connection, + connection->send.current_epoch_nr + 1); schedule(); /* may be woken up for other things but new work, too, * e.g. if the current epoch got closed. diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h deleted file mode 100644 index 3db9ebaf64f..00000000000 --- a/drivers/block/drbd/drbd_wrappers.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef _DRBD_WRAPPERS_H -#define _DRBD_WRAPPERS_H - -#include <linux/ctype.h> -#include <linux/mm.h> -#include "drbd_int.h" - -/* see get_sb_bdev and bd_claim */ -extern char *drbd_sec_holder; - -/* sets the number of 512 byte sectors of our virtual device */ -static inline void drbd_set_my_capacity(struct drbd_device *device, - sector_t size) -{ - /* set_capacity(device->this_bdev->bd_disk, size); */ - set_capacity(device->vdisk, size); - device->this_bdev->bd_inode->i_size = (loff_t)size << 9; -} - -#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE) - -/* bi_end_io handlers */ -extern void drbd_md_io_complete(struct bio *bio, int error); -extern void drbd_peer_request_endio(struct bio *bio, int error); -extern void drbd_request_endio(struct bio *bio, int error); - -/* - * used to submit our private bio - */ -static inline void drbd_generic_make_request(struct drbd_device *device, - int fault_type, struct bio *bio) -{ - __release(local); - if (!bio->bi_bdev) { - printk(KERN_ERR "drbd%d: drbd_generic_make_request: " - "bio->bi_bdev == NULL\n", - device_to_minor(device)); - dump_stack(); - bio_endio(bio, -ENODEV); - return; - } - - if (drbd_insert_fault(device, fault_type)) - bio_endio(bio, -EIO); - else - generic_make_request(bio); -} - -#ifndef __CHECKER__ -# undef __cond_lock -# define __cond_lock(x,c) (c) -#endif - -#endif diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 8f5565bf34c..677db049f55 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2351,7 +2351,7 @@ static void rw_interrupt(void) } if (CT(COMMAND) != FD_READ || - raw_cmd->kernel_data == current_req->buffer) { + raw_cmd->kernel_data == bio_data(current_req->bio)) { /* transfer directly from buffer */ cont->done(1); } else if (CT(COMMAND) == FD_READ) { @@ -2640,7 +2640,7 @@ static int make_raw_rw_request(void) raw_cmd->flags &= ~FD_RAW_WRITE; raw_cmd->flags |= FD_RAW_READ; COMMAND = FM_MODE(_floppy, FD_READ); - } else if ((unsigned long)current_req->buffer < MAX_DMA_ADDRESS) { + } else if ((unsigned long)bio_data(current_req->bio) < MAX_DMA_ADDRESS) { unsigned long dma_limit; int direct, indirect; @@ -2654,13 +2654,13 @@ static int make_raw_rw_request(void) */ max_size = buffer_chain_size(); dma_limit = (MAX_DMA_ADDRESS - - ((unsigned long)current_req->buffer)) >> 9; + ((unsigned long)bio_data(current_req->bio))) >> 9; if ((unsigned long)max_size > dma_limit) max_size = dma_limit; /* 64 kb boundaries */ - if (CROSS_64KB(current_req->buffer, max_size << 9)) + if (CROSS_64KB(bio_data(current_req->bio), max_size << 9)) max_size = (K_64 - - ((unsigned long)current_req->buffer) % + ((unsigned long)bio_data(current_req->bio)) % K_64) >> 9; direct = transfer_size(ssize, max_sector, max_size) - fsector_t; /* @@ -2677,7 +2677,7 @@ static int make_raw_rw_request(void) (DP->read_track & (1 << DRS->probed_format)))))) { max_size = blk_rq_sectors(current_req); } else { - raw_cmd->kernel_data = current_req->buffer; + raw_cmd->kernel_data = bio_data(current_req->bio); raw_cmd->length = current_count_sectors << 9; if (raw_cmd->length == 0) { DPRINT("%s: zero dma transfer attempted\n", __func__); @@ -2731,7 +2731,7 @@ static int make_raw_rw_request(void) raw_cmd->length = ((raw_cmd->length - 1) | (ssize - 1)) + 1; raw_cmd->length <<= 9; if ((raw_cmd->length < current_count_sectors << 9) || - (raw_cmd->kernel_data != current_req->buffer && + (raw_cmd->kernel_data != bio_data(current_req->bio) && CT(COMMAND) == FD_WRITE && (aligned_sector_t + (raw_cmd->length >> 9) > buffer_max || aligned_sector_t < buffer_min)) || @@ -2739,7 +2739,7 @@ static int make_raw_rw_request(void) raw_cmd->length <= 0 || current_count_sectors <= 0) { DPRINT("fractionary current count b=%lx s=%lx\n", raw_cmd->length, current_count_sectors); - if (raw_cmd->kernel_data != current_req->buffer) + if (raw_cmd->kernel_data != bio_data(current_req->bio)) pr_info("addr=%d, length=%ld\n", (int)((raw_cmd->kernel_data - floppy_track_buffer) >> 9), @@ -2756,7 +2756,7 @@ static int make_raw_rw_request(void) return 0; } - if (raw_cmd->kernel_data != current_req->buffer) { + if (raw_cmd->kernel_data != bio_data(current_req->bio)) { if (raw_cmd->kernel_data < floppy_track_buffer || current_count_sectors < 0 || raw_cmd->length < 0 || @@ -3067,7 +3067,10 @@ static int raw_cmd_copyout(int cmd, void __user *param, int ret; while (ptr) { - ret = copy_to_user(param, ptr, sizeof(*ptr)); + struct floppy_raw_cmd cmd = *ptr; + cmd.next = NULL; + cmd.kernel_data = NULL; + ret = copy_to_user(param, &cmd, sizeof(cmd)); if (ret) return -EFAULT; param += sizeof(struct floppy_raw_cmd); @@ -3121,10 +3124,11 @@ loop: return -ENOMEM; *rcmd = ptr; ret = copy_from_user(ptr, param, sizeof(*ptr)); - if (ret) - return -EFAULT; ptr->next = NULL; ptr->buffer_length = 0; + ptr->kernel_data = NULL; + if (ret) + return -EFAULT; param += sizeof(struct floppy_raw_cmd); if (ptr->cmd_count > 33) /* the command may now also take up the space @@ -3140,7 +3144,6 @@ loop: for (i = 0; i < 16; i++) ptr->reply[i] = 0; ptr->resultcode = 0; - ptr->kernel_data = NULL; if (ptr->flags & (FD_RAW_READ | FD_RAW_WRITE)) { if (ptr->length <= 0) @@ -3809,7 +3812,7 @@ static int __floppy_read_block_0(struct block_device *bdev, int drive) bio.bi_iter.bi_size = size; bio.bi_bdev = bdev; bio.bi_iter.bi_sector = 0; - bio.bi_flags = (1 << BIO_QUIET); + bio.bi_flags |= (1 << BIO_QUIET); bio.bi_private = &cbdata; bio.bi_end_io = floppy_rb0_cb; diff --git a/drivers/block/hd.c b/drivers/block/hd.c index bf397bf108b..8a290c08262 100644 --- a/drivers/block/hd.c +++ b/drivers/block/hd.c @@ -464,11 +464,11 @@ static void read_intr(void) ok_to_read: req = hd_req; - insw(HD_DATA, req->buffer, 256); + insw(HD_DATA, bio_data(req->bio), 256); #ifdef DEBUG printk("%s: read: sector %ld, remaining = %u, buffer=%p\n", req->rq_disk->disk_name, blk_rq_pos(req) + 1, - blk_rq_sectors(req) - 1, req->buffer+512); + blk_rq_sectors(req) - 1, bio_data(req->bio)+512); #endif if (hd_end_request(0, 512)) { SET_HANDLER(&read_intr); @@ -505,7 +505,7 @@ static void write_intr(void) ok_to_write: if (hd_end_request(0, 512)) { SET_HANDLER(&write_intr); - outsw(HD_DATA, req->buffer, 256); + outsw(HD_DATA, bio_data(req->bio), 256); return; } @@ -624,7 +624,7 @@ repeat: printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n", req->rq_disk->disk_name, req_data_dir(req) == READ ? "read" : "writ", - cyl, head, sec, nsect, req->buffer); + cyl, head, sec, nsect, bio_data(req->bio)); #endif if (req->cmd_type == REQ_TYPE_FS) { switch (rq_data_dir(req)) { @@ -643,7 +643,7 @@ repeat: bad_rw_intr(); goto repeat; } - outsw(HD_DATA, req->buffer, 256); + outsw(HD_DATA, bio_data(req->bio), 256); break; default: printk("unknown hd-command\n"); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f70a230a294..6cb1beb47c2 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -548,7 +548,7 @@ static int loop_thread(void *data) struct loop_device *lo = data; struct bio *bio; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) { diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index eb59b124136..e352cac707e 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -479,7 +479,7 @@ static unsigned int mg_out(struct mg_host *host, static void mg_read_one(struct mg_host *host, struct request *req) { - u16 *buff = (u16 *)req->buffer; + u16 *buff = (u16 *)bio_data(req->bio); u32 i; for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) @@ -496,7 +496,7 @@ static void mg_read(struct request *req) mg_bad_rw_intr(host); MG_DBG("requested %d sects (from %ld), buffer=0x%p\n", - blk_rq_sectors(req), blk_rq_pos(req), req->buffer); + blk_rq_sectors(req), blk_rq_pos(req), bio_data(req->bio)); do { if (mg_wait(host, ATA_DRQ, @@ -514,7 +514,7 @@ static void mg_read(struct request *req) static void mg_write_one(struct mg_host *host, struct request *req) { - u16 *buff = (u16 *)req->buffer; + u16 *buff = (u16 *)bio_data(req->bio); u32 i; for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) @@ -534,7 +534,7 @@ static void mg_write(struct request *req) } MG_DBG("requested %d sects (from %ld), buffer=0x%p\n", - rem, blk_rq_pos(req), req->buffer); + rem, blk_rq_pos(req), bio_data(req->bio)); if (mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { @@ -585,7 +585,7 @@ ok_to_read: mg_read_one(host, req); MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", - blk_rq_pos(req), blk_rq_sectors(req) - 1, req->buffer); + blk_rq_pos(req), blk_rq_sectors(req) - 1, bio_data(req->bio)); /* send read confirm */ outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); @@ -624,7 +624,7 @@ ok_to_write: /* write 1 sector and set handler if remains */ mg_write_one(host, req); MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", - blk_rq_pos(req), blk_rq_sectors(req), req->buffer); + blk_rq_pos(req), blk_rq_sectors(req), bio_data(req->bio)); host->mg_do_intr = mg_write_intr; mod_timer(&host->timer, jiffies + 3 * HZ); } diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 59c5abe32f0..295f3afbbef 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -31,6 +31,7 @@ #include <linux/module.h> #include <linux/genhd.h> #include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/bio.h> #include <linux/dma-mapping.h> #include <linux/idr.h> @@ -38,6 +39,7 @@ #include <../drivers/ata/ahci.h> #include <linux/export.h> #include <linux/debugfs.h> +#include <linux/prefetch.h> #include "mtip32xx.h" #define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32) @@ -173,60 +175,36 @@ static bool mtip_check_surprise_removal(struct pci_dev *pdev) return false; /* device present */ } -/* - * Obtain an empty command slot. - * - * This function needs to be reentrant since it could be called - * at the same time on multiple CPUs. The allocation of the - * command slot must be atomic. - * - * @port Pointer to the port data structure. - * - * return value - * >= 0 Index of command slot obtained. - * -1 No command slots available. - */ -static int get_slot(struct mtip_port *port) +static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd) { - int slot, i; - unsigned int num_command_slots = port->dd->slot_groups * 32; + struct request *rq; - /* - * Try 10 times, because there is a small race here. - * that's ok, because it's still cheaper than a lock. - * - * Race: Since this section is not protected by lock, same bit - * could be chosen by different process contexts running in - * different processor. So instead of costly lock, we are going - * with loop. - */ - for (i = 0; i < 10; i++) { - slot = find_next_zero_bit(port->allocated, - num_command_slots, 1); - if ((slot < num_command_slots) && - (!test_and_set_bit(slot, port->allocated))) - return slot; - } - dev_warn(&port->dd->pdev->dev, "Failed to get a tag.\n"); + rq = blk_mq_alloc_request(dd->queue, 0, __GFP_WAIT, true); + return blk_mq_rq_to_pdu(rq); +} - mtip_check_surprise_removal(port->dd->pdev); - return -1; +static void mtip_put_int_command(struct driver_data *dd, struct mtip_cmd *cmd) +{ + blk_put_request(blk_mq_rq_from_pdu(cmd)); } /* - * Release a command slot. - * - * @port Pointer to the port data structure. - * @tag Tag of command to release - * - * return value - * None + * Once we add support for one hctx per mtip group, this will change a bit */ -static inline void release_slot(struct mtip_port *port, int tag) +static struct request *mtip_rq_from_tag(struct driver_data *dd, + unsigned int tag) +{ + struct blk_mq_hw_ctx *hctx = dd->queue->queue_hw_ctx[0]; + + return blk_mq_tag_to_rq(hctx->tags, tag); +} + +static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd, + unsigned int tag) { - smp_mb__before_clear_bit(); - clear_bit(tag, port->allocated); - smp_mb__after_clear_bit(); + struct request *rq = mtip_rq_from_tag(dd, tag); + + return blk_mq_rq_to_pdu(rq); } /* @@ -248,93 +226,28 @@ static inline void release_slot(struct mtip_port *port, int tag) * None */ static void mtip_async_complete(struct mtip_port *port, - int tag, - void *data, - int status) + int tag, struct mtip_cmd *cmd, int status) { - struct mtip_cmd *cmd; - struct driver_data *dd = data; - int unaligned, cb_status = status ? -EIO : 0; - void (*func)(void *, int); + struct driver_data *dd = port->dd; + struct request *rq; if (unlikely(!dd) || unlikely(!port)) return; - cmd = &port->commands[tag]; - if (unlikely(status == PORT_IRQ_TF_ERR)) { dev_warn(&port->dd->pdev->dev, "Command tag %d failed due to TFE\n", tag); } - /* Clear the active flag */ - atomic_set(&port->commands[tag].active, 0); - - /* Upper layer callback */ - func = cmd->async_callback; - if (likely(func && cmpxchg(&cmd->async_callback, func, 0) == func)) { + /* Unmap the DMA scatter list entries */ + dma_unmap_sg(&dd->pdev->dev, cmd->sg, cmd->scatter_ents, cmd->direction); - /* Unmap the DMA scatter list entries */ - dma_unmap_sg(&dd->pdev->dev, - cmd->sg, - cmd->scatter_ents, - cmd->direction); + rq = mtip_rq_from_tag(dd, tag); - func(cmd->async_data, cb_status); - unaligned = cmd->unaligned; + if (unlikely(cmd->unaligned)) + up(&port->cmd_slot_unal); - /* Clear the allocated bit for the command */ - release_slot(port, tag); - - if (unlikely(unaligned)) - up(&port->cmd_slot_unal); - else - up(&port->cmd_slot); - } -} - -/* - * This function is called for clean the pending command in the - * command slot during the surprise removal of device and return - * error to the upper layer. - * - * @dd Pointer to the DRIVER_DATA structure. - * - * return value - * None - */ -static void mtip_command_cleanup(struct driver_data *dd) -{ - int tag = 0; - struct mtip_cmd *cmd; - struct mtip_port *port = dd->port; - unsigned int num_cmd_slots = dd->slot_groups * 32; - - if (!test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) - return; - - if (!port) - return; - - cmd = &port->commands[MTIP_TAG_INTERNAL]; - if (atomic_read(&cmd->active)) - if (readl(port->cmd_issue[MTIP_TAG_INTERNAL]) & - (1 << MTIP_TAG_INTERNAL)) - if (cmd->comp_func) - cmd->comp_func(port, MTIP_TAG_INTERNAL, - cmd->comp_data, -ENODEV); - - while (1) { - tag = find_next_bit(port->allocated, num_cmd_slots, tag); - if (tag >= num_cmd_slots) - break; - - cmd = &port->commands[tag]; - if (atomic_read(&cmd->active)) - mtip_async_complete(port, tag, dd, -ENODEV); - } - - set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag); + blk_mq_end_io(rq, status ? -EIO : 0); } /* @@ -388,8 +301,6 @@ static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag) { int group = tag >> 5; - atomic_set(&port->commands[tag].active, 1); - /* guard SACT and CI registers */ spin_lock(&port->cmd_issue_lock[group]); writel((1 << MTIP_TAG_BIT(tag)), @@ -397,10 +308,6 @@ static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag) writel((1 << MTIP_TAG_BIT(tag)), port->cmd_issue[MTIP_TAG_INDEX(tag)]); spin_unlock(&port->cmd_issue_lock[group]); - - /* Set the command's timeout value.*/ - port->commands[tag].comp_time = jiffies + msecs_to_jiffies( - MTIP_NCQ_COMMAND_TIMEOUT_MS); } /* @@ -648,132 +555,13 @@ static void print_tags(struct driver_data *dd, memset(tagmap, 0, sizeof(tagmap)); for (group = SLOTBITS_IN_LONGS; group > 0; group--) - tagmap_len = sprintf(tagmap + tagmap_len, "%016lX ", + tagmap_len += sprintf(tagmap + tagmap_len, "%016lX ", tagbits[group-1]); dev_warn(&dd->pdev->dev, "%d command(s) %s: tagmap [%s]", cnt, msg, tagmap); } /* - * Called periodically to see if any read/write commands are - * taking too long to complete. - * - * @data Pointer to the PORT data structure. - * - * return value - * None - */ -static void mtip_timeout_function(unsigned long int data) -{ - struct mtip_port *port = (struct mtip_port *) data; - struct host_to_dev_fis *fis; - struct mtip_cmd *cmd; - int unaligned, tag, cmdto_cnt = 0; - unsigned int bit, group; - unsigned int num_command_slots; - unsigned long to, tagaccum[SLOTBITS_IN_LONGS]; - void (*func)(void *, int); - - if (unlikely(!port)) - return; - - if (unlikely(port->dd->sr)) - return; - - if (test_bit(MTIP_DDF_RESUME_BIT, &port->dd->dd_flag)) { - mod_timer(&port->cmd_timer, - jiffies + msecs_to_jiffies(30000)); - return; - } - /* clear the tag accumulator */ - memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long)); - num_command_slots = port->dd->slot_groups * 32; - - for (tag = 0; tag < num_command_slots; tag++) { - /* - * Skip internal command slot as it has - * its own timeout mechanism - */ - if (tag == MTIP_TAG_INTERNAL) - continue; - - if (atomic_read(&port->commands[tag].active) && - (time_after(jiffies, port->commands[tag].comp_time))) { - group = tag >> 5; - bit = tag & 0x1F; - - cmd = &port->commands[tag]; - fis = (struct host_to_dev_fis *) cmd->command; - - set_bit(tag, tagaccum); - cmdto_cnt++; - if (cmdto_cnt == 1) - set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); - - /* - * Clear the completed bit. This should prevent - * any interrupt handlers from trying to retire - * the command. - */ - writel(1 << bit, port->completed[group]); - - /* Clear the active flag for the command */ - atomic_set(&port->commands[tag].active, 0); - - func = cmd->async_callback; - if (func && - cmpxchg(&cmd->async_callback, func, 0) == func) { - - /* Unmap the DMA scatter list entries */ - dma_unmap_sg(&port->dd->pdev->dev, - cmd->sg, - cmd->scatter_ents, - cmd->direction); - - func(cmd->async_data, -EIO); - unaligned = cmd->unaligned; - - /* Clear the allocated bit for the command. */ - release_slot(port, tag); - - if (unaligned) - up(&port->cmd_slot_unal); - else - up(&port->cmd_slot); - } - } - } - - if (cmdto_cnt) { - print_tags(port->dd, "timed out", tagaccum, cmdto_cnt); - if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { - mtip_device_reset(port->dd); - wake_up_interruptible(&port->svc_wait); - } - clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); - } - - if (port->ic_pause_timer) { - to = port->ic_pause_timer + msecs_to_jiffies(1000); - if (time_after(jiffies, to)) { - if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { - port->ic_pause_timer = 0; - clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); - clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags); - clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); - wake_up_interruptible(&port->svc_wait); - } - - - } - } - - /* Restart the timer */ - mod_timer(&port->cmd_timer, - jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD)); -} - -/* * Internal command completion callback function. * * This function is normally called by the driver ISR when an internal @@ -789,28 +577,19 @@ static void mtip_timeout_function(unsigned long int data) * None */ static void mtip_completion(struct mtip_port *port, - int tag, - void *data, - int status) + int tag, struct mtip_cmd *command, int status) { - struct mtip_cmd *command = &port->commands[tag]; - struct completion *waiting = data; + struct completion *waiting = command->comp_data; if (unlikely(status == PORT_IRQ_TF_ERR)) dev_warn(&port->dd->pdev->dev, "Internal command %d completed with TFE\n", tag); - command->async_callback = NULL; - command->comp_func = NULL; - complete(waiting); } static void mtip_null_completion(struct mtip_port *port, - int tag, - void *data, - int status) + int tag, struct mtip_cmd *command, int status) { - return; } static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer, @@ -842,19 +621,16 @@ static void mtip_handle_tfe(struct driver_data *dd) port = dd->port; - /* Stop the timer to prevent command timeouts. */ - del_timer(&port->cmd_timer); set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) && test_bit(MTIP_TAG_INTERNAL, port->allocated)) { - cmd = &port->commands[MTIP_TAG_INTERNAL]; + cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL); dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n"); - atomic_inc(&cmd->active); /* active > 1 indicates error */ if (cmd->comp_data && cmd->comp_func) { cmd->comp_func(port, MTIP_TAG_INTERNAL, - cmd->comp_data, PORT_IRQ_TF_ERR); + cmd, PORT_IRQ_TF_ERR); } goto handle_tfe_exit; } @@ -866,6 +642,8 @@ static void mtip_handle_tfe(struct driver_data *dd) for (group = 0; group < dd->slot_groups; group++) { completed = readl(port->completed[group]); + dev_warn(&dd->pdev->dev, "g=%u, comp=%x\n", group, completed); + /* clear completed status register in the hardware.*/ writel(completed, port->completed[group]); @@ -879,15 +657,11 @@ static void mtip_handle_tfe(struct driver_data *dd) if (tag == MTIP_TAG_INTERNAL) continue; - cmd = &port->commands[tag]; + cmd = mtip_cmd_from_tag(dd, tag); if (likely(cmd->comp_func)) { set_bit(tag, tagaccum); cmd_cnt++; - atomic_set(&cmd->active, 0); - cmd->comp_func(port, - tag, - cmd->comp_data, - 0); + cmd->comp_func(port, tag, cmd, 0); } else { dev_err(&port->dd->pdev->dev, "Missing completion func for tag %d", @@ -947,11 +721,7 @@ static void mtip_handle_tfe(struct driver_data *dd) for (bit = 0; bit < 32; bit++) { reissue = 1; tag = (group << 5) + bit; - cmd = &port->commands[tag]; - - /* If the active bit is set re-issue the command */ - if (atomic_read(&cmd->active) == 0) - continue; + cmd = mtip_cmd_from_tag(dd, tag); fis = (struct host_to_dev_fis *)cmd->command; @@ -970,11 +740,9 @@ static void mtip_handle_tfe(struct driver_data *dd) tag, fail_reason != NULL ? fail_reason : "unknown"); - atomic_set(&cmd->active, 0); if (cmd->comp_func) { cmd->comp_func(port, tag, - cmd->comp_data, - -ENODATA); + cmd, -ENODATA); } continue; } @@ -997,14 +765,9 @@ static void mtip_handle_tfe(struct driver_data *dd) /* Retire a command that will not be reissued */ dev_warn(&port->dd->pdev->dev, "retiring tag %d\n", tag); - atomic_set(&cmd->active, 0); if (cmd->comp_func) - cmd->comp_func( - port, - tag, - cmd->comp_data, - PORT_IRQ_TF_ERR); + cmd->comp_func(port, tag, cmd, PORT_IRQ_TF_ERR); else dev_warn(&port->dd->pdev->dev, "Bad completion for tag %d\n", @@ -1017,9 +780,6 @@ handle_tfe_exit: /* clear eh_active */ clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); wake_up_interruptible(&port->svc_wait); - - mod_timer(&port->cmd_timer, - jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD)); } /* @@ -1048,15 +808,10 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, if (unlikely(tag == MTIP_TAG_INTERNAL)) continue; - command = &port->commands[tag]; - /* make internal callback */ - if (likely(command->comp_func)) { - command->comp_func( - port, - tag, - command->comp_data, - 0); - } else { + command = mtip_cmd_from_tag(dd, tag); + if (likely(command->comp_func)) + command->comp_func(port, tag, command, 0); + else { dev_dbg(&dd->pdev->dev, "Null completion for tag %d", tag); @@ -1081,16 +836,13 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat) { struct mtip_port *port = dd->port; - struct mtip_cmd *cmd = &port->commands[MTIP_TAG_INTERNAL]; + struct mtip_cmd *cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL); if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) && (cmd != NULL) && !(readl(port->cmd_issue[MTIP_TAG_INTERNAL]) & (1 << MTIP_TAG_INTERNAL))) { if (cmd->comp_func) { - cmd->comp_func(port, - MTIP_TAG_INTERNAL, - cmd->comp_data, - 0); + cmd->comp_func(port, MTIP_TAG_INTERNAL, cmd, 0); return; } } @@ -1103,8 +855,6 @@ static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat) */ static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat) { - if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR))) - mtip_handle_tfe(dd); if (unlikely(port_stat & PORT_IRQ_CONNECT)) { dev_warn(&dd->pdev->dev, @@ -1122,6 +872,12 @@ static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat) dev_warn(&dd->pdev->dev, "Port stat errors %x unhandled\n", (port_stat & ~PORT_IRQ_HANDLED)); + if (mtip_check_surprise_removal(dd->pdev)) + return; + } + if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR))) { + set_bit(MTIP_PF_EH_ACTIVE_BIT, &dd->port->flags); + wake_up_interruptible(&dd->port->svc_wait); } } @@ -1222,7 +978,6 @@ static irqreturn_t mtip_irq_handler(int irq, void *instance) static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag) { - atomic_set(&port->commands[tag].active, 1); writel(1 << MTIP_TAG_BIT(tag), port->cmd_issue[MTIP_TAG_INDEX(tag)]); } @@ -1280,6 +1035,8 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout) unsigned int n; unsigned int active = 1; + blk_mq_stop_hw_queues(port->dd->queue); + to = jiffies + msecs_to_jiffies(timeout); do { if (test_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags) && @@ -1287,8 +1044,13 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout) msleep(20); continue; /* svc thd is actively issuing commands */ } + + msleep(100); + if (mtip_check_surprise_removal(port->dd->pdev)) + goto err_fault; if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag)) - return -EFAULT; + goto err_fault; + /* * Ignore s_active bit 0 of array element 0. * This bit will always be set @@ -1299,11 +1061,13 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout) if (!active) break; - - msleep(20); } while (time_before(jiffies, to)); + blk_mq_start_stopped_hw_queues(port->dd->queue, true); return active ? -EBUSY : 0; +err_fault: + blk_mq_start_stopped_hw_queues(port->dd->queue, true); + return -EFAULT; } /* @@ -1335,10 +1099,9 @@ static int mtip_exec_internal_command(struct mtip_port *port, { struct mtip_cmd_sg *command_sg; DECLARE_COMPLETION_ONSTACK(wait); - int rv = 0, ready2go = 1; - struct mtip_cmd *int_cmd = &port->commands[MTIP_TAG_INTERNAL]; - unsigned long to; + struct mtip_cmd *int_cmd; struct driver_data *dd = port->dd; + int rv = 0; /* Make sure the buffer is 8 byte aligned. This is asic specific. */ if (buffer & 0x00000007) { @@ -1346,19 +1109,8 @@ static int mtip_exec_internal_command(struct mtip_port *port, return -EFAULT; } - to = jiffies + msecs_to_jiffies(timeout); - do { - ready2go = !test_and_set_bit(MTIP_TAG_INTERNAL, - port->allocated); - if (ready2go) - break; - mdelay(100); - } while (time_before(jiffies, to)); - if (!ready2go) { - dev_warn(&dd->pdev->dev, - "Internal cmd active. new cmd [%02X]\n", fis->command); - return -EBUSY; - } + int_cmd = mtip_get_int_command(dd); + set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); port->ic_pause_timer = 0; @@ -1368,10 +1120,11 @@ static int mtip_exec_internal_command(struct mtip_port *port, if (atomic == GFP_KERNEL) { if (fis->command != ATA_CMD_STANDBYNOW1) { /* wait for io to complete if non atomic */ - if (mtip_quiesce_io(port, 5000) < 0) { + if (mtip_quiesce_io(port, + MTIP_QUIESCE_IO_TIMEOUT_MS) < 0) { dev_warn(&dd->pdev->dev, "Failed to quiesce IO\n"); - release_slot(port, MTIP_TAG_INTERNAL); + mtip_put_int_command(dd, int_cmd); clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); wake_up_interruptible(&port->svc_wait); return -EBUSY; @@ -1416,9 +1169,9 @@ static int mtip_exec_internal_command(struct mtip_port *port, if (atomic == GFP_KERNEL) { /* Wait for the command to complete or timeout. */ - if (wait_for_completion_interruptible_timeout( + if ((rv = wait_for_completion_interruptible_timeout( &wait, - msecs_to_jiffies(timeout)) <= 0) { + msecs_to_jiffies(timeout))) <= 0) { if (rv == -ERESTARTSYS) { /* interrupted */ dev_err(&dd->pdev->dev, "Internal command [%02X] was interrupted after %lu ms\n", @@ -1497,8 +1250,7 @@ static int mtip_exec_internal_command(struct mtip_port *port, } exec_ic_exit: /* Clear the allocated and active bits for the internal command. */ - atomic_set(&int_cmd->active, 0); - release_slot(port, MTIP_TAG_INTERNAL); + mtip_put_int_command(dd, int_cmd); if (rv >= 0 && mtip_pause_ncq(port, fis)) { /* NCQ paused */ return rv; @@ -1529,6 +1281,37 @@ static inline void ata_swap_string(u16 *buf, unsigned int len) be16_to_cpus(&buf[i]); } +static void mtip_set_timeout(struct driver_data *dd, + struct host_to_dev_fis *fis, + unsigned int *timeout, u8 erasemode) +{ + switch (fis->command) { + case ATA_CMD_DOWNLOAD_MICRO: + *timeout = 120000; /* 2 minutes */ + break; + case ATA_CMD_SEC_ERASE_UNIT: + case 0xFC: + if (erasemode) + *timeout = ((*(dd->port->identify + 90) * 2) * 60000); + else + *timeout = ((*(dd->port->identify + 89) * 2) * 60000); + break; + case ATA_CMD_STANDBYNOW1: + *timeout = 120000; /* 2 minutes */ + break; + case 0xF7: + case 0xFA: + *timeout = 60000; /* 60 seconds */ + break; + case ATA_CMD_SMART: + *timeout = 15000; /* 15 seconds */ + break; + default: + *timeout = MTIP_IOCTL_CMD_TIMEOUT_MS; + break; + } +} + /* * Request the device identity information. * @@ -1576,7 +1359,7 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer) sizeof(u16) * ATA_ID_WORDS, 0, GFP_KERNEL, - MTIP_INTERNAL_COMMAND_TIMEOUT_MS) + MTIP_INT_CMD_TIMEOUT_MS) < 0) { rv = -1; goto out; @@ -1644,6 +1427,7 @@ static int mtip_standby_immediate(struct mtip_port *port) int rv; struct host_to_dev_fis fis; unsigned long start; + unsigned int timeout; /* Build the FIS. */ memset(&fis, 0, sizeof(struct host_to_dev_fis)); @@ -1651,6 +1435,8 @@ static int mtip_standby_immediate(struct mtip_port *port) fis.opts = 1 << 7; fis.command = ATA_CMD_STANDBYNOW1; + mtip_set_timeout(port->dd, &fis, &timeout, 0); + start = jiffies; rv = mtip_exec_internal_command(port, &fis, @@ -1659,7 +1445,7 @@ static int mtip_standby_immediate(struct mtip_port *port) 0, 0, GFP_ATOMIC, - 15000); + timeout); dbg_printk(MTIP_DRV_NAME "Time taken to complete standby cmd: %d ms\n", jiffies_to_msecs(jiffies - start)); if (rv) @@ -1705,7 +1491,7 @@ static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer, sectors * ATA_SECT_SIZE, 0, GFP_ATOMIC, - MTIP_INTERNAL_COMMAND_TIMEOUT_MS); + MTIP_INT_CMD_TIMEOUT_MS); } /* @@ -1998,6 +1784,7 @@ static int exec_drive_task(struct mtip_port *port, u8 *command) { struct host_to_dev_fis fis; struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG); + unsigned int to; /* Build the FIS. */ memset(&fis, 0, sizeof(struct host_to_dev_fis)); @@ -2011,6 +1798,8 @@ static int exec_drive_task(struct mtip_port *port, u8 *command) fis.cyl_hi = command[5]; fis.device = command[6] & ~0x10; /* Clear the dev bit*/ + mtip_set_timeout(port->dd, &fis, &to, 0); + dbg_printk(MTIP_DRV_NAME " %s: User Command: cmd %x, feat %x, nsect %x, sect %x, lcyl %x, hcyl %x, sel %x\n", __func__, command[0], @@ -2029,7 +1818,7 @@ static int exec_drive_task(struct mtip_port *port, u8 *command) 0, 0, GFP_KERNEL, - MTIP_IOCTL_COMMAND_TIMEOUT_MS) < 0) { + to) < 0) { return -1; } @@ -2069,6 +1858,7 @@ static int exec_drive_command(struct mtip_port *port, u8 *command, u8 *buf = NULL; dma_addr_t dma_addr = 0; int rv = 0, xfer_sz = command[3]; + unsigned int to; if (xfer_sz) { if (!user_buffer) @@ -2100,6 +1890,8 @@ static int exec_drive_command(struct mtip_port *port, u8 *command, fis.cyl_hi = 0xC2; } + mtip_set_timeout(port->dd, &fis, &to, 0); + if (xfer_sz) reply = (port->rxfis + RX_FIS_PIO_SETUP); else @@ -2122,7 +1914,7 @@ static int exec_drive_command(struct mtip_port *port, u8 *command, (xfer_sz ? ATA_SECT_SIZE * xfer_sz : 0), 0, GFP_KERNEL, - MTIP_IOCTL_COMMAND_TIMEOUT_MS) + to) < 0) { rv = -EFAULT; goto exit_drive_command; @@ -2202,36 +1994,6 @@ static unsigned int implicit_sector(unsigned char command, } return rv; } -static void mtip_set_timeout(struct driver_data *dd, - struct host_to_dev_fis *fis, - unsigned int *timeout, u8 erasemode) -{ - switch (fis->command) { - case ATA_CMD_DOWNLOAD_MICRO: - *timeout = 120000; /* 2 minutes */ - break; - case ATA_CMD_SEC_ERASE_UNIT: - case 0xFC: - if (erasemode) - *timeout = ((*(dd->port->identify + 90) * 2) * 60000); - else - *timeout = ((*(dd->port->identify + 89) * 2) * 60000); - break; - case ATA_CMD_STANDBYNOW1: - *timeout = 120000; /* 2 minutes */ - break; - case 0xF7: - case 0xFA: - *timeout = 60000; /* 60 seconds */ - break; - case ATA_CMD_SMART: - *timeout = 15000; /* 15 seconds */ - break; - default: - *timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS; - break; - } -} /* * Executes a taskfile @@ -2606,22 +2368,23 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd, * return value * None */ -static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, - int nsect, int nents, int tag, void *callback, - void *data, int dir, int unaligned) +static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, + struct mtip_cmd *command, int nents, + struct blk_mq_hw_ctx *hctx) { struct host_to_dev_fis *fis; struct mtip_port *port = dd->port; - struct mtip_cmd *command = &port->commands[tag]; - int dma_dir = (dir == READ) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; - u64 start = sector; + int dma_dir = rq_data_dir(rq) == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + u64 start = blk_rq_pos(rq); + unsigned int nsect = blk_rq_sectors(rq); /* Map the scatter list for DMA access */ nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir); + prefetch(&port->flags); + command->scatter_ents = nents; - command->unaligned = unaligned; /* * The number of retries for this command before it is * reported as a failure to the upper layers. @@ -2632,8 +2395,10 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, fis = command->command; fis->type = 0x27; fis->opts = 1 << 7; - fis->command = - (dir == READ ? ATA_CMD_FPDMA_READ : ATA_CMD_FPDMA_WRITE); + if (dma_dir == DMA_FROM_DEVICE) + fis->command = ATA_CMD_FPDMA_READ; + else + fis->command = ATA_CMD_FPDMA_WRITE; fis->lba_low = start & 0xFF; fis->lba_mid = (start >> 8) & 0xFF; fis->lba_hi = (start >> 16) & 0xFF; @@ -2643,14 +2408,14 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, fis->device = 1 << 6; fis->features = nsect & 0xFF; fis->features_ex = (nsect >> 8) & 0xFF; - fis->sect_count = ((tag << 3) | (tag >> 5)); + fis->sect_count = ((rq->tag << 3) | (rq->tag >> 5)); fis->sect_cnt_ex = 0; fis->control = 0; fis->res2 = 0; fis->res3 = 0; fill_command_sg(dd, command, nents); - if (unaligned) + if (unlikely(command->unaligned)) fis->device |= 1 << 7; /* Populate the command header */ @@ -2668,81 +2433,17 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, command->direction = dma_dir; /* - * Set the completion function and data for the command passed - * from the upper layer. - */ - command->async_data = data; - command->async_callback = callback; - - /* * To prevent this command from being issued * if an internal command is in progress or error handling is active. */ - if (port->flags & MTIP_PF_PAUSE_IO) { - set_bit(tag, port->cmds_to_issue); + if (unlikely(port->flags & MTIP_PF_PAUSE_IO)) { + set_bit(rq->tag, port->cmds_to_issue); set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); return; } /* Issue the command to the hardware */ - mtip_issue_ncq_command(port, tag); - - return; -} - -/* - * Release a command slot. - * - * @dd Pointer to the driver data structure. - * @tag Slot tag - * - * return value - * None - */ -static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag, - int unaligned) -{ - struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal : - &dd->port->cmd_slot; - release_slot(dd->port, tag); - up(sem); -} - -/* - * Obtain a command slot and return its associated scatter list. - * - * @dd Pointer to the driver data structure. - * @tag Pointer to an int that will receive the allocated command - * slot tag. - * - * return value - * Pointer to the scatter list for the allocated command slot - * or NULL if no command slots are available. - */ -static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd, - int *tag, int unaligned) -{ - struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal : - &dd->port->cmd_slot; - - /* - * It is possible that, even with this semaphore, a thread - * may think that no command slots are available. Therefore, we - * need to make an attempt to get_slot(). - */ - down(sem); - *tag = get_slot(dd->port); - - if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) { - up(sem); - return NULL; - } - if (unlikely(*tag < 0)) { - up(sem); - return NULL; - } - - return dd->port->commands[*tag].sg; + mtip_issue_ncq_command(port, rq->tag); } /* @@ -3113,6 +2814,7 @@ static int mtip_free_orphan(struct driver_data *dd) if (dd->queue) { dd->queue->queuedata = NULL; blk_cleanup_queue(dd->queue); + blk_mq_free_tag_set(&dd->tags); dd->queue = NULL; } } @@ -3270,6 +2972,11 @@ static int mtip_service_thread(void *data) int ret; while (1) { + if (kthread_should_stop() || + test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags)) + goto st_out; + clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); + /* * the condition is to check neither an internal command is * is in progress nor error handling is active @@ -3277,11 +2984,12 @@ static int mtip_service_thread(void *data) wait_event_interruptible(port->svc_wait, (port->flags) && !(port->flags & MTIP_PF_PAUSE_IO)); - if (kthread_should_stop()) - goto st_out; - set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); + if (kthread_should_stop() || + test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags)) + goto st_out; + /* If I am an orphan, start self cleanup */ if (test_bit(MTIP_PF_SR_CLEANUP_BIT, &port->flags)) break; @@ -3290,6 +2998,16 @@ static int mtip_service_thread(void *data) &dd->dd_flag))) goto st_out; +restart_eh: + /* Demux bits: start with error handling */ + if (test_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags)) { + mtip_handle_tfe(dd); + clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); + } + + if (test_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags)) + goto restart_eh; + if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) { slot = 1; /* used to restrict the loop to one iteration */ @@ -3319,16 +3037,14 @@ static int mtip_service_thread(void *data) } clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); - } else if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) { + } + + if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) { if (mtip_ftl_rebuild_poll(dd) < 0) set_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag); clear_bit(MTIP_PF_REBUILD_BIT, &port->flags); } - clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); - - if (test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags)) - goto st_out; } /* wait for pci remove to exit */ @@ -3365,7 +3081,6 @@ st_out: */ static void mtip_dma_free(struct driver_data *dd) { - int i; struct mtip_port *port = dd->port; if (port->block1) @@ -3376,13 +3091,6 @@ static void mtip_dma_free(struct driver_data *dd) dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ, port->command_list, port->command_list_dma); } - - for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) { - if (port->commands[i].command) - dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, - port->commands[i].command, - port->commands[i].command_dma); - } } /* @@ -3396,8 +3104,6 @@ static void mtip_dma_free(struct driver_data *dd) static int mtip_dma_alloc(struct driver_data *dd) { struct mtip_port *port = dd->port; - int i, rv = 0; - u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64; /* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */ port->block1 = @@ -3430,41 +3136,63 @@ static int mtip_dma_alloc(struct driver_data *dd) port->smart_buf = port->block1 + AHCI_SMARTBUF_OFFSET; port->smart_buf_dma = port->block1_dma + AHCI_SMARTBUF_OFFSET; - /* Setup per command SGL DMA region */ - - /* Point the command headers at the command tables */ - for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) { - port->commands[i].command = - dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, - &port->commands[i].command_dma, GFP_KERNEL); - if (!port->commands[i].command) { - rv = -ENOMEM; - mtip_dma_free(dd); - return rv; - } - memset(port->commands[i].command, 0, CMD_DMA_ALLOC_SZ); - - port->commands[i].command_header = port->command_list + - (sizeof(struct mtip_cmd_hdr) * i); - port->commands[i].command_header_dma = - dd->port->command_list_dma + - (sizeof(struct mtip_cmd_hdr) * i); + return 0; +} - if (host_cap_64) - port->commands[i].command_header->ctbau = - __force_bit2int cpu_to_le32( - (port->commands[i].command_dma >> 16) >> 16); +static int mtip_hw_get_identify(struct driver_data *dd) +{ + struct smart_attr attr242; + unsigned char *buf; + int rv; - port->commands[i].command_header->ctba = - __force_bit2int cpu_to_le32( - port->commands[i].command_dma & 0xFFFFFFFF); + if (mtip_get_identify(dd->port, NULL) < 0) + return -EFAULT; - sg_init_table(port->commands[i].sg, MTIP_MAX_SG); + if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) == + MTIP_FTL_REBUILD_MAGIC) { + set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags); + return MTIP_FTL_REBUILD_MAGIC; + } + mtip_dump_identify(dd->port); - /* Mark command as currently inactive */ - atomic_set(&dd->port->commands[i].active, 0); + /* check write protect, over temp and rebuild statuses */ + rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ, + dd->port->log_buf, + dd->port->log_buf_dma, 1); + if (rv) { + dev_warn(&dd->pdev->dev, + "Error in READ LOG EXT (10h) command\n"); + /* non-critical error, don't fail the load */ + } else { + buf = (unsigned char *)dd->port->log_buf; + if (buf[259] & 0x1) { + dev_info(&dd->pdev->dev, + "Write protect bit is set.\n"); + set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag); + } + if (buf[288] == 0xF7) { + dev_info(&dd->pdev->dev, + "Exceeded Tmax, drive in thermal shutdown.\n"); + set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag); + } + if (buf[288] == 0xBF) { + dev_info(&dd->pdev->dev, + "Drive indicates rebuild has failed.\n"); + /* TODO */ + } } - return 0; + + /* get write protect progess */ + memset(&attr242, 0, sizeof(struct smart_attr)); + if (mtip_get_smart_attr(dd->port, 242, &attr242)) + dev_warn(&dd->pdev->dev, + "Unable to check write protect progress\n"); + else + dev_info(&dd->pdev->dev, + "Write protect progress: %u%% (%u blocks)\n", + attr242.cur, le32_to_cpu(attr242.data)); + + return rv; } /* @@ -3481,8 +3209,6 @@ static int mtip_hw_init(struct driver_data *dd) int rv; unsigned int num_command_slots; unsigned long timeout, timetaken; - unsigned char *buf; - struct smart_attr attr242; dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR]; @@ -3513,8 +3239,6 @@ static int mtip_hw_init(struct driver_data *dd) else dd->unal_qdepth = 0; - /* Counting semaphore to track command slot usage */ - sema_init(&dd->port->cmd_slot, num_command_slots - 1 - dd->unal_qdepth); sema_init(&dd->port->cmd_slot_unal, dd->unal_qdepth); /* Spinlock to prevent concurrent issue */ @@ -3599,73 +3323,16 @@ static int mtip_hw_init(struct driver_data *dd) writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN, dd->mmio + HOST_CTL); - init_timer(&dd->port->cmd_timer); init_waitqueue_head(&dd->port->svc_wait); - dd->port->cmd_timer.data = (unsigned long int) dd->port; - dd->port->cmd_timer.function = mtip_timeout_function; - mod_timer(&dd->port->cmd_timer, - jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD)); - - if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) { rv = -EFAULT; goto out3; } - if (mtip_get_identify(dd->port, NULL) < 0) { - rv = -EFAULT; - goto out3; - } - mtip_dump_identify(dd->port); - - if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) == - MTIP_FTL_REBUILD_MAGIC) { - set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags); - return MTIP_FTL_REBUILD_MAGIC; - } - - /* check write protect, over temp and rebuild statuses */ - rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ, - dd->port->log_buf, - dd->port->log_buf_dma, 1); - if (rv) { - dev_warn(&dd->pdev->dev, - "Error in READ LOG EXT (10h) command\n"); - /* non-critical error, don't fail the load */ - } else { - buf = (unsigned char *)dd->port->log_buf; - if (buf[259] & 0x1) { - dev_info(&dd->pdev->dev, - "Write protect bit is set.\n"); - set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag); - } - if (buf[288] == 0xF7) { - dev_info(&dd->pdev->dev, - "Exceeded Tmax, drive in thermal shutdown.\n"); - set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag); - } - if (buf[288] == 0xBF) { - dev_info(&dd->pdev->dev, - "Drive is in security locked state.\n"); - set_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag); - } - } - - /* get write protect progess */ - memset(&attr242, 0, sizeof(struct smart_attr)); - if (mtip_get_smart_attr(dd->port, 242, &attr242)) - dev_warn(&dd->pdev->dev, - "Unable to check write protect progress\n"); - else - dev_info(&dd->pdev->dev, - "Write protect progress: %u%% (%u blocks)\n", - attr242.cur, le32_to_cpu(attr242.data)); return rv; out3: - del_timer_sync(&dd->port->cmd_timer); - /* Disable interrupts on the HBA. */ writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN, dd->mmio + HOST_CTL); @@ -3685,6 +3352,22 @@ out1: return rv; } +static void mtip_standby_drive(struct driver_data *dd) +{ + if (dd->sr) + return; + + /* + * Send standby immediate (E0h) to the drive so that it + * saves its state. + */ + if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) && + !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag)) + if (mtip_standby_immediate(dd->port)) + dev_warn(&dd->pdev->dev, + "STANDBY IMMEDIATE failed\n"); +} + /* * Called to deinitialize an interface. * @@ -3700,12 +3383,6 @@ static int mtip_hw_exit(struct driver_data *dd) * saves its state. */ if (!dd->sr) { - if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) && - !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag)) - if (mtip_standby_immediate(dd->port)) - dev_warn(&dd->pdev->dev, - "STANDBY IMMEDIATE failed\n"); - /* de-initialize the port. */ mtip_deinit_port(dd->port); @@ -3714,8 +3391,6 @@ static int mtip_hw_exit(struct driver_data *dd) dd->mmio + HOST_CTL); } - del_timer_sync(&dd->port->cmd_timer); - /* Release the IRQ. */ irq_set_affinity_hint(dd->pdev->irq, NULL); devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); @@ -4032,100 +3707,138 @@ static const struct block_device_operations mtip_block_ops = { * * @queue Pointer to the request queue. Unused other than to obtain * the driver data structure. - * @bio Pointer to the BIO. + * @rq Pointer to the request. * */ -static void mtip_make_request(struct request_queue *queue, struct bio *bio) +static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct driver_data *dd = queue->queuedata; - struct scatterlist *sg; - struct bio_vec bvec; - struct bvec_iter iter; - int nents = 0; - int tag = 0, unaligned = 0; + struct driver_data *dd = hctx->queue->queuedata; + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); + unsigned int nents; if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) { if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) { - bio_endio(bio, -ENXIO); - return; + return -ENXIO; } if (unlikely(test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))) { - bio_endio(bio, -ENODATA); - return; + return -ENODATA; } if (unlikely(test_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag) && - bio_data_dir(bio))) { - bio_endio(bio, -ENODATA); - return; - } - if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))) { - bio_endio(bio, -ENODATA); - return; - } - if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) { - bio_endio(bio, -ENXIO); - return; + rq_data_dir(rq))) { + return -ENODATA; } + if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))) + return -ENODATA; + if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) + return -ENXIO; } - if (unlikely(bio->bi_rw & REQ_DISCARD)) { - bio_endio(bio, mtip_send_trim(dd, bio->bi_iter.bi_sector, - bio_sectors(bio))); - return; - } + if (rq->cmd_flags & REQ_DISCARD) { + int err; - if (unlikely(!bio_has_data(bio))) { - blk_queue_flush(queue, 0); - bio_endio(bio, 0); - return; + err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq)); + blk_mq_end_io(rq, err); + return 0; } - if (bio_data_dir(bio) == WRITE && bio_sectors(bio) <= 64 && - dd->unal_qdepth) { - if (bio->bi_iter.bi_sector % 8 != 0) - /* Unaligned on 4k boundaries */ - unaligned = 1; - else if (bio_sectors(bio) % 8 != 0) /* Aligned but not 4k/8k */ - unaligned = 1; + /* Create the scatter list for this request. */ + nents = blk_rq_map_sg(hctx->queue, rq, cmd->sg); + + /* Issue the read/write. */ + mtip_hw_submit_io(dd, rq, cmd, nents, hctx); + return 0; +} + +static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + struct driver_data *dd = hctx->queue->queuedata; + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); + + if (rq_data_dir(rq) == READ || !dd->unal_qdepth) + return false; + + /* + * If unaligned depth must be limited on this controller, mark it + * as unaligned if the IO isn't on a 4k boundary (start of length). + */ + if (blk_rq_sectors(rq) <= 64) { + if ((blk_rq_pos(rq) & 7) || (blk_rq_sectors(rq) & 7)) + cmd->unaligned = 1; } - sg = mtip_hw_get_scatterlist(dd, &tag, unaligned); - if (likely(sg != NULL)) { - blk_queue_bounce(queue, &bio); + if (cmd->unaligned && down_trylock(&dd->port->cmd_slot_unal)) + return true; - if (unlikely((bio)->bi_vcnt > MTIP_MAX_SG)) { - dev_warn(&dd->pdev->dev, - "Maximum number of SGL entries exceeded\n"); - bio_io_error(bio); - mtip_hw_release_scatterlist(dd, tag, unaligned); - return; - } + return false; +} - /* Create the scatter list for this bio. */ - bio_for_each_segment(bvec, bio, iter) { - sg_set_page(&sg[nents], - bvec.bv_page, - bvec.bv_len, - bvec.bv_offset); - nents++; - } +static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) +{ + int ret; - /* Issue the read/write. */ - mtip_hw_submit_io(dd, - bio->bi_iter.bi_sector, - bio_sectors(bio), - nents, - tag, - bio_endio, - bio, - bio_data_dir(bio), - unaligned); - } else - bio_io_error(bio); + if (unlikely(mtip_check_unal_depth(hctx, rq))) + return BLK_MQ_RQ_QUEUE_BUSY; + + ret = mtip_submit_request(hctx, rq); + if (likely(!ret)) + return BLK_MQ_RQ_QUEUE_OK; + + rq->errors = ret; + return BLK_MQ_RQ_QUEUE_ERROR; +} + +static void mtip_free_cmd(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx) +{ + struct driver_data *dd = data; + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); + + if (!cmd->command) + return; + + dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, + cmd->command, cmd->command_dma); +} + +static int mtip_init_cmd(void *data, struct request *rq, unsigned int hctx_idx, + unsigned int request_idx, unsigned int numa_node) +{ + struct driver_data *dd = data; + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); + u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64; + + cmd->command = dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, + &cmd->command_dma, GFP_KERNEL); + if (!cmd->command) + return -ENOMEM; + + memset(cmd->command, 0, CMD_DMA_ALLOC_SZ); + + /* Point the command headers at the command tables. */ + cmd->command_header = dd->port->command_list + + (sizeof(struct mtip_cmd_hdr) * request_idx); + cmd->command_header_dma = dd->port->command_list_dma + + (sizeof(struct mtip_cmd_hdr) * request_idx); + + if (host_cap_64) + cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16); + + cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF); + + sg_init_table(cmd->sg, MTIP_MAX_SG); + return 0; } +static struct blk_mq_ops mtip_mq_ops = { + .queue_rq = mtip_queue_rq, + .map_queue = blk_mq_map_queue, + .init_request = mtip_init_cmd, + .exit_request = mtip_free_cmd, +}; + /* * Block layer initialization function. * @@ -4148,11 +3861,7 @@ static int mtip_block_initialize(struct driver_data *dd) if (dd->disk) goto skip_create_disk; /* hw init done, before rebuild */ - /* Initialize the protocol layer. */ - wait_for_rebuild = mtip_hw_init(dd); - if (wait_for_rebuild < 0) { - dev_err(&dd->pdev->dev, - "Protocol layer initialization failed\n"); + if (mtip_hw_init(dd)) { rv = -EINVAL; goto protocol_init_error; } @@ -4194,29 +3903,53 @@ static int mtip_block_initialize(struct driver_data *dd) mtip_hw_debugfs_init(dd); - /* - * if rebuild pending, start the service thread, and delay the block - * queue creation and add_disk() - */ - if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC) - goto start_service_thread; - skip_create_disk: - /* Allocate the request queue. */ - dd->queue = blk_alloc_queue_node(GFP_KERNEL, dd->numa_node); - if (dd->queue == NULL) { + memset(&dd->tags, 0, sizeof(dd->tags)); + dd->tags.ops = &mtip_mq_ops; + dd->tags.nr_hw_queues = 1; + dd->tags.queue_depth = MTIP_MAX_COMMAND_SLOTS; + dd->tags.reserved_tags = 1; + dd->tags.cmd_size = sizeof(struct mtip_cmd); + dd->tags.numa_node = dd->numa_node; + dd->tags.flags = BLK_MQ_F_SHOULD_MERGE; + dd->tags.driver_data = dd; + + rv = blk_mq_alloc_tag_set(&dd->tags); + if (rv) { dev_err(&dd->pdev->dev, "Unable to allocate request queue\n"); rv = -ENOMEM; goto block_queue_alloc_init_error; } - /* Attach our request function to the request queue. */ - blk_queue_make_request(dd->queue, mtip_make_request); + /* Allocate the request queue. */ + dd->queue = blk_mq_init_queue(&dd->tags); + if (IS_ERR(dd->queue)) { + dev_err(&dd->pdev->dev, + "Unable to allocate request queue\n"); + rv = -ENOMEM; + goto block_queue_alloc_init_error; + } dd->disk->queue = dd->queue; dd->queue->queuedata = dd; + /* Initialize the protocol layer. */ + wait_for_rebuild = mtip_hw_get_identify(dd); + if (wait_for_rebuild < 0) { + dev_err(&dd->pdev->dev, + "Protocol layer initialization failed\n"); + rv = -EINVAL; + goto init_hw_cmds_error; + } + + /* + * if rebuild pending, start the service thread, and delay the block + * queue creation and add_disk() + */ + if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC) + goto start_service_thread; + /* Set device limits. */ set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags); blk_queue_max_segments(dd->queue, MTIP_MAX_SG); @@ -4295,8 +4028,9 @@ kthread_run_error: del_gendisk(dd->disk); read_capacity_error: +init_hw_cmds_error: blk_cleanup_queue(dd->queue); - + blk_mq_free_tag_set(&dd->tags); block_queue_alloc_init_error: mtip_hw_debugfs_exit(dd); disk_index_error: @@ -4345,6 +4079,9 @@ static int mtip_block_remove(struct driver_data *dd) kobject_put(kobj); } } + + mtip_standby_drive(dd); + /* * Delete our gendisk structure. This also removes the device * from /dev @@ -4357,6 +4094,7 @@ static int mtip_block_remove(struct driver_data *dd) if (dd->disk->queue) { del_gendisk(dd->disk); blk_cleanup_queue(dd->queue); + blk_mq_free_tag_set(&dd->tags); dd->queue = NULL; } else put_disk(dd->disk); @@ -4391,6 +4129,8 @@ static int mtip_block_remove(struct driver_data *dd) */ static int mtip_block_shutdown(struct driver_data *dd) { + mtip_hw_shutdown(dd); + /* Delete our gendisk structure, and cleanup the blk queue. */ if (dd->disk) { dev_info(&dd->pdev->dev, @@ -4399,6 +4139,7 @@ static int mtip_block_shutdown(struct driver_data *dd) if (dd->disk->queue) { del_gendisk(dd->disk); blk_cleanup_queue(dd->queue); + blk_mq_free_tag_set(&dd->tags); } else put_disk(dd->disk); dd->disk = NULL; @@ -4408,8 +4149,6 @@ static int mtip_block_shutdown(struct driver_data *dd) spin_lock(&rssd_index_lock); ida_remove(&rssd_index_ida, dd->index); spin_unlock(&rssd_index_lock); - - mtip_hw_shutdown(dd); return 0; } @@ -4479,6 +4218,57 @@ static DEFINE_HANDLER(5); static DEFINE_HANDLER(6); static DEFINE_HANDLER(7); +static void mtip_disable_link_opts(struct driver_data *dd, struct pci_dev *pdev) +{ + int pos; + unsigned short pcie_dev_ctrl; + + pos = pci_find_capability(pdev, PCI_CAP_ID_EXP); + if (pos) { + pci_read_config_word(pdev, + pos + PCI_EXP_DEVCTL, + &pcie_dev_ctrl); + if (pcie_dev_ctrl & (1 << 11) || + pcie_dev_ctrl & (1 << 4)) { + dev_info(&dd->pdev->dev, + "Disabling ERO/No-Snoop on bridge device %04x:%04x\n", + pdev->vendor, pdev->device); + pcie_dev_ctrl &= ~(PCI_EXP_DEVCTL_NOSNOOP_EN | + PCI_EXP_DEVCTL_RELAX_EN); + pci_write_config_word(pdev, + pos + PCI_EXP_DEVCTL, + pcie_dev_ctrl); + } + } +} + +static void mtip_fix_ero_nosnoop(struct driver_data *dd, struct pci_dev *pdev) +{ + /* + * This workaround is specific to AMD/ATI chipset with a PCI upstream + * device with device id 0x5aXX + */ + if (pdev->bus && pdev->bus->self) { + if (pdev->bus->self->vendor == PCI_VENDOR_ID_ATI && + ((pdev->bus->self->device & 0xff00) == 0x5a00)) { + mtip_disable_link_opts(dd, pdev->bus->self); + } else { + /* Check further up the topology */ + struct pci_dev *parent_dev = pdev->bus->self; + if (parent_dev->bus && + parent_dev->bus->parent && + parent_dev->bus->parent->self && + parent_dev->bus->parent->self->vendor == + PCI_VENDOR_ID_ATI && + (parent_dev->bus->parent->self->device & + 0xff00) == 0x5a00) { + mtip_disable_link_opts(dd, + parent_dev->bus->parent->self); + } + } + } +} + /* * Called for each supported PCI device detected. * @@ -4630,6 +4420,8 @@ static int mtip_pci_probe(struct pci_dev *pdev, goto msi_initialize_err; } + mtip_fix_ero_nosnoop(dd, pdev); + /* Initialize the block layer. */ rv = mtip_block_initialize(dd); if (rv < 0) { @@ -4710,8 +4502,6 @@ static void mtip_pci_remove(struct pci_dev *pdev) dev_warn(&dd->pdev->dev, "Completion workers still active!\n"); } - /* Cleanup the outstanding commands */ - mtip_command_cleanup(dd); /* Clean up the block layer. */ mtip_block_remove(dd); @@ -4737,8 +4527,6 @@ static void mtip_pci_remove(struct pci_dev *pdev) pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); pci_set_drvdata(pdev, NULL); - pci_dev_put(pdev); - } /* @@ -4935,13 +4723,13 @@ static int __init mtip_init(void) */ static void __exit mtip_exit(void) { - debugfs_remove_recursive(dfs_parent); - /* Release the allocated major block device number. */ unregister_blkdev(mtip_major, MTIP_DRV_NAME); /* Unregister the PCI driver. */ pci_unregister_driver(&mtip_pci_driver); + + debugfs_remove_recursive(dfs_parent); } MODULE_AUTHOR("Micron Technology, Inc"); diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index ffb955e7ccb..ba1b31ee22e 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -40,9 +40,11 @@ #define MTIP_MAX_RETRIES 2 /* Various timeout values in ms */ -#define MTIP_NCQ_COMMAND_TIMEOUT_MS 5000 -#define MTIP_IOCTL_COMMAND_TIMEOUT_MS 5000 -#define MTIP_INTERNAL_COMMAND_TIMEOUT_MS 5000 +#define MTIP_NCQ_CMD_TIMEOUT_MS 15000 +#define MTIP_IOCTL_CMD_TIMEOUT_MS 5000 +#define MTIP_INT_CMD_TIMEOUT_MS 5000 +#define MTIP_QUIESCE_IO_TIMEOUT_MS (MTIP_NCQ_CMD_TIMEOUT_MS * \ + (MTIP_MAX_RETRIES + 1)) /* check for timeouts every 500ms */ #define MTIP_TIMEOUT_CHECK_PERIOD 500 @@ -331,12 +333,8 @@ struct mtip_cmd { */ void (*comp_func)(struct mtip_port *port, int tag, - void *data, + struct mtip_cmd *cmd, int status); - /* Additional callback function that may be called by comp_func() */ - void (*async_callback)(void *data, int status); - - void *async_data; /* Addl. data passed to async_callback() */ int scatter_ents; /* Number of scatter list entries used */ @@ -347,10 +345,6 @@ struct mtip_cmd { int retries; /* The number of retries left for this command. */ int direction; /* Data transfer direction */ - - unsigned long comp_time; /* command completion time, in jiffies */ - - atomic_t active; /* declares if this command sent to the drive. */ }; /* Structure used to describe a port. */ @@ -436,12 +430,6 @@ struct mtip_port { * or error handling is active */ unsigned long cmds_to_issue[SLOTBITS_IN_LONGS]; - /* - * Array of command slots. Structure includes pointers to the - * command header and command table, and completion function and data - * pointers. - */ - struct mtip_cmd commands[MTIP_MAX_COMMAND_SLOTS]; /* Used by mtip_service_thread to wait for an event */ wait_queue_head_t svc_wait; /* @@ -452,13 +440,7 @@ struct mtip_port { /* * Timer used to complete commands that have been active for too long. */ - struct timer_list cmd_timer; unsigned long ic_pause_timer; - /* - * Semaphore used to block threads if there are no - * command slots available. - */ - struct semaphore cmd_slot; /* Semaphore to control queue depth of unaligned IOs */ struct semaphore cmd_slot_unal; @@ -485,6 +467,8 @@ struct driver_data { struct request_queue *queue; /* Our request queue. */ + struct blk_mq_tag_set tags; /* blk_mq tags */ + struct mtip_port *port; /* Pointer to the port data structure. */ unsigned product_type; /* magic value declaring the product type */ @@ -509,19 +493,19 @@ struct driver_data { struct workqueue_struct *isr_workq; - struct mtip_work work[MTIP_MAX_SLOT_GROUPS]; - atomic_t irq_workers_active; + struct mtip_work work[MTIP_MAX_SLOT_GROUPS]; + int isr_binding; struct block_device *bdev; - int unal_qdepth; /* qdepth of unaligned IO queue */ - struct list_head online_list; /* linkage for online list */ struct list_head remove_list; /* linkage for removing list */ + + int unal_qdepth; /* qdepth of unaligned IO queue */ }; #endif diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 3a70ea2f7cd..fb31b8ee437 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -243,14 +243,11 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req) struct nbd_request request; unsigned long size = blk_rq_bytes(req); + memset(&request, 0, sizeof(request)); request.magic = htonl(NBD_REQUEST_MAGIC); request.type = htonl(nbd_cmd(req)); - if (nbd_cmd(req) == NBD_CMD_FLUSH) { - /* Other values are reserved for FLUSH requests. */ - request.from = 0; - request.len = 0; - } else { + if (nbd_cmd(req) != NBD_CMD_FLUSH && nbd_cmd(req) != NBD_CMD_DISC) { request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); request.len = htonl(size); } @@ -533,7 +530,7 @@ static int nbd_thread(void *data) struct nbd_device *nbd = data; struct request *req; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) { /* wait for something to do */ wait_event_interruptible(nbd->waiting_wq, diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 091b9ea14fe..77087a29b12 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -32,6 +32,7 @@ struct nullb { unsigned int index; struct request_queue *q; struct gendisk *disk; + struct blk_mq_tag_set tag_set; struct hrtimer timer; unsigned int queue_depth; spinlock_t lock; @@ -202,8 +203,8 @@ static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) entry = llist_reverse_order(entry); do { cmd = container_of(entry, struct nullb_cmd, ll_list); - end_cmd(cmd); entry = entry->next; + end_cmd(cmd); } while (entry); } @@ -226,7 +227,7 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd) static void null_softirq_done_fn(struct request *rq) { - end_cmd(rq->special); + end_cmd(blk_mq_rq_to_pdu(rq)); } static inline void null_handle_cmd(struct nullb_cmd *cmd) @@ -311,7 +312,7 @@ static void null_request_fn(struct request_queue *q) static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct nullb_cmd *cmd = rq->special; + struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); cmd->rq = rq; cmd->nq = hctx->driver_data; @@ -320,46 +321,6 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) return BLK_MQ_RQ_QUEUE_OK; } -static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_reg *reg, unsigned int hctx_index) -{ - int b_size = DIV_ROUND_UP(reg->nr_hw_queues, nr_online_nodes); - int tip = (reg->nr_hw_queues % nr_online_nodes); - int node = 0, i, n; - - /* - * Split submit queues evenly wrt to the number of nodes. If uneven, - * fill the first buckets with one extra, until the rest is filled with - * no extra. - */ - for (i = 0, n = 1; i < hctx_index; i++, n++) { - if (n % b_size == 0) { - n = 0; - node++; - - tip--; - if (!tip) - b_size = reg->nr_hw_queues / nr_online_nodes; - } - } - - /* - * A node might not be online, therefore map the relative node id to the - * real node id. - */ - for_each_online_node(n) { - if (!node) - break; - node--; - } - - return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, n); -} - -static void null_free_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_index) -{ - kfree(hctx); -} - static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) { BUG_ON(!nullb); @@ -389,19 +350,14 @@ static struct blk_mq_ops null_mq_ops = { .complete = null_softirq_done_fn, }; -static struct blk_mq_reg null_mq_reg = { - .ops = &null_mq_ops, - .queue_depth = 64, - .cmd_size = sizeof(struct nullb_cmd), - .flags = BLK_MQ_F_SHOULD_MERGE, -}; - static void null_del_dev(struct nullb *nullb) { list_del_init(&nullb->list); del_gendisk(nullb->disk); blk_cleanup_queue(nullb->q); + if (queue_mode == NULL_Q_MQ) + blk_mq_free_tag_set(&nullb->tag_set); put_disk(nullb->disk); kfree(nullb); } @@ -506,7 +462,7 @@ static int null_add_dev(void) nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node); if (!nullb) - return -ENOMEM; + goto out; spin_lock_init(&nullb->lock); @@ -514,49 +470,44 @@ static int null_add_dev(void) submit_queues = nr_online_nodes; if (setup_queues(nullb)) - goto err; + goto out_free_nullb; if (queue_mode == NULL_Q_MQ) { - null_mq_reg.numa_node = home_node; - null_mq_reg.queue_depth = hw_queue_depth; - null_mq_reg.nr_hw_queues = submit_queues; - - if (use_per_node_hctx) { - null_mq_reg.ops->alloc_hctx = null_alloc_hctx; - null_mq_reg.ops->free_hctx = null_free_hctx; - } else { - null_mq_reg.ops->alloc_hctx = blk_mq_alloc_single_hw_queue; - null_mq_reg.ops->free_hctx = blk_mq_free_single_hw_queue; - } - - nullb->q = blk_mq_init_queue(&null_mq_reg, nullb); + nullb->tag_set.ops = &null_mq_ops; + nullb->tag_set.nr_hw_queues = submit_queues; + nullb->tag_set.queue_depth = hw_queue_depth; + nullb->tag_set.numa_node = home_node; + nullb->tag_set.cmd_size = sizeof(struct nullb_cmd); + nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + nullb->tag_set.driver_data = nullb; + + if (blk_mq_alloc_tag_set(&nullb->tag_set)) + goto out_cleanup_queues; + + nullb->q = blk_mq_init_queue(&nullb->tag_set); + if (!nullb->q) + goto out_cleanup_tags; } else if (queue_mode == NULL_Q_BIO) { nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node); + if (!nullb->q) + goto out_cleanup_queues; blk_queue_make_request(nullb->q, null_queue_bio); init_driver_queues(nullb); } else { nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node); + if (!nullb->q) + goto out_cleanup_queues; blk_queue_prep_rq(nullb->q, null_rq_prep_fn); - if (nullb->q) - blk_queue_softirq_done(nullb->q, null_softirq_done_fn); + blk_queue_softirq_done(nullb->q, null_softirq_done_fn); init_driver_queues(nullb); } - if (!nullb->q) - goto queue_fail; - nullb->q->queuedata = nullb; queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q); disk = nullb->disk = alloc_disk_node(1, home_node); - if (!disk) { -queue_fail: - blk_cleanup_queue(nullb->q); - cleanup_queues(nullb); -err: - kfree(nullb); - return -ENOMEM; - } + if (!disk) + goto out_cleanup_blk_queue; mutex_lock(&lock); list_add_tail(&nullb->list, &nullb_list); @@ -579,6 +530,18 @@ err: sprintf(disk->disk_name, "nullb%d", nullb->index); add_disk(disk); return 0; + +out_cleanup_blk_queue: + blk_cleanup_queue(nullb->q); +out_cleanup_tags: + if (queue_mode == NULL_Q_MQ) + blk_mq_free_tag_set(&nullb->tag_set); +out_cleanup_queues: + cleanup_queues(nullb); +out_free_nullb: + kfree(nullb); +out: + return -ENOMEM; } static int __init null_init(void) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index e0ac1210fe3..02351e21716 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2826,6 +2826,16 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) return result; } +static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + if (prepare) + nvme_dev_shutdown(dev); + else + nvme_dev_resume(dev); +} + static void nvme_shutdown(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); @@ -2891,6 +2901,7 @@ static const struct pci_error_handlers nvme_err_handler = { .link_reset = nvme_link_reset, .slot_reset = nvme_slot_reset, .resume = nvme_error_resume, + .reset_notify = nvme_reset_notify, }; /* Move to pci_ids.h later */ diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index e76bdc074db..719cb1bc164 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -747,7 +747,7 @@ static void do_pcd_request(struct request_queue * q) pcd_current = cd; pcd_sector = blk_rq_pos(pcd_req); pcd_count = blk_rq_cur_sectors(pcd_req); - pcd_buf = pcd_req->buffer; + pcd_buf = bio_data(pcd_req->bio); pcd_busy = 1; ps_set_intr(do_pcd_read, NULL, 0, nice); return; diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 19ad8f0c83e..fea7e76a00d 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -454,7 +454,7 @@ static enum action do_pd_io_start(void) if (pd_block + pd_count > get_capacity(pd_req->rq_disk)) return Fail; pd_run = blk_rq_sectors(pd_req); - pd_buf = pd_req->buffer; + pd_buf = bio_data(pd_req->bio); pd_retries = 0; if (pd_cmd == READ) return do_pd_read_start(); @@ -485,7 +485,7 @@ static int pd_next_buf(void) spin_lock_irqsave(&pd_lock, saved_flags); __blk_end_request_cur(pd_req, 0); pd_count = blk_rq_cur_sectors(pd_req); - pd_buf = pd_req->buffer; + pd_buf = bio_data(pd_req->bio); spin_unlock_irqrestore(&pd_lock, saved_flags); return 0; } diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index f5c86d523ba..9a15fd3c934 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -795,7 +795,7 @@ repeat: } pf_cmd = rq_data_dir(pf_req); - pf_buf = pf_req->buffer; + pf_buf = bio_data(pf_req->bio); pf_retries = 0; pf_busy = 1; @@ -827,7 +827,7 @@ static int pf_next_buf(void) if (!pf_req) return 1; pf_count = blk_rq_cur_sectors(pf_req); - pf_buf = pf_req->buffer; + pf_buf = bio_data(pf_req->bio); } return 0; } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index a2af73db187..758ac442c5b 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -704,6 +704,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? WRITE : READ, __GFP_WAIT); + blk_rq_set_block_pc(rq); if (cgc->buflen) { ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, @@ -716,7 +717,6 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * memcpy(rq->cmd, cgc->cmd, CDROM_PACKET_SIZE); rq->timeout = 60*HZ; - rq->cmd_type = REQ_TYPE_BLOCK_PC; if (cgc->quiet) rq->cmd_flags |= REQ_QUIET; @@ -1463,7 +1463,7 @@ static int kcdrwd(void *foobar) struct packet_data *pkt; long min_sleep_time, residue; - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); set_freezable(); for (;;) { diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4c95b503b09..bbeb404b3a0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -541,7 +541,6 @@ static int rbd_open(struct block_device *bdev, fmode_t mode) return -ENOENT; (void) get_device(&rbd_dev->dev); - set_device_ro(bdev, rbd_dev->mapping.read_only); return 0; } @@ -559,10 +558,76 @@ static void rbd_release(struct gendisk *disk, fmode_t mode) put_device(&rbd_dev->dev); } +static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) +{ + int ret = 0; + int val; + bool ro; + bool ro_changed = false; + + /* get_user() may sleep, so call it before taking rbd_dev->lock */ + if (get_user(val, (int __user *)(arg))) + return -EFAULT; + + ro = val ? true : false; + /* Snapshot doesn't allow to write*/ + if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) + return -EROFS; + + spin_lock_irq(&rbd_dev->lock); + /* prevent others open this device */ + if (rbd_dev->open_count > 1) { + ret = -EBUSY; + goto out; + } + + if (rbd_dev->mapping.read_only != ro) { + rbd_dev->mapping.read_only = ro; + ro_changed = true; + } + +out: + spin_unlock_irq(&rbd_dev->lock); + /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */ + if (ret == 0 && ro_changed) + set_disk_ro(rbd_dev->disk, ro ? 1 : 0); + + return ret; +} + +static int rbd_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct rbd_device *rbd_dev = bdev->bd_disk->private_data; + int ret = 0; + + switch (cmd) { + case BLKROSET: + ret = rbd_ioctl_set_ro(rbd_dev, arg); + break; + default: + ret = -ENOTTY; + } + + return ret; +} + +#ifdef CONFIG_COMPAT +static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + return rbd_ioctl(bdev, mode, cmd, arg); +} +#endif /* CONFIG_COMPAT */ + static const struct block_device_operations rbd_bd_ops = { .owner = THIS_MODULE, .open = rbd_open, .release = rbd_release, + .ioctl = rbd_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = rbd_compat_ioctl, +#endif }; /* @@ -1382,6 +1447,13 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request) kref_put(&obj_request->kref, rbd_obj_request_destroy); } +static void rbd_img_request_get(struct rbd_img_request *img_request) +{ + dout("%s: img %p (was %d)\n", __func__, img_request, + atomic_read(&img_request->kref.refcount)); + kref_get(&img_request->kref); +} + static bool img_request_child_test(struct rbd_img_request *img_request); static void rbd_parent_request_destroy(struct kref *kref); static void rbd_img_request_destroy(struct kref *kref); @@ -2142,6 +2214,7 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) img_request->next_completion = which; out: spin_unlock_irq(&img_request->completion_lock); + rbd_img_request_put(img_request); if (!more) rbd_img_request_complete(img_request); @@ -2242,6 +2315,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, goto out_unwind; obj_request->osd_req = osd_req; obj_request->callback = rbd_img_obj_callback; + rbd_img_request_get(img_request); if (write_request) { osd_req_op_alloc_hint_init(osd_req, which, @@ -2872,56 +2946,55 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) } /* - * Request sync osd watch/unwatch. The value of "start" determines - * whether a watch request is being initiated or torn down. + * Initiate a watch request, synchronously. */ -static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) +static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; int ret; - rbd_assert(start ^ !!rbd_dev->watch_event); - rbd_assert(start ^ !!rbd_dev->watch_request); + rbd_assert(!rbd_dev->watch_event); + rbd_assert(!rbd_dev->watch_request); - if (start) { - ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, - &rbd_dev->watch_event); - if (ret < 0) - return ret; - rbd_assert(rbd_dev->watch_event != NULL); - } + ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, + &rbd_dev->watch_event); + if (ret < 0) + return ret; + + rbd_assert(rbd_dev->watch_event); - ret = -ENOMEM; obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, - OBJ_REQUEST_NODATA); - if (!obj_request) + OBJ_REQUEST_NODATA); + if (!obj_request) { + ret = -ENOMEM; goto out_cancel; + } obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1, obj_request); - if (!obj_request->osd_req) - goto out_cancel; + if (!obj_request->osd_req) { + ret = -ENOMEM; + goto out_put; + } - if (start) - ceph_osdc_set_request_linger(osdc, obj_request->osd_req); - else - ceph_osdc_unregister_linger_request(osdc, - rbd_dev->watch_request->osd_req); + ceph_osdc_set_request_linger(osdc, obj_request->osd_req); osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, - rbd_dev->watch_event->cookie, 0, start ? 1 : 0); + rbd_dev->watch_event->cookie, 0, 1); rbd_osd_req_format_write(obj_request); ret = rbd_obj_request_submit(osdc, obj_request); if (ret) - goto out_cancel; + goto out_linger; + ret = rbd_obj_request_wait(obj_request); if (ret) - goto out_cancel; + goto out_linger; + ret = obj_request->result; if (ret) - goto out_cancel; + goto out_linger; /* * A watch request is set to linger, so the underlying osd @@ -2931,36 +3004,84 @@ static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) * it. We'll drop that reference (below) after we've * unregistered it. */ - if (start) { - rbd_dev->watch_request = obj_request; + rbd_dev->watch_request = obj_request; - return 0; + return 0; + +out_linger: + ceph_osdc_unregister_linger_request(osdc, obj_request->osd_req); +out_put: + rbd_obj_request_put(obj_request); +out_cancel: + ceph_osdc_cancel_event(rbd_dev->watch_event); + rbd_dev->watch_event = NULL; + + return ret; +} + +/* + * Tear down a watch request, synchronously. + */ +static int __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) +{ + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct rbd_obj_request *obj_request; + int ret; + + rbd_assert(rbd_dev->watch_event); + rbd_assert(rbd_dev->watch_request); + + obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, + OBJ_REQUEST_NODATA); + if (!obj_request) { + ret = -ENOMEM; + goto out_cancel; + } + + obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1, + obj_request); + if (!obj_request->osd_req) { + ret = -ENOMEM; + goto out_put; } + osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, + rbd_dev->watch_event->cookie, 0, 0); + rbd_osd_req_format_write(obj_request); + + ret = rbd_obj_request_submit(osdc, obj_request); + if (ret) + goto out_put; + + ret = rbd_obj_request_wait(obj_request); + if (ret) + goto out_put; + + ret = obj_request->result; + if (ret) + goto out_put; + /* We have successfully torn down the watch request */ + ceph_osdc_unregister_linger_request(osdc, + rbd_dev->watch_request->osd_req); rbd_obj_request_put(rbd_dev->watch_request); rbd_dev->watch_request = NULL; + +out_put: + rbd_obj_request_put(obj_request); out_cancel: - /* Cancel the event if we're tearing down, or on error */ ceph_osdc_cancel_event(rbd_dev->watch_event); rbd_dev->watch_event = NULL; - if (obj_request) - rbd_obj_request_put(obj_request); return ret; } -static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) -{ - return __rbd_dev_header_watch_sync(rbd_dev, true); -} - static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) { int ret; - ret = __rbd_dev_header_watch_sync(rbd_dev, false); + ret = __rbd_dev_header_unwatch_sync(rbd_dev); if (ret) { rbd_warn(rbd_dev, "unable to tear down watch request: %d\n", ret); @@ -3058,7 +3179,6 @@ static void rbd_request_fn(struct request_queue *q) __releases(q->queue_lock) __acquires(q->queue_lock) { struct rbd_device *rbd_dev = q->queuedata; - bool read_only = rbd_dev->mapping.read_only; struct request *rq; int result; @@ -3094,7 +3214,7 @@ static void rbd_request_fn(struct request_queue *q) if (write_request) { result = -EROFS; - if (read_only) + if (rbd_dev->mapping.read_only) goto end_request; rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); } @@ -4683,6 +4803,38 @@ out_err: } /* + * Return pool id (>= 0) or a negative error code. + */ +static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) +{ + u64 newest_epoch; + unsigned long timeout = rbdc->client->options->mount_timeout * HZ; + int tries = 0; + int ret; + +again: + ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); + if (ret == -ENOENT && tries++ < 1) { + ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap", + &newest_epoch); + if (ret < 0) + return ret; + + if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { + ceph_monc_request_next_osdmap(&rbdc->client->monc); + (void) ceph_monc_wait_osdmap(&rbdc->client->monc, + newest_epoch, timeout); + goto again; + } else { + /* the osdmap we have is new enough */ + return -ENOENT; + } + } + + return ret; +} + +/* * An rbd format 2 image has a unique identifier, distinct from the * name given to it by the user. Internally, that identifier is * what's used to specify the names of objects related to the image. @@ -4752,7 +4904,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) image_id = ceph_extract_encoded_string(&p, p + ret, NULL, GFP_NOIO); - ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0; + ret = PTR_ERR_OR_ZERO(image_id); if (!ret) rbd_dev->image_format = 2; } else { @@ -4907,6 +5059,7 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) if (ret) goto err_out_disk; set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); + set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); ret = rbd_bus_add_dev(rbd_dev); if (ret) @@ -5053,7 +5206,6 @@ static ssize_t do_rbd_add(struct bus_type *bus, struct rbd_options *rbd_opts = NULL; struct rbd_spec *spec = NULL; struct rbd_client *rbdc; - struct ceph_osd_client *osdc; bool read_only; int rc = -ENOMEM; @@ -5075,8 +5227,7 @@ static ssize_t do_rbd_add(struct bus_type *bus, } /* pick the pool */ - osdc = &rbdc->client->osdc; - rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); + rc = rbd_add_get_pool_id(rbdc, spec->pool_name); if (rc < 0) goto err_out_client; spec->pool_id = (u64)rc; @@ -5387,6 +5538,7 @@ err_out_slab: static void __exit rbd_exit(void) { + ida_destroy(&rbd_dev_id_ida); rbd_sysfs_cleanup(); if (single_major) unregister_blkdev(rbd_major, RBD_DRV_NAME); diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index a69dd93d1bd..608532d3f8c 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -563,7 +563,6 @@ skd_prep_discard_cdb(struct skd_scsi_request *scsi_req, req = skreq->req; blk_add_request_payload(req, page, len); - req->buffer = buf; } static void skd_request_fn_not_online(struct request_queue *q); @@ -744,6 +743,7 @@ static void skd_request_fn(struct request_queue *q) break; } skreq->discard_page = 1; + req->completion_data = page; skd_prep_discard_cdb(scsi_req, skreq, page, lba, count); } else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { @@ -858,8 +858,7 @@ static void skd_end_request(struct skd_device *skdev, (skreq->discard_page == 1)) { pr_debug("%s:%s:%d, free the page!", skdev->name, __func__, __LINE__); - free_page((unsigned long)req->buffer); - req->buffer = NULL; + __free_page(req->completion_data); } if (unlikely(error)) { @@ -3945,15 +3944,14 @@ static int skd_acquire_msix(struct skd_device *skdev) for (i = 0; i < SKD_MAX_MSIX_COUNT; i++) entries[i].entry = i; - rc = pci_enable_msix_range(pdev, entries, - SKD_MIN_MSIX_COUNT, SKD_MAX_MSIX_COUNT); - if (rc < 0) { + rc = pci_enable_msix_exact(pdev, entries, SKD_MAX_MSIX_COUNT); + if (rc) { pr_err("(%s): failed to enable MSI-X %d\n", skd_name(skdev), rc); goto msix_out; } - skdev->msix_count = rc; + skdev->msix_count = SKD_MAX_MSIX_COUNT; skdev->msix_entries = kzalloc(sizeof(struct skd_msix_entry) * skdev->msix_count, GFP_KERNEL); if (!skdev->msix_entries) { diff --git a/drivers/block/swim.c b/drivers/block/swim.c index b02d53a399f..6b44bbe528b 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -549,7 +549,7 @@ static void redo_fd_request(struct request_queue *q) case READ: err = floppy_read_sectors(fs, blk_rq_pos(req), blk_rq_cur_sectors(req), - req->buffer); + bio_data(req->bio)); break; } done: diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index c74f7b56e7c..523ee8fd4c1 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -342,7 +342,7 @@ static void start_request(struct floppy_state *fs) swim3_dbg("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n", req->rq_disk->disk_name, req->cmd, (long)blk_rq_pos(req), blk_rq_sectors(req), - req->buffer); + bio_data(req->bio)); swim3_dbg(" errors=%d current_nr_sectors=%u\n", req->errors, blk_rq_cur_sectors(req)); #endif @@ -479,11 +479,11 @@ static inline void setup_transfer(struct floppy_state *fs) /* Set up 3 dma commands: write preamble, data, postamble */ init_dma(cp, OUTPUT_MORE, write_preamble, sizeof(write_preamble)); ++cp; - init_dma(cp, OUTPUT_MORE, req->buffer, 512); + init_dma(cp, OUTPUT_MORE, bio_data(req->bio), 512); ++cp; init_dma(cp, OUTPUT_LAST, write_postamble, sizeof(write_postamble)); } else { - init_dma(cp, INPUT_LAST, req->buffer, n * 512); + init_dma(cp, INPUT_LAST, bio_data(req->bio), n * 512); } ++cp; out_le16(&cp->command, DBDMA_STOP); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 6d8a87f252d..f63d358f3d9 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -30,6 +30,9 @@ struct virtio_blk /* The disk structure for the kernel. */ struct gendisk *disk; + /* Block layer tags. */ + struct blk_mq_tag_set tag_set; + /* Process context for config space updates */ struct work_struct config_work; @@ -112,7 +115,7 @@ static int __virtblk_add_req(struct virtqueue *vq, static inline void virtblk_request_done(struct request *req) { - struct virtblk_req *vbr = req->special; + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); int error = virtblk_result(vbr); if (req->cmd_type == REQ_TYPE_BLOCK_PC) { @@ -144,21 +147,22 @@ static void virtblk_done(struct virtqueue *vq) if (unlikely(virtqueue_is_broken(vq))) break; } while (!virtqueue_enable_cb(vq)); - spin_unlock_irqrestore(&vblk->vq_lock, flags); /* In case queue is stopped waiting for more buffers. */ if (req_done) - blk_mq_start_stopped_hw_queues(vblk->disk->queue); + blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); + spin_unlock_irqrestore(&vblk->vq_lock, flags); } static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) { struct virtio_blk *vblk = hctx->queue->queuedata; - struct virtblk_req *vbr = req->special; + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); unsigned long flags; unsigned int num; const bool last = (req->cmd_flags & REQ_END) != 0; int err; + bool notify = false; BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); @@ -202,8 +206,8 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) err = __virtblk_add_req(vblk->vq, vbr, vbr->sg, num); if (err) { virtqueue_kick(vblk->vq); - spin_unlock_irqrestore(&vblk->vq_lock, flags); blk_mq_stop_hw_queue(hctx); + spin_unlock_irqrestore(&vblk->vq_lock, flags); /* Out of mem doesn't actually happen, since we fall back * to direct descriptors */ if (err == -ENOMEM || err == -ENOSPC) @@ -211,10 +215,12 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) return BLK_MQ_RQ_QUEUE_ERROR; } - if (last) - virtqueue_kick(vblk->vq); - + if (last && virtqueue_kick_prepare(vblk->vq)) + notify = true; spin_unlock_irqrestore(&vblk->vq_lock, flags); + + if (notify) + virtqueue_notify(vblk->vq); return BLK_MQ_RQ_QUEUE_OK; } @@ -480,33 +486,27 @@ static const struct device_attribute dev_attr_cache_type_rw = __ATTR(cache_type, S_IRUGO|S_IWUSR, virtblk_cache_type_show, virtblk_cache_type_store); -static struct blk_mq_ops virtio_mq_ops = { - .queue_rq = virtio_queue_rq, - .map_queue = blk_mq_map_queue, - .alloc_hctx = blk_mq_alloc_single_hw_queue, - .free_hctx = blk_mq_free_single_hw_queue, - .complete = virtblk_request_done, -}; - -static struct blk_mq_reg virtio_mq_reg = { - .ops = &virtio_mq_ops, - .nr_hw_queues = 1, - .queue_depth = 0, /* Set in virtblk_probe */ - .numa_node = NUMA_NO_NODE, - .flags = BLK_MQ_F_SHOULD_MERGE, -}; -module_param_named(queue_depth, virtio_mq_reg.queue_depth, uint, 0444); - -static int virtblk_init_vbr(void *data, struct blk_mq_hw_ctx *hctx, - struct request *rq, unsigned int nr) +static int virtblk_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx, + unsigned int numa_node) { struct virtio_blk *vblk = data; - struct virtblk_req *vbr = rq->special; + struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq); sg_init_table(vbr->sg, vblk->sg_elems); return 0; } +static struct blk_mq_ops virtio_mq_ops = { + .queue_rq = virtio_queue_rq, + .map_queue = blk_mq_map_queue, + .complete = virtblk_request_done, + .init_request = virtblk_init_request, +}; + +static unsigned int virtblk_queue_depth; +module_param_named(queue_depth, virtblk_queue_depth, uint, 0444); + static int virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; @@ -561,24 +561,34 @@ static int virtblk_probe(struct virtio_device *vdev) } /* Default queue sizing is to fill the ring. */ - if (!virtio_mq_reg.queue_depth) { - virtio_mq_reg.queue_depth = vblk->vq->num_free; + if (!virtblk_queue_depth) { + virtblk_queue_depth = vblk->vq->num_free; /* ... but without indirect descs, we use 2 descs per req */ if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) - virtio_mq_reg.queue_depth /= 2; + virtblk_queue_depth /= 2; } - virtio_mq_reg.cmd_size = + + memset(&vblk->tag_set, 0, sizeof(vblk->tag_set)); + vblk->tag_set.ops = &virtio_mq_ops; + vblk->tag_set.nr_hw_queues = 1; + vblk->tag_set.queue_depth = virtblk_queue_depth; + vblk->tag_set.numa_node = NUMA_NO_NODE; + vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + vblk->tag_set.cmd_size = sizeof(struct virtblk_req) + sizeof(struct scatterlist) * sg_elems; + vblk->tag_set.driver_data = vblk; + + err = blk_mq_alloc_tag_set(&vblk->tag_set); + if (err) + goto out_put_disk; - q = vblk->disk->queue = blk_mq_init_queue(&virtio_mq_reg, vblk); + q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set); if (!q) { err = -ENOMEM; - goto out_put_disk; + goto out_free_tags; } - blk_mq_init_commands(q, virtblk_init_vbr, vblk); - q->queuedata = vblk; virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); @@ -679,6 +689,8 @@ static int virtblk_probe(struct virtio_device *vdev) out_del_disk: del_gendisk(vblk->disk); blk_cleanup_queue(vblk->disk->queue); +out_free_tags: + blk_mq_free_tag_set(&vblk->tag_set); out_put_disk: put_disk(vblk->disk); out_free_vq: @@ -705,6 +717,8 @@ static void virtblk_remove(struct virtio_device *vdev) del_gendisk(vblk->disk); blk_cleanup_queue(vblk->disk->queue); + blk_mq_free_tag_set(&vblk->tag_set); + /* Stop all the virtqueues. */ vdev->config->reset(vdev); @@ -749,7 +763,7 @@ static int virtblk_restore(struct virtio_device *vdev) vblk->config_enable = true; ret = init_vq(vdev->priv); if (!ret) - blk_mq_start_stopped_hw_queues(vblk->disk->queue); + blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); return ret; } diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index be052773ad0..f65b807e323 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -314,7 +314,7 @@ struct xen_blkif { unsigned long long st_rd_sect; unsigned long long st_wr_sect; - wait_queue_head_t waiting_to_free; + struct work_struct free_work; /* Thread shutdown wait queue. */ wait_queue_head_t shutdown_wq; }; @@ -361,7 +361,7 @@ struct pending_req { #define xen_blkif_put(_b) \ do { \ if (atomic_dec_and_test(&(_b)->refcnt)) \ - wake_up(&(_b)->waiting_to_free);\ + schedule_work(&(_b)->free_work);\ } while (0) struct phys_req { diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 9a547e6b6eb..3a8b810b498 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -35,12 +35,26 @@ static void connect(struct backend_info *); static int connect_ring(struct backend_info *); static void backend_changed(struct xenbus_watch *, const char **, unsigned int); +static void xen_blkif_free(struct xen_blkif *blkif); +static void xen_vbd_free(struct xen_vbd *vbd); struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be) { return be->dev; } +/* + * The last request could free the device from softirq context and + * xen_blkif_free() can sleep. + */ +static void xen_blkif_deferred_free(struct work_struct *work) +{ + struct xen_blkif *blkif; + + blkif = container_of(work, struct xen_blkif, free_work); + xen_blkif_free(blkif); +} + static int blkback_name(struct xen_blkif *blkif, char *buf) { char *devpath, *devname; @@ -121,7 +135,6 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) init_completion(&blkif->drain_complete); atomic_set(&blkif->drain, 0); blkif->st_print = jiffies; - init_waitqueue_head(&blkif->waiting_to_free); blkif->persistent_gnts.rb_node = NULL; spin_lock_init(&blkif->free_pages_lock); INIT_LIST_HEAD(&blkif->free_pages); @@ -132,6 +145,7 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants); INIT_LIST_HEAD(&blkif->pending_free); + INIT_WORK(&blkif->free_work, xen_blkif_deferred_free); for (i = 0; i < XEN_BLKIF_REQS; i++) { req = kzalloc(sizeof(*req), GFP_KERNEL); @@ -231,7 +245,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page, return 0; } -static void xen_blkif_disconnect(struct xen_blkif *blkif) +static int xen_blkif_disconnect(struct xen_blkif *blkif) { if (blkif->xenblkd) { kthread_stop(blkif->xenblkd); @@ -239,9 +253,12 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif) blkif->xenblkd = NULL; } - atomic_dec(&blkif->refcnt); - wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0); - atomic_inc(&blkif->refcnt); + /* The above kthread_stop() guarantees that at this point we + * don't have any discard_io or other_io requests. So, checking + * for inflight IO is enough. + */ + if (atomic_read(&blkif->inflight) > 0) + return -EBUSY; if (blkif->irq) { unbind_from_irqhandler(blkif->irq, blkif); @@ -252,6 +269,8 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif) xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); blkif->blk_rings.common.sring = NULL; } + + return 0; } static void xen_blkif_free(struct xen_blkif *blkif) @@ -259,8 +278,8 @@ static void xen_blkif_free(struct xen_blkif *blkif) struct pending_req *req, *n; int i = 0, j; - if (!atomic_dec_and_test(&blkif->refcnt)) - BUG(); + xen_blkif_disconnect(blkif); + xen_vbd_free(&blkif->vbd); /* Remove all persistent grants and the cache of ballooned pages. */ xen_blkbk_free_caches(blkif); @@ -449,16 +468,15 @@ static int xen_blkbk_remove(struct xenbus_device *dev) be->backend_watch.node = NULL; } + dev_set_drvdata(&dev->dev, NULL); + if (be->blkif) { xen_blkif_disconnect(be->blkif); - xen_vbd_free(&be->blkif->vbd); - xen_blkif_free(be->blkif); - be->blkif = NULL; + xen_blkif_put(be->blkif); } kfree(be->mode); kfree(be); - dev_set_drvdata(&dev->dev, NULL); return 0; } @@ -481,10 +499,15 @@ static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info struct xenbus_device *dev = be->dev; struct xen_blkif *blkif = be->blkif; int err; - int state = 0; + int state = 0, discard_enable; struct block_device *bdev = be->blkif->vbd.bdev; struct request_queue *q = bdev_get_queue(bdev); + err = xenbus_scanf(XBT_NIL, dev->nodename, "discard-enable", "%d", + &discard_enable); + if (err == 1 && !discard_enable) + return; + if (blk_queue_discard(q)) { err = xenbus_printf(xbt, dev->nodename, "discard-granularity", "%u", @@ -700,7 +723,11 @@ static void frontend_changed(struct xenbus_device *dev, * Enforce precondition before potential leak point. * xen_blkif_disconnect() is idempotent. */ - xen_blkif_disconnect(be->blkif); + err = xen_blkif_disconnect(be->blkif); + if (err) { + xenbus_dev_fatal(dev, err, "pending I/O"); + break; + } err = connect_ring(be); if (err) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index efe1b476173..5deb235bd18 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -612,10 +612,10 @@ static void do_blkif_request(struct request_queue *rq) } pr_debug("do_blk_req %p: cmd %p, sec %lx, " - "(%u/%u) buffer:%p [%s]\n", + "(%u/%u) [%s]\n", req, req->cmd, (unsigned long)blk_rq_pos(req), blk_rq_cur_sectors(req), blk_rq_sectors(req), - req->buffer, rq_data_dir(req) ? "write" : "read"); + rq_data_dir(req) ? "write" : "read"); if (blkif_queue_request(req)) { blk_requeue_request(rq, req); @@ -1635,36 +1635,24 @@ blkfront_closing(struct blkfront_info *info) static void blkfront_setup_discard(struct blkfront_info *info) { int err; - char *type; unsigned int discard_granularity; unsigned int discard_alignment; unsigned int discard_secure; - type = xenbus_read(XBT_NIL, info->xbdev->otherend, "type", NULL); - if (IS_ERR(type)) - return; - - info->feature_secdiscard = 0; - if (strncmp(type, "phy", 3) == 0) { - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "discard-granularity", "%u", &discard_granularity, - "discard-alignment", "%u", &discard_alignment, - NULL); - if (!err) { - info->feature_discard = 1; - info->discard_granularity = discard_granularity; - info->discard_alignment = discard_alignment; - } - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "discard-secure", "%d", &discard_secure, - NULL); - if (!err) - info->feature_secdiscard = discard_secure; - - } else if (strncmp(type, "file", 4) == 0) - info->feature_discard = 1; - - kfree(type); + info->feature_discard = 1; + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "discard-granularity", "%u", &discard_granularity, + "discard-alignment", "%u", &discard_alignment, + NULL); + if (!err) { + info->discard_granularity = discard_granularity; + info->discard_alignment = discard_alignment; + } + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "discard-secure", "%d", &discard_secure, + NULL); + if (!err) + info->feature_secdiscard = !!discard_secure; } static int blkfront_setup_indirect(struct blkfront_info *info) diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index 1393b8871a2..ab3ea62e5df 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -661,7 +661,7 @@ static void ace_fsm_dostate(struct ace_device *ace) rq_data_dir(req)); ace->req = req; - ace->data_ptr = req->buffer; + ace->data_ptr = bio_data(req->bio); ace->data_count = blk_rq_cur_sectors(req) * ACE_BUF_PER_SECTOR; ace_out32(ace, ACE_MPULBA, blk_rq_pos(req) & 0x0FFFFFFF); @@ -733,7 +733,7 @@ static void ace_fsm_dostate(struct ace_device *ace) * blk_rq_sectors(ace->req), * blk_rq_cur_sectors(ace->req)); */ - ace->data_ptr = ace->req->buffer; + ace->data_ptr = bio_data(ace->req->bio); ace->data_count = blk_rq_cur_sectors(ace->req) * 16; ace_fsm_yieldirq(ace); break; diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index 27de5046708..968f9e52eff 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -87,13 +87,15 @@ static void do_z2_request(struct request_queue *q) while (len) { unsigned long addr = start & Z2RAM_CHUNKMASK; unsigned long size = Z2RAM_CHUNKSIZE - addr; + void *buffer = bio_data(req->bio); + if (len < size) size = len; addr += z2ram_map[ start >> Z2RAM_CHUNKSHIFT ]; if (rq_data_dir(req) == READ) - memcpy(req->buffer, (char *)addr, size); + memcpy(buffer, (char *)addr, size); else - memcpy((char *)addr, req->buffer, size); + memcpy((char *)addr, buffer, size); start += size; len -= size; } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 9849b5233bf..48eccb35018 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -572,10 +572,10 @@ static void zram_bio_discard(struct zram *zram, u32 index, * skipping this logical block is appropriate here. */ if (offset) { - if (n < offset) + if (n <= (PAGE_SIZE - offset)) return; - n -= offset; + n -= (PAGE_SIZE - offset); index++; } |