diff options
Diffstat (limited to 'drivers')
188 files changed, 7606 insertions, 3037 deletions
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile index 8b450338075..4464e353c1e 100644 --- a/drivers/block/drbd/Makefile +++ b/drivers/block/drbd/Makefile @@ -3,5 +3,6 @@ drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o drbd-y += drbd_main.o drbd_strings.o drbd_nl.o drbd-y += drbd_interval.o drbd_state.o drbd-y += drbd_nla.o +drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 05a1780ffa8..d26a3fa6368 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -92,34 +92,26 @@ struct __packed al_transaction_on_disk { __be32 context[AL_CONTEXT_PER_TRANSACTION]; }; -struct update_odbm_work { - struct drbd_work w; - struct drbd_device *device; - unsigned int enr; -}; - -struct update_al_work { - struct drbd_work w; - struct drbd_device *device; - struct completion event; - int err; -}; - - -void *drbd_md_get_buffer(struct drbd_device *device) +void *drbd_md_get_buffer(struct drbd_device *device, const char *intent) { int r; wait_event(device->misc_wait, - (r = atomic_cmpxchg(&device->md_io_in_use, 0, 1)) == 0 || + (r = atomic_cmpxchg(&device->md_io.in_use, 0, 1)) == 0 || device->state.disk <= D_FAILED); - return r ? NULL : page_address(device->md_io_page); + if (r) + return NULL; + + device->md_io.current_use = intent; + device->md_io.start_jif = jiffies; + device->md_io.submit_jif = device->md_io.start_jif - 1; + return page_address(device->md_io.page); } void drbd_md_put_buffer(struct drbd_device *device) { - if (atomic_dec_and_test(&device->md_io_in_use)) + if (atomic_dec_and_test(&device->md_io.in_use)) wake_up(&device->misc_wait); } @@ -145,10 +137,11 @@ void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_b static int _drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev, - struct page *page, sector_t sector, - int rw, int size) + sector_t sector, int rw) { struct bio *bio; + /* we do all our meta data IO in aligned 4k blocks. */ + const int size = 4096; int err; device->md_io.done = 0; @@ -156,15 +149,15 @@ static int _drbd_md_sync_page_io(struct drbd_device *device, if ((rw & WRITE) && !test_bit(MD_NO_FUA, &device->flags)) rw |= REQ_FUA | REQ_FLUSH; - rw |= REQ_SYNC; + rw |= REQ_SYNC | REQ_NOIDLE; bio = bio_alloc_drbd(GFP_NOIO); bio->bi_bdev = bdev->md_bdev; bio->bi_iter.bi_sector = sector; err = -EIO; - if (bio_add_page(bio, page, size, 0) != size) + if (bio_add_page(bio, device->md_io.page, size, 0) != size) goto out; - bio->bi_private = &device->md_io; + bio->bi_private = device; bio->bi_end_io = drbd_md_io_complete; bio->bi_rw = rw; @@ -179,7 +172,8 @@ static int _drbd_md_sync_page_io(struct drbd_device *device, } bio_get(bio); /* one bio_put() is in the completion handler */ - atomic_inc(&device->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */ + atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */ + device->md_io.submit_jif = jiffies; if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) bio_endio(bio, -EIO); else @@ -197,9 +191,7 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd sector_t sector, int rw) { int err; - struct page *iop = device->md_io_page; - - D_ASSERT(device, atomic_read(&device->md_io_in_use) == 1); + D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1); BUG_ON(!bdev->md_bdev); @@ -214,8 +206,7 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd current->comm, current->pid, __func__, (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); - /* we do all our meta data IO in aligned 4k blocks. */ - err = _drbd_md_sync_page_io(device, bdev, iop, sector, rw, 4096); + err = _drbd_md_sync_page_io(device, bdev, sector, rw); if (err) { drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); @@ -297,26 +288,12 @@ bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval * return need_transaction; } -static int al_write_transaction(struct drbd_device *device, bool delegate); - -/* When called through generic_make_request(), we must delegate - * activity log I/O to the worker thread: a further request - * submitted via generic_make_request() within the same task - * would be queued on current->bio_list, and would only start - * after this function returns (see generic_make_request()). - * - * However, if we *are* the worker, we must not delegate to ourselves. - */ +static int al_write_transaction(struct drbd_device *device); -/* - * @delegate: delegate activity log I/O to the worker thread - */ -void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate) +void drbd_al_begin_io_commit(struct drbd_device *device) { bool locked = false; - BUG_ON(delegate && current == first_peer_device(device)->connection->worker.task); - /* Serialize multiple transactions. * This uses test_and_set_bit, memory barrier is implicit. */ @@ -335,7 +312,7 @@ void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate) rcu_read_unlock(); if (write_al_updates) - al_write_transaction(device, delegate); + al_write_transaction(device); spin_lock_irq(&device->al_lock); /* FIXME if (err) @@ -352,12 +329,10 @@ void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate) /* * @delegate: delegate activity log I/O to the worker thread */ -void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i, bool delegate) +void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i) { - BUG_ON(delegate && current == first_peer_device(device)->connection->worker.task); - if (drbd_al_begin_io_prepare(device, i)) - drbd_al_begin_io_commit(device, delegate); + drbd_al_begin_io_commit(device); } int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i) @@ -380,8 +355,19 @@ int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval * /* We want all necessary updates for a given request within the same transaction * We could first check how many updates are *actually* needed, * and use that instead of the worst-case nr_al_extents */ - if (available_update_slots < nr_al_extents) - return -EWOULDBLOCK; + if (available_update_slots < nr_al_extents) { + /* Too many activity log extents are currently "hot". + * + * If we have accumulated pending changes already, + * we made progress. + * + * If we cannot get even a single pending change through, + * stop the fast path until we made some progress, + * or requests to "cold" extents could be starved. */ + if (!al->pending_changes) + __set_bit(__LC_STARVING, &device->act_log->flags); + return -ENOBUFS; + } /* Is resync active in this area? */ for (enr = first; enr <= last; enr++) { @@ -452,15 +438,6 @@ static unsigned int al_extent_to_bm_page(unsigned int al_enr) (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); } -static unsigned int rs_extent_to_bm_page(unsigned int rs_enr) -{ - return rs_enr >> - /* bit to page */ - ((PAGE_SHIFT + 3) - - /* resync extent number to bit */ - (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); -} - static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device) { const unsigned int stripes = device->ldev->md.al_stripes; @@ -479,8 +456,7 @@ static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device) return device->ldev->md.md_offset + device->ldev->md.al_offset + t; } -static int -_al_write_transaction(struct drbd_device *device) +int al_write_transaction(struct drbd_device *device) { struct al_transaction_on_disk *buffer; struct lc_element *e; @@ -505,7 +481,8 @@ _al_write_transaction(struct drbd_device *device) return -EIO; } - buffer = drbd_md_get_buffer(device); /* protects md_io_buffer, al_tr_cycle, ... */ + /* protects md_io_buffer, al_tr_cycle, ... */ + buffer = drbd_md_get_buffer(device, __func__); if (!buffer) { drbd_err(device, "disk failed while waiting for md_io buffer\n"); put_ldev(device); @@ -590,38 +567,6 @@ _al_write_transaction(struct drbd_device *device) return err; } - -static int w_al_write_transaction(struct drbd_work *w, int unused) -{ - struct update_al_work *aw = container_of(w, struct update_al_work, w); - struct drbd_device *device = aw->device; - int err; - - err = _al_write_transaction(device); - aw->err = err; - complete(&aw->event); - - return err != -EIO ? err : 0; -} - -/* Calls from worker context (see w_restart_disk_io()) need to write the - transaction directly. Others came through generic_make_request(), - those need to delegate it to the worker. */ -static int al_write_transaction(struct drbd_device *device, bool delegate) -{ - if (delegate) { - struct update_al_work al_work; - init_completion(&al_work.event); - al_work.w.cb = w_al_write_transaction; - al_work.device = device; - drbd_queue_work_front(&first_peer_device(device)->connection->sender_work, - &al_work.w); - wait_for_completion(&al_work.event); - return al_work.err; - } else - return _al_write_transaction(device); -} - static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext) { int rv; @@ -682,72 +627,56 @@ int drbd_initialize_al(struct drbd_device *device, void *buffer) return 0; } -static int w_update_odbm(struct drbd_work *w, int unused) -{ - struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); - struct drbd_device *device = udw->device; - struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, }; - - if (!get_ldev(device)) { - if (__ratelimit(&drbd_ratelimit_state)) - drbd_warn(device, "Can not update on disk bitmap, local IO disabled.\n"); - kfree(udw); - return 0; - } - - drbd_bm_write_page(device, rs_extent_to_bm_page(udw->enr)); - put_ldev(device); - - kfree(udw); - - if (drbd_bm_total_weight(device) <= device->rs_failed) { - switch (device->state.conn) { - case C_SYNC_SOURCE: case C_SYNC_TARGET: - case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T: - drbd_resync_finished(device); - default: - /* nothing to do */ - break; - } - } - drbd_bcast_event(device, &sib); - - return 0; -} - +static const char *drbd_change_sync_fname[] = { + [RECORD_RS_FAILED] = "drbd_rs_failed_io", + [SET_IN_SYNC] = "drbd_set_in_sync", + [SET_OUT_OF_SYNC] = "drbd_set_out_of_sync" +}; /* ATTENTION. The AL's extents are 4MB each, while the extents in the * resync LRU-cache are 16MB each. * The caller of this function has to hold an get_ldev() reference. * + * Adjusts the caching members ->rs_left (success) or ->rs_failed (!success), + * potentially pulling in (and recounting the corresponding bits) + * this resync extent into the resync extent lru cache. + * + * Returns whether all bits have been cleared for this resync extent, + * precisely: (rs_left <= rs_failed) + * * TODO will be obsoleted once we have a caching lru of the on disk bitmap */ -static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t sector, - int count, int success) +static bool update_rs_extent(struct drbd_device *device, + unsigned int enr, int count, + enum update_sync_bits_mode mode) { struct lc_element *e; - struct update_odbm_work *udw; - - unsigned int enr; D_ASSERT(device, atomic_read(&device->local_cnt)); - /* I simply assume that a sector/size pair never crosses - * a 16 MB extent border. (Currently this is true...) */ - enr = BM_SECT_TO_EXT(sector); - - e = lc_get(device->resync, enr); + /* When setting out-of-sync bits, + * we don't need it cached (lc_find). + * But if it is present in the cache, + * we should update the cached bit count. + * Otherwise, that extent should be in the resync extent lru cache + * already -- or we want to pull it in if necessary -- (lc_get), + * then update and check rs_left and rs_failed. */ + if (mode == SET_OUT_OF_SYNC) + e = lc_find(device->resync, enr); + else + e = lc_get(device->resync, enr); if (e) { struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); if (ext->lce.lc_number == enr) { - if (success) + if (mode == SET_IN_SYNC) ext->rs_left -= count; + else if (mode == SET_OUT_OF_SYNC) + ext->rs_left += count; else ext->rs_failed += count; if (ext->rs_left < ext->rs_failed) { - drbd_warn(device, "BAD! sector=%llus enr=%u rs_left=%d " + drbd_warn(device, "BAD! enr=%u rs_left=%d " "rs_failed=%d count=%d cstate=%s\n", - (unsigned long long)sector, ext->lce.lc_number, ext->rs_left, ext->rs_failed, count, drbd_conn_str(device->state.conn)); @@ -781,34 +710,27 @@ static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t secto ext->lce.lc_number, ext->rs_failed); } ext->rs_left = rs_left; - ext->rs_failed = success ? 0 : count; + ext->rs_failed = (mode == RECORD_RS_FAILED) ? count : 0; /* we don't keep a persistent log of the resync lru, * we can commit any change right away. */ lc_committed(device->resync); } - lc_put(device->resync, &ext->lce); + if (mode != SET_OUT_OF_SYNC) + lc_put(device->resync, &ext->lce); /* no race, we are within the al_lock! */ - if (ext->rs_left == ext->rs_failed) { + if (ext->rs_left <= ext->rs_failed) { ext->rs_failed = 0; - - udw = kmalloc(sizeof(*udw), GFP_ATOMIC); - if (udw) { - udw->enr = ext->lce.lc_number; - udw->w.cb = w_update_odbm; - udw->device = device; - drbd_queue_work_front(&first_peer_device(device)->connection->sender_work, - &udw->w); - } else { - drbd_warn(device, "Could not kmalloc an udw\n"); - } + return true; } - } else { + } else if (mode != SET_OUT_OF_SYNC) { + /* be quiet if lc_find() did not find it. */ drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n", device->resync_locked, device->resync->nr_elements, device->resync->flags); } + return false; } void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go) @@ -827,105 +749,105 @@ void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go } } -/* clear the bit corresponding to the piece of storage in question: - * size byte of data starting from sector. Only clear a bits of the affected - * one ore more _aligned_ BM_BLOCK_SIZE blocks. - * - * called by worker on C_SYNC_TARGET and receiver on SyncSource. - * - */ -void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, int size, - const char *file, const unsigned int line) +/* It is called lazy update, so don't do write-out too often. */ +static bool lazy_bitmap_update_due(struct drbd_device *device) { - /* Is called from worker and receiver context _only_ */ - unsigned long sbnr, ebnr, lbnr; - unsigned long count = 0; - sector_t esector, nr_sectors; - int wake_up = 0; - unsigned long flags; + return time_after(jiffies, device->rs_last_bcast + 2*HZ); +} - if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { - drbd_err(device, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", - (unsigned long long)sector, size); +static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done) +{ + if (rs_done) + set_bit(RS_DONE, &device->flags); + /* and also set RS_PROGRESS below */ + else if (!lazy_bitmap_update_due(device)) return; - } - - if (!get_ldev(device)) - return; /* no disk, no metadata, no bitmap to clear bits in */ - - nr_sectors = drbd_get_capacity(device->this_bdev); - esector = sector + (size >> 9) - 1; - - if (!expect(sector < nr_sectors)) - goto out; - if (!expect(esector < nr_sectors)) - esector = nr_sectors - 1; - - lbnr = BM_SECT_TO_BIT(nr_sectors-1); - - /* we clear it (in sync). - * round up start sector, round down end sector. we make sure we only - * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */ - if (unlikely(esector < BM_SECT_PER_BIT-1)) - goto out; - if (unlikely(esector == (nr_sectors-1))) - ebnr = lbnr; - else - ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); - sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); - if (sbnr > ebnr) - goto out; + drbd_device_post_work(device, RS_PROGRESS); +} +static int update_sync_bits(struct drbd_device *device, + unsigned long sbnr, unsigned long ebnr, + enum update_sync_bits_mode mode) +{ /* - * ok, (capacity & 7) != 0 sometimes, but who cares... - * we count rs_{total,left} in bits, not sectors. + * We keep a count of set bits per resync-extent in the ->rs_left + * caching member, so we need to loop and work within the resync extent + * alignment. Typically this loop will execute exactly once. */ - count = drbd_bm_clear_bits(device, sbnr, ebnr); - if (count) { - drbd_advance_rs_marks(device, drbd_bm_total_weight(device)); - spin_lock_irqsave(&device->al_lock, flags); - drbd_try_clear_on_disk_bm(device, sector, count, true); - spin_unlock_irqrestore(&device->al_lock, flags); - - /* just wake_up unconditional now, various lc_chaged(), - * lc_put() in drbd_try_clear_on_disk_bm(). */ - wake_up = 1; + unsigned long flags; + unsigned long count = 0; + unsigned int cleared = 0; + while (sbnr <= ebnr) { + /* set temporary boundary bit number to last bit number within + * the resync extent of the current start bit number, + * but cap at provided end bit number */ + unsigned long tbnr = min(ebnr, sbnr | BM_BLOCKS_PER_BM_EXT_MASK); + unsigned long c; + + if (mode == RECORD_RS_FAILED) + /* Only called from drbd_rs_failed_io(), bits + * supposedly still set. Recount, maybe some + * of the bits have been successfully cleared + * by application IO meanwhile. + */ + c = drbd_bm_count_bits(device, sbnr, tbnr); + else if (mode == SET_IN_SYNC) + c = drbd_bm_clear_bits(device, sbnr, tbnr); + else /* if (mode == SET_OUT_OF_SYNC) */ + c = drbd_bm_set_bits(device, sbnr, tbnr); + + if (c) { + spin_lock_irqsave(&device->al_lock, flags); + cleared += update_rs_extent(device, BM_BIT_TO_EXT(sbnr), c, mode); + spin_unlock_irqrestore(&device->al_lock, flags); + count += c; + } + sbnr = tbnr + 1; } -out: - put_ldev(device); - if (wake_up) + if (count) { + if (mode == SET_IN_SYNC) { + unsigned long still_to_go = drbd_bm_total_weight(device); + bool rs_is_done = (still_to_go <= device->rs_failed); + drbd_advance_rs_marks(device, still_to_go); + if (cleared || rs_is_done) + maybe_schedule_on_disk_bitmap_update(device, rs_is_done); + } else if (mode == RECORD_RS_FAILED) + device->rs_failed += count; wake_up(&device->al_wait); + } + return count; } -/* - * this is intended to set one request worth of data out of sync. - * affects at least 1 bit, - * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits. +/* clear the bit corresponding to the piece of storage in question: + * size byte of data starting from sector. Only clear a bits of the affected + * one ore more _aligned_ BM_BLOCK_SIZE blocks. + * + * called by worker on C_SYNC_TARGET and receiver on SyncSource. * - * called by tl_clear and drbd_send_dblock (==drbd_make_request). - * so this can be _any_ process. */ -int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size, - const char *file, const unsigned int line) +int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, + enum update_sync_bits_mode mode, + const char *file, const unsigned int line) { - unsigned long sbnr, ebnr, flags; + /* Is called from worker and receiver context _only_ */ + unsigned long sbnr, ebnr, lbnr; + unsigned long count = 0; sector_t esector, nr_sectors; - unsigned int enr, count = 0; - struct lc_element *e; - /* this should be an empty REQ_FLUSH */ - if (size == 0) + /* This would be an empty REQ_FLUSH, be silent. */ + if ((mode == SET_OUT_OF_SYNC) && size == 0) return 0; - if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { - drbd_err(device, "sector: %llus, size: %d\n", - (unsigned long long)sector, size); + if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { + drbd_err(device, "%s: sector=%llus size=%d nonsense!\n", + drbd_change_sync_fname[mode], + (unsigned long long)sector, size); return 0; } if (!get_ldev(device)) - return 0; /* no disk, no metadata, no bitmap to set bits in */ + return 0; /* no disk, no metadata, no bitmap to manipulate bits in */ nr_sectors = drbd_get_capacity(device->this_bdev); esector = sector + (size >> 9) - 1; @@ -935,25 +857,28 @@ int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size if (!expect(esector < nr_sectors)) esector = nr_sectors - 1; - /* we set it out of sync, - * we do not need to round anything here */ - sbnr = BM_SECT_TO_BIT(sector); - ebnr = BM_SECT_TO_BIT(esector); - - /* ok, (capacity & 7) != 0 sometimes, but who cares... - * we count rs_{total,left} in bits, not sectors. */ - spin_lock_irqsave(&device->al_lock, flags); - count = drbd_bm_set_bits(device, sbnr, ebnr); + lbnr = BM_SECT_TO_BIT(nr_sectors-1); - enr = BM_SECT_TO_EXT(sector); - e = lc_find(device->resync, enr); - if (e) - lc_entry(e, struct bm_extent, lce)->rs_left += count; - spin_unlock_irqrestore(&device->al_lock, flags); + if (mode == SET_IN_SYNC) { + /* Round up start sector, round down end sector. We make sure + * we only clear full, aligned, BM_BLOCK_SIZE blocks. */ + if (unlikely(esector < BM_SECT_PER_BIT-1)) + goto out; + if (unlikely(esector == (nr_sectors-1))) + ebnr = lbnr; + else + ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); + sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); + } else { + /* We set it out of sync, or record resync failure. + * Should not round anything here. */ + sbnr = BM_SECT_TO_BIT(sector); + ebnr = BM_SECT_TO_BIT(esector); + } + count = update_sync_bits(device, sbnr, ebnr, mode); out: put_ldev(device); - return count; } @@ -1075,6 +1000,15 @@ int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector) struct lc_element *e; struct bm_extent *bm_ext; int i; + bool throttle = drbd_rs_should_slow_down(device, sector, true); + + /* If we need to throttle, a half-locked (only marked BME_NO_WRITES, + * not yet BME_LOCKED) extent needs to be kicked out explicitly if we + * need to throttle. There is at most one such half-locked extent, + * which is remembered in resync_wenr. */ + + if (throttle && device->resync_wenr != enr) + return -EAGAIN; spin_lock_irq(&device->al_lock); if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) { @@ -1098,8 +1032,10 @@ int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector) D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); clear_bit(BME_NO_WRITES, &bm_ext->flags); device->resync_wenr = LC_FREE; - if (lc_put(device->resync, &bm_ext->lce) == 0) + if (lc_put(device->resync, &bm_ext->lce) == 0) { + bm_ext->flags = 0; device->resync_locked--; + } wake_up(&device->al_wait); } else { drbd_alert(device, "LOGIC BUG\n"); @@ -1161,8 +1097,20 @@ proceed: return 0; try_again: - if (bm_ext) - device->resync_wenr = enr; + if (bm_ext) { + if (throttle) { + D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); + D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); + clear_bit(BME_NO_WRITES, &bm_ext->flags); + device->resync_wenr = LC_FREE; + if (lc_put(device->resync, &bm_ext->lce) == 0) { + bm_ext->flags = 0; + device->resync_locked--; + } + wake_up(&device->al_wait); + } else + device->resync_wenr = enr; + } spin_unlock_irq(&device->al_lock); return -EAGAIN; } @@ -1270,69 +1218,3 @@ int drbd_rs_del_all(struct drbd_device *device) return 0; } - -/** - * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks - * @device: DRBD device. - * @sector: The sector number. - * @size: Size of failed IO operation, in byte. - */ -void drbd_rs_failed_io(struct drbd_device *device, sector_t sector, int size) -{ - /* Is called from worker and receiver context _only_ */ - unsigned long sbnr, ebnr, lbnr; - unsigned long count; - sector_t esector, nr_sectors; - int wake_up = 0; - - if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { - drbd_err(device, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", - (unsigned long long)sector, size); - return; - } - nr_sectors = drbd_get_capacity(device->this_bdev); - esector = sector + (size >> 9) - 1; - - if (!expect(sector < nr_sectors)) - return; - if (!expect(esector < nr_sectors)) - esector = nr_sectors - 1; - - lbnr = BM_SECT_TO_BIT(nr_sectors-1); - - /* - * round up start sector, round down end sector. we make sure we only - * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */ - if (unlikely(esector < BM_SECT_PER_BIT-1)) - return; - if (unlikely(esector == (nr_sectors-1))) - ebnr = lbnr; - else - ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); - sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); - - if (sbnr > ebnr) - return; - - /* - * ok, (capacity & 7) != 0 sometimes, but who cares... - * we count rs_{total,left} in bits, not sectors. - */ - spin_lock_irq(&device->al_lock); - count = drbd_bm_count_bits(device, sbnr, ebnr); - if (count) { - device->rs_failed += count; - - if (get_ldev(device)) { - drbd_try_clear_on_disk_bm(device, sector, count, false); - put_ldev(device); - } - - /* just wake_up unconditional now, various lc_chaged(), - * lc_put() in drbd_try_clear_on_disk_bm(). */ - wake_up = 1; - } - spin_unlock_irq(&device->al_lock); - if (wake_up) - wake_up(&device->al_wait); -} diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 1aa29f8fdfe..426c97aef90 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -22,6 +22,8 @@ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/bitops.h> #include <linux/vmalloc.h> #include <linux/string.h> @@ -353,9 +355,8 @@ static void bm_free_pages(struct page **pages, unsigned long number) for (i = 0; i < number; i++) { if (!pages[i]) { - printk(KERN_ALERT "drbd: bm_free_pages tried to free " - "a NULL pointer; i=%lu n=%lu\n", - i, number); + pr_alert("bm_free_pages tried to free a NULL pointer; i=%lu n=%lu\n", + i, number); continue; } __free_page(pages[i]); @@ -592,7 +593,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) end = offset + len; if (end > b->bm_words) { - printk(KERN_ALERT "drbd: bm_memset end > bm_words\n"); + pr_alert("bm_memset end > bm_words\n"); return; } @@ -602,7 +603,7 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) p_addr = bm_map_pidx(b, idx); bm = p_addr + MLPP(offset); if (bm+do_now > p_addr + LWPP) { - printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", + pr_alert("BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", p_addr, bm, (int)do_now); } else memset(bm, c, do_now * sizeof(long)); @@ -927,22 +928,14 @@ void drbd_bm_clear_all(struct drbd_device *device) spin_unlock_irq(&b->bm_lock); } -struct bm_aio_ctx { - struct drbd_device *device; - atomic_t in_flight; - unsigned int done; - unsigned flags; -#define BM_AIO_COPY_PAGES 1 -#define BM_AIO_WRITE_HINTED 2 -#define BM_WRITE_ALL_PAGES 4 - int error; - struct kref kref; -}; - -static void bm_aio_ctx_destroy(struct kref *kref) +static void drbd_bm_aio_ctx_destroy(struct kref *kref) { - struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref); + struct drbd_bm_aio_ctx *ctx = container_of(kref, struct drbd_bm_aio_ctx, kref); + unsigned long flags; + spin_lock_irqsave(&ctx->device->resource->req_lock, flags); + list_del(&ctx->list); + spin_unlock_irqrestore(&ctx->device->resource->req_lock, flags); put_ldev(ctx->device); kfree(ctx); } @@ -950,7 +943,7 @@ static void bm_aio_ctx_destroy(struct kref *kref) /* bv_page may be a copy, or may be the original */ static void bm_async_io_complete(struct bio *bio, int error) { - struct bm_aio_ctx *ctx = bio->bi_private; + struct drbd_bm_aio_ctx *ctx = bio->bi_private; struct drbd_device *device = ctx->device; struct drbd_bitmap *b = device->bitmap; unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page); @@ -993,17 +986,18 @@ static void bm_async_io_complete(struct bio *bio, int error) if (atomic_dec_and_test(&ctx->in_flight)) { ctx->done = 1; wake_up(&device->misc_wait); - kref_put(&ctx->kref, &bm_aio_ctx_destroy); + kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy); } } -static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local) +static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local) { struct bio *bio = bio_alloc_drbd(GFP_NOIO); struct drbd_device *device = ctx->device; struct drbd_bitmap *b = device->bitmap; struct page *page; unsigned int len; + unsigned int rw = (ctx->flags & BM_AIO_READ) ? READ : WRITE; sector_t on_disk_sector = device->ldev->md.md_offset + device->ldev->md.bm_offset; @@ -1049,9 +1043,9 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must /* * bm_rw: read/write the whole bitmap from/to its on disk location. */ -static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local) +static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned lazy_writeout_upper_idx) __must_hold(local) { - struct bm_aio_ctx *ctx; + struct drbd_bm_aio_ctx *ctx; struct drbd_bitmap *b = device->bitmap; int num_pages, i, count = 0; unsigned long now; @@ -1067,12 +1061,13 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la * as we submit copies of pages anyways. */ - ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO); + ctx = kmalloc(sizeof(struct drbd_bm_aio_ctx), GFP_NOIO); if (!ctx) return -ENOMEM; - *ctx = (struct bm_aio_ctx) { + *ctx = (struct drbd_bm_aio_ctx) { .device = device, + .start_jif = jiffies, .in_flight = ATOMIC_INIT(1), .done = 0, .flags = flags, @@ -1080,15 +1075,21 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la .kref = { ATOMIC_INIT(2) }, }; - if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */ + if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in drbd_bm_aio_ctx_destroy() */ drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n"); kfree(ctx); return -ENODEV; } + /* Here D_ATTACHING is sufficient since drbd_bm_read() is called only from + drbd_adm_attach(), after device->ldev was assigned. */ - if (!ctx->flags) + if (0 == (ctx->flags & ~BM_AIO_READ)) WARN_ON(!(BM_LOCKED_MASK & b->bm_flags)); + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&ctx->list, &device->pending_bitmap_io); + spin_unlock_irq(&device->resource->req_lock); + num_pages = b->bm_number_of_pages; now = jiffies; @@ -1098,13 +1099,13 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la /* ignore completely unchanged pages */ if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) break; - if (rw & WRITE) { + if (!(flags & BM_AIO_READ)) { if ((flags & BM_AIO_WRITE_HINTED) && !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT, &page_private(b->bm_pages[i]))) continue; - if (!(flags & BM_WRITE_ALL_PAGES) && + if (!(flags & BM_AIO_WRITE_ALL_PAGES) && bm_test_page_unchanged(b->bm_pages[i])) { dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i); continue; @@ -1118,7 +1119,7 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la } } atomic_inc(&ctx->in_flight); - bm_page_io_async(ctx, i, rw); + bm_page_io_async(ctx, i); ++count; cond_resched(); } @@ -1134,12 +1135,12 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la if (!atomic_dec_and_test(&ctx->in_flight)) wait_until_done_or_force_detached(device, device->ldev, &ctx->done); else - kref_put(&ctx->kref, &bm_aio_ctx_destroy); + kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy); /* summary for global bitmap IO */ if (flags == 0) drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n", - rw == WRITE ? "WRITE" : "READ", + (flags & BM_AIO_READ) ? "READ" : "WRITE", count, jiffies - now); if (ctx->error) { @@ -1152,20 +1153,18 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la err = -EIO; /* Disk timeout/force-detach during IO... */ now = jiffies; - if (rw == WRITE) { - drbd_md_flush(device); - } else /* rw == READ */ { + if (flags & BM_AIO_READ) { b->bm_set = bm_count_bits(b); drbd_info(device, "recounting of set bits took additional %lu jiffies\n", jiffies - now); } now = b->bm_set; - if (flags == 0) + if ((flags & ~BM_AIO_READ) == 0) drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); - kref_put(&ctx->kref, &bm_aio_ctx_destroy); + kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy); return err; } @@ -1175,7 +1174,7 @@ static int bm_rw(struct drbd_device *device, int rw, unsigned flags, unsigned la */ int drbd_bm_read(struct drbd_device *device) __must_hold(local) { - return bm_rw(device, READ, 0, 0); + return bm_rw(device, BM_AIO_READ, 0); } /** @@ -1186,7 +1185,7 @@ int drbd_bm_read(struct drbd_device *device) __must_hold(local) */ int drbd_bm_write(struct drbd_device *device) __must_hold(local) { - return bm_rw(device, WRITE, 0, 0); + return bm_rw(device, 0, 0); } /** @@ -1197,7 +1196,17 @@ int drbd_bm_write(struct drbd_device *device) __must_hold(local) */ int drbd_bm_write_all(struct drbd_device *device) __must_hold(local) { - return bm_rw(device, WRITE, BM_WRITE_ALL_PAGES, 0); + return bm_rw(device, BM_AIO_WRITE_ALL_PAGES, 0); +} + +/** + * drbd_bm_write_lazy() - Write bitmap pages 0 to @upper_idx-1, if they have changed. + * @device: DRBD device. + * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages + */ +int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local) +{ + return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx); } /** @@ -1213,7 +1222,7 @@ int drbd_bm_write_all(struct drbd_device *device) __must_hold(local) */ int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local) { - return bm_rw(device, WRITE, BM_AIO_COPY_PAGES, 0); + return bm_rw(device, BM_AIO_COPY_PAGES, 0); } /** @@ -1222,62 +1231,7 @@ int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local) */ int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local) { - return bm_rw(device, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0); -} - -/** - * drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap - * @device: DRBD device. - * @idx: bitmap page index - * - * We don't want to special case on logical_block_size of the backend device, - * so we submit PAGE_SIZE aligned pieces. - * Note that on "most" systems, PAGE_SIZE is 4k. - * - * In case this becomes an issue on systems with larger PAGE_SIZE, - * we may want to change this again to write 4k aligned 4k pieces. - */ -int drbd_bm_write_page(struct drbd_device *device, unsigned int idx) __must_hold(local) -{ - struct bm_aio_ctx *ctx; - int err; - - if (bm_test_page_unchanged(device->bitmap->bm_pages[idx])) { - dynamic_drbd_dbg(device, "skipped bm page write for idx %u\n", idx); - return 0; - } - - ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO); - if (!ctx) - return -ENOMEM; - - *ctx = (struct bm_aio_ctx) { - .device = device, - .in_flight = ATOMIC_INIT(1), - .done = 0, - .flags = BM_AIO_COPY_PAGES, - .error = 0, - .kref = { ATOMIC_INIT(2) }, - }; - - if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */ - drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n"); - kfree(ctx); - return -ENODEV; - } - - bm_page_io_async(ctx, idx, WRITE_SYNC); - wait_until_done_or_force_detached(device, device->ldev, &ctx->done); - - if (ctx->error) - drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); - /* that causes us to detach, so the in memory bitmap will be - * gone in a moment as well. */ - - device->bm_writ_cnt++; - err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error; - kref_put(&ctx->kref, &bm_aio_ctx_destroy); - return err; + return bm_rw(device, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0); } /* NOTE diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c new file mode 100644 index 00000000000..5c20b18540b --- /dev/null +++ b/drivers/block/drbd/drbd_debugfs.c @@ -0,0 +1,958 @@ +#define pr_fmt(fmt) "drbd debugfs: " fmt +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/stat.h> +#include <linux/jiffies.h> +#include <linux/list.h> + +#include "drbd_int.h" +#include "drbd_req.h" +#include "drbd_debugfs.h" + + +/********************************************************************** + * Whenever you change the file format, remember to bump the version. * + **********************************************************************/ + +static struct dentry *drbd_debugfs_root; +static struct dentry *drbd_debugfs_version; +static struct dentry *drbd_debugfs_resources; +static struct dentry *drbd_debugfs_minors; + +static void seq_print_age_or_dash(struct seq_file *m, bool valid, unsigned long dt) +{ + if (valid) + seq_printf(m, "\t%d", jiffies_to_msecs(dt)); + else + seq_printf(m, "\t-"); +} + +static void __seq_print_rq_state_bit(struct seq_file *m, + bool is_set, char *sep, const char *set_name, const char *unset_name) +{ + if (is_set && set_name) { + seq_putc(m, *sep); + seq_puts(m, set_name); + *sep = '|'; + } else if (!is_set && unset_name) { + seq_putc(m, *sep); + seq_puts(m, unset_name); + *sep = '|'; + } +} + +static void seq_print_rq_state_bit(struct seq_file *m, + bool is_set, char *sep, const char *set_name) +{ + __seq_print_rq_state_bit(m, is_set, sep, set_name, NULL); +} + +/* pretty print enum drbd_req_state_bits req->rq_state */ +static void seq_print_request_state(struct seq_file *m, struct drbd_request *req) +{ + unsigned int s = req->rq_state; + char sep = ' '; + seq_printf(m, "\t0x%08x", s); + seq_printf(m, "\tmaster: %s", req->master_bio ? "pending" : "completed"); + + /* RQ_WRITE ignored, already reported */ + seq_puts(m, "\tlocal:"); + seq_print_rq_state_bit(m, s & RQ_IN_ACT_LOG, &sep, "in-AL"); + seq_print_rq_state_bit(m, s & RQ_POSTPONED, &sep, "postponed"); + seq_print_rq_state_bit(m, s & RQ_COMPLETION_SUSP, &sep, "suspended"); + sep = ' '; + seq_print_rq_state_bit(m, s & RQ_LOCAL_PENDING, &sep, "pending"); + seq_print_rq_state_bit(m, s & RQ_LOCAL_COMPLETED, &sep, "completed"); + seq_print_rq_state_bit(m, s & RQ_LOCAL_ABORTED, &sep, "aborted"); + seq_print_rq_state_bit(m, s & RQ_LOCAL_OK, &sep, "ok"); + if (sep == ' ') + seq_puts(m, " -"); + + /* for_each_connection ... */ + seq_printf(m, "\tnet:"); + sep = ' '; + seq_print_rq_state_bit(m, s & RQ_NET_PENDING, &sep, "pending"); + seq_print_rq_state_bit(m, s & RQ_NET_QUEUED, &sep, "queued"); + seq_print_rq_state_bit(m, s & RQ_NET_SENT, &sep, "sent"); + seq_print_rq_state_bit(m, s & RQ_NET_DONE, &sep, "done"); + seq_print_rq_state_bit(m, s & RQ_NET_SIS, &sep, "sis"); + seq_print_rq_state_bit(m, s & RQ_NET_OK, &sep, "ok"); + if (sep == ' ') + seq_puts(m, " -"); + + seq_printf(m, " :"); + sep = ' '; + seq_print_rq_state_bit(m, s & RQ_EXP_RECEIVE_ACK, &sep, "B"); + seq_print_rq_state_bit(m, s & RQ_EXP_WRITE_ACK, &sep, "C"); + seq_print_rq_state_bit(m, s & RQ_EXP_BARR_ACK, &sep, "barr"); + if (sep == ' ') + seq_puts(m, " -"); + seq_printf(m, "\n"); +} + +static void seq_print_one_request(struct seq_file *m, struct drbd_request *req, unsigned long now) +{ + /* change anything here, fixup header below! */ + unsigned int s = req->rq_state; + +#define RQ_HDR_1 "epoch\tsector\tsize\trw" + seq_printf(m, "0x%x\t%llu\t%u\t%s", + req->epoch, + (unsigned long long)req->i.sector, req->i.size >> 9, + (s & RQ_WRITE) ? "W" : "R"); + +#define RQ_HDR_2 "\tstart\tin AL\tsubmit" + seq_printf(m, "\t%d", jiffies_to_msecs(now - req->start_jif)); + seq_print_age_or_dash(m, s & RQ_IN_ACT_LOG, now - req->in_actlog_jif); + seq_print_age_or_dash(m, s & RQ_LOCAL_PENDING, now - req->pre_submit_jif); + +#define RQ_HDR_3 "\tsent\tacked\tdone" + seq_print_age_or_dash(m, s & RQ_NET_SENT, now - req->pre_send_jif); + seq_print_age_or_dash(m, (s & RQ_NET_SENT) && !(s & RQ_NET_PENDING), now - req->acked_jif); + seq_print_age_or_dash(m, s & RQ_NET_DONE, now - req->net_done_jif); + +#define RQ_HDR_4 "\tstate\n" + seq_print_request_state(m, req); +} +#define RQ_HDR RQ_HDR_1 RQ_HDR_2 RQ_HDR_3 RQ_HDR_4 + +static void seq_print_minor_vnr_req(struct seq_file *m, struct drbd_request *req, unsigned long now) +{ + seq_printf(m, "%u\t%u\t", req->device->minor, req->device->vnr); + seq_print_one_request(m, req, now); +} + +static void seq_print_resource_pending_meta_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now) +{ + struct drbd_device *device; + unsigned int i; + + seq_puts(m, "minor\tvnr\tstart\tsubmit\tintent\n"); + rcu_read_lock(); + idr_for_each_entry(&resource->devices, device, i) { + struct drbd_md_io tmp; + /* In theory this is racy, + * in the sense that there could have been a + * drbd_md_put_buffer(); drbd_md_get_buffer(); + * between accessing these members here. */ + tmp = device->md_io; + if (atomic_read(&tmp.in_use)) { + seq_printf(m, "%u\t%u\t%d\t", + device->minor, device->vnr, + jiffies_to_msecs(now - tmp.start_jif)); + if (time_before(tmp.submit_jif, tmp.start_jif)) + seq_puts(m, "-\t"); + else + seq_printf(m, "%d\t", jiffies_to_msecs(now - tmp.submit_jif)); + seq_printf(m, "%s\n", tmp.current_use); + } + } + rcu_read_unlock(); +} + +static void seq_print_waiting_for_AL(struct seq_file *m, struct drbd_resource *resource, unsigned long now) +{ + struct drbd_device *device; + unsigned int i; + + seq_puts(m, "minor\tvnr\tage\t#waiting\n"); + rcu_read_lock(); + idr_for_each_entry(&resource->devices, device, i) { + unsigned long jif; + struct drbd_request *req; + int n = atomic_read(&device->ap_actlog_cnt); + if (n) { + spin_lock_irq(&device->resource->req_lock); + req = list_first_entry_or_null(&device->pending_master_completion[1], + struct drbd_request, req_pending_master_completion); + /* if the oldest request does not wait for the activity log + * it is not interesting for us here */ + if (req && !(req->rq_state & RQ_IN_ACT_LOG)) + jif = req->start_jif; + else + req = NULL; + spin_unlock_irq(&device->resource->req_lock); + } + if (n) { + seq_printf(m, "%u\t%u\t", device->minor, device->vnr); + if (req) + seq_printf(m, "%u\t", jiffies_to_msecs(now - jif)); + else + seq_puts(m, "-\t"); + seq_printf(m, "%u\n", n); + } + } + rcu_read_unlock(); +} + +static void seq_print_device_bitmap_io(struct seq_file *m, struct drbd_device *device, unsigned long now) +{ + struct drbd_bm_aio_ctx *ctx; + unsigned long start_jif; + unsigned int in_flight; + unsigned int flags; + spin_lock_irq(&device->resource->req_lock); + ctx = list_first_entry_or_null(&device->pending_bitmap_io, struct drbd_bm_aio_ctx, list); + if (ctx && ctx->done) + ctx = NULL; + if (ctx) { + start_jif = ctx->start_jif; + in_flight = atomic_read(&ctx->in_flight); + flags = ctx->flags; + } + spin_unlock_irq(&device->resource->req_lock); + if (ctx) { + seq_printf(m, "%u\t%u\t%c\t%u\t%u\n", + device->minor, device->vnr, + (flags & BM_AIO_READ) ? 'R' : 'W', + jiffies_to_msecs(now - start_jif), + in_flight); + } +} + +static void seq_print_resource_pending_bitmap_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now) +{ + struct drbd_device *device; + unsigned int i; + + seq_puts(m, "minor\tvnr\trw\tage\t#in-flight\n"); + rcu_read_lock(); + idr_for_each_entry(&resource->devices, device, i) { + seq_print_device_bitmap_io(m, device, now); + } + rcu_read_unlock(); +} + +/* pretty print enum peer_req->flags */ +static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_request *peer_req) +{ + unsigned long f = peer_req->flags; + char sep = ' '; + + __seq_print_rq_state_bit(m, f & EE_SUBMITTED, &sep, "submitted", "preparing"); + __seq_print_rq_state_bit(m, f & EE_APPLICATION, &sep, "application", "internal"); + seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL"); + seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C"); + seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync"); + + if (f & EE_IS_TRIM) { + seq_putc(m, sep); + sep = '|'; + if (f & EE_IS_TRIM_USE_ZEROOUT) + seq_puts(m, "zero-out"); + else + seq_puts(m, "trim"); + } + seq_putc(m, '\n'); +} + +static void seq_print_peer_request(struct seq_file *m, + struct drbd_device *device, struct list_head *lh, + unsigned long now) +{ + bool reported_preparing = false; + struct drbd_peer_request *peer_req; + list_for_each_entry(peer_req, lh, w.list) { + if (reported_preparing && !(peer_req->flags & EE_SUBMITTED)) + continue; + + if (device) + seq_printf(m, "%u\t%u\t", device->minor, device->vnr); + + seq_printf(m, "%llu\t%u\t%c\t%u\t", + (unsigned long long)peer_req->i.sector, peer_req->i.size >> 9, + (peer_req->flags & EE_WRITE) ? 'W' : 'R', + jiffies_to_msecs(now - peer_req->submit_jif)); + seq_print_peer_request_flags(m, peer_req); + if (peer_req->flags & EE_SUBMITTED) + break; + else + reported_preparing = true; + } +} + +static void seq_print_device_peer_requests(struct seq_file *m, + struct drbd_device *device, unsigned long now) +{ + seq_puts(m, "minor\tvnr\tsector\tsize\trw\tage\tflags\n"); + spin_lock_irq(&device->resource->req_lock); + seq_print_peer_request(m, device, &device->active_ee, now); + seq_print_peer_request(m, device, &device->read_ee, now); + seq_print_peer_request(m, device, &device->sync_ee, now); + spin_unlock_irq(&device->resource->req_lock); + if (test_bit(FLUSH_PENDING, &device->flags)) { + seq_printf(m, "%u\t%u\t-\t-\tF\t%u\tflush\n", + device->minor, device->vnr, + jiffies_to_msecs(now - device->flush_jif)); + } +} + +static void seq_print_resource_pending_peer_requests(struct seq_file *m, + struct drbd_resource *resource, unsigned long now) +{ + struct drbd_device *device; + unsigned int i; + + rcu_read_lock(); + idr_for_each_entry(&resource->devices, device, i) { + seq_print_device_peer_requests(m, device, now); + } + rcu_read_unlock(); +} + +static void seq_print_resource_transfer_log_summary(struct seq_file *m, + struct drbd_resource *resource, + struct drbd_connection *connection, + unsigned long now) +{ + struct drbd_request *req; + unsigned int count = 0; + unsigned int show_state = 0; + + seq_puts(m, "n\tdevice\tvnr\t" RQ_HDR); + spin_lock_irq(&resource->req_lock); + list_for_each_entry(req, &connection->transfer_log, tl_requests) { + unsigned int tmp = 0; + unsigned int s; + ++count; + + /* don't disable irq "forever" */ + if (!(count & 0x1ff)) { + struct drbd_request *req_next; + kref_get(&req->kref); + spin_unlock_irq(&resource->req_lock); + cond_resched(); + spin_lock_irq(&resource->req_lock); + req_next = list_next_entry(req, tl_requests); + if (kref_put(&req->kref, drbd_req_destroy)) + req = req_next; + if (&req->tl_requests == &connection->transfer_log) + break; + } + + s = req->rq_state; + + /* This is meant to summarize timing issues, to be able to tell + * local disk problems from network problems. + * Skip requests, if we have shown an even older request with + * similar aspects already. */ + if (req->master_bio == NULL) + tmp |= 1; + if ((s & RQ_LOCAL_MASK) && (s & RQ_LOCAL_PENDING)) + tmp |= 2; + if (s & RQ_NET_MASK) { + if (!(s & RQ_NET_SENT)) + tmp |= 4; + if (s & RQ_NET_PENDING) + tmp |= 8; + if (!(s & RQ_NET_DONE)) + tmp |= 16; + } + if ((tmp & show_state) == tmp) + continue; + show_state |= tmp; + seq_printf(m, "%u\t", count); + seq_print_minor_vnr_req(m, req, now); + if (show_state == 0x1f) + break; + } + spin_unlock_irq(&resource->req_lock); +} + +/* TODO: transfer_log and friends should be moved to resource */ +static int in_flight_summary_show(struct seq_file *m, void *pos) +{ + struct drbd_resource *resource = m->private; + struct drbd_connection *connection; + unsigned long jif = jiffies; + + connection = first_connection(resource); + /* This does not happen, actually. + * But be robust and prepare for future code changes. */ + if (!connection || !kref_get_unless_zero(&connection->kref)) + return -ESTALE; + + /* BUMP me if you change the file format/content/presentation */ + seq_printf(m, "v: %u\n\n", 0); + + seq_puts(m, "oldest bitmap IO\n"); + seq_print_resource_pending_bitmap_io(m, resource, jif); + seq_putc(m, '\n'); + + seq_puts(m, "meta data IO\n"); + seq_print_resource_pending_meta_io(m, resource, jif); + seq_putc(m, '\n'); + + seq_puts(m, "socket buffer stats\n"); + /* for each connection ... once we have more than one */ + rcu_read_lock(); + if (connection->data.socket) { + /* open coded SIOCINQ, the "relevant" part */ + struct tcp_sock *tp = tcp_sk(connection->data.socket->sk); + int answ = tp->rcv_nxt - tp->copied_seq; + seq_printf(m, "unread receive buffer: %u Byte\n", answ); + /* open coded SIOCOUTQ, the "relevant" part */ + answ = tp->write_seq - tp->snd_una; + seq_printf(m, "unacked send buffer: %u Byte\n", answ); + } + rcu_read_unlock(); + seq_putc(m, '\n'); + + seq_puts(m, "oldest peer requests\n"); + seq_print_resource_pending_peer_requests(m, resource, jif); + seq_putc(m, '\n'); + + seq_puts(m, "application requests waiting for activity log\n"); + seq_print_waiting_for_AL(m, resource, jif); + seq_putc(m, '\n'); + + seq_puts(m, "oldest application requests\n"); + seq_print_resource_transfer_log_summary(m, resource, connection, jif); + seq_putc(m, '\n'); + + jif = jiffies - jif; + if (jif) + seq_printf(m, "generated in %d ms\n", jiffies_to_msecs(jif)); + kref_put(&connection->kref, drbd_destroy_connection); + return 0; +} + +/* simple_positive(file->f_dentry) respectively debugfs_positive(), + * but neither is "reachable" from here. + * So we have our own inline version of it above. :-( */ +static inline int debugfs_positive(struct dentry *dentry) +{ + return dentry->d_inode && !d_unhashed(dentry); +} + +/* make sure at *open* time that the respective object won't go away. */ +static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *), + void *data, struct kref *kref, + void (*release)(struct kref *)) +{ + struct dentry *parent; + int ret = -ESTALE; + + /* Are we still linked, + * or has debugfs_remove() already been called? */ + parent = file->f_dentry->d_parent; + /* not sure if this can happen: */ + if (!parent || !parent->d_inode) + goto out; + /* serialize with d_delete() */ + mutex_lock(&parent->d_inode->i_mutex); + /* Make sure the object is still alive */ + if (debugfs_positive(file->f_dentry) + && kref_get_unless_zero(kref)) + ret = 0; + mutex_unlock(&parent->d_inode->i_mutex); + if (!ret) { + ret = single_open(file, show, data); + if (ret) + kref_put(kref, release); + } +out: + return ret; +} + +static int in_flight_summary_open(struct inode *inode, struct file *file) +{ + struct drbd_resource *resource = inode->i_private; + return drbd_single_open(file, in_flight_summary_show, resource, + &resource->kref, drbd_destroy_resource); +} + +static int in_flight_summary_release(struct inode *inode, struct file *file) +{ + struct drbd_resource *resource = inode->i_private; + kref_put(&resource->kref, drbd_destroy_resource); + return single_release(inode, file); +} + +static const struct file_operations in_flight_summary_fops = { + .owner = THIS_MODULE, + .open = in_flight_summary_open, + .read = seq_read, + .llseek = seq_lseek, + .release = in_flight_summary_release, +}; + +void drbd_debugfs_resource_add(struct drbd_resource *resource) +{ + struct dentry *dentry; + if (!drbd_debugfs_resources) + return; + + dentry = debugfs_create_dir(resource->name, drbd_debugfs_resources); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + resource->debugfs_res = dentry; + + dentry = debugfs_create_dir("volumes", resource->debugfs_res); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + resource->debugfs_res_volumes = dentry; + + dentry = debugfs_create_dir("connections", resource->debugfs_res); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + resource->debugfs_res_connections = dentry; + + dentry = debugfs_create_file("in_flight_summary", S_IRUSR|S_IRGRP, + resource->debugfs_res, resource, + &in_flight_summary_fops); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + resource->debugfs_res_in_flight_summary = dentry; + return; + +fail: + drbd_debugfs_resource_cleanup(resource); + drbd_err(resource, "failed to create debugfs dentry\n"); +} + +static void drbd_debugfs_remove(struct dentry **dp) +{ + debugfs_remove(*dp); + *dp = NULL; +} + +void drbd_debugfs_resource_cleanup(struct drbd_resource *resource) +{ + /* it is ok to call debugfs_remove(NULL) */ + drbd_debugfs_remove(&resource->debugfs_res_in_flight_summary); + drbd_debugfs_remove(&resource->debugfs_res_connections); + drbd_debugfs_remove(&resource->debugfs_res_volumes); + drbd_debugfs_remove(&resource->debugfs_res); +} + +static void seq_print_one_timing_detail(struct seq_file *m, + const struct drbd_thread_timing_details *tdp, + unsigned long now) +{ + struct drbd_thread_timing_details td; + /* No locking... + * use temporary assignment to get at consistent data. */ + do { + td = *tdp; + } while (td.cb_nr != tdp->cb_nr); + if (!td.cb_addr) + return; + seq_printf(m, "%u\t%d\t%s:%u\t%ps\n", + td.cb_nr, + jiffies_to_msecs(now - td.start_jif), + td.caller_fn, td.line, + td.cb_addr); +} + +static void seq_print_timing_details(struct seq_file *m, + const char *title, + unsigned int cb_nr, struct drbd_thread_timing_details *tdp, unsigned long now) +{ + unsigned int start_idx; + unsigned int i; + + seq_printf(m, "%s\n", title); + /* If not much is going on, this will result in natural ordering. + * If it is very busy, we will possibly skip events, or even see wrap + * arounds, which could only be avoided with locking. + */ + start_idx = cb_nr % DRBD_THREAD_DETAILS_HIST; + for (i = start_idx; i < DRBD_THREAD_DETAILS_HIST; i++) + seq_print_one_timing_detail(m, tdp+i, now); + for (i = 0; i < start_idx; i++) + seq_print_one_timing_detail(m, tdp+i, now); +} + +static int callback_history_show(struct seq_file *m, void *ignored) +{ + struct drbd_connection *connection = m->private; + unsigned long jif = jiffies; + + /* BUMP me if you change the file format/content/presentation */ + seq_printf(m, "v: %u\n\n", 0); + + seq_puts(m, "n\tage\tcallsite\tfn\n"); + seq_print_timing_details(m, "worker", connection->w_cb_nr, connection->w_timing_details, jif); + seq_print_timing_details(m, "receiver", connection->r_cb_nr, connection->r_timing_details, jif); + return 0; +} + +static int callback_history_open(struct inode *inode, struct file *file) +{ + struct drbd_connection *connection = inode->i_private; + return drbd_single_open(file, callback_history_show, connection, + &connection->kref, drbd_destroy_connection); +} + +static int callback_history_release(struct inode *inode, struct file *file) +{ + struct drbd_connection *connection = inode->i_private; + kref_put(&connection->kref, drbd_destroy_connection); + return single_release(inode, file); +} + +static const struct file_operations connection_callback_history_fops = { + .owner = THIS_MODULE, + .open = callback_history_open, + .read = seq_read, + .llseek = seq_lseek, + .release = callback_history_release, +}; + +static int connection_oldest_requests_show(struct seq_file *m, void *ignored) +{ + struct drbd_connection *connection = m->private; + unsigned long now = jiffies; + struct drbd_request *r1, *r2; + + /* BUMP me if you change the file format/content/presentation */ + seq_printf(m, "v: %u\n\n", 0); + + spin_lock_irq(&connection->resource->req_lock); + r1 = connection->req_next; + if (r1) + seq_print_minor_vnr_req(m, r1, now); + r2 = connection->req_ack_pending; + if (r2 && r2 != r1) { + r1 = r2; + seq_print_minor_vnr_req(m, r1, now); + } + r2 = connection->req_not_net_done; + if (r2 && r2 != r1) + seq_print_minor_vnr_req(m, r2, now); + spin_unlock_irq(&connection->resource->req_lock); + return 0; +} + +static int connection_oldest_requests_open(struct inode *inode, struct file *file) +{ + struct drbd_connection *connection = inode->i_private; + return drbd_single_open(file, connection_oldest_requests_show, connection, + &connection->kref, drbd_destroy_connection); +} + +static int connection_oldest_requests_release(struct inode *inode, struct file *file) +{ + struct drbd_connection *connection = inode->i_private; + kref_put(&connection->kref, drbd_destroy_connection); + return single_release(inode, file); +} + +static const struct file_operations connection_oldest_requests_fops = { + .owner = THIS_MODULE, + .open = connection_oldest_requests_open, + .read = seq_read, + .llseek = seq_lseek, + .release = connection_oldest_requests_release, +}; + +void drbd_debugfs_connection_add(struct drbd_connection *connection) +{ + struct dentry *conns_dir = connection->resource->debugfs_res_connections; + struct dentry *dentry; + if (!conns_dir) + return; + + /* Once we enable mutliple peers, + * these connections will have descriptive names. + * For now, it is just the one connection to the (only) "peer". */ + dentry = debugfs_create_dir("peer", conns_dir); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + connection->debugfs_conn = dentry; + + dentry = debugfs_create_file("callback_history", S_IRUSR|S_IRGRP, + connection->debugfs_conn, connection, + &connection_callback_history_fops); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + connection->debugfs_conn_callback_history = dentry; + + dentry = debugfs_create_file("oldest_requests", S_IRUSR|S_IRGRP, + connection->debugfs_conn, connection, + &connection_oldest_requests_fops); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + connection->debugfs_conn_oldest_requests = dentry; + return; + +fail: + drbd_debugfs_connection_cleanup(connection); + drbd_err(connection, "failed to create debugfs dentry\n"); +} + +void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) +{ + drbd_debugfs_remove(&connection->debugfs_conn_callback_history); + drbd_debugfs_remove(&connection->debugfs_conn_oldest_requests); + drbd_debugfs_remove(&connection->debugfs_conn); +} + +static void resync_dump_detail(struct seq_file *m, struct lc_element *e) +{ + struct bm_extent *bme = lc_entry(e, struct bm_extent, lce); + + seq_printf(m, "%5d %s %s %s\n", bme->rs_left, + test_bit(BME_NO_WRITES, &bme->flags) ? "NO_WRITES" : "---------", + test_bit(BME_LOCKED, &bme->flags) ? "LOCKED" : "------", + test_bit(BME_PRIORITY, &bme->flags) ? "PRIORITY" : "--------" + ); +} + +static int device_resync_extents_show(struct seq_file *m, void *ignored) +{ + struct drbd_device *device = m->private; + + /* BUMP me if you change the file format/content/presentation */ + seq_printf(m, "v: %u\n\n", 0); + + if (get_ldev_if_state(device, D_FAILED)) { + lc_seq_printf_stats(m, device->resync); + lc_seq_dump_details(m, device->resync, "rs_left flags", resync_dump_detail); + put_ldev(device); + } + return 0; +} + +static int device_act_log_extents_show(struct seq_file *m, void *ignored) +{ + struct drbd_device *device = m->private; + + /* BUMP me if you change the file format/content/presentation */ + seq_printf(m, "v: %u\n\n", 0); + + if (get_ldev_if_state(device, D_FAILED)) { + lc_seq_printf_stats(m, device->act_log); + lc_seq_dump_details(m, device->act_log, "", NULL); + put_ldev(device); + } + return 0; +} + +static int device_oldest_requests_show(struct seq_file *m, void *ignored) +{ + struct drbd_device *device = m->private; + struct drbd_resource *resource = device->resource; + unsigned long now = jiffies; + struct drbd_request *r1, *r2; + int i; + + /* BUMP me if you change the file format/content/presentation */ + seq_printf(m, "v: %u\n\n", 0); + + seq_puts(m, RQ_HDR); + spin_lock_irq(&resource->req_lock); + /* WRITE, then READ */ + for (i = 1; i >= 0; --i) { + r1 = list_first_entry_or_null(&device->pending_master_completion[i], + struct drbd_request, req_pending_master_completion); + r2 = list_first_entry_or_null(&device->pending_completion[i], + struct drbd_request, req_pending_local); + if (r1) + seq_print_one_request(m, r1, now); + if (r2 && r2 != r1) + seq_print_one_request(m, r2, now); + } + spin_unlock_irq(&resource->req_lock); + return 0; +} + +static int device_data_gen_id_show(struct seq_file *m, void *ignored) +{ + struct drbd_device *device = m->private; + struct drbd_md *md; + enum drbd_uuid_index idx; + + if (!get_ldev_if_state(device, D_FAILED)) + return -ENODEV; + + md = &device->ldev->md; + spin_lock_irq(&md->uuid_lock); + for (idx = UI_CURRENT; idx <= UI_HISTORY_END; idx++) { + seq_printf(m, "0x%016llX\n", md->uuid[idx]); + } + spin_unlock_irq(&md->uuid_lock); + put_ldev(device); + return 0; +} + +#define drbd_debugfs_device_attr(name) \ +static int device_ ## name ## _open(struct inode *inode, struct file *file) \ +{ \ + struct drbd_device *device = inode->i_private; \ + return drbd_single_open(file, device_ ## name ## _show, device, \ + &device->kref, drbd_destroy_device); \ +} \ +static int device_ ## name ## _release(struct inode *inode, struct file *file) \ +{ \ + struct drbd_device *device = inode->i_private; \ + kref_put(&device->kref, drbd_destroy_device); \ + return single_release(inode, file); \ +} \ +static const struct file_operations device_ ## name ## _fops = { \ + .owner = THIS_MODULE, \ + .open = device_ ## name ## _open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = device_ ## name ## _release, \ +}; + +drbd_debugfs_device_attr(oldest_requests) +drbd_debugfs_device_attr(act_log_extents) +drbd_debugfs_device_attr(resync_extents) +drbd_debugfs_device_attr(data_gen_id) + +void drbd_debugfs_device_add(struct drbd_device *device) +{ + struct dentry *vols_dir = device->resource->debugfs_res_volumes; + char minor_buf[8]; /* MINORMASK, MINORBITS == 20; */ + char vnr_buf[8]; /* volume number vnr is even 16 bit only; */ + char *slink_name = NULL; + + struct dentry *dentry; + if (!vols_dir || !drbd_debugfs_minors) + return; + + snprintf(vnr_buf, sizeof(vnr_buf), "%u", device->vnr); + dentry = debugfs_create_dir(vnr_buf, vols_dir); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + device->debugfs_vol = dentry; + + snprintf(minor_buf, sizeof(minor_buf), "%u", device->minor); + slink_name = kasprintf(GFP_KERNEL, "../resources/%s/volumes/%u", + device->resource->name, device->vnr); + if (!slink_name) + goto fail; + dentry = debugfs_create_symlink(minor_buf, drbd_debugfs_minors, slink_name); + kfree(slink_name); + slink_name = NULL; + if (IS_ERR_OR_NULL(dentry)) + goto fail; + device->debugfs_minor = dentry; + +#define DCF(name) do { \ + dentry = debugfs_create_file(#name, S_IRUSR|S_IRGRP, \ + device->debugfs_vol, device, \ + &device_ ## name ## _fops); \ + if (IS_ERR_OR_NULL(dentry)) \ + goto fail; \ + device->debugfs_vol_ ## name = dentry; \ + } while (0) + + DCF(oldest_requests); + DCF(act_log_extents); + DCF(resync_extents); + DCF(data_gen_id); +#undef DCF + return; + +fail: + drbd_debugfs_device_cleanup(device); + drbd_err(device, "failed to create debugfs entries\n"); +} + +void drbd_debugfs_device_cleanup(struct drbd_device *device) +{ + drbd_debugfs_remove(&device->debugfs_minor); + drbd_debugfs_remove(&device->debugfs_vol_oldest_requests); + drbd_debugfs_remove(&device->debugfs_vol_act_log_extents); + drbd_debugfs_remove(&device->debugfs_vol_resync_extents); + drbd_debugfs_remove(&device->debugfs_vol_data_gen_id); + drbd_debugfs_remove(&device->debugfs_vol); +} + +void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device) +{ + struct dentry *conn_dir = peer_device->connection->debugfs_conn; + struct dentry *dentry; + char vnr_buf[8]; + + if (!conn_dir) + return; + + snprintf(vnr_buf, sizeof(vnr_buf), "%u", peer_device->device->vnr); + dentry = debugfs_create_dir(vnr_buf, conn_dir); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + peer_device->debugfs_peer_dev = dentry; + return; + +fail: + drbd_debugfs_peer_device_cleanup(peer_device); + drbd_err(peer_device, "failed to create debugfs entries\n"); +} + +void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device) +{ + drbd_debugfs_remove(&peer_device->debugfs_peer_dev); +} + +static int drbd_version_show(struct seq_file *m, void *ignored) +{ + seq_printf(m, "# %s\n", drbd_buildtag()); + seq_printf(m, "VERSION=%s\n", REL_VERSION); + seq_printf(m, "API_VERSION=%u\n", API_VERSION); + seq_printf(m, "PRO_VERSION_MIN=%u\n", PRO_VERSION_MIN); + seq_printf(m, "PRO_VERSION_MAX=%u\n", PRO_VERSION_MAX); + return 0; +} + +static int drbd_version_open(struct inode *inode, struct file *file) +{ + return single_open(file, drbd_version_show, NULL); +} + +static struct file_operations drbd_version_fops = { + .owner = THIS_MODULE, + .open = drbd_version_open, + .llseek = seq_lseek, + .read = seq_read, + .release = single_release, +}; + +/* not __exit, may be indirectly called + * from the module-load-failure path as well. */ +void drbd_debugfs_cleanup(void) +{ + drbd_debugfs_remove(&drbd_debugfs_resources); + drbd_debugfs_remove(&drbd_debugfs_minors); + drbd_debugfs_remove(&drbd_debugfs_version); + drbd_debugfs_remove(&drbd_debugfs_root); +} + +int __init drbd_debugfs_init(void) +{ + struct dentry *dentry; + + dentry = debugfs_create_dir("drbd", NULL); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + drbd_debugfs_root = dentry; + + dentry = debugfs_create_file("version", 0444, drbd_debugfs_root, NULL, &drbd_version_fops); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + drbd_debugfs_version = dentry; + + dentry = debugfs_create_dir("resources", drbd_debugfs_root); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + drbd_debugfs_resources = dentry; + + dentry = debugfs_create_dir("minors", drbd_debugfs_root); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + drbd_debugfs_minors = dentry; + return 0; + +fail: + drbd_debugfs_cleanup(); + if (dentry) + return PTR_ERR(dentry); + else + return -EINVAL; +} diff --git a/drivers/block/drbd/drbd_debugfs.h b/drivers/block/drbd/drbd_debugfs.h new file mode 100644 index 00000000000..8bee21340dc --- /dev/null +++ b/drivers/block/drbd/drbd_debugfs.h @@ -0,0 +1,39 @@ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/debugfs.h> + +#include "drbd_int.h" + +#ifdef CONFIG_DEBUG_FS +int __init drbd_debugfs_init(void); +void drbd_debugfs_cleanup(void); + +void drbd_debugfs_resource_add(struct drbd_resource *resource); +void drbd_debugfs_resource_cleanup(struct drbd_resource *resource); + +void drbd_debugfs_connection_add(struct drbd_connection *connection); +void drbd_debugfs_connection_cleanup(struct drbd_connection *connection); + +void drbd_debugfs_device_add(struct drbd_device *device); +void drbd_debugfs_device_cleanup(struct drbd_device *device); + +void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device); +void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device); +#else + +static inline int __init drbd_debugfs_init(void) { return -ENODEV; } +static inline void drbd_debugfs_cleanup(void) { } + +static inline void drbd_debugfs_resource_add(struct drbd_resource *resource) { } +static inline void drbd_debugfs_resource_cleanup(struct drbd_resource *resource) { } + +static inline void drbd_debugfs_connection_add(struct drbd_connection *connection) { } +static inline void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) { } + +static inline void drbd_debugfs_device_add(struct drbd_device *device) { } +static inline void drbd_debugfs_device_cleanup(struct drbd_device *device) { } + +static inline void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device) { } +static inline void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device) { } + +#endif diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index a76ceb344d6..1a000016ccd 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -317,7 +317,63 @@ struct drbd_request { struct list_head tl_requests; /* ring list in the transfer log */ struct bio *master_bio; /* master bio pointer */ - unsigned long start_time; + + /* see struct drbd_device */ + struct list_head req_pending_master_completion; + struct list_head req_pending_local; + + /* for generic IO accounting */ + unsigned long start_jif; + + /* for DRBD internal statistics */ + + /* Minimal set of time stamps to determine if we wait for activity log + * transactions, local disk or peer. 32 bit "jiffies" are good enough, + * we don't expect a DRBD request to be stalled for several month. + */ + + /* before actual request processing */ + unsigned long in_actlog_jif; + + /* local disk */ + unsigned long pre_submit_jif; + + /* per connection */ + unsigned long pre_send_jif; + unsigned long acked_jif; + unsigned long net_done_jif; + + /* Possibly even more detail to track each phase: + * master_completion_jif + * how long did it take to complete the master bio + * (application visible latency) + * allocated_jif + * how long the master bio was blocked until we finally allocated + * a tracking struct + * in_actlog_jif + * how long did we wait for activity log transactions + * + * net_queued_jif + * when did we finally queue it for sending + * pre_send_jif + * when did we start sending it + * post_send_jif + * how long did we block in the network stack trying to send it + * acked_jif + * when did we receive (or fake, in protocol A) a remote ACK + * net_done_jif + * when did we receive final acknowledgement (P_BARRIER_ACK), + * or decide, e.g. on connection loss, that we do no longer expect + * anything from this peer for this request. + * + * pre_submit_jif + * post_sub_jif + * when did we start submiting to the lower level device, + * and how long did we block in that submit function + * local_completion_jif + * how long did it take the lower level device to complete this request + */ + /* once it hits 0, we may complete the master_bio */ atomic_t completion_ref; @@ -366,6 +422,7 @@ struct drbd_peer_request { struct drbd_interval i; /* see comments on ee flag bits below */ unsigned long flags; + unsigned long submit_jif; union { u64 block_id; struct digest_info *digest; @@ -408,6 +465,17 @@ enum { /* Is set when net_conf had two_primaries set while creating this peer_req */ __EE_IN_INTERVAL_TREE, + + /* for debugfs: */ + /* has this been submitted, or does it still wait for something else? */ + __EE_SUBMITTED, + + /* this is/was a write request */ + __EE_WRITE, + + /* this originates from application on peer + * (not some resync or verify or other DRBD internal request) */ + __EE_APPLICATION, }; #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) @@ -419,6 +487,9 @@ enum { #define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS) #define EE_SEND_WRITE_ACK (1<<__EE_SEND_WRITE_ACK) #define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE) +#define EE_SUBMITTED (1<<__EE_SUBMITTED) +#define EE_WRITE (1<<__EE_WRITE) +#define EE_APPLICATION (1<<__EE_APPLICATION) /* flag bits per device */ enum { @@ -433,11 +504,11 @@ enum { CONSIDER_RESYNC, MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ + SUSPEND_IO, /* suspend application io */ BITMAP_IO, /* suspend application io; once no more io in flight, start bitmap io */ BITMAP_IO_QUEUED, /* Started bitmap IO */ - GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */ WAS_IO_ERROR, /* Local disk failed, returned IO error */ WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */ FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */ @@ -450,6 +521,20 @@ enum { B_RS_H_DONE, /* Before resync handler done (already executed) */ DISCARD_MY_DATA, /* discard_my_data flag per volume */ READ_BALANCE_RR, + + FLUSH_PENDING, /* if set, device->flush_jif is when we submitted that flush + * from drbd_flush_after_epoch() */ + + /* cleared only after backing device related structures have been destroyed. */ + GOING_DISKLESS, /* Disk is being detached, because of io-error, or admin request. */ + + /* to be used in drbd_device_post_work() */ + GO_DISKLESS, /* tell worker to schedule cleanup before detach */ + DESTROY_DISK, /* tell worker to close backing devices and destroy related structures. */ + MD_SYNC, /* tell worker to call drbd_md_sync() */ + RS_START, /* tell worker to start resync/OV */ + RS_PROGRESS, /* tell worker that resync made significant progress */ + RS_DONE, /* tell worker that resync is done */ }; struct drbd_bitmap; /* opaque for drbd_device */ @@ -531,6 +616,11 @@ struct drbd_backing_dev { }; struct drbd_md_io { + struct page *page; + unsigned long start_jif; /* last call to drbd_md_get_buffer */ + unsigned long submit_jif; /* last _drbd_md_sync_page_io() submit */ + const char *current_use; + atomic_t in_use; unsigned int done; int error; }; @@ -577,10 +667,18 @@ enum { * and potentially deadlock on, this drbd worker. */ DISCONNECT_SENT, + + DEVICE_WORK_PENDING, /* tell worker that some device has pending work */ }; struct drbd_resource { char *name; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_res; + struct dentry *debugfs_res_volumes; + struct dentry *debugfs_res_connections; + struct dentry *debugfs_res_in_flight_summary; +#endif struct kref kref; struct idr devices; /* volume number to device mapping */ struct list_head connections; @@ -594,12 +692,28 @@ struct drbd_resource { unsigned susp_nod:1; /* IO suspended because no data */ unsigned susp_fen:1; /* IO suspended because fence peer handler runs */ + enum write_ordering_e write_ordering; + cpumask_var_t cpu_mask; }; +struct drbd_thread_timing_details +{ + unsigned long start_jif; + void *cb_addr; + const char *caller_fn; + unsigned int line; + unsigned int cb_nr; +}; + struct drbd_connection { struct list_head connections; struct drbd_resource *resource; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_conn; + struct dentry *debugfs_conn_callback_history; + struct dentry *debugfs_conn_oldest_requests; +#endif struct kref kref; struct idr peer_devices; /* volume number to peer device mapping */ enum drbd_conns cstate; /* Only C_STANDALONE to C_WF_REPORT_PARAMS */ @@ -636,7 +750,6 @@ struct drbd_connection { struct drbd_epoch *current_epoch; spinlock_t epoch_lock; unsigned int epochs; - enum write_ordering_e write_ordering; atomic_t current_tle_nr; /* transfer log epoch number */ unsigned current_tle_writes; /* writes seen within this tl epoch */ @@ -645,9 +758,22 @@ struct drbd_connection { struct drbd_thread worker; struct drbd_thread asender; + /* cached pointers, + * so we can look up the oldest pending requests more quickly. + * protected by resource->req_lock */ + struct drbd_request *req_next; /* DRBD 9: todo.req_next */ + struct drbd_request *req_ack_pending; + struct drbd_request *req_not_net_done; + /* sender side */ struct drbd_work_queue sender_work; +#define DRBD_THREAD_DETAILS_HIST 16 + unsigned int w_cb_nr; /* keeps counting up */ + unsigned int r_cb_nr; /* keeps counting up */ + struct drbd_thread_timing_details w_timing_details[DRBD_THREAD_DETAILS_HIST]; + struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST]; + struct { /* whether this sender thread * has processed a single write yet. */ @@ -663,11 +789,22 @@ struct drbd_connection { } send; }; +void __update_timing_details( + struct drbd_thread_timing_details *tdp, + unsigned int *cb_nr, + void *cb, + const char *fn, const unsigned int line); + +#define update_worker_timing_details(c, cb) \ + __update_timing_details(c->w_timing_details, &c->w_cb_nr, cb, __func__ , __LINE__ ) +#define update_receiver_timing_details(c, cb) \ + __update_timing_details(c->r_timing_details, &c->r_cb_nr, cb, __func__ , __LINE__ ) + struct submit_worker { struct workqueue_struct *wq; struct work_struct worker; - spinlock_t lock; + /* protected by ..->resource->req_lock */ struct list_head writes; }; @@ -675,12 +812,29 @@ struct drbd_peer_device { struct list_head peer_devices; struct drbd_device *device; struct drbd_connection *connection; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_peer_dev; +#endif }; struct drbd_device { struct drbd_resource *resource; struct list_head peer_devices; - int vnr; /* volume number within the connection */ + struct list_head pending_bitmap_io; + + unsigned long flush_jif; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_minor; + struct dentry *debugfs_vol; + struct dentry *debugfs_vol_oldest_requests; + struct dentry *debugfs_vol_act_log_extents; + struct dentry *debugfs_vol_resync_extents; + struct dentry *debugfs_vol_data_gen_id; +#endif + + unsigned int vnr; /* volume number within the connection */ + unsigned int minor; /* device minor number */ + struct kref kref; /* things that are stored as / read from meta data on disk */ @@ -697,19 +851,10 @@ struct drbd_device { unsigned long last_reattach_jif; struct drbd_work resync_work; struct drbd_work unplug_work; - struct drbd_work go_diskless; - struct drbd_work md_sync_work; - struct drbd_work start_resync_work; struct timer_list resync_timer; struct timer_list md_sync_timer; struct timer_list start_resync_timer; struct timer_list request_timer; -#ifdef DRBD_DEBUG_MD_SYNC - struct { - unsigned int line; - const char* func; - } last_md_mark_dirty; -#endif /* Used after attach while negotiating new disk state. */ union drbd_state new_state_tmp; @@ -724,6 +869,7 @@ struct drbd_device { unsigned int al_writ_cnt; unsigned int bm_writ_cnt; atomic_t ap_bio_cnt; /* Requests we need to complete */ + atomic_t ap_actlog_cnt; /* Requests waiting for activity log */ atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ atomic_t unacked_cnt; /* Need to send replies for */ @@ -733,6 +879,13 @@ struct drbd_device { struct rb_root read_requests; struct rb_root write_requests; + /* for statistics and timeouts */ + /* [0] read, [1] write */ + struct list_head pending_master_completion[2]; + struct list_head pending_completion[2]; + + /* use checksums for *this* resync */ + bool use_csums; /* blocks to resync in this run [unit BM_BLOCK_SIZE] */ unsigned long rs_total; /* number of resync blocks that failed in this run */ @@ -788,9 +941,7 @@ struct drbd_device { atomic_t pp_in_use; /* allocated from page pool */ atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ wait_queue_head_t ee_wait; - struct page *md_io_page; /* one page buffer for md_io */ struct drbd_md_io md_io; - atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */ spinlock_t al_lock; wait_queue_head_t al_wait; struct lru_cache *act_log; /* activity log */ @@ -800,7 +951,6 @@ struct drbd_device { atomic_t packet_seq; unsigned int peer_seq; spinlock_t peer_seq_lock; - unsigned int minor; unsigned long comm_bm_set; /* communicated number of set bits. */ struct bm_io_work bm_io_work; u64 ed_uuid; /* UUID of the exposed data */ @@ -824,6 +974,21 @@ struct drbd_device { struct submit_worker submit; }; +struct drbd_bm_aio_ctx { + struct drbd_device *device; + struct list_head list; /* on device->pending_bitmap_io */; + unsigned long start_jif; + atomic_t in_flight; + unsigned int done; + unsigned flags; +#define BM_AIO_COPY_PAGES 1 +#define BM_AIO_WRITE_HINTED 2 +#define BM_AIO_WRITE_ALL_PAGES 4 +#define BM_AIO_READ 8 + int error; + struct kref kref; +}; + struct drbd_config_context { /* assigned from drbd_genlmsghdr */ unsigned int minor; @@ -949,7 +1114,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int extern int drbd_send_bitmap(struct drbd_device *device); extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode); extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode); -extern void drbd_free_bc(struct drbd_backing_dev *ldev); +extern void drbd_free_ldev(struct drbd_backing_dev *ldev); extern void drbd_device_cleanup(struct drbd_device *device); void drbd_print_uuids(struct drbd_device *device, const char *text); @@ -966,13 +1131,7 @@ extern void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must extern void drbd_md_set_flag(struct drbd_device *device, int flags) __must_hold(local); extern void drbd_md_clear_flag(struct drbd_device *device, int flags)__must_hold(local); extern int drbd_md_test_flag(struct drbd_backing_dev *, int); -#ifndef DRBD_DEBUG_MD_SYNC extern void drbd_md_mark_dirty(struct drbd_device *device); -#else -#define drbd_md_mark_dirty(m) drbd_md_mark_dirty_(m, __LINE__ , __func__ ) -extern void drbd_md_mark_dirty_(struct drbd_device *device, - unsigned int line, const char *func); -#endif extern void drbd_queue_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *), void (*done)(struct drbd_device *, int), @@ -983,9 +1142,8 @@ extern int drbd_bitmap_io(struct drbd_device *device, extern int drbd_bitmap_io_from_worker(struct drbd_device *device, int (*io_fn)(struct drbd_device *), char *why, enum bm_flag flags); -extern int drbd_bmio_set_n_write(struct drbd_device *device); -extern int drbd_bmio_clear_n_write(struct drbd_device *device); -extern void drbd_ldev_destroy(struct drbd_device *device); +extern int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local); +extern int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local); /* Meta data layout * @@ -1105,17 +1263,21 @@ struct bm_extent { /* in which _bitmap_ extent (resp. sector) the bit for a certain * _storage_ sector is located in */ #define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9)) +#define BM_BIT_TO_EXT(x) ((x) >> (BM_EXT_SHIFT - BM_BLOCK_SHIFT)) -/* how much _storage_ sectors we have per bitmap sector */ +/* first storage sector a bitmap extent corresponds to */ #define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9)) +/* how much _storage_ sectors we have per bitmap extent */ #define BM_SECT_PER_EXT BM_EXT_TO_SECT(1) +/* how many bits are covered by one bitmap extent (resync extent) */ +#define BM_BITS_PER_EXT (1UL << (BM_EXT_SHIFT - BM_BLOCK_SHIFT)) + +#define BM_BLOCKS_PER_BM_EXT_MASK (BM_BITS_PER_EXT - 1) + /* in one sector of the bitmap, we have this many activity_log extents. */ #define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) -#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT) -#define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1) - /* the extent in "PER_EXTENT" below is an activity log extent * we need that many (long words/bytes) to store the bitmap * of one AL_EXTENT_SIZE chunk of storage. @@ -1195,11 +1357,11 @@ extern void _drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e); extern int drbd_bm_test_bit(struct drbd_device *device, unsigned long bitnr); extern int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr); -extern int drbd_bm_write_page(struct drbd_device *device, unsigned int idx) __must_hold(local); extern int drbd_bm_read(struct drbd_device *device) __must_hold(local); extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr); extern int drbd_bm_write(struct drbd_device *device) __must_hold(local); extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local); +extern int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local); extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local); extern int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local); extern size_t drbd_bm_words(struct drbd_device *device); @@ -1213,7 +1375,6 @@ extern unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned lon extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo); extern unsigned long _drbd_bm_total_weight(struct drbd_device *device); extern unsigned long drbd_bm_total_weight(struct drbd_device *device); -extern int drbd_bm_rs_done(struct drbd_device *device); /* for receive_bitmap */ extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, size_t number, unsigned long *buffer); @@ -1312,7 +1473,7 @@ enum determine_dev_size { extern enum determine_dev_size drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local); extern void resync_after_online_grow(struct drbd_device *); -extern void drbd_reconsider_max_bio_size(struct drbd_device *device); +extern void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev); extern enum drbd_state_rv drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force); @@ -1333,7 +1494,7 @@ extern void resume_next_sg(struct drbd_device *device); extern void suspend_other_sg(struct drbd_device *device); extern int drbd_resync_finished(struct drbd_device *device); /* maybe rather drbd_main.c ? */ -extern void *drbd_md_get_buffer(struct drbd_device *device); +extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent); extern void drbd_md_put_buffer(struct drbd_device *device); extern int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev, sector_t sector, int rw); @@ -1380,7 +1541,8 @@ extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req); extern int drbd_receiver(struct drbd_thread *thi); extern int drbd_asender(struct drbd_thread *thi); extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); -extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector); +extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, + bool throttle_if_app_is_waiting); extern int drbd_submit_peer_request(struct drbd_device *, struct drbd_peer_request *, const unsigned, const int); @@ -1464,10 +1626,7 @@ static inline void drbd_generic_make_request(struct drbd_device *device, { __release(local); if (!bio->bi_bdev) { - printk(KERN_ERR "drbd%d: drbd_generic_make_request: " - "bio->bi_bdev == NULL\n", - device_to_minor(device)); - dump_stack(); + drbd_err(device, "drbd_generic_make_request: bio->bi_bdev == NULL\n"); bio_endio(bio, -ENODEV); return; } @@ -1478,7 +1637,8 @@ static inline void drbd_generic_make_request(struct drbd_device *device, generic_make_request(bio); } -void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo); +void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev, + enum write_ordering_e wo); /* drbd_proc.c */ extern struct proc_dir_entry *drbd_proc; @@ -1489,9 +1649,9 @@ extern const char *drbd_role_str(enum drbd_role s); /* drbd_actlog.c */ extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i); extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i); -extern void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate); +extern void drbd_al_begin_io_commit(struct drbd_device *device); extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i); -extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i, bool delegate); +extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i); extern void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i); extern void drbd_rs_complete_io(struct drbd_device *device, sector_t sector); extern int drbd_rs_begin_io(struct drbd_device *device, sector_t sector); @@ -1501,14 +1661,17 @@ extern int drbd_rs_del_all(struct drbd_device *device); extern void drbd_rs_failed_io(struct drbd_device *device, sector_t sector, int size); extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go); -extern void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, - int size, const char *file, const unsigned int line); + +enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC }; +extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, + enum update_sync_bits_mode mode, + const char *file, const unsigned int line); #define drbd_set_in_sync(device, sector, size) \ - __drbd_set_in_sync(device, sector, size, __FILE__, __LINE__) -extern int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, - int size, const char *file, const unsigned int line); + __drbd_change_sync(device, sector, size, SET_IN_SYNC, __FILE__, __LINE__) #define drbd_set_out_of_sync(device, sector, size) \ - __drbd_set_out_of_sync(device, sector, size, __FILE__, __LINE__) + __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC, __FILE__, __LINE__) +#define drbd_rs_failed_io(device, sector, size) \ + __drbd_change_sync(device, sector, size, RECORD_RS_FAILED, __FILE__, __LINE__) extern void drbd_al_shrink(struct drbd_device *device); extern int drbd_initialize_al(struct drbd_device *, void *); @@ -1764,25 +1927,38 @@ static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev) } static inline void -drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) +drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) { unsigned long flags; spin_lock_irqsave(&q->q_lock, flags); - list_add(&w->list, &q->q); + list_add_tail(&w->list, &q->q); spin_unlock_irqrestore(&q->q_lock, flags); wake_up(&q->q_wait); } static inline void -drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) +drbd_queue_work_if_unqueued(struct drbd_work_queue *q, struct drbd_work *w) { unsigned long flags; spin_lock_irqsave(&q->q_lock, flags); - list_add_tail(&w->list, &q->q); + if (list_empty_careful(&w->list)) + list_add_tail(&w->list, &q->q); spin_unlock_irqrestore(&q->q_lock, flags); wake_up(&q->q_wait); } +static inline void +drbd_device_post_work(struct drbd_device *device, int work_bit) +{ + if (!test_and_set_bit(work_bit, &device->flags)) { + struct drbd_connection *connection = + first_peer_device(device)->connection; + struct drbd_work_queue *q = &connection->sender_work; + if (!test_and_set_bit(DEVICE_WORK_PENDING, &connection->flags)) + wake_up(&q->q_wait); + } +} + extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue); static inline void wake_asender(struct drbd_connection *connection) @@ -1859,7 +2035,7 @@ static inline void inc_ap_pending(struct drbd_device *device) func, line, \ atomic_read(&device->which)) -#define dec_ap_pending(device) _dec_ap_pending(device, __FUNCTION__, __LINE__) +#define dec_ap_pending(device) _dec_ap_pending(device, __func__, __LINE__) static inline void _dec_ap_pending(struct drbd_device *device, const char *func, int line) { if (atomic_dec_and_test(&device->ap_pending_cnt)) @@ -1878,7 +2054,7 @@ static inline void inc_rs_pending(struct drbd_device *device) atomic_inc(&device->rs_pending_cnt); } -#define dec_rs_pending(device) _dec_rs_pending(device, __FUNCTION__, __LINE__) +#define dec_rs_pending(device) _dec_rs_pending(device, __func__, __LINE__) static inline void _dec_rs_pending(struct drbd_device *device, const char *func, int line) { atomic_dec(&device->rs_pending_cnt); @@ -1899,20 +2075,29 @@ static inline void inc_unacked(struct drbd_device *device) atomic_inc(&device->unacked_cnt); } -#define dec_unacked(device) _dec_unacked(device, __FUNCTION__, __LINE__) +#define dec_unacked(device) _dec_unacked(device, __func__, __LINE__) static inline void _dec_unacked(struct drbd_device *device, const char *func, int line) { atomic_dec(&device->unacked_cnt); ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); } -#define sub_unacked(device, n) _sub_unacked(device, n, __FUNCTION__, __LINE__) +#define sub_unacked(device, n) _sub_unacked(device, n, __func__, __LINE__) static inline void _sub_unacked(struct drbd_device *device, int n, const char *func, int line) { atomic_sub(n, &device->unacked_cnt); ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); } +static inline bool is_sync_state(enum drbd_conns connection_state) +{ + return + (connection_state == C_SYNC_SOURCE + || connection_state == C_SYNC_TARGET + || connection_state == C_PAUSED_SYNC_S + || connection_state == C_PAUSED_SYNC_T); +} + /** * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev * @M: DRBD device. @@ -1924,6 +2109,11 @@ static inline void _sub_unacked(struct drbd_device *device, int n, const char *f static inline void put_ldev(struct drbd_device *device) { + enum drbd_disk_state ds = device->state.disk; + /* We must check the state *before* the atomic_dec becomes visible, + * or we have a theoretical race where someone hitting zero, + * while state still D_FAILED, will then see D_DISKLESS in the + * condition below and calling into destroy, where he must not, yet. */ int i = atomic_dec_return(&device->local_cnt); /* This may be called from some endio handler, @@ -1932,15 +2122,13 @@ static inline void put_ldev(struct drbd_device *device) __release(local); D_ASSERT(device, i >= 0); if (i == 0) { - if (device->state.disk == D_DISKLESS) + if (ds == D_DISKLESS) /* even internal references gone, safe to destroy */ - drbd_ldev_destroy(device); - if (device->state.disk == D_FAILED) { + drbd_device_post_work(device, DESTROY_DISK); + if (ds == D_FAILED) /* all application IO references gone. */ - if (!test_and_set_bit(GO_DISKLESS, &device->flags)) - drbd_queue_work(&first_peer_device(device)->connection->sender_work, - &device->go_diskless); - } + if (!test_and_set_bit(GOING_DISKLESS, &device->flags)) + drbd_device_post_work(device, GO_DISKLESS); wake_up(&device->misc_wait); } } @@ -1964,54 +2152,6 @@ static inline int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_ extern int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins); #endif -/* you must have an "get_ldev" reference */ -static inline void drbd_get_syncer_progress(struct drbd_device *device, - unsigned long *bits_left, unsigned int *per_mil_done) -{ - /* this is to break it at compile time when we change that, in case we - * want to support more than (1<<32) bits on a 32bit arch. */ - typecheck(unsigned long, device->rs_total); - - /* note: both rs_total and rs_left are in bits, i.e. in - * units of BM_BLOCK_SIZE. - * for the percentage, we don't care. */ - - if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T) - *bits_left = device->ov_left; - else - *bits_left = drbd_bm_total_weight(device) - device->rs_failed; - /* >> 10 to prevent overflow, - * +1 to prevent division by zero */ - if (*bits_left > device->rs_total) { - /* doh. maybe a logic bug somewhere. - * may also be just a race condition - * between this and a disconnect during sync. - * for now, just prevent in-kernel buffer overflow. - */ - smp_rmb(); - drbd_warn(device, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n", - drbd_conn_str(device->state.conn), - *bits_left, device->rs_total, device->rs_failed); - *per_mil_done = 0; - } else { - /* Make sure the division happens in long context. - * We allow up to one petabyte storage right now, - * at a granularity of 4k per bit that is 2**38 bits. - * After shift right and multiplication by 1000, - * this should still fit easily into a 32bit long, - * so we don't need a 64bit division on 32bit arch. - * Note: currently we don't support such large bitmaps on 32bit - * arch anyways, but no harm done to be prepared for it here. - */ - unsigned int shift = device->rs_total > UINT_MAX ? 16 : 10; - unsigned long left = *bits_left >> shift; - unsigned long total = 1UL + (device->rs_total >> shift); - unsigned long tmp = 1000UL - left * 1000UL/total; - *per_mil_done = tmp; - } -} - - /* this throttles on-the-fly application requests * according to max_buffers settings; * maybe re-implement using semaphores? */ @@ -2201,25 +2341,6 @@ static inline int drbd_queue_order_type(struct drbd_device *device) return QUEUE_ORDERED_NONE; } -static inline void drbd_md_flush(struct drbd_device *device) -{ - int r; - - if (device->ldev == NULL) { - drbd_warn(device, "device->ldev == NULL in drbd_md_flush\n"); - return; - } - - if (test_bit(MD_NO_FUA, &device->flags)) - return; - - r = blkdev_issue_flush(device->ldev->md_bdev, GFP_NOIO, NULL); - if (r) { - set_bit(MD_NO_FUA, &device->flags); - drbd_err(device, "meta data flush failed with status %d, disabling md-flushes\n", r); - } -} - static inline struct drbd_connection *first_connection(struct drbd_resource *resource) { return list_first_entry_or_null(&resource->connections, diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h index f38fcb00c10..f210543f05f 100644 --- a/drivers/block/drbd/drbd_interval.h +++ b/drivers/block/drbd/drbd_interval.h @@ -10,7 +10,9 @@ struct drbd_interval { unsigned int size; /* size in bytes */ sector_t end; /* highest interval end in subtree */ int local:1 /* local or remote request? */; - int waiting:1; + int waiting:1; /* someone is waiting for this to complete */ + int completed:1; /* this has been completed already; + * ignore for conflict detection */ }; static inline void drbd_clear_interval(struct drbd_interval *i) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 960645c26e6..9b465bb6848 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -26,7 +26,10 @@ */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> +#include <linux/jiffies.h> #include <linux/drbd.h> #include <asm/uaccess.h> #include <asm/types.h> @@ -54,16 +57,14 @@ #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */ - #include "drbd_vli.h" +#include "drbd_debugfs.h" static DEFINE_MUTEX(drbd_main_mutex); static int drbd_open(struct block_device *bdev, fmode_t mode); static void drbd_release(struct gendisk *gd, fmode_t mode); -static int w_md_sync(struct drbd_work *w, int unused); static void md_sync_timer_fn(unsigned long data); static int w_bitmap_io(struct drbd_work *w, int unused); -static int w_go_diskless(struct drbd_work *w, int unused); MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " "Lars Ellenberg <lars@linbit.com>"); @@ -264,7 +265,7 @@ bail: /** * _tl_restart() - Walks the transfer log, and applies an action to all requests - * @device: DRBD device. + * @connection: DRBD connection to operate on. * @what: The action/event to perform with all request objects * * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO, @@ -662,6 +663,11 @@ static int __send_command(struct drbd_connection *connection, int vnr, msg_flags); if (data && !err) err = drbd_send_all(connection, sock->socket, data, size, 0); + /* DRBD protocol "pings" are latency critical. + * This is supposed to trigger tcp_push_pending_frames() */ + if (!err && (cmd == P_PING || cmd == P_PING_ACK)) + drbd_tcp_nodelay(sock->socket); + return err; } @@ -1636,7 +1642,10 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * if (peer_device->connection->agreed_pro_version >= 100) { if (req->rq_state & RQ_EXP_RECEIVE_ACK) dp_flags |= DP_SEND_RECEIVE_ACK; - if (req->rq_state & RQ_EXP_WRITE_ACK) + /* During resync, request an explicit write ack, + * even in protocol != C */ + if (req->rq_state & RQ_EXP_WRITE_ACK + || (dp_flags & DP_MAY_SET_IN_SYNC)) dp_flags |= DP_SEND_WRITE_ACK; } p->dp_flags = cpu_to_be32(dp_flags); @@ -1900,6 +1909,7 @@ void drbd_init_set_defaults(struct drbd_device *device) drbd_set_defaults(device); atomic_set(&device->ap_bio_cnt, 0); + atomic_set(&device->ap_actlog_cnt, 0); atomic_set(&device->ap_pending_cnt, 0); atomic_set(&device->rs_pending_cnt, 0); atomic_set(&device->unacked_cnt, 0); @@ -1908,7 +1918,7 @@ void drbd_init_set_defaults(struct drbd_device *device) atomic_set(&device->rs_sect_in, 0); atomic_set(&device->rs_sect_ev, 0); atomic_set(&device->ap_in_flight, 0); - atomic_set(&device->md_io_in_use, 0); + atomic_set(&device->md_io.in_use, 0); mutex_init(&device->own_state_mutex); device->state_mutex = &device->own_state_mutex; @@ -1924,17 +1934,15 @@ void drbd_init_set_defaults(struct drbd_device *device) INIT_LIST_HEAD(&device->resync_reads); INIT_LIST_HEAD(&device->resync_work.list); INIT_LIST_HEAD(&device->unplug_work.list); - INIT_LIST_HEAD(&device->go_diskless.list); - INIT_LIST_HEAD(&device->md_sync_work.list); - INIT_LIST_HEAD(&device->start_resync_work.list); INIT_LIST_HEAD(&device->bm_io_work.w.list); + INIT_LIST_HEAD(&device->pending_master_completion[0]); + INIT_LIST_HEAD(&device->pending_master_completion[1]); + INIT_LIST_HEAD(&device->pending_completion[0]); + INIT_LIST_HEAD(&device->pending_completion[1]); device->resync_work.cb = w_resync_timer; device->unplug_work.cb = w_send_write_hint; - device->go_diskless.cb = w_go_diskless; - device->md_sync_work.cb = w_md_sync; device->bm_io_work.w.cb = w_bitmap_io; - device->start_resync_work.cb = w_start_resync; init_timer(&device->resync_timer); init_timer(&device->md_sync_timer); @@ -1992,7 +2000,7 @@ void drbd_device_cleanup(struct drbd_device *device) drbd_bm_cleanup(device); } - drbd_free_bc(device->ldev); + drbd_free_ldev(device->ldev); device->ldev = NULL; clear_bit(AL_SUSPENDED, &device->flags); @@ -2006,7 +2014,6 @@ void drbd_device_cleanup(struct drbd_device *device) D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q)); D_ASSERT(device, list_empty(&device->resync_work.list)); D_ASSERT(device, list_empty(&device->unplug_work.list)); - D_ASSERT(device, list_empty(&device->go_diskless.list)); drbd_set_defaults(device); } @@ -2129,20 +2136,6 @@ Enomem: return -ENOMEM; } -static int drbd_notify_sys(struct notifier_block *this, unsigned long code, - void *unused) -{ - /* just so we have it. you never know what interesting things we - * might want to do here some day... - */ - - return NOTIFY_DONE; -} - -static struct notifier_block drbd_notifier = { - .notifier_call = drbd_notify_sys, -}; - static void drbd_release_all_peer_reqs(struct drbd_device *device) { int rr; @@ -2173,7 +2166,7 @@ void drbd_destroy_device(struct kref *kref) { struct drbd_device *device = container_of(kref, struct drbd_device, kref); struct drbd_resource *resource = device->resource; - struct drbd_connection *connection; + struct drbd_peer_device *peer_device, *tmp_peer_device; del_timer_sync(&device->request_timer); @@ -2187,7 +2180,7 @@ void drbd_destroy_device(struct kref *kref) if (device->this_bdev) bdput(device->this_bdev); - drbd_free_bc(device->ldev); + drbd_free_ldev(device->ldev); device->ldev = NULL; drbd_release_all_peer_reqs(device); @@ -2200,15 +2193,20 @@ void drbd_destroy_device(struct kref *kref) if (device->bitmap) /* should no longer be there. */ drbd_bm_cleanup(device); - __free_page(device->md_io_page); + __free_page(device->md_io.page); put_disk(device->vdisk); blk_cleanup_queue(device->rq_queue); kfree(device->rs_plan_s); - kfree(first_peer_device(device)); - kfree(device); - for_each_connection(connection, resource) - kref_put(&connection->kref, drbd_destroy_connection); + /* not for_each_connection(connection, resource): + * those may have been cleaned up and disassociated already. + */ + for_each_peer_device_safe(peer_device, tmp_peer_device, device) { + kref_put(&peer_device->connection->kref, drbd_destroy_connection); + kfree(peer_device); + } + memset(device, 0xfd, sizeof(*device)); + kfree(device); kref_put(&resource->kref, drbd_destroy_resource); } @@ -2236,7 +2234,7 @@ static void do_retry(struct work_struct *ws) list_for_each_entry_safe(req, tmp, &writes, tl_requests) { struct drbd_device *device = req->device; struct bio *bio = req->master_bio; - unsigned long start_time = req->start_time; + unsigned long start_jif = req->start_jif; bool expected; expected = @@ -2271,10 +2269,12 @@ static void do_retry(struct work_struct *ws) /* We are not just doing generic_make_request(), * as we want to keep the start_time information. */ inc_ap_bio(device); - __drbd_make_request(device, bio, start_time); + __drbd_make_request(device, bio, start_jif); } } +/* called via drbd_req_put_completion_ref(), + * holds resource->req_lock */ void drbd_restart_request(struct drbd_request *req) { unsigned long flags; @@ -2298,6 +2298,7 @@ void drbd_destroy_resource(struct kref *kref) idr_destroy(&resource->devices); free_cpumask_var(resource->cpu_mask); kfree(resource->name); + memset(resource, 0xf2, sizeof(*resource)); kfree(resource); } @@ -2307,8 +2308,10 @@ void drbd_free_resource(struct drbd_resource *resource) for_each_connection_safe(connection, tmp, resource) { list_del(&connection->connections); + drbd_debugfs_connection_cleanup(connection); kref_put(&connection->kref, drbd_destroy_connection); } + drbd_debugfs_resource_cleanup(resource); kref_put(&resource->kref, drbd_destroy_resource); } @@ -2318,8 +2321,6 @@ static void drbd_cleanup(void) struct drbd_device *device; struct drbd_resource *resource, *tmp; - unregister_reboot_notifier(&drbd_notifier); - /* first remove proc, * drbdsetup uses it's presence to detect * whether DRBD is loaded. @@ -2335,6 +2336,7 @@ static void drbd_cleanup(void) destroy_workqueue(retry.wq); drbd_genl_unregister(); + drbd_debugfs_cleanup(); idr_for_each_entry(&drbd_devices, device, i) drbd_delete_device(device); @@ -2350,7 +2352,7 @@ static void drbd_cleanup(void) idr_destroy(&drbd_devices); - printk(KERN_INFO "drbd: module cleanup done.\n"); + pr_info("module cleanup done.\n"); } /** @@ -2539,6 +2541,20 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) { err = bitmap_parse(res_opts->cpu_mask, DRBD_CPU_MASK_SIZE, cpumask_bits(new_cpu_mask), nr_cpu_ids); + if (err == -EOVERFLOW) { + /* So what. mask it out. */ + cpumask_var_t tmp_cpu_mask; + if (zalloc_cpumask_var(&tmp_cpu_mask, GFP_KERNEL)) { + cpumask_setall(tmp_cpu_mask); + cpumask_and(new_cpu_mask, new_cpu_mask, tmp_cpu_mask); + drbd_warn(resource, "Overflow in bitmap_parse(%.12s%s), truncating to %u bits\n", + res_opts->cpu_mask, + strlen(res_opts->cpu_mask) > 12 ? "..." : "", + nr_cpu_ids); + free_cpumask_var(tmp_cpu_mask); + err = 0; + } + } if (err) { drbd_warn(resource, "bitmap_parse() failed with %d\n", err); /* retcode = ERR_CPU_MASK_PARSE; */ @@ -2579,10 +2595,12 @@ struct drbd_resource *drbd_create_resource(const char *name) kref_init(&resource->kref); idr_init(&resource->devices); INIT_LIST_HEAD(&resource->connections); + resource->write_ordering = WO_bdev_flush; list_add_tail_rcu(&resource->resources, &drbd_resources); mutex_init(&resource->conf_update); mutex_init(&resource->adm_mutex); spin_lock_init(&resource->req_lock); + drbd_debugfs_resource_add(resource); return resource; fail_free_name: @@ -2593,7 +2611,7 @@ fail: return NULL; } -/* caller must be under genl_lock() */ +/* caller must be under adm_mutex */ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts) { struct drbd_resource *resource; @@ -2617,7 +2635,6 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts) INIT_LIST_HEAD(&connection->current_epoch->list); connection->epochs = 1; spin_lock_init(&connection->epoch_lock); - connection->write_ordering = WO_bdev_flush; connection->send.seen_any_write_yet = false; connection->send.current_epoch_nr = 0; @@ -2652,6 +2669,7 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts) kref_get(&resource->kref); list_add_tail_rcu(&connection->connections, &resource->connections); + drbd_debugfs_connection_add(connection); return connection; fail_resource: @@ -2680,6 +2698,7 @@ void drbd_destroy_connection(struct kref *kref) drbd_free_socket(&connection->data); kfree(connection->int_dig_in); kfree(connection->int_dig_vv); + memset(connection, 0xfc, sizeof(*connection)); kfree(connection); kref_put(&resource->kref, drbd_destroy_resource); } @@ -2694,7 +2713,6 @@ static int init_submitter(struct drbd_device *device) return -ENOMEM; INIT_WORK(&device->submit.worker, do_submit); - spin_lock_init(&device->submit.lock); INIT_LIST_HEAD(&device->submit.writes); return 0; } @@ -2764,8 +2782,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig blk_queue_merge_bvec(q, drbd_merge_bvec); q->queue_lock = &resource->req_lock; - device->md_io_page = alloc_page(GFP_KERNEL); - if (!device->md_io_page) + device->md_io.page = alloc_page(GFP_KERNEL); + if (!device->md_io.page) goto out_no_io_page; if (drbd_bm_init(device)) @@ -2794,6 +2812,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig kref_get(&device->kref); INIT_LIST_HEAD(&device->peer_devices); + INIT_LIST_HEAD(&device->pending_bitmap_io); for_each_connection(connection, resource) { peer_device = kzalloc(sizeof(struct drbd_peer_device), GFP_KERNEL); if (!peer_device) @@ -2829,7 +2848,10 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig for_each_peer_device(peer_device, device) drbd_connected(peer_device); } - + /* move to create_peer_device() */ + for_each_peer_device(peer_device, device) + drbd_debugfs_peer_device_add(peer_device); + drbd_debugfs_device_add(device); return NO_ERROR; out_idr_remove_vol: @@ -2853,7 +2875,7 @@ out_idr_remove_minor: out_no_minor_idr: drbd_bm_cleanup(device); out_no_bitmap: - __free_page(device->md_io_page); + __free_page(device->md_io.page); out_no_io_page: put_disk(disk); out_no_disk: @@ -2868,8 +2890,13 @@ void drbd_delete_device(struct drbd_device *device) { struct drbd_resource *resource = device->resource; struct drbd_connection *connection; + struct drbd_peer_device *peer_device; int refs = 3; + /* move to free_peer_device() */ + for_each_peer_device(peer_device, device) + drbd_debugfs_peer_device_cleanup(peer_device); + drbd_debugfs_device_cleanup(device); for_each_connection(connection, resource) { idr_remove(&connection->peer_devices, device->vnr); refs++; @@ -2881,13 +2908,12 @@ void drbd_delete_device(struct drbd_device *device) kref_sub(&device->kref, refs, drbd_destroy_device); } -int __init drbd_init(void) +static int __init drbd_init(void) { int err; if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) { - printk(KERN_ERR - "drbd: invalid minor_count (%d)\n", minor_count); + pr_err("invalid minor_count (%d)\n", minor_count); #ifdef MODULE return -EINVAL; #else @@ -2897,14 +2923,11 @@ int __init drbd_init(void) err = register_blkdev(DRBD_MAJOR, "drbd"); if (err) { - printk(KERN_ERR - "drbd: unable to register block device major %d\n", + pr_err("unable to register block device major %d\n", DRBD_MAJOR); return err; } - register_reboot_notifier(&drbd_notifier); - /* * allocate all necessary structs */ @@ -2918,7 +2941,7 @@ int __init drbd_init(void) err = drbd_genl_register(); if (err) { - printk(KERN_ERR "drbd: unable to register generic netlink family\n"); + pr_err("unable to register generic netlink family\n"); goto fail; } @@ -2929,38 +2952,39 @@ int __init drbd_init(void) err = -ENOMEM; drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); if (!drbd_proc) { - printk(KERN_ERR "drbd: unable to register proc file\n"); + pr_err("unable to register proc file\n"); goto fail; } retry.wq = create_singlethread_workqueue("drbd-reissue"); if (!retry.wq) { - printk(KERN_ERR "drbd: unable to create retry workqueue\n"); + pr_err("unable to create retry workqueue\n"); goto fail; } INIT_WORK(&retry.worker, do_retry); spin_lock_init(&retry.lock); INIT_LIST_HEAD(&retry.writes); - printk(KERN_INFO "drbd: initialized. " + if (drbd_debugfs_init()) + pr_notice("failed to initialize debugfs -- will not be available\n"); + + pr_info("initialized. " "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX); - printk(KERN_INFO "drbd: %s\n", drbd_buildtag()); - printk(KERN_INFO "drbd: registered as block device major %d\n", - DRBD_MAJOR); - + pr_info("%s\n", drbd_buildtag()); + pr_info("registered as block device major %d\n", DRBD_MAJOR); return 0; /* Success! */ fail: drbd_cleanup(); if (err == -ENOMEM) - printk(KERN_ERR "drbd: ran out of memory\n"); + pr_err("ran out of memory\n"); else - printk(KERN_ERR "drbd: initialization failure\n"); + pr_err("initialization failure\n"); return err; } -void drbd_free_bc(struct drbd_backing_dev *ldev) +void drbd_free_ldev(struct drbd_backing_dev *ldev) { if (ldev == NULL) return; @@ -2972,24 +2996,29 @@ void drbd_free_bc(struct drbd_backing_dev *ldev) kfree(ldev); } -void drbd_free_sock(struct drbd_connection *connection) +static void drbd_free_one_sock(struct drbd_socket *ds) { - if (connection->data.socket) { - mutex_lock(&connection->data.mutex); - kernel_sock_shutdown(connection->data.socket, SHUT_RDWR); - sock_release(connection->data.socket); - connection->data.socket = NULL; - mutex_unlock(&connection->data.mutex); - } - if (connection->meta.socket) { - mutex_lock(&connection->meta.mutex); - kernel_sock_shutdown(connection->meta.socket, SHUT_RDWR); - sock_release(connection->meta.socket); - connection->meta.socket = NULL; - mutex_unlock(&connection->meta.mutex); + struct socket *s; + mutex_lock(&ds->mutex); + s = ds->socket; + ds->socket = NULL; + mutex_unlock(&ds->mutex); + if (s) { + /* so debugfs does not need to mutex_lock() */ + synchronize_rcu(); + kernel_sock_shutdown(s, SHUT_RDWR); + sock_release(s); } } +void drbd_free_sock(struct drbd_connection *connection) +{ + if (connection->data.socket) + drbd_free_one_sock(&connection->data); + if (connection->meta.socket) + drbd_free_one_sock(&connection->meta); +} + /* meta data management */ void conn_md_sync(struct drbd_connection *connection) @@ -3093,7 +3122,7 @@ void drbd_md_sync(struct drbd_device *device) if (!get_ldev_if_state(device, D_FAILED)) return; - buffer = drbd_md_get_buffer(device); + buffer = drbd_md_get_buffer(device, __func__); if (!buffer) goto out; @@ -3253,7 +3282,7 @@ int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev) if (device->state.disk != D_DISKLESS) return ERR_DISK_CONFIGURED; - buffer = drbd_md_get_buffer(device); + buffer = drbd_md_get_buffer(device, __func__); if (!buffer) return ERR_NOMEM; @@ -3466,23 +3495,19 @@ void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local) * * Sets all bits in the bitmap and writes the whole bitmap to stable storage. */ -int drbd_bmio_set_n_write(struct drbd_device *device) +int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local) { int rv = -EIO; - if (get_ldev_if_state(device, D_ATTACHING)) { - drbd_md_set_flag(device, MDF_FULL_SYNC); - drbd_md_sync(device); - drbd_bm_set_all(device); - - rv = drbd_bm_write(device); + drbd_md_set_flag(device, MDF_FULL_SYNC); + drbd_md_sync(device); + drbd_bm_set_all(device); - if (!rv) { - drbd_md_clear_flag(device, MDF_FULL_SYNC); - drbd_md_sync(device); - } + rv = drbd_bm_write(device); - put_ldev(device); + if (!rv) { + drbd_md_clear_flag(device, MDF_FULL_SYNC); + drbd_md_sync(device); } return rv; @@ -3494,18 +3519,11 @@ int drbd_bmio_set_n_write(struct drbd_device *device) * * Clears all bits in the bitmap and writes the whole bitmap to stable storage. */ -int drbd_bmio_clear_n_write(struct drbd_device *device) +int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local) { - int rv = -EIO; - drbd_resume_al(device); - if (get_ldev_if_state(device, D_ATTACHING)) { - drbd_bm_clear_all(device); - rv = drbd_bm_write(device); - put_ldev(device); - } - - return rv; + drbd_bm_clear_all(device); + return drbd_bm_write(device); } static int w_bitmap_io(struct drbd_work *w, int unused) @@ -3537,61 +3555,6 @@ static int w_bitmap_io(struct drbd_work *w, int unused) return 0; } -void drbd_ldev_destroy(struct drbd_device *device) -{ - lc_destroy(device->resync); - device->resync = NULL; - lc_destroy(device->act_log); - device->act_log = NULL; - __no_warn(local, - drbd_free_bc(device->ldev); - device->ldev = NULL;); - - clear_bit(GO_DISKLESS, &device->flags); -} - -static int w_go_diskless(struct drbd_work *w, int unused) -{ - struct drbd_device *device = - container_of(w, struct drbd_device, go_diskless); - - D_ASSERT(device, device->state.disk == D_FAILED); - /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will - * inc/dec it frequently. Once we are D_DISKLESS, no one will touch - * the protected members anymore, though, so once put_ldev reaches zero - * again, it will be safe to free them. */ - - /* Try to write changed bitmap pages, read errors may have just - * set some bits outside the area covered by the activity log. - * - * If we have an IO error during the bitmap writeout, - * we will want a full sync next time, just in case. - * (Do we want a specific meta data flag for this?) - * - * If that does not make it to stable storage either, - * we cannot do anything about that anymore. - * - * We still need to check if both bitmap and ldev are present, we may - * end up here after a failed attach, before ldev was even assigned. - */ - if (device->bitmap && device->ldev) { - /* An interrupted resync or similar is allowed to recounts bits - * while we detach. - * Any modifications would not be expected anymore, though. - */ - if (drbd_bitmap_io_from_worker(device, drbd_bm_write, - "detach", BM_LOCKED_TEST_ALLOWED)) { - if (test_bit(WAS_READ_ERROR, &device->flags)) { - drbd_md_set_flag(device, MDF_FULL_SYNC); - drbd_md_sync(device); - } - } - } - - drbd_force_state(device, NS(disk, D_DISKLESS)); - return 0; -} - /** * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap * @device: DRBD device. @@ -3603,6 +3566,9 @@ static int w_go_diskless(struct drbd_work *w, int unused) * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be * called from worker context. It MUST NOT be used while a previous such * work is still pending! + * + * Its worker function encloses the call of io_fn() by get_ldev() and + * put_ldev(). */ void drbd_queue_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *), @@ -3685,25 +3651,7 @@ int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag) static void md_sync_timer_fn(unsigned long data) { struct drbd_device *device = (struct drbd_device *) data; - - /* must not double-queue! */ - if (list_empty(&device->md_sync_work.list)) - drbd_queue_work_front(&first_peer_device(device)->connection->sender_work, - &device->md_sync_work); -} - -static int w_md_sync(struct drbd_work *w, int unused) -{ - struct drbd_device *device = - container_of(w, struct drbd_device, md_sync_work); - - drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); -#ifdef DEBUG - drbd_warn(device, "last md_mark_dirty: %s:%u\n", - device->last_md_mark_dirty.func, device->last_md_mark_dirty.line); -#endif - drbd_md_sync(device); - return 0; + drbd_device_post_work(device, MD_SYNC); } const char *cmdname(enum drbd_packet cmd) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 3f2e1673808..1cd47df44bd 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -23,6 +23,8 @@ */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/drbd.h> #include <linux/in.h> @@ -85,7 +87,7 @@ static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) { genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb)))); if (genlmsg_reply(skb, info)) - printk(KERN_ERR "drbd: error sending genl reply\n"); + pr_err("error sending genl reply\n"); } /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only @@ -558,8 +560,10 @@ void conn_try_outdate_peer_async(struct drbd_connection *connection) } enum drbd_state_rv -drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force) +drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int force) { + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; const int max_tries = 4; enum drbd_state_rv rv = SS_UNKNOWN_ERROR; struct net_conf *nc; @@ -607,7 +611,7 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force) device->state.disk == D_CONSISTENT && mask.pdsk == 0) { D_ASSERT(device, device->state.pdsk == D_UNKNOWN); - if (conn_try_outdate_peer(first_peer_device(device)->connection)) { + if (conn_try_outdate_peer(connection)) { val.disk = D_UP_TO_DATE; mask.disk = D_MASK; } @@ -617,7 +621,7 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force) if (rv == SS_NOTHING_TO_DO) goto out; if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) { - if (!conn_try_outdate_peer(first_peer_device(device)->connection) && force) { + if (!conn_try_outdate_peer(connection) && force) { drbd_warn(device, "Forced into split brain situation!\n"); mask.pdsk = D_MASK; val.pdsk = D_OUTDATED; @@ -630,7 +634,7 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force) retry at most once more in this case. */ int timeo; rcu_read_lock(); - nc = rcu_dereference(first_peer_device(device)->connection->net_conf); + nc = rcu_dereference(connection->net_conf); timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1; rcu_read_unlock(); schedule_timeout_interruptible(timeo); @@ -659,19 +663,17 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force) /* FIXME also wait for all pending P_BARRIER_ACK? */ if (new_role == R_SECONDARY) { - set_disk_ro(device->vdisk, true); if (get_ldev(device)) { device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; put_ldev(device); } } else { - /* Called from drbd_adm_set_role only. - * We are still holding the conf_update mutex. */ - nc = first_peer_device(device)->connection->net_conf; + mutex_lock(&device->resource->conf_update); + nc = connection->net_conf; if (nc) nc->discard_my_data = 0; /* without copy; single bit op is atomic */ + mutex_unlock(&device->resource->conf_update); - set_disk_ro(device->vdisk, false); if (get_ldev(device)) { if (((device->state.conn < C_CONNECTED || device->state.pdsk <= D_FAILED) @@ -689,12 +691,12 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force) if (device->state.conn >= C_WF_REPORT_PARAMS) { /* if this was forced, we should consider sync */ if (forced) - drbd_send_uuids(first_peer_device(device)); - drbd_send_current_state(first_peer_device(device)); + drbd_send_uuids(peer_device); + drbd_send_current_state(peer_device); } drbd_md_sync(device); - + set_disk_ro(device->vdisk, new_role == R_SECONDARY); kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE); out: mutex_unlock(device->state_mutex); @@ -891,7 +893,7 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct * still lock the act_log to not trigger ASSERTs there. */ drbd_suspend_io(device); - buffer = drbd_md_get_buffer(device); /* Lock meta-data IO */ + buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */ if (!buffer) { drbd_resume_io(device); return DS_ERROR; @@ -971,6 +973,10 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct if (la_size_changed || md_moved || rs) { u32 prev_flags; + /* We do some synchronous IO below, which may take some time. + * Clear the timer, to avoid scary "timer expired!" messages, + * "Superblock" is written out at least twice below, anyways. */ + del_timer(&device->md_sync_timer); drbd_al_shrink(device); /* All extents inactive. */ prev_flags = md->flags; @@ -1116,15 +1122,16 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc) return 0; } -static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_bio_size) +static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev, + unsigned int max_bio_size) { struct request_queue * const q = device->rq_queue; unsigned int max_hw_sectors = max_bio_size >> 9; unsigned int max_segments = 0; struct request_queue *b = NULL; - if (get_ldev_if_state(device, D_ATTACHING)) { - b = device->ldev->backing_bdev->bd_disk->queue; + if (bdev) { + b = bdev->backing_bdev->bd_disk->queue; max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); rcu_read_lock(); @@ -1169,11 +1176,10 @@ static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_ b->backing_dev_info.ra_pages); q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; } - put_ldev(device); } } -void drbd_reconsider_max_bio_size(struct drbd_device *device) +void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev) { unsigned int now, new, local, peer; @@ -1181,10 +1187,9 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device) local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */ peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */ - if (get_ldev_if_state(device, D_ATTACHING)) { - local = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9; + if (bdev) { + local = queue_max_hw_sectors(bdev->backing_bdev->bd_disk->queue) << 9; device->local_max_bio_size = local; - put_ldev(device); } local = min(local, DRBD_MAX_BIO_SIZE); @@ -1217,7 +1222,7 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device) if (new != now) drbd_info(device, "max BIO size = %u\n", new); - drbd_setup_queue_param(device, new); + drbd_setup_queue_param(device, bdev, new); } /* Starts the worker thread */ @@ -1299,6 +1304,13 @@ static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev) return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION; } +static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b) +{ + return a->disk_barrier != b->disk_barrier || + a->disk_flushes != b->disk_flushes || + a->disk_drain != b->disk_drain; +} + int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) { struct drbd_config_context adm_ctx; @@ -1405,7 +1417,8 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) else set_bit(MD_NO_FUA, &device->flags); - drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush); + if (write_ordering_changed(old_disk_conf, new_disk_conf)) + drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush); drbd_md_sync(device); @@ -1440,6 +1453,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) { struct drbd_config_context adm_ctx; struct drbd_device *device; + struct drbd_peer_device *peer_device; + struct drbd_connection *connection; int err; enum drbd_ret_code retcode; enum determine_dev_size dd; @@ -1462,7 +1477,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) device = adm_ctx.device; mutex_lock(&adm_ctx.resource->adm_mutex); - conn_reconfig_start(first_peer_device(device)->connection); + peer_device = first_peer_device(device); + connection = peer_device ? peer_device->connection : NULL; + conn_reconfig_start(connection); /* if you want to reconfigure, please tear down first */ if (device->state.disk > D_DISKLESS) { @@ -1473,7 +1490,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) * drbd_ldev_destroy is done already, we may end up here very fast, * e.g. if someone calls attach from the on-io-error handler, * to realize a "hot spare" feature (not that I'd recommend that) */ - wait_event(device->misc_wait, !atomic_read(&device->local_cnt)); + wait_event(device->misc_wait, !test_bit(GOING_DISKLESS, &device->flags)); /* make sure there is no leftover from previous force-detach attempts */ clear_bit(FORCE_DETACH, &device->flags); @@ -1529,7 +1546,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) goto fail; rcu_read_lock(); - nc = rcu_dereference(first_peer_device(device)->connection->net_conf); + nc = rcu_dereference(connection->net_conf); if (nc) { if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) { rcu_read_unlock(); @@ -1649,7 +1666,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) */ wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device)); /* and for any other previously queued work */ - drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work); + drbd_flush_workqueue(&connection->sender_work); rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE); retcode = rv; /* FIXME: Type mismatch. */ @@ -1710,7 +1727,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) new_disk_conf = NULL; new_plan = NULL; - drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush); + drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush); if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY)) set_bit(CRASHED_PRIMARY, &device->flags); @@ -1726,7 +1743,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) device->read_cnt = 0; device->writ_cnt = 0; - drbd_reconsider_max_bio_size(device); + drbd_reconsider_max_bio_size(device, device->ldev); /* If I am currently not R_PRIMARY, * but meta data primary indicator is set, @@ -1845,7 +1862,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE); put_ldev(device); - conn_reconfig_done(first_peer_device(device)->connection); + conn_reconfig_done(connection); mutex_unlock(&adm_ctx.resource->adm_mutex); drbd_adm_finish(&adm_ctx, info, retcode); return 0; @@ -1856,7 +1873,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) drbd_force_state(device, NS(disk, D_DISKLESS)); drbd_md_sync(device); fail: - conn_reconfig_done(first_peer_device(device)->connection); + conn_reconfig_done(connection); if (nbc) { if (nbc->backing_bdev) blkdev_put(nbc->backing_bdev, @@ -1888,7 +1905,7 @@ static int adm_detach(struct drbd_device *device, int force) } drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */ - drbd_md_get_buffer(device); /* make sure there is no in-flight meta-data IO */ + drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */ retcode = drbd_request_state(device, NS(disk, D_FAILED)); drbd_md_put_buffer(device); /* D_FAILED will transition to DISKLESS. */ @@ -2654,8 +2671,13 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) if (retcode != NO_ERROR) goto out; - mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; + if (!get_ldev(device)) { + retcode = ERR_NO_DISK; + goto out; + } + + mutex_lock(&adm_ctx.resource->adm_mutex); /* If there is still bitmap IO pending, probably because of a previous * resync just being finished, wait for it before requesting a new resync. @@ -2679,6 +2701,7 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T)); drbd_resume_io(device); mutex_unlock(&adm_ctx.resource->adm_mutex); + put_ldev(device); out: drbd_adm_finish(&adm_ctx, info, retcode); return 0; @@ -2704,7 +2727,7 @@ out: return 0; } -static int drbd_bmio_set_susp_al(struct drbd_device *device) +static int drbd_bmio_set_susp_al(struct drbd_device *device) __must_hold(local) { int rv; @@ -2725,8 +2748,13 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) if (retcode != NO_ERROR) goto out; - mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; + if (!get_ldev(device)) { + retcode = ERR_NO_DISK; + goto out; + } + + mutex_lock(&adm_ctx.resource->adm_mutex); /* If there is still bitmap IO pending, probably because of a previous * resync just being finished, wait for it before requesting a new resync. @@ -2753,6 +2781,7 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S)); drbd_resume_io(device); mutex_unlock(&adm_ctx.resource->adm_mutex); + put_ldev(device); out: drbd_adm_finish(&adm_ctx, info, retcode); return 0; @@ -2892,7 +2921,7 @@ static struct drbd_connection *the_only_connection(struct drbd_resource *resourc return list_first_entry(&resource->connections, struct drbd_connection, connections); } -int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device, +static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device, const struct sib_info *sib) { struct drbd_resource *resource = device->resource; @@ -3622,13 +3651,6 @@ void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib) unsigned seq; int err = -ENOMEM; - if (sib->sib_reason == SIB_SYNC_PROGRESS) { - if (time_after(jiffies, device->rs_last_bcast + HZ)) - device->rs_last_bcast = jiffies; - else - return; - } - seq = atomic_inc_return(&drbd_genl_seq); msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); if (!msg) diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 89736bdbbc7..06e6147c760 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -60,20 +60,65 @@ static void seq_printf_with_thousands_grouping(struct seq_file *seq, long v) seq_printf(seq, "%ld", v); } +static void drbd_get_syncer_progress(struct drbd_device *device, + union drbd_dev_state state, unsigned long *rs_total, + unsigned long *bits_left, unsigned int *per_mil_done) +{ + /* this is to break it at compile time when we change that, in case we + * want to support more than (1<<32) bits on a 32bit arch. */ + typecheck(unsigned long, device->rs_total); + *rs_total = device->rs_total; + + /* note: both rs_total and rs_left are in bits, i.e. in + * units of BM_BLOCK_SIZE. + * for the percentage, we don't care. */ + + if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T) + *bits_left = device->ov_left; + else + *bits_left = drbd_bm_total_weight(device) - device->rs_failed; + /* >> 10 to prevent overflow, + * +1 to prevent division by zero */ + if (*bits_left > *rs_total) { + /* D'oh. Maybe a logic bug somewhere. More likely just a race + * between state change and reset of rs_total. + */ + *bits_left = *rs_total; + *per_mil_done = *rs_total ? 0 : 1000; + } else { + /* Make sure the division happens in long context. + * We allow up to one petabyte storage right now, + * at a granularity of 4k per bit that is 2**38 bits. + * After shift right and multiplication by 1000, + * this should still fit easily into a 32bit long, + * so we don't need a 64bit division on 32bit arch. + * Note: currently we don't support such large bitmaps on 32bit + * arch anyways, but no harm done to be prepared for it here. + */ + unsigned int shift = *rs_total > UINT_MAX ? 16 : 10; + unsigned long left = *bits_left >> shift; + unsigned long total = 1UL + (*rs_total >> shift); + unsigned long tmp = 1000UL - left * 1000UL/total; + *per_mil_done = tmp; + } +} + + /*lge * progress bars shamelessly adapted from driver/md/md.c * output looks like * [=====>..............] 33.5% (23456/123456) * finish: 2:20:20 speed: 6,345 (6,456) K/sec */ -static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq) +static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq, + union drbd_dev_state state) { - unsigned long db, dt, dbdt, rt, rs_left; + unsigned long db, dt, dbdt, rt, rs_total, rs_left; unsigned int res; int i, x, y; int stalled = 0; - drbd_get_syncer_progress(device, &rs_left, &res); + drbd_get_syncer_progress(device, state, &rs_total, &rs_left, &res); x = res/50; y = 20-x; @@ -85,21 +130,21 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se seq_printf(seq, "."); seq_printf(seq, "] "); - if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T) + if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T) seq_printf(seq, "verified:"); else seq_printf(seq, "sync'ed:"); seq_printf(seq, "%3u.%u%% ", res / 10, res % 10); /* if more than a few GB, display in MB */ - if (device->rs_total > (4UL << (30 - BM_BLOCK_SHIFT))) + if (rs_total > (4UL << (30 - BM_BLOCK_SHIFT))) seq_printf(seq, "(%lu/%lu)M", (unsigned long) Bit2KB(rs_left >> 10), - (unsigned long) Bit2KB(device->rs_total >> 10)); + (unsigned long) Bit2KB(rs_total >> 10)); else seq_printf(seq, "(%lu/%lu)K\n\t", (unsigned long) Bit2KB(rs_left), - (unsigned long) Bit2KB(device->rs_total)); + (unsigned long) Bit2KB(rs_total)); /* see drivers/md/md.c * We do not want to overflow, so the order of operands and @@ -150,13 +195,13 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se dt = (jiffies - device->rs_start - device->rs_paused) / HZ; if (dt == 0) dt = 1; - db = device->rs_total - rs_left; + db = rs_total - rs_left; dbdt = Bit2KB(db/dt); seq_printf_with_thousands_grouping(seq, dbdt); seq_printf(seq, ")"); - if (device->state.conn == C_SYNC_TARGET || - device->state.conn == C_VERIFY_S) { + if (state.conn == C_SYNC_TARGET || + state.conn == C_VERIFY_S) { seq_printf(seq, " want: "); seq_printf_with_thousands_grouping(seq, device->c_sync_rate); } @@ -168,8 +213,8 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se unsigned long bm_bits = drbd_bm_bits(device); unsigned long bit_pos; unsigned long long stop_sector = 0; - if (device->state.conn == C_VERIFY_S || - device->state.conn == C_VERIFY_T) { + if (state.conn == C_VERIFY_S || + state.conn == C_VERIFY_T) { bit_pos = bm_bits - device->ov_left; if (verify_can_do_stop_sector(device)) stop_sector = device->ov_stop_sector; @@ -188,22 +233,13 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se } } -static void resync_dump_detail(struct seq_file *seq, struct lc_element *e) -{ - struct bm_extent *bme = lc_entry(e, struct bm_extent, lce); - - seq_printf(seq, "%5d %s %s\n", bme->rs_left, - bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------", - bme->flags & BME_LOCKED ? "LOCKED" : "------" - ); -} - static int drbd_seq_show(struct seq_file *seq, void *v) { int i, prev_i = -1; const char *sn; struct drbd_device *device; struct net_conf *nc; + union drbd_dev_state state; char wp; static char write_ordering_chars[] = { @@ -241,11 +277,12 @@ static int drbd_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); prev_i = i; - sn = drbd_conn_str(device->state.conn); + state = device->state; + sn = drbd_conn_str(state.conn); - if (device->state.conn == C_STANDALONE && - device->state.disk == D_DISKLESS && - device->state.role == R_SECONDARY) { + if (state.conn == C_STANDALONE && + state.disk == D_DISKLESS && + state.role == R_SECONDARY) { seq_printf(seq, "%2d: cs:Unconfigured\n", i); } else { /* reset device->congestion_reason */ @@ -258,15 +295,15 @@ static int drbd_seq_show(struct seq_file *seq, void *v) " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c", i, sn, - drbd_role_str(device->state.role), - drbd_role_str(device->state.peer), - drbd_disk_str(device->state.disk), - drbd_disk_str(device->state.pdsk), + drbd_role_str(state.role), + drbd_role_str(state.peer), + drbd_disk_str(state.disk), + drbd_disk_str(state.pdsk), wp, drbd_suspended(device) ? 's' : 'r', - device->state.aftr_isp ? 'a' : '-', - device->state.peer_isp ? 'p' : '-', - device->state.user_isp ? 'u' : '-', + state.aftr_isp ? 'a' : '-', + state.peer_isp ? 'p' : '-', + state.user_isp ? 'u' : '-', device->congestion_reason ?: '-', test_bit(AL_SUSPENDED, &device->flags) ? 's' : '-', device->send_cnt/2, @@ -281,17 +318,17 @@ static int drbd_seq_show(struct seq_file *seq, void *v) atomic_read(&device->unacked_cnt), atomic_read(&device->ap_bio_cnt), first_peer_device(device)->connection->epochs, - write_ordering_chars[first_peer_device(device)->connection->write_ordering] + write_ordering_chars[device->resource->write_ordering] ); seq_printf(seq, " oos:%llu\n", Bit2KB((unsigned long long) drbd_bm_total_weight(device))); } - if (device->state.conn == C_SYNC_SOURCE || - device->state.conn == C_SYNC_TARGET || - device->state.conn == C_VERIFY_S || - device->state.conn == C_VERIFY_T) - drbd_syncer_progress(device, seq); + if (state.conn == C_SYNC_SOURCE || + state.conn == C_SYNC_TARGET || + state.conn == C_VERIFY_S || + state.conn == C_VERIFY_T) + drbd_syncer_progress(device, seq, state); if (proc_details >= 1 && get_ldev_if_state(device, D_FAILED)) { lc_seq_printf_stats(seq, device->resync); @@ -299,12 +336,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v) put_ldev(device); } - if (proc_details >= 2) { - if (device->resync) { - lc_seq_dump_details(seq, device->resync, "rs_left", - resync_dump_detail); - } - } + if (proc_details >= 2) + seq_printf(seq, "\tblocked on activity log: %d\n", atomic_read(&device->ap_actlog_cnt)); } rcu_read_unlock(); @@ -316,7 +349,7 @@ static int drbd_proc_open(struct inode *inode, struct file *file) int err; if (try_module_get(THIS_MODULE)) { - err = single_open(file, drbd_seq_show, PDE_DATA(inode)); + err = single_open(file, drbd_seq_show, NULL); if (err) module_put(THIS_MODULE); return err; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 5b17ec88ea0..9342b8da73a 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -362,17 +362,14 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto goto fail; } + memset(peer_req, 0, sizeof(*peer_req)); + INIT_LIST_HEAD(&peer_req->w.list); drbd_clear_interval(&peer_req->i); peer_req->i.size = data_size; peer_req->i.sector = sector; - peer_req->i.local = false; - peer_req->i.waiting = false; - - peer_req->epoch = NULL; + peer_req->submit_jif = jiffies; peer_req->peer_device = peer_device; peer_req->pages = page; - atomic_set(&peer_req->pending_bios, 0); - peer_req->flags = 0; /* * The block_id is opaque to the receiver. It is not endianness * converted, and sent back to the sender unchanged. @@ -389,11 +386,16 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req, int is_net) { + might_sleep(); if (peer_req->flags & EE_HAS_DIGEST) kfree(peer_req->digest); drbd_free_pages(device, peer_req->pages, is_net); D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0); D_ASSERT(device, drbd_interval_empty(&peer_req->i)); + if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) { + peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; + drbd_al_complete_io(device, &peer_req->i); + } mempool_free(peer_req, drbd_ee_mempool); } @@ -791,8 +793,18 @@ static int receive_first_packet(struct drbd_connection *connection, struct socke { unsigned int header_size = drbd_header_size(connection); struct packet_info pi; + struct net_conf *nc; int err; + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + if (!nc) { + rcu_read_unlock(); + return -EIO; + } + sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10; + rcu_read_unlock(); + err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0); if (err != header_size) { if (err >= 0) @@ -809,7 +821,7 @@ static int receive_first_packet(struct drbd_connection *connection, struct socke * drbd_socket_okay() - Free the socket if its connection is not okay * @sock: pointer to the pointer to the socket. */ -static int drbd_socket_okay(struct socket **sock) +static bool drbd_socket_okay(struct socket **sock) { int rr; char tb[4]; @@ -827,6 +839,30 @@ static int drbd_socket_okay(struct socket **sock) return false; } } + +static bool connection_established(struct drbd_connection *connection, + struct socket **sock1, + struct socket **sock2) +{ + struct net_conf *nc; + int timeout; + bool ok; + + if (!*sock1 || !*sock2) + return false; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10; + rcu_read_unlock(); + schedule_timeout_interruptible(timeout); + + ok = drbd_socket_okay(sock1); + ok = drbd_socket_okay(sock2) && ok; + + return ok; +} + /* Gets called if a connection is established, or if a new minor gets created in a connection */ int drbd_connected(struct drbd_peer_device *peer_device) @@ -868,8 +904,8 @@ static int conn_connect(struct drbd_connection *connection) struct drbd_socket sock, msock; struct drbd_peer_device *peer_device; struct net_conf *nc; - int vnr, timeout, h, ok; - bool discard_my_data; + int vnr, timeout, h; + bool discard_my_data, ok; enum drbd_state_rv rv; struct accept_wait_data ad = { .connection = connection, @@ -913,17 +949,8 @@ static int conn_connect(struct drbd_connection *connection) } } - if (sock.socket && msock.socket) { - rcu_read_lock(); - nc = rcu_dereference(connection->net_conf); - timeout = nc->ping_timeo * HZ / 10; - rcu_read_unlock(); - schedule_timeout_interruptible(timeout); - ok = drbd_socket_okay(&sock.socket); - ok = drbd_socket_okay(&msock.socket) && ok; - if (ok) - break; - } + if (connection_established(connection, &sock.socket, &msock.socket)) + break; retry: s = drbd_wait_for_connect(connection, &ad); @@ -969,8 +996,7 @@ randomize: goto out_release_sockets; } - ok = drbd_socket_okay(&sock.socket); - ok = drbd_socket_okay(&msock.socket) && ok; + ok = connection_established(connection, &sock.socket, &msock.socket); } while (!ok); if (ad.s_listen) @@ -1151,7 +1177,7 @@ static void drbd_flush(struct drbd_connection *connection) struct drbd_peer_device *peer_device; int vnr; - if (connection->write_ordering >= WO_bdev_flush) { + if (connection->resource->write_ordering >= WO_bdev_flush) { rcu_read_lock(); idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { struct drbd_device *device = peer_device->device; @@ -1161,14 +1187,22 @@ static void drbd_flush(struct drbd_connection *connection) kref_get(&device->kref); rcu_read_unlock(); + /* Right now, we have only this one synchronous code path + * for flushes between request epochs. + * We may want to make those asynchronous, + * or at least parallelize the flushes to the volume devices. + */ + device->flush_jif = jiffies; + set_bit(FLUSH_PENDING, &device->flags); rv = blkdev_issue_flush(device->ldev->backing_bdev, GFP_NOIO, NULL); + clear_bit(FLUSH_PENDING, &device->flags); if (rv) { drbd_info(device, "local disk flush failed with status %d\n", rv); /* would rather check on EOPNOTSUPP, but that is not reliable. * don't try again for ANY return value != 0 * if (rv == -EOPNOTSUPP) */ - drbd_bump_write_ordering(connection, WO_drain_io); + drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io); } put_ldev(device); kref_put(&device->kref, drbd_destroy_device); @@ -1257,15 +1291,30 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connectio return rv; } +static enum write_ordering_e +max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo) +{ + struct disk_conf *dc; + + dc = rcu_dereference(bdev->disk_conf); + + if (wo == WO_bdev_flush && !dc->disk_flushes) + wo = WO_drain_io; + if (wo == WO_drain_io && !dc->disk_drain) + wo = WO_none; + + return wo; +} + /** * drbd_bump_write_ordering() - Fall back to an other write ordering method * @connection: DRBD connection. * @wo: Write ordering method to try. */ -void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo) +void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev, + enum write_ordering_e wo) { - struct disk_conf *dc; - struct drbd_peer_device *peer_device; + struct drbd_device *device; enum write_ordering_e pwo; int vnr; static char *write_ordering_str[] = { @@ -1274,26 +1323,27 @@ void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ord [WO_bdev_flush] = "flush", }; - pwo = connection->write_ordering; - wo = min(pwo, wo); + pwo = resource->write_ordering; + if (wo != WO_bdev_flush) + wo = min(pwo, wo); rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; + idr_for_each_entry(&resource->devices, device, vnr) { + if (get_ldev(device)) { + wo = max_allowed_wo(device->ldev, wo); + if (device->ldev == bdev) + bdev = NULL; + put_ldev(device); + } + } - if (!get_ldev_if_state(device, D_ATTACHING)) - continue; - dc = rcu_dereference(device->ldev->disk_conf); + if (bdev) + wo = max_allowed_wo(bdev, wo); - if (wo == WO_bdev_flush && !dc->disk_flushes) - wo = WO_drain_io; - if (wo == WO_drain_io && !dc->disk_drain) - wo = WO_none; - put_ldev(device); - } rcu_read_unlock(); - connection->write_ordering = wo; - if (pwo != connection->write_ordering || wo == WO_bdev_flush) - drbd_info(connection, "Method to ensure write ordering: %s\n", write_ordering_str[connection->write_ordering]); + + resource->write_ordering = wo; + if (pwo != resource->write_ordering || wo == WO_bdev_flush) + drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]); } /** @@ -1330,6 +1380,13 @@ int drbd_submit_peer_request(struct drbd_device *device, /* wait for all pending IO completions, before we start * zeroing things out. */ conn_wait_active_ee_empty(first_peer_device(device)->connection); + /* add it to the active list now, + * so we can find it to present it in debugfs */ + peer_req->submit_jif = jiffies; + peer_req->flags |= EE_SUBMITTED; + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&peer_req->w.list, &device->active_ee); + spin_unlock_irq(&device->resource->req_lock); if (blkdev_issue_zeroout(device->ldev->backing_bdev, sector, ds >> 9, GFP_NOIO)) peer_req->flags |= EE_WAS_ERROR; @@ -1398,6 +1455,9 @@ submit: D_ASSERT(device, page == NULL); atomic_set(&peer_req->pending_bios, n_bios); + /* for debugfs: update timestamp, mark as submitted */ + peer_req->submit_jif = jiffies; + peer_req->flags |= EE_SUBMITTED; do { bio = bios; bios = bios->bi_next; @@ -1471,7 +1531,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf * R_PRIMARY crashes now. * Therefore we must send the barrier_ack after the barrier request was * completed. */ - switch (connection->write_ordering) { + switch (connection->resource->write_ordering) { case WO_none: if (rv == FE_RECYCLED) return 0; @@ -1498,7 +1558,8 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf return 0; default: - drbd_err(connection, "Strangeness in connection->write_ordering %d\n", connection->write_ordering); + drbd_err(connection, "Strangeness in connection->write_ordering %d\n", + connection->resource->write_ordering); return -EIO; } @@ -1531,7 +1592,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, struct drbd_peer_request *peer_req; struct page *page; int dgs, ds, err; - int data_size = pi->size; + unsigned int data_size = pi->size; void *dig_in = peer_device->connection->int_dig_in; void *dig_vv = peer_device->connection->int_dig_vv; unsigned long *data; @@ -1578,6 +1639,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, if (!peer_req) return NULL; + peer_req->flags |= EE_WRITE; if (trim) return peer_req; @@ -1734,9 +1796,10 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto * respective _drbd_clear_done_ee */ peer_req->w.cb = e_end_resync_block; + peer_req->submit_jif = jiffies; spin_lock_irq(&device->resource->req_lock); - list_add(&peer_req->w.list, &device->sync_ee); + list_add_tail(&peer_req->w.list, &device->sync_ee); spin_unlock_irq(&device->resource->req_lock); atomic_add(pi->size >> 9, &device->rs_sect_ev); @@ -1889,6 +1952,7 @@ static int e_end_block(struct drbd_work *w, int cancel) } dec_unacked(device); } + /* we delete from the conflict detection hash _after_ we sent out the * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ if (peer_req->flags & EE_IN_INTERVAL_TREE) { @@ -2115,6 +2179,8 @@ static int handle_write_conflicts(struct drbd_device *device, drbd_for_each_overlap(i, &device->write_requests, sector, size) { if (i == &peer_req->i) continue; + if (i->completed) + continue; if (!i->local) { /* @@ -2147,7 +2213,6 @@ static int handle_write_conflicts(struct drbd_device *device, (unsigned long long)sector, size, superseded ? "local" : "remote"); - inc_unacked(device); peer_req->w.cb = superseded ? e_send_superseded : e_send_retry_write; list_add_tail(&peer_req->w.list, &device->done_ee); @@ -2206,6 +2271,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * { struct drbd_peer_device *peer_device; struct drbd_device *device; + struct net_conf *nc; sector_t sector; struct drbd_peer_request *peer_req; struct p_data *p = pi->data; @@ -2245,6 +2311,8 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * } peer_req->w.cb = e_end_block; + peer_req->submit_jif = jiffies; + peer_req->flags |= EE_APPLICATION; dp_flags = be32_to_cpu(p->dp_flags); rw |= wire_flags_to_bio(dp_flags); @@ -2271,9 +2339,36 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * spin_unlock(&connection->epoch_lock); rcu_read_lock(); - tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries; + nc = rcu_dereference(peer_device->connection->net_conf); + tp = nc->two_primaries; + if (peer_device->connection->agreed_pro_version < 100) { + switch (nc->wire_protocol) { + case DRBD_PROT_C: + dp_flags |= DP_SEND_WRITE_ACK; + break; + case DRBD_PROT_B: + dp_flags |= DP_SEND_RECEIVE_ACK; + break; + } + } rcu_read_unlock(); + + if (dp_flags & DP_SEND_WRITE_ACK) { + peer_req->flags |= EE_SEND_WRITE_ACK; + inc_unacked(device); + /* corresponding dec_unacked() in e_end_block() + * respective _drbd_clear_done_ee */ + } + + if (dp_flags & DP_SEND_RECEIVE_ACK) { + /* I really don't like it that the receiver thread + * sends on the msock, but anyways */ + drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req); + } + if (tp) { + /* two primaries implies protocol C */ + D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK); peer_req->flags |= EE_IN_INTERVAL_TREE; err = wait_for_and_update_peer_seq(peer_device, peer_seq); if (err) @@ -2297,44 +2392,18 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * * active_ee to become empty in drbd_submit_peer_request(); * better not add ourselves here. */ if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0) - list_add(&peer_req->w.list, &device->active_ee); + list_add_tail(&peer_req->w.list, &device->active_ee); spin_unlock_irq(&device->resource->req_lock); if (device->state.conn == C_SYNC_TARGET) wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req)); - if (peer_device->connection->agreed_pro_version < 100) { - rcu_read_lock(); - switch (rcu_dereference(peer_device->connection->net_conf)->wire_protocol) { - case DRBD_PROT_C: - dp_flags |= DP_SEND_WRITE_ACK; - break; - case DRBD_PROT_B: - dp_flags |= DP_SEND_RECEIVE_ACK; - break; - } - rcu_read_unlock(); - } - - if (dp_flags & DP_SEND_WRITE_ACK) { - peer_req->flags |= EE_SEND_WRITE_ACK; - inc_unacked(device); - /* corresponding dec_unacked() in e_end_block() - * respective _drbd_clear_done_ee */ - } - - if (dp_flags & DP_SEND_RECEIVE_ACK) { - /* I really don't like it that the receiver thread - * sends on the msock, but anyways */ - drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req); - } - if (device->state.pdsk < D_INCONSISTENT) { /* In case we have the only disk of the cluster, */ drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size); - peer_req->flags |= EE_CALL_AL_COMPLETE_IO; peer_req->flags &= ~EE_MAY_SET_IN_SYNC; - drbd_al_begin_io(device, &peer_req->i, true); + drbd_al_begin_io(device, &peer_req->i); + peer_req->flags |= EE_CALL_AL_COMPLETE_IO; } err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR); @@ -2347,8 +2416,10 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * list_del(&peer_req->w.list); drbd_remove_epoch_entry_interval(device, peer_req); spin_unlock_irq(&device->resource->req_lock); - if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) + if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) { + peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; drbd_al_complete_io(device, &peer_req->i); + } out_interrupted: drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP); @@ -2368,13 +2439,14 @@ out_interrupted: * The current sync rate used here uses only the most recent two step marks, * to have a short time average so we can react faster. */ -bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector) +bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, + bool throttle_if_app_is_waiting) { struct lc_element *tmp; - bool throttle = true; + bool throttle = drbd_rs_c_min_rate_throttle(device); - if (!drbd_rs_c_min_rate_throttle(device)) - return false; + if (!throttle || throttle_if_app_is_waiting) + return throttle; spin_lock_irq(&device->al_lock); tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector)); @@ -2382,7 +2454,8 @@ bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector) struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); if (test_bit(BME_PRIORITY, &bm_ext->flags)) throttle = false; - /* Do not slow down if app IO is already waiting for this extent */ + /* Do not slow down if app IO is already waiting for this extent, + * and our progress is necessary for application IO to complete. */ } spin_unlock_irq(&device->al_lock); @@ -2407,7 +2480,9 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device) curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + (int)part_stat_read(&disk->part0, sectors[1]) - atomic_read(&device->rs_sect_ev); - if (!device->rs_last_events || curr_events - device->rs_last_events > 64) { + + if (atomic_read(&device->ap_actlog_cnt) + || !device->rs_last_events || curr_events - device->rs_last_events > 64) { unsigned long rs_left; int i; @@ -2508,6 +2583,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet peer_req->w.cb = w_e_end_data_req; fault_type = DRBD_FAULT_DT_RD; /* application IO, don't drbd_rs_begin_io */ + peer_req->flags |= EE_APPLICATION; goto submit; case P_RS_DATA_REQUEST: @@ -2538,6 +2614,8 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet peer_req->w.cb = w_e_end_csum_rs_req; /* used in the sector offset progress display */ device->bm_resync_fo = BM_SECT_TO_BIT(sector); + /* remember to report stats in drbd_resync_finished */ + device->use_csums = true; } else if (pi->cmd == P_OV_REPLY) { /* track progress, we may need to throttle */ atomic_add(size >> 9, &device->rs_sect_in); @@ -2595,8 +2673,20 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet * we would also throttle its application reads. * In that case, throttling is done on the SyncTarget only. */ - if (device->state.peer != R_PRIMARY && drbd_rs_should_slow_down(device, sector)) + + /* Even though this may be a resync request, we do add to "read_ee"; + * "sync_ee" is only used for resync WRITEs. + * Add to list early, so debugfs can find this request + * even if we have to sleep below. */ + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&peer_req->w.list, &device->read_ee); + spin_unlock_irq(&device->resource->req_lock); + + update_receiver_timing_details(connection, drbd_rs_should_slow_down); + if (device->state.peer != R_PRIMARY + && drbd_rs_should_slow_down(device, sector, false)) schedule_timeout_uninterruptible(HZ/10); + update_receiver_timing_details(connection, drbd_rs_begin_io); if (drbd_rs_begin_io(device, sector)) goto out_free_e; @@ -2604,22 +2694,20 @@ submit_for_resync: atomic_add(size >> 9, &device->rs_sect_ev); submit: + update_receiver_timing_details(connection, drbd_submit_peer_request); inc_unacked(device); - spin_lock_irq(&device->resource->req_lock); - list_add_tail(&peer_req->w.list, &device->read_ee); - spin_unlock_irq(&device->resource->req_lock); - if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0) return 0; /* don't care for the reason here */ drbd_err(device, "submit failed, triggering re-connect\n"); + +out_free_e: spin_lock_irq(&device->resource->req_lock); list_del(&peer_req->w.list); spin_unlock_irq(&device->resource->req_lock); /* no drbd_rs_complete_io(), we are dropping the connection anyways */ -out_free_e: put_ldev(device); drbd_free_peer_req(device, peer_req); return -EIO; @@ -2842,8 +2930,10 @@ static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid, -1091 requires proto 91 -1096 requires proto 96 */ -static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_hold(local) +static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __must_hold(local) { + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; u64 self, peer; int i, j; @@ -2869,7 +2959,7 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) { - if (first_peer_device(device)->connection->agreed_pro_version < 91) + if (connection->agreed_pro_version < 91) return -1091; if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) && @@ -2892,7 +2982,7 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) { - if (first_peer_device(device)->connection->agreed_pro_version < 91) + if (connection->agreed_pro_version < 91) return -1091; if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) && @@ -2925,7 +3015,7 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho case 1: /* self_pri && !peer_pri */ return 1; case 2: /* !self_pri && peer_pri */ return -1; case 3: /* self_pri && peer_pri */ - dc = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags); + dc = test_bit(RESOLVE_CONFLICTS, &connection->flags); return dc ? -1 : 1; } } @@ -2938,14 +3028,14 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho *rule_nr = 51; peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1); if (self == peer) { - if (first_peer_device(device)->connection->agreed_pro_version < 96 ? + if (connection->agreed_pro_version < 96 ? (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) : peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) { /* The last P_SYNC_UUID did not get though. Undo the last start of resync as sync source modifications of the peer's UUIDs. */ - if (first_peer_device(device)->connection->agreed_pro_version < 91) + if (connection->agreed_pro_version < 91) return -1091; device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START]; @@ -2975,14 +3065,14 @@ static int drbd_uuid_compare(struct drbd_device *device, int *rule_nr) __must_ho *rule_nr = 71; self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); if (self == peer) { - if (first_peer_device(device)->connection->agreed_pro_version < 96 ? + if (connection->agreed_pro_version < 96 ? (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) : self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) { /* The last P_SYNC_UUID did not get though. Undo the last start of resync as sync source modifications of our UUIDs. */ - if (first_peer_device(device)->connection->agreed_pro_version < 91) + if (connection->agreed_pro_version < 91) return -1091; __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]); @@ -3352,8 +3442,7 @@ disconnect: * return: NULL (alg name was "") * ERR_PTR(error) if something goes wrong * or the crypto hash ptr, if it worked out ok. */ -static -struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device, +static struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device, const char *alg, const char *name) { struct crypto_hash *tfm; @@ -3639,7 +3728,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info struct drbd_device *device; struct p_sizes *p = pi->data; enum determine_dev_size dd = DS_UNCHANGED; - sector_t p_size, p_usize, my_usize; + sector_t p_size, p_usize, p_csize, my_usize; int ldsc = 0; /* local disk size changed */ enum dds_flags ddsf; @@ -3650,6 +3739,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info p_size = be64_to_cpu(p->d_size); p_usize = be64_to_cpu(p->u_size); + p_csize = be64_to_cpu(p->c_size); /* just store the peer's disk size for now. * we still need to figure out whether we accept that. */ @@ -3710,7 +3800,6 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info } device->peer_max_bio_size = be32_to_cpu(p->max_bio_size); - drbd_reconsider_max_bio_size(device); /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size(). In case we cleared the QUEUE_FLAG_DISCARD from our queue in drbd_reconsider_max_bio_size(), we can be sure that after @@ -3718,14 +3807,28 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info ddsf = be16_to_cpu(p->dds_flags); if (get_ldev(device)) { + drbd_reconsider_max_bio_size(device, device->ldev); dd = drbd_determine_dev_size(device, ddsf, NULL); put_ldev(device); if (dd == DS_ERROR) return -EIO; drbd_md_sync(device); } else { - /* I am diskless, need to accept the peer's size. */ - drbd_set_my_capacity(device, p_size); + /* + * I am diskless, need to accept the peer's *current* size. + * I must NOT accept the peers backing disk size, + * it may have been larger than mine all along... + * + * At this point, the peer knows more about my disk, or at + * least about what we last agreed upon, than myself. + * So if his c_size is less than his d_size, the most likely + * reason is that *my* d_size was smaller last time we checked. + * + * However, if he sends a zero current size, + * take his (user-capped or) backing disk size anyways. + */ + drbd_reconsider_max_bio_size(device, NULL); + drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size); } if (get_ldev(device)) { @@ -4501,6 +4604,7 @@ static void drbdd(struct drbd_connection *connection) struct data_cmd *cmd; drbd_thread_current_set_cpu(&connection->receiver); + update_receiver_timing_details(connection, drbd_recv_header); if (drbd_recv_header(connection, &pi)) goto err_out; @@ -4519,12 +4623,14 @@ static void drbdd(struct drbd_connection *connection) } if (shs) { + update_receiver_timing_details(connection, drbd_recv_all_warn); err = drbd_recv_all_warn(connection, pi.data, shs); if (err) goto err_out; pi.size -= shs; } + update_receiver_timing_details(connection, cmd->fn); err = cmd->fn(connection, &pi); if (err) { drbd_err(connection, "error receiving %s, e: %d l: %d!\n", diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 09803d0d520..c67717d572d 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -52,7 +52,7 @@ static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req) { int rw = bio_data_dir(req->master_bio); - unsigned long duration = jiffies - req->start_time; + unsigned long duration = jiffies - req->start_jif; int cpu; cpu = part_stat_lock(); part_stat_add(cpu, &device->vdisk->part0, ticks[rw], duration); @@ -66,7 +66,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, { struct drbd_request *req; - req = mempool_alloc(drbd_request_mempool, GFP_NOIO); + req = mempool_alloc(drbd_request_mempool, GFP_NOIO | __GFP_ZERO); if (!req) return NULL; @@ -84,6 +84,8 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, INIT_LIST_HEAD(&req->tl_requests); INIT_LIST_HEAD(&req->w.list); + INIT_LIST_HEAD(&req->req_pending_master_completion); + INIT_LIST_HEAD(&req->req_pending_local); /* one reference to be put by __drbd_make_request */ atomic_set(&req->completion_ref, 1); @@ -92,6 +94,19 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, return req; } +static void drbd_remove_request_interval(struct rb_root *root, + struct drbd_request *req) +{ + struct drbd_device *device = req->device; + struct drbd_interval *i = &req->i; + + drbd_remove_interval(root, i); + + /* Wake up any processes waiting for this request to complete. */ + if (i->waiting) + wake_up(&device->misc_wait); +} + void drbd_req_destroy(struct kref *kref) { struct drbd_request *req = container_of(kref, struct drbd_request, kref); @@ -107,14 +122,30 @@ void drbd_req_destroy(struct kref *kref) return; } - /* remove it from the transfer log. - * well, only if it had been there in the first - * place... if it had not (local only or conflicting - * and never sent), it should still be "empty" as - * initialized in drbd_req_new(), so we can list_del() it - * here unconditionally */ + /* If called from mod_rq_state (expected normal case) or + * drbd_send_and_submit (the less likely normal path), this holds the + * req_lock, and req->tl_requests will typicaly be on ->transfer_log, + * though it may be still empty (never added to the transfer log). + * + * If called from do_retry(), we do NOT hold the req_lock, but we are + * still allowed to unconditionally list_del(&req->tl_requests), + * because it will be on a local on-stack list only. */ list_del_init(&req->tl_requests); + /* finally remove the request from the conflict detection + * respective block_id verification interval tree. */ + if (!drbd_interval_empty(&req->i)) { + struct rb_root *root; + + if (s & RQ_WRITE) + root = &device->write_requests; + else + root = &device->read_requests; + drbd_remove_request_interval(root, req); + } else if (s & (RQ_NET_MASK & ~RQ_NET_DONE) && req->i.size != 0) + drbd_err(device, "drbd_req_destroy: Logic BUG: interval empty, but: rq_state=0x%x, sect=%llu, size=%u\n", + s, (unsigned long long)req->i.sector, req->i.size); + /* if it was a write, we may have to set the corresponding * bit(s) out-of-sync first. If it had a local part, we need to * release the reference to the activity log. */ @@ -188,19 +219,6 @@ void complete_master_bio(struct drbd_device *device, } -static void drbd_remove_request_interval(struct rb_root *root, - struct drbd_request *req) -{ - struct drbd_device *device = req->device; - struct drbd_interval *i = &req->i; - - drbd_remove_interval(root, i); - - /* Wake up any processes waiting for this request to complete. */ - if (i->waiting) - wake_up(&device->misc_wait); -} - /* Helper for __req_mod(). * Set m->bio to the master bio, if it is fit to be completed, * or leave it alone (it is initialized to NULL in __req_mod), @@ -254,18 +272,6 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); error = PTR_ERR(req->private_bio); - /* remove the request from the conflict detection - * respective block_id verification hash */ - if (!drbd_interval_empty(&req->i)) { - struct rb_root *root; - - if (rw == WRITE) - root = &device->write_requests; - else - root = &device->read_requests; - drbd_remove_request_interval(root, req); - } - /* Before we can signal completion to the upper layers, * we may need to close the current transfer log epoch. * We are within the request lock, so we can simply compare @@ -301,9 +307,24 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) m->error = ok ? 0 : (error ?: -EIO); m->bio = req->master_bio; req->master_bio = NULL; + /* We leave it in the tree, to be able to verify later + * write-acks in protocol != C during resync. + * But we mark it as "complete", so it won't be counted as + * conflict in a multi-primary setup. */ + req->i.completed = true; } + + if (req->i.waiting) + wake_up(&device->misc_wait); + + /* Either we are about to complete to upper layers, + * or we will restart this request. + * In either case, the request object will be destroyed soon, + * so better remove it from all lists. */ + list_del_init(&req->req_pending_master_completion); } +/* still holds resource->req_lock */ static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put) { struct drbd_device *device = req->device; @@ -324,12 +345,91 @@ static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_ return 1; } +static void set_if_null_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + if (!connection) + return; + if (connection->req_next == NULL) + connection->req_next = req; +} + +static void advance_conn_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + if (!connection) + return; + if (connection->req_next != req) + return; + list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) { + const unsigned s = req->rq_state; + if (s & RQ_NET_QUEUED) + break; + } + if (&req->tl_requests == &connection->transfer_log) + req = NULL; + connection->req_next = req; +} + +static void set_if_null_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + if (!connection) + return; + if (connection->req_ack_pending == NULL) + connection->req_ack_pending = req; +} + +static void advance_conn_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + if (!connection) + return; + if (connection->req_ack_pending != req) + return; + list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) { + const unsigned s = req->rq_state; + if ((s & RQ_NET_SENT) && (s & RQ_NET_PENDING)) + break; + } + if (&req->tl_requests == &connection->transfer_log) + req = NULL; + connection->req_ack_pending = req; +} + +static void set_if_null_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + if (!connection) + return; + if (connection->req_not_net_done == NULL) + connection->req_not_net_done = req; +} + +static void advance_conn_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + if (!connection) + return; + if (connection->req_not_net_done != req) + return; + list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) { + const unsigned s = req->rq_state; + if ((s & RQ_NET_SENT) && !(s & RQ_NET_DONE)) + break; + } + if (&req->tl_requests == &connection->transfer_log) + req = NULL; + connection->req_not_net_done = req; +} + /* I'd like this to be the only place that manipulates * req->completion_ref and req->kref. */ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, int clear, int set) { struct drbd_device *device = req->device; + struct drbd_peer_device *peer_device = first_peer_device(device); unsigned s = req->rq_state; int c_put = 0; int k_put = 0; @@ -356,14 +456,23 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, atomic_inc(&req->completion_ref); } - if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) + if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) { atomic_inc(&req->completion_ref); + set_if_null_req_next(peer_device, req); + } if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK)) kref_get(&req->kref); /* wait for the DONE */ - if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) - atomic_add(req->i.size >> 9, &device->ap_in_flight); + if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) { + /* potentially already completed in the asender thread */ + if (!(s & RQ_NET_DONE)) { + atomic_add(req->i.size >> 9, &device->ap_in_flight); + set_if_null_req_not_net_done(peer_device, req); + } + if (s & RQ_NET_PENDING) + set_if_null_req_ack_pending(peer_device, req); + } if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP)) atomic_inc(&req->completion_ref); @@ -386,20 +495,34 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, ++k_put; else ++c_put; + list_del_init(&req->req_pending_local); } if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) { dec_ap_pending(device); ++c_put; + req->acked_jif = jiffies; + advance_conn_req_ack_pending(peer_device, req); } - if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) + if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) { ++c_put; + advance_conn_req_next(peer_device, req); + } - if ((s & RQ_EXP_BARR_ACK) && !(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) { - if (req->rq_state & RQ_NET_SENT) + if (!(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) { + if (s & RQ_NET_SENT) atomic_sub(req->i.size >> 9, &device->ap_in_flight); - ++k_put; + if (s & RQ_EXP_BARR_ACK) + ++k_put; + req->net_done_jif = jiffies; + + /* in ahead/behind mode, or just in case, + * before we finally destroy this request, + * the caching pointers must not reference it anymore */ + advance_conn_req_next(peer_device, req); + advance_conn_req_ack_pending(peer_device, req); + advance_conn_req_not_net_done(peer_device, req); } /* potentially complete and destroy */ @@ -439,6 +562,19 @@ static void drbd_report_io_error(struct drbd_device *device, struct drbd_request bdevname(device->ldev->backing_bdev, b)); } +/* Helper for HANDED_OVER_TO_NETWORK. + * Is this a protocol A write (neither WRITE_ACK nor RECEIVE_ACK expected)? + * Is it also still "PENDING"? + * --> If so, clear PENDING and set NET_OK below. + * If it is a protocol A write, but not RQ_PENDING anymore, neg-ack was faster + * (and we must not set RQ_NET_OK) */ +static inline bool is_pending_write_protocol_A(struct drbd_request *req) +{ + return (req->rq_state & + (RQ_WRITE|RQ_NET_PENDING|RQ_EXP_WRITE_ACK|RQ_EXP_RECEIVE_ACK)) + == (RQ_WRITE|RQ_NET_PENDING); +} + /* obviously this could be coded as many single functions * instead of one huge switch, * or by putting the code directly in the respective locations @@ -454,7 +590,9 @@ static void drbd_report_io_error(struct drbd_device *device, struct drbd_request int __req_mod(struct drbd_request *req, enum drbd_req_event what, struct bio_and_error *m) { - struct drbd_device *device = req->device; + struct drbd_device *const device = req->device; + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; struct net_conf *nc; int p, rv = 0; @@ -477,7 +615,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, * and from w_read_retry_remote */ D_ASSERT(device, !(req->rq_state & RQ_NET_MASK)); rcu_read_lock(); - nc = rcu_dereference(first_peer_device(device)->connection->net_conf); + nc = rcu_dereference(connection->net_conf); p = nc->wire_protocol; rcu_read_unlock(); req->rq_state |= @@ -549,7 +687,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, D_ASSERT(device, (req->rq_state & RQ_LOCAL_MASK) == 0); mod_rq_state(req, m, 0, RQ_NET_QUEUED); req->w.cb = w_send_read_req; - drbd_queue_work(&first_peer_device(device)->connection->sender_work, + drbd_queue_work(&connection->sender_work, &req->w); break; @@ -585,23 +723,23 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, D_ASSERT(device, req->rq_state & RQ_NET_PENDING); mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK); req->w.cb = w_send_dblock; - drbd_queue_work(&first_peer_device(device)->connection->sender_work, + drbd_queue_work(&connection->sender_work, &req->w); /* close the epoch, in case it outgrew the limit */ rcu_read_lock(); - nc = rcu_dereference(first_peer_device(device)->connection->net_conf); + nc = rcu_dereference(connection->net_conf); p = nc->max_epoch_size; rcu_read_unlock(); - if (first_peer_device(device)->connection->current_tle_writes >= p) - start_new_tl_epoch(first_peer_device(device)->connection); + if (connection->current_tle_writes >= p) + start_new_tl_epoch(connection); break; case QUEUE_FOR_SEND_OOS: mod_rq_state(req, m, 0, RQ_NET_QUEUED); req->w.cb = w_send_out_of_sync; - drbd_queue_work(&first_peer_device(device)->connection->sender_work, + drbd_queue_work(&connection->sender_work, &req->w); break; @@ -615,18 +753,16 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, case HANDED_OVER_TO_NETWORK: /* assert something? */ - if (bio_data_dir(req->master_bio) == WRITE && - !(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) { + if (is_pending_write_protocol_A(req)) /* this is what is dangerous about protocol A: * pretend it was successfully written on the peer. */ - if (req->rq_state & RQ_NET_PENDING) - mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK); - /* else: neg-ack was faster... */ - /* it is still not yet RQ_NET_DONE until the - * corresponding epoch barrier got acked as well, - * so we know what to dirty on connection loss */ - } - mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT); + mod_rq_state(req, m, RQ_NET_QUEUED|RQ_NET_PENDING, + RQ_NET_SENT|RQ_NET_OK); + else + mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT); + /* It is still not yet RQ_NET_DONE until the + * corresponding epoch barrier got acked as well, + * so we know what to dirty on connection loss. */ break; case OOS_HANDED_TO_NETWORK: @@ -658,12 +794,13 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, case WRITE_ACKED_BY_PEER_AND_SIS: req->rq_state |= RQ_NET_SIS; case WRITE_ACKED_BY_PEER: - D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK); - /* protocol C; successfully written on peer. + /* Normal operation protocol C: successfully written on peer. + * During resync, even in protocol != C, + * we requested an explicit write ack anyways. + * Which means we cannot even assert anything here. * Nothing more to do here. * We want to keep the tl in place for all protocols, to cater * for volatile write-back caches on lower level devices. */ - goto ack_common; case RECV_ACKED_BY_PEER: D_ASSERT(device, req->rq_state & RQ_EXP_RECEIVE_ACK); @@ -671,7 +808,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, * see also notes above in HANDED_OVER_TO_NETWORK about * protocol != C */ ack_common: - D_ASSERT(device, req->rq_state & RQ_NET_PENDING); mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK); break; @@ -714,7 +850,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, get_ldev(device); /* always succeeds in this call path */ req->w.cb = w_restart_disk_io; - drbd_queue_work(&first_peer_device(device)->connection->sender_work, + drbd_queue_work(&connection->sender_work, &req->w); break; @@ -736,7 +872,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING); if (req->w.cb) { - drbd_queue_work(&first_peer_device(device)->connection->sender_work, + /* w.cb expected to be w_send_dblock, or w_send_read_req */ + drbd_queue_work(&connection->sender_work, &req->w); rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ; } /* else: FIXME can this happen? */ @@ -769,7 +906,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, break; case QUEUE_AS_DRBD_BARRIER: - start_new_tl_epoch(first_peer_device(device)->connection); + start_new_tl_epoch(connection); mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE); break; }; @@ -886,6 +1023,9 @@ static void maybe_pull_ahead(struct drbd_device *device) connection->agreed_pro_version < 96) return; + if (on_congestion == OC_PULL_AHEAD && device->state.conn == C_AHEAD) + return; /* nothing to do ... */ + /* If I don't even have good local storage, we can not reasonably try * to pull ahead of the peer. We also need the local reference to make * sure device->act_log is there. @@ -1021,6 +1161,7 @@ drbd_submit_req_private_bio(struct drbd_request *req) * stable storage, and this is a WRITE, we may not even submit * this bio. */ if (get_ldev(device)) { + req->pre_submit_jif = jiffies; if (drbd_insert_fault(device, rw == WRITE ? DRBD_FAULT_DT_WR : rw == READ ? DRBD_FAULT_DT_RD @@ -1035,10 +1176,14 @@ drbd_submit_req_private_bio(struct drbd_request *req) static void drbd_queue_write(struct drbd_device *device, struct drbd_request *req) { - spin_lock(&device->submit.lock); + spin_lock_irq(&device->resource->req_lock); list_add_tail(&req->tl_requests, &device->submit.writes); - spin_unlock(&device->submit.lock); + list_add_tail(&req->req_pending_master_completion, + &device->pending_master_completion[1 /* WRITE */]); + spin_unlock_irq(&device->resource->req_lock); queue_work(device->submit.wq, &device->submit.worker); + /* do_submit() may sleep internally on al_wait, too */ + wake_up(&device->al_wait); } /* returns the new drbd_request pointer, if the caller is expected to @@ -1047,7 +1192,7 @@ static void drbd_queue_write(struct drbd_device *device, struct drbd_request *re * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request. */ static struct drbd_request * -drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_time) +drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_jif) { const int rw = bio_data_dir(bio); struct drbd_request *req; @@ -1062,7 +1207,7 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long bio_endio(bio, -ENOMEM); return ERR_PTR(-ENOMEM); } - req->start_time = start_time; + req->start_jif = start_jif; if (!get_ldev(device)) { bio_put(req->private_bio); @@ -1075,10 +1220,12 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long if (rw == WRITE && req->private_bio && req->i.size && !test_bit(AL_SUSPENDED, &device->flags)) { if (!drbd_al_begin_io_fastpath(device, &req->i)) { + atomic_inc(&device->ap_actlog_cnt); drbd_queue_write(device, req); return NULL; } req->rq_state |= RQ_IN_ACT_LOG; + req->in_actlog_jif = jiffies; } return req; @@ -1086,11 +1233,13 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req) { + struct drbd_resource *resource = device->resource; const int rw = bio_rw(req->master_bio); struct bio_and_error m = { NULL, }; bool no_remote = false; + bool submit_private_bio = false; - spin_lock_irq(&device->resource->req_lock); + spin_lock_irq(&resource->req_lock); if (rw == WRITE) { /* This may temporarily give up the req_lock, * but will re-aquire it before it returns here. @@ -1148,13 +1297,18 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request no_remote = true; } + /* If it took the fast path in drbd_request_prepare, add it here. + * The slow path has added it already. */ + if (list_empty(&req->req_pending_master_completion)) + list_add_tail(&req->req_pending_master_completion, + &device->pending_master_completion[rw == WRITE]); if (req->private_bio) { /* needs to be marked within the same spinlock */ + list_add_tail(&req->req_pending_local, + &device->pending_completion[rw == WRITE]); _req_mod(req, TO_BE_SUBMITTED); /* but we need to give up the spinlock to submit */ - spin_unlock_irq(&device->resource->req_lock); - drbd_submit_req_private_bio(req); - spin_lock_irq(&device->resource->req_lock); + submit_private_bio = true; } else if (no_remote) { nodata: if (__ratelimit(&drbd_ratelimit_state)) @@ -1167,15 +1321,23 @@ nodata: out: if (drbd_req_put_completion_ref(req, &m, 1)) kref_put(&req->kref, drbd_req_destroy); - spin_unlock_irq(&device->resource->req_lock); - + spin_unlock_irq(&resource->req_lock); + + /* Even though above is a kref_put(), this is safe. + * As long as we still need to submit our private bio, + * we hold a completion ref, and the request cannot disappear. + * If however this request did not even have a private bio to submit + * (e.g. remote read), req may already be invalid now. + * That's why we cannot check on req->private_bio. */ + if (submit_private_bio) + drbd_submit_req_private_bio(req); if (m.bio) complete_master_bio(device, &m); } -void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_time) +void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_jif) { - struct drbd_request *req = drbd_request_prepare(device, bio, start_time); + struct drbd_request *req = drbd_request_prepare(device, bio, start_jif); if (IS_ERR_OR_NULL(req)) return; drbd_send_and_submit(device, req); @@ -1194,6 +1356,8 @@ static void submit_fast_path(struct drbd_device *device, struct list_head *incom continue; req->rq_state |= RQ_IN_ACT_LOG; + req->in_actlog_jif = jiffies; + atomic_dec(&device->ap_actlog_cnt); } list_del_init(&req->tl_requests); @@ -1203,7 +1367,8 @@ static void submit_fast_path(struct drbd_device *device, struct list_head *incom static bool prepare_al_transaction_nonblock(struct drbd_device *device, struct list_head *incoming, - struct list_head *pending) + struct list_head *pending, + struct list_head *later) { struct drbd_request *req, *tmp; int wake = 0; @@ -1212,45 +1377,105 @@ static bool prepare_al_transaction_nonblock(struct drbd_device *device, spin_lock_irq(&device->al_lock); list_for_each_entry_safe(req, tmp, incoming, tl_requests) { err = drbd_al_begin_io_nonblock(device, &req->i); + if (err == -ENOBUFS) + break; if (err == -EBUSY) wake = 1; if (err) - continue; - req->rq_state |= RQ_IN_ACT_LOG; - list_move_tail(&req->tl_requests, pending); + list_move_tail(&req->tl_requests, later); + else + list_move_tail(&req->tl_requests, pending); } spin_unlock_irq(&device->al_lock); if (wake) wake_up(&device->al_wait); - return !list_empty(pending); } +void send_and_submit_pending(struct drbd_device *device, struct list_head *pending) +{ + struct drbd_request *req, *tmp; + + list_for_each_entry_safe(req, tmp, pending, tl_requests) { + req->rq_state |= RQ_IN_ACT_LOG; + req->in_actlog_jif = jiffies; + atomic_dec(&device->ap_actlog_cnt); + list_del_init(&req->tl_requests); + drbd_send_and_submit(device, req); + } +} + void do_submit(struct work_struct *ws) { struct drbd_device *device = container_of(ws, struct drbd_device, submit.worker); - LIST_HEAD(incoming); - LIST_HEAD(pending); - struct drbd_request *req, *tmp; + LIST_HEAD(incoming); /* from drbd_make_request() */ + LIST_HEAD(pending); /* to be submitted after next AL-transaction commit */ + LIST_HEAD(busy); /* blocked by resync requests */ + + /* grab new incoming requests */ + spin_lock_irq(&device->resource->req_lock); + list_splice_tail_init(&device->submit.writes, &incoming); + spin_unlock_irq(&device->resource->req_lock); for (;;) { - spin_lock(&device->submit.lock); - list_splice_tail_init(&device->submit.writes, &incoming); - spin_unlock(&device->submit.lock); + DEFINE_WAIT(wait); + /* move used-to-be-busy back to front of incoming */ + list_splice_init(&busy, &incoming); submit_fast_path(device, &incoming); if (list_empty(&incoming)) break; -skip_fast_path: - wait_event(device->al_wait, prepare_al_transaction_nonblock(device, &incoming, &pending)); - /* Maybe more was queued, while we prepared the transaction? - * Try to stuff them into this transaction as well. - * Be strictly non-blocking here, no wait_event, we already - * have something to commit. - * Stop if we don't make any more progres. - */ for (;;) { + prepare_to_wait(&device->al_wait, &wait, TASK_UNINTERRUPTIBLE); + + list_splice_init(&busy, &incoming); + prepare_al_transaction_nonblock(device, &incoming, &pending, &busy); + if (!list_empty(&pending)) + break; + + schedule(); + + /* If all currently "hot" activity log extents are kept busy by + * incoming requests, we still must not totally starve new + * requests to "cold" extents. + * Something left on &incoming means there had not been + * enough update slots available, and the activity log + * has been marked as "starving". + * + * Try again now, without looking for new requests, + * effectively blocking all new requests until we made + * at least _some_ progress with what we currently have. + */ + if (!list_empty(&incoming)) + continue; + + /* Nothing moved to pending, but nothing left + * on incoming: all moved to busy! + * Grab new and iterate. */ + spin_lock_irq(&device->resource->req_lock); + list_splice_tail_init(&device->submit.writes, &incoming); + spin_unlock_irq(&device->resource->req_lock); + } + finish_wait(&device->al_wait, &wait); + + /* If the transaction was full, before all incoming requests + * had been processed, skip ahead to commit, and iterate + * without splicing in more incoming requests from upper layers. + * + * Else, if all incoming have been processed, + * they have become either "pending" (to be submitted after + * next transaction commit) or "busy" (blocked by resync). + * + * Maybe more was queued, while we prepared the transaction? + * Try to stuff those into this transaction as well. + * Be strictly non-blocking here, + * we already have something to commit. + * + * Commit if we don't make any more progres. + */ + + while (list_empty(&incoming)) { LIST_HEAD(more_pending); LIST_HEAD(more_incoming); bool made_progress; @@ -1260,55 +1485,32 @@ skip_fast_path: if (list_empty(&device->submit.writes)) break; - spin_lock(&device->submit.lock); + spin_lock_irq(&device->resource->req_lock); list_splice_tail_init(&device->submit.writes, &more_incoming); - spin_unlock(&device->submit.lock); + spin_unlock_irq(&device->resource->req_lock); if (list_empty(&more_incoming)) break; - made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending); + made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending, &busy); list_splice_tail_init(&more_pending, &pending); list_splice_tail_init(&more_incoming, &incoming); - if (!made_progress) break; } - drbd_al_begin_io_commit(device, false); - - list_for_each_entry_safe(req, tmp, &pending, tl_requests) { - list_del_init(&req->tl_requests); - drbd_send_and_submit(device, req); - } - /* If all currently hot activity log extents are kept busy by - * incoming requests, we still must not totally starve new - * requests to cold extents. In that case, prepare one request - * in blocking mode. */ - list_for_each_entry_safe(req, tmp, &incoming, tl_requests) { - list_del_init(&req->tl_requests); - req->rq_state |= RQ_IN_ACT_LOG; - if (!drbd_al_begin_io_prepare(device, &req->i)) { - /* Corresponding extent was hot after all? */ - drbd_send_and_submit(device, req); - } else { - /* Found a request to a cold extent. - * Put on "pending" list, - * and try to cumulate with more. */ - list_add(&req->tl_requests, &pending); - goto skip_fast_path; - } - } + drbd_al_begin_io_commit(device); + send_and_submit_pending(device, &pending); } } void drbd_make_request(struct request_queue *q, struct bio *bio) { struct drbd_device *device = (struct drbd_device *) q->queuedata; - unsigned long start_time; + unsigned long start_jif; - start_time = jiffies; + start_jif = jiffies; /* * what we "blindly" assume: @@ -1316,7 +1518,7 @@ void drbd_make_request(struct request_queue *q, struct bio *bio) D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512)); inc_ap_bio(device); - __drbd_make_request(device, bio, start_time); + __drbd_make_request(device, bio, start_jif); } /* This is called by bio_add_page(). @@ -1353,36 +1555,13 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct return limit; } -static void find_oldest_requests( - struct drbd_connection *connection, - struct drbd_device *device, - struct drbd_request **oldest_req_waiting_for_peer, - struct drbd_request **oldest_req_waiting_for_disk) -{ - struct drbd_request *r; - *oldest_req_waiting_for_peer = NULL; - *oldest_req_waiting_for_disk = NULL; - list_for_each_entry(r, &connection->transfer_log, tl_requests) { - const unsigned s = r->rq_state; - if (!*oldest_req_waiting_for_peer - && ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) - *oldest_req_waiting_for_peer = r; - - if (!*oldest_req_waiting_for_disk - && (s & RQ_LOCAL_PENDING) && r->device == device) - *oldest_req_waiting_for_disk = r; - - if (*oldest_req_waiting_for_peer && *oldest_req_waiting_for_disk) - break; - } -} - void request_timer_fn(unsigned long data) { struct drbd_device *device = (struct drbd_device *) data; struct drbd_connection *connection = first_peer_device(device)->connection; - struct drbd_request *req_disk, *req_peer; /* oldest request */ + struct drbd_request *req_read, *req_write, *req_peer; /* oldest request */ struct net_conf *nc; + unsigned long oldest_submit_jif; unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ unsigned long now; @@ -1403,14 +1582,31 @@ void request_timer_fn(unsigned long data) return; /* Recurring timer stopped */ now = jiffies; + nt = now + et; spin_lock_irq(&device->resource->req_lock); - find_oldest_requests(connection, device, &req_peer, &req_disk); - if (req_peer == NULL && req_disk == NULL) { - spin_unlock_irq(&device->resource->req_lock); - mod_timer(&device->request_timer, now + et); - return; - } + req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local); + req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local); + req_peer = connection->req_not_net_done; + /* maybe the oldest request waiting for the peer is in fact still + * blocking in tcp sendmsg */ + if (!req_peer && connection->req_next && connection->req_next->pre_send_jif) + req_peer = connection->req_next; + + /* evaluate the oldest peer request only in one timer! */ + if (req_peer && req_peer->device != device) + req_peer = NULL; + + /* do we have something to evaluate? */ + if (req_peer == NULL && req_write == NULL && req_read == NULL) + goto out; + + oldest_submit_jif = + (req_write && req_read) + ? ( time_before(req_write->pre_submit_jif, req_read->pre_submit_jif) + ? req_write->pre_submit_jif : req_read->pre_submit_jif ) + : req_write ? req_write->pre_submit_jif + : req_read ? req_read->pre_submit_jif : now; /* The request is considered timed out, if * - we have some effective timeout from the configuration, @@ -1429,13 +1625,13 @@ void request_timer_fn(unsigned long data) * to expire twice (worst case) to become effective. Good enough. */ if (ent && req_peer && - time_after(now, req_peer->start_time + ent) && + time_after(now, req_peer->pre_send_jif + ent) && !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) { drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n"); _drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); } - if (dt && req_disk && - time_after(now, req_disk->start_time + dt) && + if (dt && oldest_submit_jif != now && + time_after(now, oldest_submit_jif + dt) && !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) { drbd_warn(device, "Local backing device failed to meet the disk-timeout\n"); __drbd_chk_io_error(device, DRBD_FORCE_DETACH); @@ -1443,11 +1639,12 @@ void request_timer_fn(unsigned long data) /* Reschedule timer for the nearest not already expired timeout. * Fallback to now + min(effective network timeout, disk timeout). */ - ent = (ent && req_peer && time_before(now, req_peer->start_time + ent)) - ? req_peer->start_time + ent : now + et; - dt = (dt && req_disk && time_before(now, req_disk->start_time + dt)) - ? req_disk->start_time + dt : now + et; + ent = (ent && req_peer && time_before(now, req_peer->pre_send_jif + ent)) + ? req_peer->pre_send_jif + ent : now + et; + dt = (dt && oldest_submit_jif != now && time_before(now, oldest_submit_jif + dt)) + ? oldest_submit_jif + dt : now + et; nt = time_before(ent, dt) ? ent : dt; +out: spin_unlock_irq(&connection->resource->req_lock); mod_timer(&device->request_timer, nt); } diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 8566cd5866b..9f6a04080e9 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -288,6 +288,7 @@ extern void complete_master_bio(struct drbd_device *device, extern void request_timer_fn(unsigned long data); extern void tl_restart(struct drbd_connection *connection, enum drbd_req_event what); extern void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what); +extern void tl_abort_disk_io(struct drbd_device *device); /* this is in drbd_main.c */ extern void drbd_restart_request(struct drbd_request *req); diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index a5d8aae00e0..c35c0f001bb 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -410,7 +410,7 @@ _drbd_request_state(struct drbd_device *device, union drbd_state mask, return rv; } -static void print_st(struct drbd_device *device, char *name, union drbd_state ns) +static void print_st(struct drbd_device *device, const char *name, union drbd_state ns) { drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n", name, @@ -952,11 +952,12 @@ enum drbd_state_rv __drbd_set_state(struct drbd_device *device, union drbd_state ns, enum chg_state_flags flags, struct completion *done) { + struct drbd_peer_device *peer_device = first_peer_device(device); + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; union drbd_state os; enum drbd_state_rv rv = SS_SUCCESS; enum sanitize_state_warnings ssw; struct after_state_chg_work *ascw; - bool did_remote, should_do_remote; os = drbd_read_state(device); @@ -978,9 +979,9 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, this happen...*/ if (is_valid_state(device, os) == rv) - rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection); + rv = is_valid_soft_transition(os, ns, connection); } else - rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection); + rv = is_valid_soft_transition(os, ns, connection); } if (rv < SS_SUCCESS) { @@ -997,7 +998,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, sanitize_state(). Only display it here if we where not called from _conn_request_state() */ if (!(flags & CS_DC_SUSP)) - conn_pr_state_change(first_peer_device(device)->connection, os, ns, + conn_pr_state_change(connection, os, ns, (flags & ~CS_DC_MASK) | CS_DC_SUSP); /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference @@ -1008,28 +1009,35 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, (os.disk != D_DISKLESS && ns.disk == D_DISKLESS)) atomic_inc(&device->local_cnt); - did_remote = drbd_should_do_remote(device->state); + if (!is_sync_state(os.conn) && is_sync_state(ns.conn)) + clear_bit(RS_DONE, &device->flags); + + /* changes to local_cnt and device flags should be visible before + * changes to state, which again should be visible before anything else + * depending on that change happens. */ + smp_wmb(); device->state.i = ns.i; - should_do_remote = drbd_should_do_remote(device->state); device->resource->susp = ns.susp; device->resource->susp_nod = ns.susp_nod; device->resource->susp_fen = ns.susp_fen; + smp_wmb(); /* put replicated vs not-replicated requests in seperate epochs */ - if (did_remote != should_do_remote) - start_new_tl_epoch(first_peer_device(device)->connection); + if (drbd_should_do_remote((union drbd_dev_state)os.i) != + drbd_should_do_remote((union drbd_dev_state)ns.i)) + start_new_tl_epoch(connection); if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING) drbd_print_uuids(device, "attached to UUIDs"); /* Wake up role changes, that were delayed because of connection establishing */ if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS && - no_peer_wf_report_params(first_peer_device(device)->connection)) - clear_bit(STATE_SENT, &first_peer_device(device)->connection->flags); + no_peer_wf_report_params(connection)) + clear_bit(STATE_SENT, &connection->flags); wake_up(&device->misc_wait); wake_up(&device->state_wait); - wake_up(&first_peer_device(device)->connection->ping_wait); + wake_up(&connection->ping_wait); /* Aborted verify run, or we reached the stop sector. * Log the last position, unless end-of-device. */ @@ -1118,21 +1126,21 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, /* Receiver should clean up itself */ if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) - drbd_thread_stop_nowait(&first_peer_device(device)->connection->receiver); + drbd_thread_stop_nowait(&connection->receiver); /* Now the receiver finished cleaning up itself, it should die */ if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) - drbd_thread_stop_nowait(&first_peer_device(device)->connection->receiver); + drbd_thread_stop_nowait(&connection->receiver); /* Upon network failure, we need to restart the receiver. */ if (os.conn > C_WF_CONNECTION && ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) - drbd_thread_restart_nowait(&first_peer_device(device)->connection->receiver); + drbd_thread_restart_nowait(&connection->receiver); /* Resume AL writing if we get a connection */ if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { drbd_resume_al(device); - first_peer_device(device)->connection->connect_cnt++; + connection->connect_cnt++; } /* remember last attach time so request_timer_fn() won't @@ -1150,7 +1158,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, ascw->w.cb = w_after_state_ch; ascw->device = device; ascw->done = done; - drbd_queue_work(&first_peer_device(device)->connection->sender_work, + drbd_queue_work(&connection->sender_work, &ascw->w); } else { drbd_err(device, "Could not kmalloc an ascw\n"); @@ -1222,13 +1230,16 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, union drbd_state ns, enum chg_state_flags flags) { struct drbd_resource *resource = device->resource; + struct drbd_peer_device *peer_device = first_peer_device(device); + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; struct sib_info sib; sib.sib_reason = SIB_STATE_CHANGE; sib.os = os; sib.ns = ns; - if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { + if ((os.disk != D_UP_TO_DATE || os.pdsk != D_UP_TO_DATE) + && (ns.disk == D_UP_TO_DATE && ns.pdsk == D_UP_TO_DATE)) { clear_bit(CRASHED_PRIMARY, &device->flags); if (device->p_uuid) device->p_uuid[UI_FLAGS] &= ~((u64)2); @@ -1245,7 +1256,6 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, state change. This function might sleep */ if (ns.susp_nod) { - struct drbd_connection *connection = first_peer_device(device)->connection; enum drbd_req_event what = NOTHING; spin_lock_irq(&device->resource->req_lock); @@ -1267,8 +1277,6 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, } if (ns.susp_fen) { - struct drbd_connection *connection = first_peer_device(device)->connection; - spin_lock_irq(&device->resource->req_lock); if (resource->susp_fen && conn_lowest_conn(connection) >= C_CONNECTED) { /* case2: The connection was established again: */ @@ -1294,8 +1302,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, * which is unexpected. */ if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) && (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) && - first_peer_device(device)->connection->agreed_pro_version >= 96 && get_ldev(device)) { - drbd_gen_and_send_sync_uuid(first_peer_device(device)); + connection->agreed_pro_version >= 96 && get_ldev(device)) { + drbd_gen_and_send_sync_uuid(peer_device); put_ldev(device); } @@ -1309,8 +1317,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, atomic_set(&device->rs_pending_cnt, 0); drbd_rs_cancel_all(device); - drbd_send_uuids(first_peer_device(device)); - drbd_send_state(first_peer_device(device), ns); + drbd_send_uuids(peer_device); + drbd_send_state(peer_device, ns); } /* No point in queuing send_bitmap if we don't have a connection * anymore, so check also the _current_ state, not only the new state @@ -1335,7 +1343,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, set_bit(NEW_CUR_UUID, &device->flags); } else { drbd_uuid_new_current(device); - drbd_send_uuids(first_peer_device(device)); + drbd_send_uuids(peer_device); } } put_ldev(device); @@ -1346,7 +1354,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { drbd_uuid_new_current(device); - drbd_send_uuids(first_peer_device(device)); + drbd_send_uuids(peer_device); } /* D_DISKLESS Peer becomes secondary */ if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) @@ -1373,16 +1381,16 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, /* Last part of the attaching process ... */ if (ns.conn >= C_CONNECTED && os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { - drbd_send_sizes(first_peer_device(device), 0, 0); /* to start sync... */ - drbd_send_uuids(first_peer_device(device)); - drbd_send_state(first_peer_device(device), ns); + drbd_send_sizes(peer_device, 0, 0); /* to start sync... */ + drbd_send_uuids(peer_device); + drbd_send_state(peer_device, ns); } /* We want to pause/continue resync, tell peer. */ if (ns.conn >= C_CONNECTED && ((os.aftr_isp != ns.aftr_isp) || (os.user_isp != ns.user_isp))) - drbd_send_state(first_peer_device(device), ns); + drbd_send_state(peer_device, ns); /* In case one of the isp bits got set, suspend other devices. */ if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && @@ -1392,10 +1400,10 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, /* Make sure the peer gets informed about eventual state changes (ISP bits) while we were in WFReportParams. */ if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) - drbd_send_state(first_peer_device(device), ns); + drbd_send_state(peer_device, ns); if (os.conn != C_AHEAD && ns.conn == C_AHEAD) - drbd_send_state(first_peer_device(device), ns); + drbd_send_state(peer_device, ns); /* We are in the progress to start a full sync... */ if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || @@ -1449,7 +1457,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, drbd_disk_str(device->state.disk)); if (ns.conn >= C_CONNECTED) - drbd_send_state(first_peer_device(device), ns); + drbd_send_state(peer_device, ns); drbd_rs_cancel_all(device); @@ -1473,7 +1481,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, drbd_disk_str(device->state.disk)); if (ns.conn >= C_CONNECTED) - drbd_send_state(first_peer_device(device), ns); + drbd_send_state(peer_device, ns); /* corresponding get_ldev in __drbd_set_state * this may finally trigger drbd_ldev_destroy. */ put_ldev(device); @@ -1481,7 +1489,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, /* Notify peer that I had a local IO error, and did not detached.. */ if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) - drbd_send_state(first_peer_device(device), ns); + drbd_send_state(peer_device, ns); /* Disks got bigger while they were detached */ if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && @@ -1499,14 +1507,14 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, /* sync target done with resync. Explicitly notify peer, even though * it should (at least for non-empty resyncs) already know itself. */ if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) - drbd_send_state(first_peer_device(device), ns); + drbd_send_state(peer_device, ns); /* Verify finished, or reached stop sector. Peer did not know about * the stop sector, and we may even have changed the stop sector during * verify to interrupt/stop early. Send the new state. */ if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED && verify_can_do_stop_sector(device)) - drbd_send_state(first_peer_device(device), ns); + drbd_send_state(peer_device, ns); /* This triggers bitmap writeout of potentially still unwritten pages * if the resync finished cleanly, or aborted because of peer disk @@ -1563,7 +1571,7 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused) old_conf = connection->net_conf; connection->my_addr_len = 0; connection->peer_addr_len = 0; - rcu_assign_pointer(connection->net_conf, NULL); + RCU_INIT_POINTER(connection->net_conf, NULL); conn_free_crypto(connection); mutex_unlock(&connection->resource->conf_update); @@ -1599,7 +1607,7 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused) return 0; } -void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf) +static void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf) { enum chg_state_flags flags = ~0; struct drbd_peer_device *peer_device; @@ -1688,7 +1696,7 @@ conn_is_valid_transition(struct drbd_connection *connection, union drbd_state ma return rv; } -void +static void conn_set_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val, union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags) { diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index d8f57b6305c..50776b36282 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -67,13 +67,10 @@ rwlock_t global_state_lock; */ void drbd_md_io_complete(struct bio *bio, int error) { - struct drbd_md_io *md_io; struct drbd_device *device; - md_io = (struct drbd_md_io *)bio->bi_private; - device = container_of(md_io, struct drbd_device, md_io); - - md_io->error = error; + device = bio->bi_private; + device->md_io.error = error; /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able * to timeout on the lower level device, and eventually detach from it. @@ -87,7 +84,7 @@ void drbd_md_io_complete(struct bio *bio, int error) * ASSERT(atomic_read(&device->md_io_in_use) == 1) there. */ drbd_md_put_buffer(device); - md_io->done = 1; + device->md_io.done = 1; wake_up(&device->misc_wait); bio_put(bio); if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */ @@ -135,6 +132,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l i = peer_req->i; do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO; block_id = peer_req->block_id; + peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; spin_lock_irqsave(&device->resource->req_lock, flags); device->writ_cnt += peer_req->i.size >> 9; @@ -398,9 +396,6 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, if (!get_ldev(device)) return -EIO; - if (drbd_rs_should_slow_down(device, sector)) - goto defer; - /* GFP_TRY, because if there is no memory available right now, this may * be rescheduled for later. It is "only" background resync, after all. */ peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector, @@ -410,7 +405,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, peer_req->w.cb = w_e_send_csum; spin_lock_irq(&device->resource->req_lock); - list_add(&peer_req->w.list, &device->read_ee); + list_add_tail(&peer_req->w.list, &device->read_ee); spin_unlock_irq(&device->resource->req_lock); atomic_add(size >> 9, &device->rs_sect_ev); @@ -452,9 +447,9 @@ void resync_timer_fn(unsigned long data) { struct drbd_device *device = (struct drbd_device *) data; - if (list_empty(&device->resync_work.list)) - drbd_queue_work(&first_peer_device(device)->connection->sender_work, - &device->resync_work); + drbd_queue_work_if_unqueued( + &first_peer_device(device)->connection->sender_work, + &device->resync_work); } static void fifo_set(struct fifo_buffer *fb, int value) @@ -504,9 +499,9 @@ struct fifo_buffer *fifo_alloc(int fifo_size) static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in) { struct disk_conf *dc; - unsigned int want; /* The number of sectors we want in the proxy */ + unsigned int want; /* The number of sectors we want in-flight */ int req_sect; /* Number of sectors to request in this turn */ - int correction; /* Number of sectors more we need in the proxy*/ + int correction; /* Number of sectors more we need in-flight */ int cps; /* correction per invocation of drbd_rs_controller() */ int steps; /* Number of time steps to plan ahead */ int curr_corr; @@ -577,20 +572,27 @@ static int drbd_rs_number_requests(struct drbd_device *device) * potentially causing a distributed deadlock on congestion during * online-verify or (checksum-based) resync, if max-buffers, * socket buffer sizes and resync rate settings are mis-configured. */ - if (mxb - device->rs_in_flight < number) - number = mxb - device->rs_in_flight; + + /* note that "number" is in units of "BM_BLOCK_SIZE" (which is 4k), + * mxb (as used here, and in drbd_alloc_pages on the peer) is + * "number of pages" (typically also 4k), + * but "rs_in_flight" is in "sectors" (512 Byte). */ + if (mxb - device->rs_in_flight/8 < number) + number = mxb - device->rs_in_flight/8; return number; } -static int make_resync_request(struct drbd_device *device, int cancel) +static int make_resync_request(struct drbd_device *const device, int cancel) { + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; unsigned long bit; sector_t sector; const sector_t capacity = drbd_get_capacity(device->this_bdev); int max_bio_size; int number, rollback_i, size; - int align, queued, sndbuf; + int align, requeue = 0; int i = 0; if (unlikely(cancel)) @@ -617,17 +619,22 @@ static int make_resync_request(struct drbd_device *device, int cancel) goto requeue; for (i = 0; i < number; i++) { - /* Stop generating RS requests, when half of the send buffer is filled */ - mutex_lock(&first_peer_device(device)->connection->data.mutex); - if (first_peer_device(device)->connection->data.socket) { - queued = first_peer_device(device)->connection->data.socket->sk->sk_wmem_queued; - sndbuf = first_peer_device(device)->connection->data.socket->sk->sk_sndbuf; - } else { - queued = 1; - sndbuf = 0; - } - mutex_unlock(&first_peer_device(device)->connection->data.mutex); - if (queued > sndbuf / 2) + /* Stop generating RS requests when half of the send buffer is filled, + * but notify TCP that we'd like to have more space. */ + mutex_lock(&connection->data.mutex); + if (connection->data.socket) { + struct sock *sk = connection->data.socket->sk; + int queued = sk->sk_wmem_queued; + int sndbuf = sk->sk_sndbuf; + if (queued > sndbuf / 2) { + requeue = 1; + if (sk->sk_socket) + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + } + } else + requeue = 1; + mutex_unlock(&connection->data.mutex); + if (requeue) goto requeue; next_sector: @@ -642,8 +649,7 @@ next_sector: sector = BM_BIT_TO_SECT(bit); - if (drbd_rs_should_slow_down(device, sector) || - drbd_try_rs_begin_io(device, sector)) { + if (drbd_try_rs_begin_io(device, sector)) { device->bm_resync_fo = bit; goto requeue; } @@ -696,9 +702,9 @@ next_sector: /* adjust very last sectors, in case we are oddly sized */ if (sector + (size>>9) > capacity) size = (capacity-sector)<<9; - if (first_peer_device(device)->connection->agreed_pro_version >= 89 && - first_peer_device(device)->connection->csums_tfm) { - switch (read_for_csum(first_peer_device(device), sector, size)) { + + if (device->use_csums) { + switch (read_for_csum(peer_device, sector, size)) { case -EIO: /* Disk failure */ put_ldev(device); return -EIO; @@ -717,7 +723,7 @@ next_sector: int err; inc_rs_pending(device); - err = drbd_send_drequest(first_peer_device(device), P_RS_DATA_REQUEST, + err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST, sector, size, ID_SYNCER); if (err) { drbd_err(device, "drbd_send_drequest() failed, aborting...\n"); @@ -774,8 +780,7 @@ static int make_ov_request(struct drbd_device *device, int cancel) size = BM_BLOCK_SIZE; - if (drbd_rs_should_slow_down(device, sector) || - drbd_try_rs_begin_io(device, sector)) { + if (drbd_try_rs_begin_io(device, sector)) { device->ov_position = sector; goto requeue; } @@ -911,7 +916,7 @@ int drbd_resync_finished(struct drbd_device *device) if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) khelper_cmd = "after-resync-target"; - if (first_peer_device(device)->connection->csums_tfm && device->rs_total) { + if (device->use_csums && device->rs_total) { const unsigned long s = device->rs_same_csum; const unsigned long t = device->rs_total; const int ratio = @@ -1351,13 +1356,15 @@ int w_send_out_of_sync(struct drbd_work *w, int cancel) { struct drbd_request *req = container_of(w, struct drbd_request, w); struct drbd_device *device = req->device; - struct drbd_connection *connection = first_peer_device(device)->connection; + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *const connection = peer_device->connection; int err; if (unlikely(cancel)) { req_mod(req, SEND_CANCELED); return 0; } + req->pre_send_jif = jiffies; /* this time, no connection->send.current_epoch_writes++; * If it was sent, it was the closing barrier for the last @@ -1365,7 +1372,7 @@ int w_send_out_of_sync(struct drbd_work *w, int cancel) * No more barriers will be sent, until we leave AHEAD mode again. */ maybe_send_barrier(connection, req->epoch); - err = drbd_send_out_of_sync(first_peer_device(device), req); + err = drbd_send_out_of_sync(peer_device, req); req_mod(req, OOS_HANDED_TO_NETWORK); return err; @@ -1380,19 +1387,21 @@ int w_send_dblock(struct drbd_work *w, int cancel) { struct drbd_request *req = container_of(w, struct drbd_request, w); struct drbd_device *device = req->device; - struct drbd_connection *connection = first_peer_device(device)->connection; + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *connection = peer_device->connection; int err; if (unlikely(cancel)) { req_mod(req, SEND_CANCELED); return 0; } + req->pre_send_jif = jiffies; re_init_if_first_write(connection, req->epoch); maybe_send_barrier(connection, req->epoch); connection->send.current_epoch_writes++; - err = drbd_send_dblock(first_peer_device(device), req); + err = drbd_send_dblock(peer_device, req); req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); return err; @@ -1407,19 +1416,21 @@ int w_send_read_req(struct drbd_work *w, int cancel) { struct drbd_request *req = container_of(w, struct drbd_request, w); struct drbd_device *device = req->device; - struct drbd_connection *connection = first_peer_device(device)->connection; + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *connection = peer_device->connection; int err; if (unlikely(cancel)) { req_mod(req, SEND_CANCELED); return 0; } + req->pre_send_jif = jiffies; /* Even read requests may close a write epoch, * if there was any yet. */ maybe_send_barrier(connection, req->epoch); - err = drbd_send_drequest(first_peer_device(device), P_DATA_REQUEST, req->i.sector, req->i.size, + err = drbd_send_drequest(peer_device, P_DATA_REQUEST, req->i.sector, req->i.size, (unsigned long)req); req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); @@ -1433,7 +1444,7 @@ int w_restart_disk_io(struct drbd_work *w, int cancel) struct drbd_device *device = req->device; if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) - drbd_al_begin_io(device, &req->i, false); + drbd_al_begin_io(device, &req->i); drbd_req_make_private_bio(req, req->master_bio); req->private_bio->bi_bdev = device->ldev->backing_bdev; @@ -1601,26 +1612,32 @@ void drbd_rs_controller_reset(struct drbd_device *device) void start_resync_timer_fn(unsigned long data) { struct drbd_device *device = (struct drbd_device *) data; - - drbd_queue_work(&first_peer_device(device)->connection->sender_work, - &device->start_resync_work); + drbd_device_post_work(device, RS_START); } -int w_start_resync(struct drbd_work *w, int cancel) +static void do_start_resync(struct drbd_device *device) { - struct drbd_device *device = - container_of(w, struct drbd_device, start_resync_work); - if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) { - drbd_warn(device, "w_start_resync later...\n"); + drbd_warn(device, "postponing start_resync ...\n"); device->start_resync_timer.expires = jiffies + HZ/10; add_timer(&device->start_resync_timer); - return 0; + return; } drbd_start_resync(device, C_SYNC_SOURCE); clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags); - return 0; +} + +static bool use_checksum_based_resync(struct drbd_connection *connection, struct drbd_device *device) +{ + bool csums_after_crash_only; + rcu_read_lock(); + csums_after_crash_only = rcu_dereference(connection->net_conf)->csums_after_crash_only; + rcu_read_unlock(); + return connection->agreed_pro_version >= 89 && /* supported? */ + connection->csums_tfm && /* configured? */ + (csums_after_crash_only == 0 /* use for each resync? */ + || test_bit(CRASHED_PRIMARY, &device->flags)); /* or only after Primary crash? */ } /** @@ -1633,6 +1650,8 @@ int w_start_resync(struct drbd_work *w, int cancel) */ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) { + struct drbd_peer_device *peer_device = first_peer_device(device); + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; union drbd_state ns; int r; @@ -1651,7 +1670,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) if (r > 0) { drbd_info(device, "before-resync-target handler returned %d, " "dropping connection.\n", r); - conn_request_state(first_peer_device(device)->connection, NS(conn, C_DISCONNECTING), CS_HARD); + conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); return; } } else /* C_SYNC_SOURCE */ { @@ -1664,7 +1683,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) } else { drbd_info(device, "before-resync-source handler returned %d, " "dropping connection.\n", r); - conn_request_state(first_peer_device(device)->connection, + conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); return; } @@ -1672,7 +1691,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) } } - if (current == first_peer_device(device)->connection->worker.task) { + if (current == connection->worker.task) { /* The worker should not sleep waiting for state_mutex, that can take long */ if (!mutex_trylock(device->state_mutex)) { @@ -1733,11 +1752,20 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) device->rs_mark_time[i] = now; } _drbd_pause_after(device); + /* Forget potentially stale cached per resync extent bit-counts. + * Open coded drbd_rs_cancel_all(device), we already have IRQs + * disabled, and know the disk state is ok. */ + spin_lock(&device->al_lock); + lc_reset(device->resync); + device->resync_locked = 0; + device->resync_wenr = LC_FREE; + spin_unlock(&device->al_lock); } write_unlock(&global_state_lock); spin_unlock_irq(&device->resource->req_lock); if (r == SS_SUCCESS) { + wake_up(&device->al_wait); /* for lc_reset() above */ /* reset rs_last_bcast when a resync or verify is started, * to deal with potential jiffies wrap. */ device->rs_last_bcast = jiffies - HZ; @@ -1746,8 +1774,12 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) drbd_conn_str(ns.conn), (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10), (unsigned long) device->rs_total); - if (side == C_SYNC_TARGET) + if (side == C_SYNC_TARGET) { device->bm_resync_fo = 0; + device->use_csums = use_checksum_based_resync(connection, device); + } else { + device->use_csums = 0; + } /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid * with w_send_oos, or the sync target will get confused as to @@ -1756,12 +1788,10 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) * drbd_resync_finished from here in that case. * We drbd_gen_and_send_sync_uuid here for protocol < 96, * and from after_state_ch otherwise. */ - if (side == C_SYNC_SOURCE && - first_peer_device(device)->connection->agreed_pro_version < 96) - drbd_gen_and_send_sync_uuid(first_peer_device(device)); + if (side == C_SYNC_SOURCE && connection->agreed_pro_version < 96) + drbd_gen_and_send_sync_uuid(peer_device); - if (first_peer_device(device)->connection->agreed_pro_version < 95 && - device->rs_total == 0) { + if (connection->agreed_pro_version < 95 && device->rs_total == 0) { /* This still has a race (about when exactly the peers * detect connection loss) that can lead to a full sync * on next handshake. In 8.3.9 we fixed this with explicit @@ -1777,7 +1807,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) int timeo; rcu_read_lock(); - nc = rcu_dereference(first_peer_device(device)->connection->net_conf); + nc = rcu_dereference(connection->net_conf); timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9; rcu_read_unlock(); schedule_timeout_interruptible(timeo); @@ -1799,10 +1829,165 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) mutex_unlock(device->state_mutex); } +static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done) +{ + struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, }; + device->rs_last_bcast = jiffies; + + if (!get_ldev(device)) + return; + + drbd_bm_write_lazy(device, 0); + if (resync_done && is_sync_state(device->state.conn)) + drbd_resync_finished(device); + + drbd_bcast_event(device, &sib); + /* update timestamp, in case it took a while to write out stuff */ + device->rs_last_bcast = jiffies; + put_ldev(device); +} + +static void drbd_ldev_destroy(struct drbd_device *device) +{ + lc_destroy(device->resync); + device->resync = NULL; + lc_destroy(device->act_log); + device->act_log = NULL; + __no_warn(local, + drbd_free_ldev(device->ldev); + device->ldev = NULL;); + clear_bit(GOING_DISKLESS, &device->flags); + wake_up(&device->misc_wait); +} + +static void go_diskless(struct drbd_device *device) +{ + D_ASSERT(device, device->state.disk == D_FAILED); + /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will + * inc/dec it frequently. Once we are D_DISKLESS, no one will touch + * the protected members anymore, though, so once put_ldev reaches zero + * again, it will be safe to free them. */ + + /* Try to write changed bitmap pages, read errors may have just + * set some bits outside the area covered by the activity log. + * + * If we have an IO error during the bitmap writeout, + * we will want a full sync next time, just in case. + * (Do we want a specific meta data flag for this?) + * + * If that does not make it to stable storage either, + * we cannot do anything about that anymore. + * + * We still need to check if both bitmap and ldev are present, we may + * end up here after a failed attach, before ldev was even assigned. + */ + if (device->bitmap && device->ldev) { + /* An interrupted resync or similar is allowed to recounts bits + * while we detach. + * Any modifications would not be expected anymore, though. + */ + if (drbd_bitmap_io_from_worker(device, drbd_bm_write, + "detach", BM_LOCKED_TEST_ALLOWED)) { + if (test_bit(WAS_READ_ERROR, &device->flags)) { + drbd_md_set_flag(device, MDF_FULL_SYNC); + drbd_md_sync(device); + } + } + } + + drbd_force_state(device, NS(disk, D_DISKLESS)); +} + +static int do_md_sync(struct drbd_device *device) +{ + drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); + drbd_md_sync(device); + return 0; +} + +/* only called from drbd_worker thread, no locking */ +void __update_timing_details( + struct drbd_thread_timing_details *tdp, + unsigned int *cb_nr, + void *cb, + const char *fn, const unsigned int line) +{ + unsigned int i = *cb_nr % DRBD_THREAD_DETAILS_HIST; + struct drbd_thread_timing_details *td = tdp + i; + + td->start_jif = jiffies; + td->cb_addr = cb; + td->caller_fn = fn; + td->line = line; + td->cb_nr = *cb_nr; + + i = (i+1) % DRBD_THREAD_DETAILS_HIST; + td = tdp + i; + memset(td, 0, sizeof(*td)); + + ++(*cb_nr); +} + +#define WORK_PENDING(work_bit, todo) (todo & (1UL << work_bit)) +static void do_device_work(struct drbd_device *device, const unsigned long todo) +{ + if (WORK_PENDING(MD_SYNC, todo)) + do_md_sync(device); + if (WORK_PENDING(RS_DONE, todo) || + WORK_PENDING(RS_PROGRESS, todo)) + update_on_disk_bitmap(device, WORK_PENDING(RS_DONE, todo)); + if (WORK_PENDING(GO_DISKLESS, todo)) + go_diskless(device); + if (WORK_PENDING(DESTROY_DISK, todo)) + drbd_ldev_destroy(device); + if (WORK_PENDING(RS_START, todo)) + do_start_resync(device); +} + +#define DRBD_DEVICE_WORK_MASK \ + ((1UL << GO_DISKLESS) \ + |(1UL << DESTROY_DISK) \ + |(1UL << MD_SYNC) \ + |(1UL << RS_START) \ + |(1UL << RS_PROGRESS) \ + |(1UL << RS_DONE) \ + ) + +static unsigned long get_work_bits(unsigned long *flags) +{ + unsigned long old, new; + do { + old = *flags; + new = old & ~DRBD_DEVICE_WORK_MASK; + } while (cmpxchg(flags, old, new) != old); + return old & DRBD_DEVICE_WORK_MASK; +} + +static void do_unqueued_work(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + unsigned long todo = get_work_bits(&device->flags); + if (!todo) + continue; + + kref_get(&device->kref); + rcu_read_unlock(); + do_device_work(device, todo); + kref_put(&device->kref, drbd_destroy_device); + rcu_read_lock(); + } + rcu_read_unlock(); +} + static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list) { spin_lock_irq(&queue->q_lock); - list_splice_init(&queue->q, work_list); + list_splice_tail_init(&queue->q, work_list); spin_unlock_irq(&queue->q_lock); return !list_empty(work_list); } @@ -1851,7 +2036,7 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head * /* dequeue single item only, * we still use drbd_queue_work_front() in some places */ if (!list_empty(&connection->sender_work.q)) - list_move(connection->sender_work.q.next, work_list); + list_splice_tail_init(&connection->sender_work.q, work_list); spin_unlock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */ if (!list_empty(work_list) || signal_pending(current)) { spin_unlock_irq(&connection->resource->req_lock); @@ -1873,6 +2058,14 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head * if (send_barrier) maybe_send_barrier(connection, connection->send.current_epoch_nr + 1); + + if (test_bit(DEVICE_WORK_PENDING, &connection->flags)) + break; + + /* drbd_send() may have called flush_signals() */ + if (get_t_state(&connection->worker) != RUNNING) + break; + schedule(); /* may be woken up for other things but new work, too, * e.g. if the current epoch got closed. @@ -1906,10 +2099,15 @@ int drbd_worker(struct drbd_thread *thi) while (get_t_state(thi) == RUNNING) { drbd_thread_current_set_cpu(thi); - /* as long as we use drbd_queue_work_front(), - * we may only dequeue single work items here, not batches. */ - if (list_empty(&work_list)) + if (list_empty(&work_list)) { + update_worker_timing_details(connection, wait_for_work); wait_for_work(connection, &work_list); + } + + if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) { + update_worker_timing_details(connection, do_unqueued_work); + do_unqueued_work(connection); + } if (signal_pending(current)) { flush_signals(current); @@ -1926,6 +2124,7 @@ int drbd_worker(struct drbd_thread *thi) while (!list_empty(&work_list)) { w = list_first_entry(&work_list, struct drbd_work, list); list_del_init(&w->list); + update_worker_timing_details(connection, w->cb); if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0) continue; if (connection->cstate >= C_WF_REPORT_PARAMS) @@ -1934,13 +2133,18 @@ int drbd_worker(struct drbd_thread *thi) } do { + if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) { + update_worker_timing_details(connection, do_unqueued_work); + do_unqueued_work(connection); + } while (!list_empty(&work_list)) { w = list_first_entry(&work_list, struct drbd_work, list); list_del_init(&w->list); + update_worker_timing_details(connection, w->cb); w->cb(w, 1); } dequeue_work_batch(&connection->sender_work, &work_list); - } while (!list_empty(&work_list)); + } while (!list_empty(&work_list) || test_bit(DEVICE_WORK_PENDING, &connection->flags)); rcu_read_lock(); idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index f63d358f3d9..0a581400de0 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -15,17 +15,22 @@ #include <linux/numa.h> #define PART_BITS 4 +#define VQ_NAME_LEN 16 static int major; static DEFINE_IDA(vd_index_ida); static struct workqueue_struct *virtblk_wq; +struct virtio_blk_vq { + struct virtqueue *vq; + spinlock_t lock; + char name[VQ_NAME_LEN]; +} ____cacheline_aligned_in_smp; + struct virtio_blk { struct virtio_device *vdev; - struct virtqueue *vq; - spinlock_t vq_lock; /* The disk structure for the kernel. */ struct gendisk *disk; @@ -47,6 +52,10 @@ struct virtio_blk /* Ida index - used to track minor number allocations. */ int index; + + /* num of vqs */ + int num_vqs; + struct virtio_blk_vq *vqs; }; struct virtblk_req @@ -133,14 +142,15 @@ static void virtblk_done(struct virtqueue *vq) { struct virtio_blk *vblk = vq->vdev->priv; bool req_done = false; + int qid = vq->index; struct virtblk_req *vbr; unsigned long flags; unsigned int len; - spin_lock_irqsave(&vblk->vq_lock, flags); + spin_lock_irqsave(&vblk->vqs[qid].lock, flags); do { virtqueue_disable_cb(vq); - while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { + while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) { blk_mq_complete_request(vbr->req); req_done = true; } @@ -151,7 +161,7 @@ static void virtblk_done(struct virtqueue *vq) /* In case queue is stopped waiting for more buffers. */ if (req_done) blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); - spin_unlock_irqrestore(&vblk->vq_lock, flags); + spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); } static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) @@ -160,6 +170,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); unsigned long flags; unsigned int num; + int qid = hctx->queue_num; const bool last = (req->cmd_flags & REQ_END) != 0; int err; bool notify = false; @@ -202,12 +213,12 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) vbr->out_hdr.type |= VIRTIO_BLK_T_IN; } - spin_lock_irqsave(&vblk->vq_lock, flags); - err = __virtblk_add_req(vblk->vq, vbr, vbr->sg, num); + spin_lock_irqsave(&vblk->vqs[qid].lock, flags); + err = __virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num); if (err) { - virtqueue_kick(vblk->vq); + virtqueue_kick(vblk->vqs[qid].vq); blk_mq_stop_hw_queue(hctx); - spin_unlock_irqrestore(&vblk->vq_lock, flags); + spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); /* Out of mem doesn't actually happen, since we fall back * to direct descriptors */ if (err == -ENOMEM || err == -ENOSPC) @@ -215,12 +226,12 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) return BLK_MQ_RQ_QUEUE_ERROR; } - if (last && virtqueue_kick_prepare(vblk->vq)) + if (last && virtqueue_kick_prepare(vblk->vqs[qid].vq)) notify = true; - spin_unlock_irqrestore(&vblk->vq_lock, flags); + spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); if (notify) - virtqueue_notify(vblk->vq); + virtqueue_notify(vblk->vqs[qid].vq); return BLK_MQ_RQ_QUEUE_OK; } @@ -377,12 +388,64 @@ static void virtblk_config_changed(struct virtio_device *vdev) static int init_vq(struct virtio_blk *vblk) { int err = 0; + int i; + vq_callback_t **callbacks; + const char **names; + struct virtqueue **vqs; + unsigned short num_vqs; + struct virtio_device *vdev = vblk->vdev; + + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ, + struct virtio_blk_config, num_queues, + &num_vqs); + if (err) + num_vqs = 1; + + vblk->vqs = kmalloc(sizeof(*vblk->vqs) * num_vqs, GFP_KERNEL); + if (!vblk->vqs) { + err = -ENOMEM; + goto out; + } + + names = kmalloc(sizeof(*names) * num_vqs, GFP_KERNEL); + if (!names) + goto err_names; + + callbacks = kmalloc(sizeof(*callbacks) * num_vqs, GFP_KERNEL); + if (!callbacks) + goto err_callbacks; + + vqs = kmalloc(sizeof(*vqs) * num_vqs, GFP_KERNEL); + if (!vqs) + goto err_vqs; - /* We expect one virtqueue, for output. */ - vblk->vq = virtio_find_single_vq(vblk->vdev, virtblk_done, "requests"); - if (IS_ERR(vblk->vq)) - err = PTR_ERR(vblk->vq); + for (i = 0; i < num_vqs; i++) { + callbacks[i] = virtblk_done; + snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i); + names[i] = vblk->vqs[i].name; + } + + /* Discover virtqueues and write information to configuration. */ + err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names); + if (err) + goto err_find_vqs; + for (i = 0; i < num_vqs; i++) { + spin_lock_init(&vblk->vqs[i].lock); + vblk->vqs[i].vq = vqs[i]; + } + vblk->num_vqs = num_vqs; + + err_find_vqs: + kfree(vqs); + err_vqs: + kfree(callbacks); + err_callbacks: + kfree(names); + err_names: + if (err) + kfree(vblk->vqs); + out: return err; } @@ -551,7 +614,6 @@ static int virtblk_probe(struct virtio_device *vdev) err = init_vq(vblk); if (err) goto out_free_vblk; - spin_lock_init(&vblk->vq_lock); /* FIXME: How many partitions? How long is a piece of string? */ vblk->disk = alloc_disk(1 << PART_BITS); @@ -562,7 +624,7 @@ static int virtblk_probe(struct virtio_device *vdev) /* Default queue sizing is to fill the ring. */ if (!virtblk_queue_depth) { - virtblk_queue_depth = vblk->vq->num_free; + virtblk_queue_depth = vblk->vqs[0].vq->num_free; /* ... but without indirect descs, we use 2 descs per req */ if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) virtblk_queue_depth /= 2; @@ -570,7 +632,6 @@ static int virtblk_probe(struct virtio_device *vdev) memset(&vblk->tag_set, 0, sizeof(vblk->tag_set)); vblk->tag_set.ops = &virtio_mq_ops; - vblk->tag_set.nr_hw_queues = 1; vblk->tag_set.queue_depth = virtblk_queue_depth; vblk->tag_set.numa_node = NUMA_NO_NODE; vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; @@ -578,6 +639,7 @@ static int virtblk_probe(struct virtio_device *vdev) sizeof(struct virtblk_req) + sizeof(struct scatterlist) * sg_elems; vblk->tag_set.driver_data = vblk; + vblk->tag_set.nr_hw_queues = vblk->num_vqs; err = blk_mq_alloc_tag_set(&vblk->tag_set); if (err) @@ -727,6 +789,7 @@ static void virtblk_remove(struct virtio_device *vdev) refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount); put_disk(vblk->disk); vdev->config->del_vqs(vdev); + kfree(vblk->vqs); kfree(vblk); /* Only free device id if we don't have any users */ @@ -777,7 +840,8 @@ static const struct virtio_device_id id_table[] = { static unsigned int features[] = { VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI, - VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE + VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE, + VIRTIO_BLK_F_MQ, }; static struct virtio_driver virtio_blk = { diff --git a/drivers/cpufreq/pmac64-cpufreq.c b/drivers/cpufreq/pmac64-cpufreq.c index 8bc422977b5..4ff86878727 100644 --- a/drivers/cpufreq/pmac64-cpufreq.c +++ b/drivers/cpufreq/pmac64-cpufreq.c @@ -499,8 +499,7 @@ static int __init g5_pm72_cpufreq_init(struct device_node *cpunode) } /* Lookup the i2c hwclock */ - for (hwclock = NULL; - (hwclock = of_find_node_by_name(hwclock, "i2c-hwclock")) != NULL;){ + for_each_node_by_name(hwclock, "i2c-hwclock") { const char *loc = of_get_property(hwclock, "hwctrl-location", NULL); if (loc == NULL) diff --git a/drivers/crypto/nx/nx-842.c b/drivers/crypto/nx/nx-842.c index 544f6d327ed..061407d5952 100644 --- a/drivers/crypto/nx/nx-842.c +++ b/drivers/crypto/nx/nx-842.c @@ -936,28 +936,14 @@ static int nx842_OF_upd(struct property *new_prop) goto error_out; } - /* Set ptr to new property if provided */ - if (new_prop) { - /* Single property */ - if (!strncmp(new_prop->name, "status", new_prop->length)) { - status = new_prop; - - } else if (!strncmp(new_prop->name, "ibm,max-sg-len", - new_prop->length)) { - maxsglen = new_prop; - - } else if (!strncmp(new_prop->name, "ibm,max-sync-cop", - new_prop->length)) { - maxsyncop = new_prop; - - } else { - /* - * Skip the update, the property being updated - * has no impact. - */ - goto out; - } - } + /* + * If this is a property update, there are only certain properties that + * we care about. Bail if it isn't in the below list + */ + if (new_prop && (strncmp(new_prop->name, "status", new_prop->length) || + strncmp(new_prop->name, "ibm,max-sg-len", new_prop->length) || + strncmp(new_prop->name, "ibm,max-sync-cop", new_prop->length))) + goto out; /* Perform property updates */ ret = nx842_OF_upd_status(new_devdata, status); diff --git a/drivers/edac/cell_edac.c b/drivers/edac/cell_edac.c index 374b57fc596..a12c8552f6a 100644 --- a/drivers/edac/cell_edac.c +++ b/drivers/edac/cell_edac.c @@ -134,8 +134,7 @@ static void cell_edac_init_csrows(struct mem_ctl_info *mci) int j; u32 nr_pages; - for (np = NULL; - (np = of_find_node_by_name(np, "memory")) != NULL;) { + for_each_node_by_name(np, "memory") { struct resource r; /* We "know" that the Cell firmware only creates one entry diff --git a/drivers/hwmon/adm1025.c b/drivers/hwmon/adm1025.c index d3d0e8cf27b..d6c767ace91 100644 --- a/drivers/hwmon/adm1025.c +++ b/drivers/hwmon/adm1025.c @@ -382,6 +382,9 @@ static ssize_t set_vrm(struct device *dev, struct device_attribute *attr, if (err) return err; + if (val > 255) + return -EINVAL; + data->vrm = val; return count; } diff --git a/drivers/hwmon/adm1026.c b/drivers/hwmon/adm1026.c index ca8430f9256..e67b9a50ac7 100644 --- a/drivers/hwmon/adm1026.c +++ b/drivers/hwmon/adm1026.c @@ -1085,6 +1085,9 @@ static ssize_t store_vrm_reg(struct device *dev, struct device_attribute *attr, if (err) return err; + if (val > 255) + return -EINVAL; + data->vrm = val; return count; } diff --git a/drivers/hwmon/ads1015.c b/drivers/hwmon/ads1015.c index 22e0c926989..126516414c1 100644 --- a/drivers/hwmon/ads1015.c +++ b/drivers/hwmon/ads1015.c @@ -212,6 +212,7 @@ static int ads1015_get_channels_config_of(struct i2c_client *client) dev_err(&client->dev, "invalid gain on %s\n", node->full_name); + return -EINVAL; } } @@ -222,6 +223,7 @@ static int ads1015_get_channels_config_of(struct i2c_client *client) dev_err(&client->dev, "invalid data_rate on %s\n", node->full_name); + return -EINVAL; } } diff --git a/drivers/hwmon/asb100.c b/drivers/hwmon/asb100.c index f96063680e5..272fcc837ec 100644 --- a/drivers/hwmon/asb100.c +++ b/drivers/hwmon/asb100.c @@ -510,6 +510,10 @@ static ssize_t set_vrm(struct device *dev, struct device_attribute *attr, err = kstrtoul(buf, 10, &val); if (err) return err; + + if (val > 255) + return -EINVAL; + data->vrm = val; return count; } diff --git a/drivers/hwmon/dme1737.c b/drivers/hwmon/dme1737.c index 4ae3fff13f4..bea0a344fab 100644 --- a/drivers/hwmon/dme1737.c +++ b/drivers/hwmon/dme1737.c @@ -247,8 +247,8 @@ struct dme1737_data { u8 pwm_acz[3]; u8 pwm_freq[6]; u8 pwm_rr[2]; - u8 zone_low[3]; - u8 zone_abs[3]; + s8 zone_low[3]; + s8 zone_abs[3]; u8 zone_hyst[2]; u32 alarms; }; @@ -277,7 +277,7 @@ static inline int IN_FROM_REG(int reg, int nominal, int res) return (reg * nominal + (3 << (res - 3))) / (3 << (res - 2)); } -static inline int IN_TO_REG(int val, int nominal) +static inline int IN_TO_REG(long val, int nominal) { return clamp_val((val * 192 + nominal / 2) / nominal, 0, 255); } @@ -293,7 +293,7 @@ static inline int TEMP_FROM_REG(int reg, int res) return (reg * 1000) >> (res - 8); } -static inline int TEMP_TO_REG(int val) +static inline int TEMP_TO_REG(long val) { return clamp_val((val < 0 ? val - 500 : val + 500) / 1000, -128, 127); } @@ -308,7 +308,7 @@ static inline int TEMP_RANGE_FROM_REG(int reg) return TEMP_RANGE[(reg >> 4) & 0x0f]; } -static int TEMP_RANGE_TO_REG(int val, int reg) +static int TEMP_RANGE_TO_REG(long val, int reg) { int i; @@ -331,7 +331,7 @@ static inline int TEMP_HYST_FROM_REG(int reg, int ix) return (((ix == 1) ? reg : reg >> 4) & 0x0f) * 1000; } -static inline int TEMP_HYST_TO_REG(int val, int ix, int reg) +static inline int TEMP_HYST_TO_REG(long val, int ix, int reg) { int hyst = clamp_val((val + 500) / 1000, 0, 15); @@ -347,7 +347,7 @@ static inline int FAN_FROM_REG(int reg, int tpc) return (reg == 0 || reg == 0xffff) ? 0 : 90000 * 60 / reg; } -static inline int FAN_TO_REG(int val, int tpc) +static inline int FAN_TO_REG(long val, int tpc) { if (tpc) { return clamp_val(val / tpc, 0, 0xffff); @@ -379,7 +379,7 @@ static inline int FAN_TYPE_FROM_REG(int reg) return (edge > 0) ? 1 << (edge - 1) : 0; } -static inline int FAN_TYPE_TO_REG(int val, int reg) +static inline int FAN_TYPE_TO_REG(long val, int reg) { int edge = (val == 4) ? 3 : val; @@ -402,7 +402,7 @@ static int FAN_MAX_FROM_REG(int reg) return 1000 + i * 500; } -static int FAN_MAX_TO_REG(int val) +static int FAN_MAX_TO_REG(long val) { int i; @@ -460,7 +460,7 @@ static inline int PWM_ACZ_FROM_REG(int reg) return acz[(reg >> 5) & 0x07]; } -static inline int PWM_ACZ_TO_REG(int val, int reg) +static inline int PWM_ACZ_TO_REG(long val, int reg) { int acz = (val == 4) ? 2 : val - 1; @@ -476,7 +476,7 @@ static inline int PWM_FREQ_FROM_REG(int reg) return PWM_FREQ[reg & 0x0f]; } -static int PWM_FREQ_TO_REG(int val, int reg) +static int PWM_FREQ_TO_REG(long val, int reg) { int i; @@ -510,7 +510,7 @@ static inline int PWM_RR_FROM_REG(int reg, int ix) return (rr & 0x08) ? PWM_RR[rr & 0x07] : 0; } -static int PWM_RR_TO_REG(int val, int ix, int reg) +static int PWM_RR_TO_REG(long val, int ix, int reg) { int i; @@ -528,7 +528,7 @@ static inline int PWM_RR_EN_FROM_REG(int reg, int ix) return PWM_RR_FROM_REG(reg, ix) ? 1 : 0; } -static inline int PWM_RR_EN_TO_REG(int val, int ix, int reg) +static inline int PWM_RR_EN_TO_REG(long val, int ix, int reg) { int en = (ix == 1) ? 0x80 : 0x08; @@ -1481,13 +1481,16 @@ static ssize_t set_vrm(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct dme1737_data *data = dev_get_drvdata(dev); - long val; + unsigned long val; int err; - err = kstrtol(buf, 10, &val); + err = kstrtoul(buf, 10, &val); if (err) return err; + if (val > 255) + return -EINVAL; + data->vrm = val; return count; } diff --git a/drivers/hwmon/emc6w201.c b/drivers/hwmon/emc6w201.c index e87da902f3a..ada90716448 100644 --- a/drivers/hwmon/emc6w201.c +++ b/drivers/hwmon/emc6w201.c @@ -252,12 +252,12 @@ static ssize_t set_temp(struct device *dev, struct device_attribute *devattr, if (err < 0) return err; - val /= 1000; + val = DIV_ROUND_CLOSEST(val, 1000); reg = (sf == min) ? EMC6W201_REG_TEMP_LOW(nr) : EMC6W201_REG_TEMP_HIGH(nr); mutex_lock(&data->update_lock); - data->temp[sf][nr] = clamp_val(val, -127, 128); + data->temp[sf][nr] = clamp_val(val, -127, 127); err = emc6w201_write8(client, reg, data->temp[sf][nr]); mutex_unlock(&data->update_lock); diff --git a/drivers/hwmon/hih6130.c b/drivers/hwmon/hih6130.c index 0e01c4e13e3..7b73d2002d3 100644 --- a/drivers/hwmon/hih6130.c +++ b/drivers/hwmon/hih6130.c @@ -238,6 +238,9 @@ static int hih6130_probe(struct i2c_client *client, hih6130->client = client; mutex_init(&hih6130->lock); + if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_QUICK)) + hih6130->write_length = 1; + hwmon_dev = devm_hwmon_device_register_with_groups(dev, client->name, hih6130, hih6130_groups); diff --git a/drivers/hwmon/lm87.c b/drivers/hwmon/lm87.c index ba1d83d4805..a5e295826ae 100644 --- a/drivers/hwmon/lm87.c +++ b/drivers/hwmon/lm87.c @@ -617,6 +617,10 @@ static ssize_t set_vrm(struct device *dev, struct device_attribute *attr, err = kstrtoul(buf, 10, &val); if (err) return err; + + if (val > 255) + return -EINVAL; + data->vrm = val; return count; } diff --git a/drivers/hwmon/lm92.c b/drivers/hwmon/lm92.c index d2060e245ff..cfaf70b9cba 100644 --- a/drivers/hwmon/lm92.c +++ b/drivers/hwmon/lm92.c @@ -74,12 +74,9 @@ static inline int TEMP_FROM_REG(s16 reg) return reg / 8 * 625 / 10; } -static inline s16 TEMP_TO_REG(int val) +static inline s16 TEMP_TO_REG(long val) { - if (val <= -60000) - return -60000 * 10 / 625 * 8; - if (val >= 160000) - return 160000 * 10 / 625 * 8; + val = clamp_val(val, -60000, 160000); return val * 10 / 625 * 8; } @@ -206,10 +203,12 @@ static ssize_t set_temp_hyst(struct device *dev, if (err) return err; + val = clamp_val(val, -120000, 220000); mutex_lock(&data->update_lock); - data->temp[t_hyst] = TEMP_FROM_REG(data->temp[attr->index]) - val; + data->temp[t_hyst] = + TEMP_TO_REG(TEMP_FROM_REG(data->temp[attr->index]) - val); i2c_smbus_write_word_swapped(client, LM92_REG_TEMP_HYST, - TEMP_TO_REG(data->temp[t_hyst])); + data->temp[t_hyst]); mutex_unlock(&data->update_lock); return count; } diff --git a/drivers/hwmon/pc87360.c b/drivers/hwmon/pc87360.c index 988181e4cfc..145f674c1d8 100644 --- a/drivers/hwmon/pc87360.c +++ b/drivers/hwmon/pc87360.c @@ -615,6 +615,9 @@ static ssize_t set_vrm(struct device *dev, struct device_attribute *attr, if (err) return err; + if (val > 255) + return -EINVAL; + data->vrm = val; return count; } diff --git a/drivers/hwmon/tmp103.c b/drivers/hwmon/tmp103.c index c74d2da389d..e42964f07f6 100644 --- a/drivers/hwmon/tmp103.c +++ b/drivers/hwmon/tmp103.c @@ -131,13 +131,6 @@ static int tmp103_probe(struct i2c_client *client, struct regmap *regmap; int ret; - if (!i2c_check_functionality(client->adapter, - I2C_FUNC_SMBUS_BYTE_DATA)) { - dev_err(&client->dev, - "adapter doesn't support SMBus byte transactions\n"); - return -ENODEV; - } - regmap = devm_regmap_init_i2c(client, &tmp103_regmap_config); if (IS_ERR(regmap)) { dev_err(dev, "failed to allocate register map\n"); diff --git a/drivers/hwmon/vt1211.c b/drivers/hwmon/vt1211.c index 344b22ec255..3ea57c3504e 100644 --- a/drivers/hwmon/vt1211.c +++ b/drivers/hwmon/vt1211.c @@ -879,6 +879,9 @@ static ssize_t set_vrm(struct device *dev, struct device_attribute *attr, if (err) return err; + if (val > 255) + return -EINVAL; + data->vrm = val; return count; diff --git a/drivers/hwmon/w83627hf.c b/drivers/hwmon/w83627hf.c index c1726be3654..2f55973a8c4 100644 --- a/drivers/hwmon/w83627hf.c +++ b/drivers/hwmon/w83627hf.c @@ -820,6 +820,9 @@ store_vrm_reg(struct device *dev, struct device_attribute *attr, const char *buf err = kstrtoul(buf, 10, &val); if (err) return err; + + if (val > 255) + return -EINVAL; data->vrm = val; return count; diff --git a/drivers/hwmon/w83791d.c b/drivers/hwmon/w83791d.c index cb3765fec98..001df856913 100644 --- a/drivers/hwmon/w83791d.c +++ b/drivers/hwmon/w83791d.c @@ -1181,6 +1181,9 @@ static ssize_t store_vrm_reg(struct device *dev, if (err) return err; + if (val > 255) + return -EINVAL; + data->vrm = val; return count; } diff --git a/drivers/hwmon/w83793.c b/drivers/hwmon/w83793.c index 9d63d71214c..816aa6caf5d 100644 --- a/drivers/hwmon/w83793.c +++ b/drivers/hwmon/w83793.c @@ -353,6 +353,9 @@ store_vrm(struct device *dev, struct device_attribute *attr, if (err) return err; + if (val > 255) + return -EINVAL; + data->vrm = val; return count; } diff --git a/drivers/hwspinlock/Kconfig b/drivers/hwspinlock/Kconfig index 70637d23b1f..3612cb5b30b 100644 --- a/drivers/hwspinlock/Kconfig +++ b/drivers/hwspinlock/Kconfig @@ -10,7 +10,7 @@ menu "Hardware Spinlock drivers" config HWSPINLOCK_OMAP tristate "OMAP Hardware Spinlock device" - depends on ARCH_OMAP4 || SOC_OMAP5 + depends on ARCH_OMAP4 || SOC_OMAP5 || SOC_DRA7XX || SOC_AM33XX || SOC_AM43XX select HWSPINLOCK help Say y here to support the OMAP Hardware Spinlock device (firstly diff --git a/drivers/hwspinlock/omap_hwspinlock.c b/drivers/hwspinlock/omap_hwspinlock.c index 292869cc903..c1e2cd4d85f 100644 --- a/drivers/hwspinlock/omap_hwspinlock.c +++ b/drivers/hwspinlock/omap_hwspinlock.c @@ -98,10 +98,29 @@ static int omap_hwspinlock_probe(struct platform_device *pdev) if (!io_base) return -ENOMEM; + /* + * make sure the module is enabled and clocked before reading + * the module SYSSTATUS register + */ + pm_runtime_enable(&pdev->dev); + ret = pm_runtime_get_sync(&pdev->dev); + if (ret < 0) { + pm_runtime_put_noidle(&pdev->dev); + goto iounmap_base; + } + /* Determine number of locks */ i = readl(io_base + SYSSTATUS_OFFSET); i >>= SPINLOCK_NUMLOCKS_BIT_OFFSET; + /* + * runtime PM will make sure the clock of this module is + * enabled again iff at least one lock is requested + */ + ret = pm_runtime_put(&pdev->dev); + if (ret < 0) + goto iounmap_base; + /* one of the four lsb's must be set, and nothing else */ if (hweight_long(i & 0xf) != 1 || i > 8) { ret = -EINVAL; @@ -121,12 +140,6 @@ static int omap_hwspinlock_probe(struct platform_device *pdev) for (i = 0, hwlock = &bank->lock[0]; i < num_locks; i++, hwlock++) hwlock->priv = io_base + LOCK_BASE_OFFSET + sizeof(u32) * i; - /* - * runtime PM will make sure the clock of this module is - * enabled iff at least one lock is requested - */ - pm_runtime_enable(&pdev->dev); - ret = hwspin_lock_register(bank, &pdev->dev, &omap_hwspinlock_ops, pdata->base_id, num_locks); if (ret) @@ -135,9 +148,9 @@ static int omap_hwspinlock_probe(struct platform_device *pdev) return 0; reg_fail: - pm_runtime_disable(&pdev->dev); kfree(bank); iounmap_base: + pm_runtime_disable(&pdev->dev); iounmap(io_base); return ret; } diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c index 2bc7f5af64f..f6d29614cb0 100644 --- a/drivers/infiniband/core/agent.c +++ b/drivers/infiniband/core/agent.c @@ -94,14 +94,14 @@ void agent_send_response(struct ib_mad *mad, struct ib_grh *grh, port_priv = ib_get_agent_port(device, port_num); if (!port_priv) { - printk(KERN_ERR SPFX "Unable to find port agent\n"); + dev_err(&device->dev, "Unable to find port agent\n"); return; } agent = port_priv->agent[qpn]; ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num); if (IS_ERR(ah)) { - printk(KERN_ERR SPFX "ib_create_ah_from_wc error %ld\n", + dev_err(&device->dev, "ib_create_ah_from_wc error %ld\n", PTR_ERR(ah)); return; } @@ -110,7 +110,7 @@ void agent_send_response(struct ib_mad *mad, struct ib_grh *grh, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, GFP_KERNEL); if (IS_ERR(send_buf)) { - printk(KERN_ERR SPFX "ib_create_send_mad error\n"); + dev_err(&device->dev, "ib_create_send_mad error\n"); goto err1; } @@ -125,7 +125,7 @@ void agent_send_response(struct ib_mad *mad, struct ib_grh *grh, } if (ib_post_send_mad(send_buf, NULL)) { - printk(KERN_ERR SPFX "ib_post_send_mad error\n"); + dev_err(&device->dev, "ib_post_send_mad error\n"); goto err2; } return; @@ -151,7 +151,7 @@ int ib_agent_port_open(struct ib_device *device, int port_num) /* Create new device info */ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); if (!port_priv) { - printk(KERN_ERR SPFX "No memory for ib_agent_port_private\n"); + dev_err(&device->dev, "No memory for ib_agent_port_private\n"); ret = -ENOMEM; goto error1; } @@ -161,7 +161,7 @@ int ib_agent_port_open(struct ib_device *device, int port_num) port_priv->agent[0] = ib_register_mad_agent(device, port_num, IB_QPT_SMI, NULL, 0, &agent_send_handler, - NULL, NULL); + NULL, NULL, 0); if (IS_ERR(port_priv->agent[0])) { ret = PTR_ERR(port_priv->agent[0]); goto error2; @@ -172,7 +172,7 @@ int ib_agent_port_open(struct ib_device *device, int port_num) port_priv->agent[1] = ib_register_mad_agent(device, port_num, IB_QPT_GSI, NULL, 0, &agent_send_handler, - NULL, NULL); + NULL, NULL, 0); if (IS_ERR(port_priv->agent[1])) { ret = PTR_ERR(port_priv->agent[1]); goto error3; @@ -202,7 +202,7 @@ int ib_agent_port_close(struct ib_device *device, int port_num) port_priv = __ib_get_agent_port(device, port_num); if (port_priv == NULL) { spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); - printk(KERN_ERR SPFX "Port %d not found\n", port_num); + dev_err(&device->dev, "Port %d not found\n", port_num); return -ENODEV; } list_del(&port_priv->port_list); diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index c3239170d8b..e28a494e2a3 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3753,7 +3753,7 @@ static void cm_add_one(struct ib_device *ib_device) struct cm_port *port; struct ib_mad_reg_req reg_req = { .mgmt_class = IB_MGMT_CLASS_CM, - .mgmt_class_version = IB_CM_CLASS_VERSION + .mgmt_class_version = IB_CM_CLASS_VERSION, }; struct ib_port_modify port_modify = { .set_port_cap_mask = IB_PORT_CM_SUP @@ -3801,7 +3801,8 @@ static void cm_add_one(struct ib_device *ib_device) 0, cm_send_handler, cm_recv_handler, - port); + port, + 0); if (IS_ERR(port->mad_agent)) goto error2; diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 3d2e489ab73..ff9163dc159 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -46,6 +46,7 @@ #include <linux/completion.h> #include <linux/slab.h> #include <linux/module.h> +#include <linux/sysctl.h> #include <rdma/iw_cm.h> #include <rdma/ib_addr.h> @@ -65,6 +66,20 @@ struct iwcm_work { struct list_head free_list; }; +static unsigned int default_backlog = 256; + +static struct ctl_table_header *iwcm_ctl_table_hdr; +static struct ctl_table iwcm_ctl_table[] = { + { + .procname = "default_backlog", + .data = &default_backlog, + .maxlen = sizeof(default_backlog), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + /* * The following services provide a mechanism for pre-allocating iwcm_work * elements. The design pre-allocates them based on the cm_id type: @@ -425,6 +440,9 @@ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog) cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + if (!backlog) + backlog = default_backlog; + ret = alloc_work_entries(cm_id_priv, backlog); if (ret) return ret; @@ -1030,11 +1048,20 @@ static int __init iw_cm_init(void) if (!iwcm_wq) return -ENOMEM; + iwcm_ctl_table_hdr = register_net_sysctl(&init_net, "net/iw_cm", + iwcm_ctl_table); + if (!iwcm_ctl_table_hdr) { + pr_err("iw_cm: couldn't register sysctl paths\n"); + destroy_workqueue(iwcm_wq); + return -ENOMEM; + } + return 0; } static void __exit iw_cm_cleanup(void) { + unregister_net_sysctl_table(iwcm_ctl_table_hdr); destroy_workqueue(iwcm_wq); } diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index ab31f136d04..74c30f4c557 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -33,6 +33,9 @@ * SOFTWARE. * */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/dma-mapping.h> #include <linux/slab.h> #include <linux/module.h> @@ -195,7 +198,8 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, u8 rmpp_version, ib_mad_send_handler send_handler, ib_mad_recv_handler recv_handler, - void *context) + void *context, + u32 registration_flags) { struct ib_mad_port_private *port_priv; struct ib_mad_agent *ret = ERR_PTR(-EINVAL); @@ -211,68 +215,109 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, /* Validate parameters */ qpn = get_spl_qp_index(qp_type); - if (qpn == -1) + if (qpn == -1) { + dev_notice(&device->dev, + "ib_register_mad_agent: invalid QP Type %d\n", + qp_type); goto error1; + } - if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) + if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) { + dev_notice(&device->dev, + "ib_register_mad_agent: invalid RMPP Version %u\n", + rmpp_version); goto error1; + } /* Validate MAD registration request if supplied */ if (mad_reg_req) { - if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) + if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) { + dev_notice(&device->dev, + "ib_register_mad_agent: invalid Class Version %u\n", + mad_reg_req->mgmt_class_version); goto error1; - if (!recv_handler) + } + if (!recv_handler) { + dev_notice(&device->dev, + "ib_register_mad_agent: no recv_handler\n"); goto error1; + } if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) { /* * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only * one in this range currently allowed */ if (mad_reg_req->mgmt_class != - IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid Mgmt Class 0x%x\n", + mad_reg_req->mgmt_class); goto error1; + } } else if (mad_reg_req->mgmt_class == 0) { /* * Class 0 is reserved in IBA and is used for * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE */ + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid Mgmt Class 0\n"); goto error1; } else if (is_vendor_class(mad_reg_req->mgmt_class)) { /* * If class is in "new" vendor range, * ensure supplied OUI is not zero */ - if (!is_vendor_oui(mad_reg_req->oui)) + if (!is_vendor_oui(mad_reg_req->oui)) { + dev_notice(&device->dev, + "ib_register_mad_agent: No OUI specified for class 0x%x\n", + mad_reg_req->mgmt_class); goto error1; + } } /* Make sure class supplied is consistent with RMPP */ if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) { - if (rmpp_version) + if (rmpp_version) { + dev_notice(&device->dev, + "ib_register_mad_agent: RMPP version for non-RMPP class 0x%x\n", + mad_reg_req->mgmt_class); goto error1; + } } + /* Make sure class supplied is consistent with QP type */ if (qp_type == IB_QPT_SMI) { if ((mad_reg_req->mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED) && (mad_reg_req->mgmt_class != - IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid SM QP type: class 0x%x\n", + mad_reg_req->mgmt_class); goto error1; + } } else { if ((mad_reg_req->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) || (mad_reg_req->mgmt_class == - IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid GS QP type: class 0x%x\n", + mad_reg_req->mgmt_class); goto error1; + } } } else { /* No registration request supplied */ if (!send_handler) goto error1; + if (registration_flags & IB_MAD_USER_RMPP) + goto error1; } /* Validate device and port */ port_priv = ib_get_mad_port(device, port_num); if (!port_priv) { + dev_notice(&device->dev, "ib_register_mad_agent: Invalid port\n"); ret = ERR_PTR(-ENODEV); goto error1; } @@ -280,6 +325,8 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, /* Verify the QP requested is supported. For example, Ethernet devices * will not have QP0 */ if (!port_priv->qp_info[qpn].qp) { + dev_notice(&device->dev, + "ib_register_mad_agent: QP %d not supported\n", qpn); ret = ERR_PTR(-EPROTONOSUPPORT); goto error1; } @@ -316,6 +363,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, mad_agent_priv->agent.context = context; mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp; mad_agent_priv->agent.port_num = port_num; + mad_agent_priv->agent.flags = registration_flags; spin_lock_init(&mad_agent_priv->lock); INIT_LIST_HEAD(&mad_agent_priv->send_list); INIT_LIST_HEAD(&mad_agent_priv->wait_list); @@ -706,7 +754,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, smi_handle_dr_smp_send(smp, device->node_type, port_num) == IB_SMI_DISCARD) { ret = -EINVAL; - printk(KERN_ERR PFX "Invalid directed route\n"); + dev_err(&device->dev, "Invalid directed route\n"); goto out; } @@ -718,7 +766,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, local = kmalloc(sizeof *local, GFP_ATOMIC); if (!local) { ret = -ENOMEM; - printk(KERN_ERR PFX "No memory for ib_mad_local_private\n"); + dev_err(&device->dev, "No memory for ib_mad_local_private\n"); goto out; } local->mad_priv = NULL; @@ -726,7 +774,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_ATOMIC); if (!mad_priv) { ret = -ENOMEM; - printk(KERN_ERR PFX "No memory for local response MAD\n"); + dev_err(&device->dev, "No memory for local response MAD\n"); kfree(local); goto out; } @@ -837,9 +885,9 @@ static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr, for (left = send_buf->data_len + pad; left > 0; left -= seg_size) { seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask); if (!seg) { - printk(KERN_ERR "alloc_send_rmpp_segs: RMPP mem " - "alloc failed for len %zd, gfp %#x\n", - sizeof (*seg) + seg_size, gfp_mask); + dev_err(&send_buf->mad_agent->device->dev, + "alloc_send_rmpp_segs: RMPP mem alloc failed for len %zd, gfp %#x\n", + sizeof (*seg) + seg_size, gfp_mask); free_send_rmpp_list(send_wr); return -ENOMEM; } @@ -862,6 +910,12 @@ static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr, return 0; } +int ib_mad_kernel_rmpp_agent(struct ib_mad_agent *agent) +{ + return agent->rmpp_version && !(agent->flags & IB_MAD_USER_RMPP); +} +EXPORT_SYMBOL(ib_mad_kernel_rmpp_agent); + struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, u32 remote_qpn, u16 pkey_index, int rmpp_active, @@ -878,10 +932,12 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, pad = get_pad_size(hdr_len, data_len); message_size = hdr_len + data_len + pad; - if ((!mad_agent->rmpp_version && - (rmpp_active || message_size > sizeof(struct ib_mad))) || - (!rmpp_active && message_size > sizeof(struct ib_mad))) - return ERR_PTR(-EINVAL); + if (ib_mad_kernel_rmpp_agent(mad_agent)) { + if (!rmpp_active && message_size > sizeof(struct ib_mad)) + return ERR_PTR(-EINVAL); + } else + if (rmpp_active || message_size > sizeof(struct ib_mad)) + return ERR_PTR(-EINVAL); size = rmpp_active ? hdr_len : sizeof(struct ib_mad); buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask); @@ -1135,7 +1191,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, &mad_agent_priv->send_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - if (mad_agent_priv->agent.rmpp_version) { + if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { ret = ib_send_rmpp_mad(mad_send_wr); if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED) ret = ib_send_mad(mad_send_wr); @@ -1199,7 +1255,8 @@ EXPORT_SYMBOL(ib_redirect_mad_qp); int ib_process_mad_wc(struct ib_mad_agent *mad_agent, struct ib_wc *wc) { - printk(KERN_ERR PFX "ib_process_mad_wc() not implemented yet\n"); + dev_err(&mad_agent->device->dev, + "ib_process_mad_wc() not implemented yet\n"); return 0; } EXPORT_SYMBOL(ib_process_mad_wc); @@ -1211,7 +1268,7 @@ static int method_in_use(struct ib_mad_mgmt_method_table **method, for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) { if ((*method)->agent[i]) { - printk(KERN_ERR PFX "Method %d already in use\n", i); + pr_err("Method %d already in use\n", i); return -EINVAL; } } @@ -1223,8 +1280,7 @@ static int allocate_method_table(struct ib_mad_mgmt_method_table **method) /* Allocate management method table */ *method = kzalloc(sizeof **method, GFP_ATOMIC); if (!*method) { - printk(KERN_ERR PFX "No memory for " - "ib_mad_mgmt_method_table\n"); + pr_err("No memory for ib_mad_mgmt_method_table\n"); return -ENOMEM; } @@ -1319,8 +1375,8 @@ static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req, /* Allocate management class table for "new" class version */ *class = kzalloc(sizeof **class, GFP_ATOMIC); if (!*class) { - printk(KERN_ERR PFX "No memory for " - "ib_mad_mgmt_class_table\n"); + dev_err(&agent_priv->agent.device->dev, + "No memory for ib_mad_mgmt_class_table\n"); ret = -ENOMEM; goto error1; } @@ -1386,8 +1442,8 @@ static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, /* Allocate mgmt vendor class table for "new" class version */ vendor = kzalloc(sizeof *vendor, GFP_ATOMIC); if (!vendor) { - printk(KERN_ERR PFX "No memory for " - "ib_mad_mgmt_vendor_class_table\n"); + dev_err(&agent_priv->agent.device->dev, + "No memory for ib_mad_mgmt_vendor_class_table\n"); goto error1; } @@ -1397,8 +1453,8 @@ static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, /* Allocate table for this management vendor class */ vendor_class = kzalloc(sizeof *vendor_class, GFP_ATOMIC); if (!vendor_class) { - printk(KERN_ERR PFX "No memory for " - "ib_mad_mgmt_vendor_class\n"); + dev_err(&agent_priv->agent.device->dev, + "No memory for ib_mad_mgmt_vendor_class\n"); goto error2; } @@ -1429,7 +1485,7 @@ static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, goto check_in_use; } } - printk(KERN_ERR PFX "All OUI slots in use\n"); + dev_err(&agent_priv->agent.device->dev, "All OUI slots in use\n"); goto error3; check_in_use: @@ -1640,9 +1696,9 @@ find_mad_agent(struct ib_mad_port_private *port_priv, if (mad_agent->agent.recv_handler) atomic_inc(&mad_agent->refcount); else { - printk(KERN_NOTICE PFX "No receive handler for client " - "%p on port %d\n", - &mad_agent->agent, port_priv->port_num); + dev_notice(&port_priv->device->dev, + "No receive handler for client %p on port %d\n", + &mad_agent->agent, port_priv->port_num); mad_agent = NULL; } } @@ -1658,8 +1714,8 @@ static int validate_mad(struct ib_mad *mad, u32 qp_num) /* Make sure MAD base version is understood */ if (mad->mad_hdr.base_version != IB_MGMT_BASE_VERSION) { - printk(KERN_ERR PFX "MAD received with unsupported base " - "version %d\n", mad->mad_hdr.base_version); + pr_err("MAD received with unsupported base version %d\n", + mad->mad_hdr.base_version); goto out; } @@ -1685,6 +1741,7 @@ static int is_data_mad(struct ib_mad_agent_private *mad_agent_priv, rmpp_mad = (struct ib_rmpp_mad *)mad_hdr; return !mad_agent_priv->agent.rmpp_version || + !ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) || !(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE) || (rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA); @@ -1812,7 +1869,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, INIT_LIST_HEAD(&mad_recv_wc->rmpp_list); list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list); - if (mad_agent_priv->agent.rmpp_version) { + if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv, mad_recv_wc); if (!mad_recv_wc) { @@ -1827,23 +1884,39 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc); if (!mad_send_wr) { spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - ib_free_recv_mad(mad_recv_wc); - deref_mad_agent(mad_agent_priv); - return; - } - ib_mark_mad_done(mad_send_wr); - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + if (!ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) + && ib_is_mad_class_rmpp(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class) + && (ib_get_rmpp_flags(&((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr) + & IB_MGMT_RMPP_FLAG_ACTIVE)) { + /* user rmpp is in effect + * and this is an active RMPP MAD + */ + mad_recv_wc->wc->wr_id = 0; + mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, + mad_recv_wc); + atomic_dec(&mad_agent_priv->refcount); + } else { + /* not user rmpp, revert to normal behavior and + * drop the mad */ + ib_free_recv_mad(mad_recv_wc); + deref_mad_agent(mad_agent_priv); + return; + } + } else { + ib_mark_mad_done(mad_send_wr); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - /* Defined behavior is to complete response before request */ - mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf; - mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, - mad_recv_wc); - atomic_dec(&mad_agent_priv->refcount); + /* Defined behavior is to complete response before request */ + mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf; + mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, + mad_recv_wc); + atomic_dec(&mad_agent_priv->refcount); - mad_send_wc.status = IB_WC_SUCCESS; - mad_send_wc.vendor_err = 0; - mad_send_wc.send_buf = &mad_send_wr->send_buf; - ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); + mad_send_wc.status = IB_WC_SUCCESS; + mad_send_wc.vendor_err = 0; + mad_send_wc.send_buf = &mad_send_wr->send_buf; + ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); + } } else { mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, mad_recv_wc); @@ -1911,8 +1984,8 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, response = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL); if (!response) { - printk(KERN_ERR PFX "ib_mad_recv_done_handler no memory " - "for response buffer\n"); + dev_err(&port_priv->device->dev, + "ib_mad_recv_done_handler no memory for response buffer\n"); goto out; } @@ -2083,7 +2156,7 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, mad_agent_priv = mad_send_wr->mad_agent_priv; spin_lock_irqsave(&mad_agent_priv->lock, flags); - if (mad_agent_priv->agent.rmpp_version) { + if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { ret = ib_process_rmpp_send_wc(mad_send_wr, mad_send_wc); if (ret == IB_RMPP_RESULT_CONSUMED) goto done; @@ -2176,7 +2249,8 @@ retry: ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr, &bad_send_wr); if (ret) { - printk(KERN_ERR PFX "ib_post_send failed: %d\n", ret); + dev_err(&port_priv->device->dev, + "ib_post_send failed: %d\n", ret); mad_send_wr = queued_send_wr; wc->status = IB_WC_LOC_QP_OP_ERR; goto retry; @@ -2248,8 +2322,9 @@ static void mad_error_handler(struct ib_mad_port_private *port_priv, IB_QP_STATE | IB_QP_CUR_STATE); kfree(attr); if (ret) - printk(KERN_ERR PFX "mad_error_handler - " - "ib_modify_qp to RTS : %d\n", ret); + dev_err(&port_priv->device->dev, + "mad_error_handler - ib_modify_qp to RTS : %d\n", + ret); else mark_sends_for_retry(qp_info); } @@ -2408,7 +2483,8 @@ static void local_completions(struct work_struct *work) if (local->mad_priv) { recv_mad_agent = local->recv_mad_agent; if (!recv_mad_agent) { - printk(KERN_ERR PFX "No receive MAD agent for local completion\n"); + dev_err(&mad_agent_priv->agent.device->dev, + "No receive MAD agent for local completion\n"); free_mad = 1; goto local_send_completion; } @@ -2476,7 +2552,7 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr) mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms); - if (mad_send_wr->mad_agent_priv->agent.rmpp_version) { + if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) { ret = ib_retry_rmpp(mad_send_wr); switch (ret) { case IB_RMPP_RESULT_UNHANDLED: @@ -2589,7 +2665,8 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, } else { mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL); if (!mad_priv) { - printk(KERN_ERR PFX "No memory for receive buffer\n"); + dev_err(&qp_info->port_priv->device->dev, + "No memory for receive buffer\n"); ret = -ENOMEM; break; } @@ -2625,7 +2702,8 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, sizeof mad_priv->header, DMA_FROM_DEVICE); kmem_cache_free(ib_mad_cache, mad_priv); - printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret); + dev_err(&qp_info->port_priv->device->dev, + "ib_post_recv failed: %d\n", ret); break; } } while (post); @@ -2681,7 +2759,8 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) attr = kmalloc(sizeof *attr, GFP_KERNEL); if (!attr) { - printk(KERN_ERR PFX "Couldn't kmalloc ib_qp_attr\n"); + dev_err(&port_priv->device->dev, + "Couldn't kmalloc ib_qp_attr\n"); return -ENOMEM; } @@ -2705,16 +2784,18 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY); if (ret) { - printk(KERN_ERR PFX "Couldn't change QP%d state to " - "INIT: %d\n", i, ret); + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to INIT: %d\n", + i, ret); goto out; } attr->qp_state = IB_QPS_RTR; ret = ib_modify_qp(qp, attr, IB_QP_STATE); if (ret) { - printk(KERN_ERR PFX "Couldn't change QP%d state to " - "RTR: %d\n", i, ret); + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to RTR: %d\n", + i, ret); goto out; } @@ -2722,16 +2803,18 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) attr->sq_psn = IB_MAD_SEND_Q_PSN; ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN); if (ret) { - printk(KERN_ERR PFX "Couldn't change QP%d state to " - "RTS: %d\n", i, ret); + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to RTS: %d\n", + i, ret); goto out; } } ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP); if (ret) { - printk(KERN_ERR PFX "Failed to request completion " - "notification: %d\n", ret); + dev_err(&port_priv->device->dev, + "Failed to request completion notification: %d\n", + ret); goto out; } @@ -2741,7 +2824,8 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL); if (ret) { - printk(KERN_ERR PFX "Couldn't post receive WRs\n"); + dev_err(&port_priv->device->dev, + "Couldn't post receive WRs\n"); goto out; } } @@ -2755,7 +2839,8 @@ static void qp_event_handler(struct ib_event *event, void *qp_context) struct ib_mad_qp_info *qp_info = qp_context; /* It's worse than that! He's dead, Jim! */ - printk(KERN_ERR PFX "Fatal error (%d) on MAD QP (%d)\n", + dev_err(&qp_info->port_priv->device->dev, + "Fatal error (%d) on MAD QP (%d)\n", event->event, qp_info->qp->qp_num); } @@ -2801,8 +2886,9 @@ static int create_mad_qp(struct ib_mad_qp_info *qp_info, qp_init_attr.event_handler = qp_event_handler; qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr); if (IS_ERR(qp_info->qp)) { - printk(KERN_ERR PFX "Couldn't create ib_mad QP%d\n", - get_spl_qp_index(qp_type)); + dev_err(&qp_info->port_priv->device->dev, + "Couldn't create ib_mad QP%d\n", + get_spl_qp_index(qp_type)); ret = PTR_ERR(qp_info->qp); goto error; } @@ -2840,7 +2926,7 @@ static int ib_mad_port_open(struct ib_device *device, /* Create new device info */ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); if (!port_priv) { - printk(KERN_ERR PFX "No memory for ib_mad_port_private\n"); + dev_err(&device->dev, "No memory for ib_mad_port_private\n"); return -ENOMEM; } @@ -2860,21 +2946,21 @@ static int ib_mad_port_open(struct ib_device *device, ib_mad_thread_completion_handler, NULL, port_priv, cq_size, 0); if (IS_ERR(port_priv->cq)) { - printk(KERN_ERR PFX "Couldn't create ib_mad CQ\n"); + dev_err(&device->dev, "Couldn't create ib_mad CQ\n"); ret = PTR_ERR(port_priv->cq); goto error3; } port_priv->pd = ib_alloc_pd(device); if (IS_ERR(port_priv->pd)) { - printk(KERN_ERR PFX "Couldn't create ib_mad PD\n"); + dev_err(&device->dev, "Couldn't create ib_mad PD\n"); ret = PTR_ERR(port_priv->pd); goto error4; } port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(port_priv->mr)) { - printk(KERN_ERR PFX "Couldn't get ib_mad DMA MR\n"); + dev_err(&device->dev, "Couldn't get ib_mad DMA MR\n"); ret = PTR_ERR(port_priv->mr); goto error5; } @@ -2902,7 +2988,7 @@ static int ib_mad_port_open(struct ib_device *device, ret = ib_mad_port_start(port_priv); if (ret) { - printk(KERN_ERR PFX "Couldn't start port\n"); + dev_err(&device->dev, "Couldn't start port\n"); goto error9; } @@ -2946,7 +3032,7 @@ static int ib_mad_port_close(struct ib_device *device, int port_num) port_priv = __ib_get_mad_port(device, port_num); if (port_priv == NULL) { spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); - printk(KERN_ERR PFX "Port %d not found\n", port_num); + dev_err(&device->dev, "Port %d not found\n", port_num); return -ENODEV; } list_del_init(&port_priv->port_list); @@ -2984,14 +3070,12 @@ static void ib_mad_init_device(struct ib_device *device) for (i = start; i <= end; i++) { if (ib_mad_port_open(device, i)) { - printk(KERN_ERR PFX "Couldn't open %s port %d\n", - device->name, i); + dev_err(&device->dev, "Couldn't open port %d\n", i); goto error; } if (ib_agent_port_open(device, i)) { - printk(KERN_ERR PFX "Couldn't open %s port %d " - "for agents\n", - device->name, i); + dev_err(&device->dev, + "Couldn't open port %d for agents\n", i); goto error_agent; } } @@ -2999,20 +3083,17 @@ static void ib_mad_init_device(struct ib_device *device) error_agent: if (ib_mad_port_close(device, i)) - printk(KERN_ERR PFX "Couldn't close %s port %d\n", - device->name, i); + dev_err(&device->dev, "Couldn't close port %d\n", i); error: i--; while (i >= start) { if (ib_agent_port_close(device, i)) - printk(KERN_ERR PFX "Couldn't close %s port %d " - "for agents\n", - device->name, i); + dev_err(&device->dev, + "Couldn't close port %d for agents\n", i); if (ib_mad_port_close(device, i)) - printk(KERN_ERR PFX "Couldn't close %s port %d\n", - device->name, i); + dev_err(&device->dev, "Couldn't close port %d\n", i); i--; } } @@ -3033,12 +3114,12 @@ static void ib_mad_remove_device(struct ib_device *device) } for (i = 0; i < num_ports; i++, cur_port++) { if (ib_agent_port_close(device, cur_port)) - printk(KERN_ERR PFX "Couldn't close %s port %d " - "for agents\n", - device->name, cur_port); + dev_err(&device->dev, + "Couldn't close port %d for agents\n", + cur_port); if (ib_mad_port_close(device, cur_port)) - printk(KERN_ERR PFX "Couldn't close %s port %d\n", - device->name, cur_port); + dev_err(&device->dev, "Couldn't close port %d\n", + cur_port); } } @@ -3064,7 +3145,7 @@ static int __init ib_mad_init_module(void) SLAB_HWCACHE_ALIGN, NULL); if (!ib_mad_cache) { - printk(KERN_ERR PFX "Couldn't create ib_mad cache\n"); + pr_err("Couldn't create ib_mad cache\n"); ret = -ENOMEM; goto error1; } @@ -3072,7 +3153,7 @@ static int __init ib_mad_init_module(void) INIT_LIST_HEAD(&ib_mad_port_list); if (ib_register_client(&mad_client)) { - printk(KERN_ERR PFX "Couldn't register ib_mad client\n"); + pr_err("Couldn't register ib_mad client\n"); ret = -EINVAL; goto error2; } diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index 9430ab4969c..d1a0b0ee944 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -42,9 +42,6 @@ #include <rdma/ib_mad.h> #include <rdma/ib_smi.h> - -#define PFX "ib_mad: " - #define IB_MAD_QPS_CORE 2 /* Always QP0 and QP1 as a minimum */ /* QP and CQ parameters */ diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 233eaf541f5..c38f030f0dc 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1184,7 +1184,7 @@ static void ib_sa_add_one(struct ib_device *device) sa_dev->port[i].agent = ib_register_mad_agent(device, i + s, IB_QPT_GSI, NULL, 0, send_handler, - recv_handler, sa_dev); + recv_handler, sa_dev, 0); if (IS_ERR(sa_dev->port[i].agent)) goto err; diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 1acb9910055..928cdd20e2d 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -33,6 +33,8 @@ * SOFTWARE. */ +#define pr_fmt(fmt) "user_mad: " fmt + #include <linux/module.h> #include <linux/init.h> #include <linux/device.h> @@ -504,13 +506,15 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data; hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class); - if (!ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)) { - copy_offset = IB_MGMT_MAD_HDR; - rmpp_active = 0; - } else { + + if (ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class) + && ib_mad_kernel_rmpp_agent(agent)) { copy_offset = IB_MGMT_RMPP_HDR; rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & - IB_MGMT_RMPP_FLAG_ACTIVE; + IB_MGMT_RMPP_FLAG_ACTIVE; + } else { + copy_offset = IB_MGMT_MAD_HDR; + rmpp_active = 0; } data_len = count - hdr_size(file) - hdr_len; @@ -556,14 +560,22 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, rmpp_mad->mad_hdr.tid = *tid; } - spin_lock_irq(&file->send_lock); - ret = is_duplicate(file, packet); - if (!ret) + if (!ib_mad_kernel_rmpp_agent(agent) + && ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class) + && (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) { + spin_lock_irq(&file->send_lock); list_add_tail(&packet->list, &file->send_list); - spin_unlock_irq(&file->send_lock); - if (ret) { - ret = -EINVAL; - goto err_msg; + spin_unlock_irq(&file->send_lock); + } else { + spin_lock_irq(&file->send_lock); + ret = is_duplicate(file, packet); + if (!ret) + list_add_tail(&packet->list, &file->send_list); + spin_unlock_irq(&file->send_lock); + if (ret) { + ret = -EINVAL; + goto err_msg; + } } ret = ib_post_send_mad(packet->msg, NULL); @@ -614,6 +626,8 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, mutex_lock(&file->mutex); if (!file->port->ib_dev) { + dev_notice(file->port->dev, + "ib_umad_reg_agent: invalid device\n"); ret = -EPIPE; goto out; } @@ -624,6 +638,9 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, } if (ureq.qpn != 0 && ureq.qpn != 1) { + dev_notice(file->port->dev, + "ib_umad_reg_agent: invalid QPN %d specified\n", + ureq.qpn); ret = -EINVAL; goto out; } @@ -632,11 +649,15 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, if (!__get_agent(file, agent_id)) goto found; + dev_notice(file->port->dev, + "ib_umad_reg_agent: Max Agents (%u) reached\n", + IB_UMAD_MAX_AGENTS); ret = -ENOMEM; goto out; found: if (ureq.mgmt_class) { + memset(&req, 0, sizeof(req)); req.mgmt_class = ureq.mgmt_class; req.mgmt_class_version = ureq.mgmt_class_version; memcpy(req.oui, ureq.oui, sizeof req.oui); @@ -657,7 +678,7 @@ found: ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI, ureq.mgmt_class ? &req : NULL, ureq.rmpp_version, - send_handler, recv_handler, file); + send_handler, recv_handler, file, 0); if (IS_ERR(agent)) { ret = PTR_ERR(agent); agent = NULL; @@ -673,10 +694,11 @@ found: if (!file->already_used) { file->already_used = 1; if (!file->use_pkey_index) { - printk(KERN_WARNING "user_mad: process %s did not enable " - "P_Key index support.\n", current->comm); - printk(KERN_WARNING "user_mad: Documentation/infiniband/user_mad.txt " - "has info on the new ABI.\n"); + dev_warn(file->port->dev, + "process %s did not enable P_Key index support.\n", + current->comm); + dev_warn(file->port->dev, + " Documentation/infiniband/user_mad.txt has info on the new ABI.\n"); } } @@ -694,6 +716,119 @@ out: return ret; } +static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg) +{ + struct ib_user_mad_reg_req2 ureq; + struct ib_mad_reg_req req; + struct ib_mad_agent *agent = NULL; + int agent_id; + int ret; + + mutex_lock(&file->port->file_mutex); + mutex_lock(&file->mutex); + + if (!file->port->ib_dev) { + dev_notice(file->port->dev, + "ib_umad_reg_agent2: invalid device\n"); + ret = -EPIPE; + goto out; + } + + if (copy_from_user(&ureq, arg, sizeof(ureq))) { + ret = -EFAULT; + goto out; + } + + if (ureq.qpn != 0 && ureq.qpn != 1) { + dev_notice(file->port->dev, + "ib_umad_reg_agent2: invalid QPN %d specified\n", + ureq.qpn); + ret = -EINVAL; + goto out; + } + + if (ureq.flags & ~IB_USER_MAD_REG_FLAGS_CAP) { + dev_notice(file->port->dev, + "ib_umad_reg_agent2 failed: invalid registration flags specified 0x%x; supported 0x%x\n", + ureq.flags, IB_USER_MAD_REG_FLAGS_CAP); + ret = -EINVAL; + + if (put_user((u32)IB_USER_MAD_REG_FLAGS_CAP, + (u32 __user *) (arg + offsetof(struct + ib_user_mad_reg_req2, flags)))) + ret = -EFAULT; + + goto out; + } + + for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id) + if (!__get_agent(file, agent_id)) + goto found; + + dev_notice(file->port->dev, + "ib_umad_reg_agent2: Max Agents (%u) reached\n", + IB_UMAD_MAX_AGENTS); + ret = -ENOMEM; + goto out; + +found: + if (ureq.mgmt_class) { + memset(&req, 0, sizeof(req)); + req.mgmt_class = ureq.mgmt_class; + req.mgmt_class_version = ureq.mgmt_class_version; + if (ureq.oui & 0xff000000) { + dev_notice(file->port->dev, + "ib_umad_reg_agent2 failed: oui invalid 0x%08x\n", + ureq.oui); + ret = -EINVAL; + goto out; + } + req.oui[2] = ureq.oui & 0x0000ff; + req.oui[1] = (ureq.oui & 0x00ff00) >> 8; + req.oui[0] = (ureq.oui & 0xff0000) >> 16; + memcpy(req.method_mask, ureq.method_mask, + sizeof(req.method_mask)); + } + + agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num, + ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI, + ureq.mgmt_class ? &req : NULL, + ureq.rmpp_version, + send_handler, recv_handler, file, + ureq.flags); + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); + agent = NULL; + goto out; + } + + if (put_user(agent_id, + (u32 __user *)(arg + + offsetof(struct ib_user_mad_reg_req2, id)))) { + ret = -EFAULT; + goto out; + } + + if (!file->already_used) { + file->already_used = 1; + file->use_pkey_index = 1; + } + + file->agent[agent_id] = agent; + ret = 0; + +out: + mutex_unlock(&file->mutex); + + if (ret && agent) + ib_unregister_mad_agent(agent); + + mutex_unlock(&file->port->file_mutex); + + return ret; +} + + static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg) { struct ib_mad_agent *agent = NULL; @@ -749,6 +884,8 @@ static long ib_umad_ioctl(struct file *filp, unsigned int cmd, return ib_umad_unreg_agent(filp->private_data, (__u32 __user *) arg); case IB_USER_MAD_ENABLE_PKEY: return ib_umad_enable_pkey(filp->private_data); + case IB_USER_MAD_REGISTER_AGENT2: + return ib_umad_reg_agent2(filp->private_data, (void __user *) arg); default: return -ENOIOCTLCMD; } @@ -765,6 +902,8 @@ static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd, return ib_umad_unreg_agent(filp->private_data, compat_ptr(arg)); case IB_USER_MAD_ENABLE_PKEY: return ib_umad_enable_pkey(filp->private_data); + case IB_USER_MAD_REGISTER_AGENT2: + return ib_umad_reg_agent2(filp->private_data, compat_ptr(arg)); default: return -ENOIOCTLCMD; } @@ -983,7 +1122,7 @@ static CLASS_ATTR_STRING(abi_version, S_IRUGO, static dev_t overflow_maj; static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS); -static int find_overflow_devnum(void) +static int find_overflow_devnum(struct ib_device *device) { int ret; @@ -991,7 +1130,8 @@ static int find_overflow_devnum(void) ret = alloc_chrdev_region(&overflow_maj, 0, IB_UMAD_MAX_PORTS * 2, "infiniband_mad"); if (ret) { - printk(KERN_ERR "user_mad: couldn't register dynamic device number\n"); + dev_err(&device->dev, + "couldn't register dynamic device number\n"); return ret; } } @@ -1014,7 +1154,7 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS); if (devnum >= IB_UMAD_MAX_PORTS) { spin_unlock(&port_lock); - devnum = find_overflow_devnum(); + devnum = find_overflow_devnum(device); if (devnum < 0) return -1; @@ -1200,14 +1340,14 @@ static int __init ib_umad_init(void) ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2, "infiniband_mad"); if (ret) { - printk(KERN_ERR "user_mad: couldn't register device number\n"); + pr_err("couldn't register device number\n"); goto out; } umad_class = class_create(THIS_MODULE, "infiniband_mad"); if (IS_ERR(umad_class)) { ret = PTR_ERR(umad_class); - printk(KERN_ERR "user_mad: couldn't create class infiniband_mad\n"); + pr_err("couldn't create class infiniband_mad\n"); goto out_chrdev; } @@ -1215,13 +1355,13 @@ static int __init ib_umad_init(void) ret = class_create_file(umad_class, &class_attr_abi_version.attr); if (ret) { - printk(KERN_ERR "user_mad: couldn't create abi_version attribute\n"); + pr_err("couldn't create abi_version attribute\n"); goto out_class; } ret = ib_register_client(&umad_client); if (ret) { - printk(KERN_ERR "user_mad: couldn't register ib_umad client\n"); + pr_err("couldn't register ib_umad client\n"); goto out_class; } diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index a283274a5a0..643c08a025a 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -221,6 +221,7 @@ IB_UVERBS_DECLARE_CMD(query_port); IB_UVERBS_DECLARE_CMD(alloc_pd); IB_UVERBS_DECLARE_CMD(dealloc_pd); IB_UVERBS_DECLARE_CMD(reg_mr); +IB_UVERBS_DECLARE_CMD(rereg_mr); IB_UVERBS_DECLARE_CMD(dereg_mr); IB_UVERBS_DECLARE_CMD(alloc_mw); IB_UVERBS_DECLARE_CMD(dealloc_mw); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index ea6203ee7bc..0600c50e621 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1002,6 +1002,99 @@ err_free: return ret; } +ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_rereg_mr cmd; + struct ib_uverbs_rereg_mr_resp resp; + struct ib_udata udata; + struct ib_pd *pd = NULL; + struct ib_mr *mr; + struct ib_pd *old_pd; + int ret; + struct ib_uobject *uobj; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd), out_len - sizeof(resp)); + + if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags) + return -EINVAL; + + if ((cmd.flags & IB_MR_REREG_TRANS) && + (!cmd.start || !cmd.hca_va || 0 >= cmd.length || + (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))) + return -EINVAL; + + uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, + file->ucontext); + + if (!uobj) + return -EINVAL; + + mr = uobj->object; + + if (cmd.flags & IB_MR_REREG_ACCESS) { + ret = ib_check_mr_access(cmd.access_flags); + if (ret) + goto put_uobjs; + } + + if (cmd.flags & IB_MR_REREG_PD) { + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto put_uobjs; + } + } + + if (atomic_read(&mr->usecnt)) { + ret = -EBUSY; + goto put_uobj_pd; + } + + old_pd = mr->pd; + ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start, + cmd.length, cmd.hca_va, + cmd.access_flags, pd, &udata); + if (!ret) { + if (cmd.flags & IB_MR_REREG_PD) { + atomic_inc(&pd->usecnt); + mr->pd = pd; + atomic_dec(&old_pd->usecnt); + } + } else { + goto put_uobj_pd; + } + + memset(&resp, 0, sizeof(resp)); + resp.lkey = mr->lkey; + resp.rkey = mr->rkey; + + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) + ret = -EFAULT; + else + ret = in_len; + +put_uobj_pd: + if (cmd.flags & IB_MR_REREG_PD) + put_pd_read(pd); + +put_uobjs: + + put_uobj_write(mr->uobject); + + return ret; +} + ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 08219fb3338..c73b22a257f 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -87,6 +87,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_CMD_ALLOC_PD] = ib_uverbs_alloc_pd, [IB_USER_VERBS_CMD_DEALLOC_PD] = ib_uverbs_dealloc_pd, [IB_USER_VERBS_CMD_REG_MR] = ib_uverbs_reg_mr, + [IB_USER_VERBS_CMD_REREG_MR] = ib_uverbs_rereg_mr, [IB_USER_VERBS_CMD_DEREG_MR] = ib_uverbs_dereg_mr, [IB_USER_VERBS_CMD_ALLOC_MW] = ib_uverbs_alloc_mw, [IB_USER_VERBS_CMD_DEALLOC_MW] = ib_uverbs_dealloc_mw, diff --git a/drivers/infiniband/hw/amso1100/c2_cq.c b/drivers/infiniband/hw/amso1100/c2_cq.c index 49e0e8533f7..1b63185b4ad 100644 --- a/drivers/infiniband/hw/amso1100/c2_cq.c +++ b/drivers/infiniband/hw/amso1100/c2_cq.c @@ -260,11 +260,14 @@ static void c2_free_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq) mq->msg_pool.host, dma_unmap_addr(mq, mapping)); } -static int c2_alloc_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq, int q_size, - int msg_size) +static int c2_alloc_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq, + size_t q_size, size_t msg_size) { u8 *pool_start; + if (q_size > SIZE_MAX / msg_size) + return -EINVAL; + pool_start = dma_alloc_coherent(&c2dev->pcidev->dev, q_size * msg_size, &mq->host_dma, GFP_KERNEL); if (!pool_start) diff --git a/drivers/infiniband/hw/cxgb4/ev.c b/drivers/infiniband/hw/cxgb4/ev.c index fbe6051af25..c9df0549f51 100644 --- a/drivers/infiniband/hw/cxgb4/ev.c +++ b/drivers/infiniband/hw/cxgb4/ev.c @@ -227,6 +227,7 @@ int c4iw_ev_handler(struct c4iw_dev *dev, u32 qid) chp = get_chp(dev, qid); if (chp) { + t4_clear_cq_armed(&chp->cq); spin_lock_irqsave(&chp->comp_handler_lock, flag); (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); spin_unlock_irqrestore(&chp->comp_handler_lock, flag); diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index c158fcc02bc..41cd6882b64 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -1105,7 +1105,7 @@ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, struct c4iw_cq *schp) { int count; - int flushed; + int rq_flushed, sq_flushed; unsigned long flag; PDBG("%s qhp %p rchp %p schp %p\n", __func__, qhp, rchp, schp); @@ -1123,27 +1123,40 @@ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, c4iw_flush_hw_cq(rchp); c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count); - flushed = c4iw_flush_rq(&qhp->wq, &rchp->cq, count); + rq_flushed = c4iw_flush_rq(&qhp->wq, &rchp->cq, count); spin_unlock(&qhp->lock); spin_unlock_irqrestore(&rchp->lock, flag); - if (flushed) { - spin_lock_irqsave(&rchp->comp_handler_lock, flag); - (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); - spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); - } /* locking hierarchy: cq lock first, then qp lock. */ spin_lock_irqsave(&schp->lock, flag); spin_lock(&qhp->lock); if (schp != rchp) c4iw_flush_hw_cq(schp); - flushed = c4iw_flush_sq(qhp); + sq_flushed = c4iw_flush_sq(qhp); spin_unlock(&qhp->lock); spin_unlock_irqrestore(&schp->lock, flag); - if (flushed) { - spin_lock_irqsave(&schp->comp_handler_lock, flag); - (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); - spin_unlock_irqrestore(&schp->comp_handler_lock, flag); + + if (schp == rchp) { + if (t4_clear_cq_armed(&rchp->cq) && + (rq_flushed || sq_flushed)) { + spin_lock_irqsave(&rchp->comp_handler_lock, flag); + (*rchp->ibcq.comp_handler)(&rchp->ibcq, + rchp->ibcq.cq_context); + spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); + } + } else { + if (t4_clear_cq_armed(&rchp->cq) && rq_flushed) { + spin_lock_irqsave(&rchp->comp_handler_lock, flag); + (*rchp->ibcq.comp_handler)(&rchp->ibcq, + rchp->ibcq.cq_context); + spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); + } + if (t4_clear_cq_armed(&schp->cq) && sq_flushed) { + spin_lock_irqsave(&schp->comp_handler_lock, flag); + (*schp->ibcq.comp_handler)(&schp->ibcq, + schp->ibcq.cq_context); + spin_unlock_irqrestore(&schp->comp_handler_lock, flag); + } } } diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h index df5edfa31a8..c04e5134b30 100644 --- a/drivers/infiniband/hw/cxgb4/t4.h +++ b/drivers/infiniband/hw/cxgb4/t4.h @@ -524,6 +524,10 @@ static inline int t4_wq_db_enabled(struct t4_wq *wq) return !wq->rq.queue[wq->rq.size].status.db_off; } +enum t4_cq_flags { + CQ_ARMED = 1, +}; + struct t4_cq { struct t4_cqe *queue; dma_addr_t dma_addr; @@ -544,12 +548,19 @@ struct t4_cq { u16 cidx_inc; u8 gen; u8 error; + unsigned long flags; }; +static inline int t4_clear_cq_armed(struct t4_cq *cq) +{ + return test_and_clear_bit(CQ_ARMED, &cq->flags); +} + static inline int t4_arm_cq(struct t4_cq *cq, int se) { u32 val; + set_bit(CQ_ARMED, &cq->flags); while (cq->cidx_inc > CIDXINC_MASK) { val = SEINTARM(0) | CIDXINC(CIDXINC_MASK) | TIMERREG(7) | INGRESSQID(cq->cqid); diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c index 43f2d0424d4..e890e5ba0e0 100644 --- a/drivers/infiniband/hw/ipath/ipath_mad.c +++ b/drivers/infiniband/hw/ipath/ipath_mad.c @@ -726,7 +726,7 @@ bail: * @dd: the infinipath device * @pkeys: the PKEY table */ -static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys) +static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys, u8 port) { struct ipath_portdata *pd; int i; @@ -759,6 +759,7 @@ static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys) } if (changed) { u64 pkey; + struct ib_event event; pkey = (u64) dd->ipath_pkeys[0] | ((u64) dd->ipath_pkeys[1] << 16) | @@ -768,12 +769,17 @@ static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys) (unsigned long long) pkey); ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey, pkey); + + event.event = IB_EVENT_PKEY_CHANGE; + event.device = &dd->verbs_dev->ibdev; + event.element.port_num = port; + ib_dispatch_event(&event); } return 0; } static int recv_subn_set_pkeytable(struct ib_smp *smp, - struct ib_device *ibdev) + struct ib_device *ibdev, u8 port) { u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff); __be16 *p = (__be16 *) smp->data; @@ -784,7 +790,7 @@ static int recv_subn_set_pkeytable(struct ib_smp *smp, for (i = 0; i < n; i++) q[i] = be16_to_cpu(p[i]); - if (startpx != 0 || set_pkeys(dev->dd, q) != 0) + if (startpx != 0 || set_pkeys(dev->dd, q, port) != 0) smp->status |= IB_SMP_INVALID_FIELD; return recv_subn_get_pkeytable(smp, ibdev); @@ -1342,7 +1348,7 @@ static int process_subn(struct ib_device *ibdev, int mad_flags, ret = recv_subn_set_portinfo(smp, ibdev, port_num); goto bail; case IB_SMP_ATTR_PKEY_TABLE: - ret = recv_subn_set_pkeytable(smp, ibdev); + ret = recv_subn_set_pkeytable(smp, ibdev, port_num); goto bail; case IB_SMP_ATTR_SM_INFO: if (dev->port_cap_flags & IB_PORT_SM_DISABLED) { diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 287ad0564ac..82a7dd87089 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -891,7 +891,7 @@ int mlx4_ib_mad_init(struct mlx4_ib_dev *dev) agent = ib_register_mad_agent(&dev->ib_dev, p + 1, q ? IB_QPT_GSI : IB_QPT_SMI, NULL, 0, send_handler, - NULL, NULL); + NULL, NULL, 0); if (IS_ERR(agent)) { ret = PTR_ERR(agent); goto err; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 0f7027e7db1..e1e558a3d69 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -910,8 +910,7 @@ static int __mlx4_ib_default_rules_match(struct ib_qp *qp, const struct default_rules *pdefault_rules = default_table; u8 link_layer = rdma_port_get_link_layer(qp->device, flow_attr->port); - for (i = 0; i < sizeof(default_table)/sizeof(default_table[0]); i++, - pdefault_rules++) { + for (i = 0; i < ARRAY_SIZE(default_table); i++, pdefault_rules++) { __u32 field_types[IB_FLOW_SPEC_SUPPORT_LAYERS]; memset(&field_types, 0, sizeof(field_types)); @@ -965,8 +964,7 @@ static int __mlx4_ib_create_default_rules( int size = 0; int i; - for (i = 0; i < sizeof(pdefault_rules->rules_create_list)/ - sizeof(pdefault_rules->rules_create_list[0]); i++) { + for (i = 0; i < ARRAY_SIZE(pdefault_rules->rules_create_list); i++) { int ret; union ib_flow_spec ib_spec; switch (pdefault_rules->rules_create_list[i]) { @@ -2007,6 +2005,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_REREG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | @@ -2059,6 +2058,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.req_notify_cq = mlx4_ib_arm_cq; ibdev->ib_dev.get_dma_mr = mlx4_ib_get_dma_mr; ibdev->ib_dev.reg_user_mr = mlx4_ib_reg_user_mr; + ibdev->ib_dev.rereg_user_mr = mlx4_ib_rereg_user_mr; ibdev->ib_dev.dereg_mr = mlx4_ib_dereg_mr; ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr; ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 369da3ca5d6..e8cad3926bf 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -788,5 +788,9 @@ int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn); void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count); int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, int is_attach); +int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, + u64 start, u64 length, u64 virt_addr, + int mr_access_flags, struct ib_pd *pd, + struct ib_udata *udata); #endif /* MLX4_IB_H */ diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index cb2a8727f3f..9b0e80e59b0 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -144,8 +144,10 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mr) return ERR_PTR(-ENOMEM); + /* Force registering the memory as writable. */ + /* Used for memory re-registeration. HCA protects the access */ mr->umem = ib_umem_get(pd->uobject->context, start, length, - access_flags, 0); + access_flags | IB_ACCESS_LOCAL_WRITE, 0); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); goto err_free; @@ -183,6 +185,90 @@ err_free: return ERR_PTR(err); } +int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, + u64 start, u64 length, u64 virt_addr, + int mr_access_flags, struct ib_pd *pd, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(mr->device); + struct mlx4_ib_mr *mmr = to_mmr(mr); + struct mlx4_mpt_entry *mpt_entry; + struct mlx4_mpt_entry **pmpt_entry = &mpt_entry; + int err; + + /* Since we synchronize this call and mlx4_ib_dereg_mr via uverbs, + * we assume that the calls can't run concurrently. Otherwise, a + * race exists. + */ + err = mlx4_mr_hw_get_mpt(dev->dev, &mmr->mmr, &pmpt_entry); + + if (err) + return err; + + if (flags & IB_MR_REREG_PD) { + err = mlx4_mr_hw_change_pd(dev->dev, *pmpt_entry, + to_mpd(pd)->pdn); + + if (err) + goto release_mpt_entry; + } + + if (flags & IB_MR_REREG_ACCESS) { + err = mlx4_mr_hw_change_access(dev->dev, *pmpt_entry, + convert_access(mr_access_flags)); + + if (err) + goto release_mpt_entry; + } + + if (flags & IB_MR_REREG_TRANS) { + int shift; + int err; + int n; + + mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); + ib_umem_release(mmr->umem); + mmr->umem = ib_umem_get(mr->uobject->context, start, length, + mr_access_flags | + IB_ACCESS_LOCAL_WRITE, + 0); + if (IS_ERR(mmr->umem)) { + err = PTR_ERR(mmr->umem); + mmr->umem = NULL; + goto release_mpt_entry; + } + n = ib_umem_page_count(mmr->umem); + shift = ilog2(mmr->umem->page_size); + + mmr->mmr.iova = virt_addr; + mmr->mmr.size = length; + err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr, + virt_addr, length, n, shift, + *pmpt_entry); + if (err) { + ib_umem_release(mmr->umem); + goto release_mpt_entry; + } + + err = mlx4_ib_umem_write_mtt(dev, &mmr->mmr.mtt, mmr->umem); + if (err) { + mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); + ib_umem_release(mmr->umem); + goto release_mpt_entry; + } + } + + /* If we couldn't transfer the MR to the HCA, just remember to + * return a failure. But dereg_mr will free the resources. + */ + err = mlx4_mr_hw_write_mpt(dev->dev, &mmr->mmr, pmpt_entry); + +release_mpt_entry: + mlx4_mr_hw_put_mpt(dev->dev, pmpt_entry); + + return err; +} + int mlx4_ib_dereg_mr(struct ib_mr *ibmr) { struct mlx4_ib_mr *mr = to_mmr(ibmr); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 7efe6e3f354..8c574b63d77 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2501,7 +2501,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, spin_lock_irqsave(&qp->sq.lock, flags); for (nreq = 0; wr; nreq++, wr = wr->next) { - if (unlikely(wr->opcode >= sizeof(mlx5_ib_opcode) / sizeof(mlx5_ib_opcode[0]))) { + if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) { mlx5_ib_warn(dev, "\n"); err = -EINVAL; *bad_wr = wr; diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c index b6f7f457fc5..8881fa376e0 100644 --- a/drivers/infiniband/hw/mthca/mthca_mad.c +++ b/drivers/infiniband/hw/mthca/mthca_mad.c @@ -294,7 +294,7 @@ int mthca_create_agents(struct mthca_dev *dev) agent = ib_register_mad_agent(&dev->ib_dev, p + 1, q ? IB_QPT_GSI : IB_QPT_SMI, NULL, 0, send_handler, - NULL, NULL); + NULL, NULL, 0); if (IS_ERR(agent)) { ret = PTR_ERR(agent); goto err; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h index 19011dbb930..b43456ae124 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -40,7 +40,7 @@ #include <be_roce.h> #include "ocrdma_sli.h" -#define OCRDMA_ROCE_DRV_VERSION "10.2.145.0u" +#define OCRDMA_ROCE_DRV_VERSION "10.2.287.0u" #define OCRDMA_ROCE_DRV_DESC "Emulex OneConnect RoCE Driver" #define OCRDMA_NODE_DESC "Emulex OneConnect RoCE HCA" @@ -137,6 +137,7 @@ struct mqe_ctx { u16 cqe_status; u16 ext_status; bool cmd_done; + bool fw_error_state; }; struct ocrdma_hw_mr { @@ -235,7 +236,10 @@ struct ocrdma_dev { struct list_head entry; struct rcu_head rcu; int id; - u64 stag_arr[OCRDMA_MAX_STAG]; + u64 *stag_arr; + u8 sl; /* service level */ + bool pfc_state; + atomic_t update_sl; u16 pvid; u32 asic_id; @@ -518,4 +522,22 @@ static inline u8 ocrdma_get_asic_type(struct ocrdma_dev *dev) OCRDMA_SLI_ASIC_GEN_NUM_SHIFT; } +static inline u8 ocrdma_get_pfc_prio(u8 *pfc, u8 prio) +{ + return *(pfc + prio); +} + +static inline u8 ocrdma_get_app_prio(u8 *app_prio, u8 prio) +{ + return *(app_prio + prio); +} + +static inline u8 ocrdma_is_enabled_and_synced(u32 state) +{ /* May also be used to interpret TC-state, QCN-state + * Appl-state and Logical-link-state in future. + */ + return (state & OCRDMA_STATE_FLAG_ENABLED) && + (state & OCRDMA_STATE_FLAG_SYNC); +} + #endif diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c index d4cc01f10c0..40f8536c10b 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c @@ -35,6 +35,8 @@ #include "ocrdma_ah.h" #include "ocrdma_hw.h" +#define OCRDMA_VID_PCP_SHIFT 0xD + static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, struct ib_ah_attr *attr, int pdid) { @@ -55,7 +57,7 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, if (vlan_tag && (vlan_tag < 0x1000)) { eth.eth_type = cpu_to_be16(0x8100); eth.roce_eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE); - vlan_tag |= (attr->sl & 7) << 13; + vlan_tag |= (dev->sl & 0x07) << OCRDMA_VID_PCP_SHIFT; eth.vlan_tag = cpu_to_be16(vlan_tag); eth_sz = sizeof(struct ocrdma_eth_vlan); vlan_enabled = true; @@ -100,6 +102,8 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) if (!(attr->ah_flags & IB_AH_GRH)) return ERR_PTR(-EINVAL); + if (atomic_cmpxchg(&dev->update_sl, 1, 0)) + ocrdma_init_service_level(dev); ah = kzalloc(sizeof(*ah), GFP_ATOMIC); if (!ah) return ERR_PTR(-ENOMEM); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 3bbf2010a82..dd35ae558ae 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -525,7 +525,7 @@ static int ocrdma_mbx_mq_cq_create(struct ocrdma_dev *dev, cmd->ev_cnt_flags = OCRDMA_CREATE_CQ_DEF_FLAGS; cmd->eqn = eq->id; - cmd->cqe_count = cq->size / sizeof(struct ocrdma_mcqe); + cmd->pdid_cqecnt = cq->size / sizeof(struct ocrdma_mcqe); ocrdma_build_q_pages(&cmd->pa[0], cq->size / OCRDMA_MIN_Q_PAGE_SIZE, cq->dma, PAGE_SIZE_4K); @@ -661,7 +661,7 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev, { struct ocrdma_qp *qp = NULL; struct ocrdma_cq *cq = NULL; - struct ib_event ib_evt = { 0 }; + struct ib_event ib_evt; int cq_event = 0; int qp_event = 1; int srq_event = 0; @@ -674,6 +674,8 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev, if (cqe->cqvalid_cqid & OCRDMA_AE_MCQE_CQVALID) cq = dev->cq_tbl[cqe->cqvalid_cqid & OCRDMA_AE_MCQE_CQID_MASK]; + memset(&ib_evt, 0, sizeof(ib_evt)); + ib_evt.device = &dev->ibdev; switch (type) { @@ -771,6 +773,10 @@ static void ocrdma_process_grp5_aync(struct ocrdma_dev *dev, OCRDMA_AE_PVID_MCQE_TAG_MASK) >> OCRDMA_AE_PVID_MCQE_TAG_SHIFT); break; + + case OCRDMA_ASYNC_EVENT_COS_VALUE: + atomic_set(&dev->update_sl, 1); + break; default: /* Not interested evts. */ break; @@ -962,8 +968,12 @@ static int ocrdma_wait_mqe_cmpl(struct ocrdma_dev *dev) msecs_to_jiffies(30000)); if (status) return 0; - else + else { + dev->mqe_ctx.fw_error_state = true; + pr_err("%s(%d) mailbox timeout: fw not responding\n", + __func__, dev->id); return -1; + } } /* issue a mailbox command on the MQ */ @@ -975,6 +985,8 @@ static int ocrdma_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe) struct ocrdma_mbx_rsp *rsp = NULL; mutex_lock(&dev->mqe_ctx.lock); + if (dev->mqe_ctx.fw_error_state) + goto mbx_err; ocrdma_post_mqe(dev, mqe); status = ocrdma_wait_mqe_cmpl(dev); if (status) @@ -1078,7 +1090,8 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev, OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT; attr->max_mw = rsp->max_mw; attr->max_mr = rsp->max_mr; - attr->max_mr_size = ~0ull; + attr->max_mr_size = ((u64)rsp->max_mr_size_hi << 32) | + rsp->max_mr_size_lo; attr->max_fmr = 0; attr->max_pages_per_frmr = rsp->max_pages_per_frmr; attr->max_num_mr_pbl = rsp->max_num_mr_pbl; @@ -1252,7 +1265,9 @@ static int ocrdma_mbx_get_ctrl_attribs(struct ocrdma_dev *dev) ctrl_attr_rsp = (struct ocrdma_get_ctrl_attribs_rsp *)dma.va; hba_attribs = &ctrl_attr_rsp->ctrl_attribs.hba_attribs; - dev->hba_port_num = hba_attribs->phy_port; + dev->hba_port_num = (hba_attribs->ptpnum_maxdoms_hbast_cv & + OCRDMA_HBA_ATTRB_PTNUM_MASK) + >> OCRDMA_HBA_ATTRB_PTNUM_SHIFT; strncpy(dev->model_number, hba_attribs->controller_model_number, 31); } @@ -1302,7 +1317,8 @@ int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed) goto mbx_err; rsp = (struct ocrdma_get_link_speed_rsp *)cmd; - *lnk_speed = rsp->phys_port_speed; + *lnk_speed = (rsp->pflt_pps_ld_pnum & OCRDMA_PHY_PS_MASK) + >> OCRDMA_PHY_PS_SHIFT; mbx_err: kfree(cmd); @@ -1328,11 +1344,16 @@ static int ocrdma_mbx_get_phy_info(struct ocrdma_dev *dev) goto mbx_err; rsp = (struct ocrdma_get_phy_info_rsp *)cmd; - dev->phy.phy_type = le16_to_cpu(rsp->phy_type); + dev->phy.phy_type = + (rsp->ityp_ptyp & OCRDMA_PHY_TYPE_MASK); + dev->phy.interface_type = + (rsp->ityp_ptyp & OCRDMA_IF_TYPE_MASK) + >> OCRDMA_IF_TYPE_SHIFT; dev->phy.auto_speeds_supported = - le16_to_cpu(rsp->auto_speeds_supported); + (rsp->fspeed_aspeed & OCRDMA_ASPEED_SUPP_MASK); dev->phy.fixed_speeds_supported = - le16_to_cpu(rsp->fixed_speeds_supported); + (rsp->fspeed_aspeed & OCRDMA_FSPEED_SUPP_MASK) + >> OCRDMA_FSPEED_SUPP_SHIFT; mbx_err: kfree(cmd); return status; @@ -1457,8 +1478,8 @@ static int ocrdma_mbx_create_ah_tbl(struct ocrdma_dev *dev) pbes = (struct ocrdma_pbe *)dev->av_tbl.pbl.va; for (i = 0; i < dev->av_tbl.size / OCRDMA_MIN_Q_PAGE_SIZE; i++) { - pbes[i].pa_lo = (u32) (pa & 0xffffffff); - pbes[i].pa_hi = (u32) upper_32_bits(pa); + pbes[i].pa_lo = (u32)cpu_to_le32(pa & 0xffffffff); + pbes[i].pa_hi = (u32)cpu_to_le32(upper_32_bits(pa)); pa += PAGE_SIZE; } cmd->tbl_addr[0].lo = (u32)(dev->av_tbl.pbl.pa & 0xFFFFFFFF); @@ -1501,6 +1522,7 @@ static void ocrdma_mbx_delete_ah_tbl(struct ocrdma_dev *dev) ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); dma_free_coherent(&pdev->dev, dev->av_tbl.size, dev->av_tbl.va, dev->av_tbl.pa); + dev->av_tbl.va = NULL; dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->av_tbl.pbl.va, dev->av_tbl.pbl.pa); kfree(cmd); @@ -1624,14 +1646,16 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq, cmd->cmd.pgsz_pgcnt |= OCRDMA_CREATE_CQ_DPP << OCRDMA_CREATE_CQ_TYPE_SHIFT; cq->phase_change = false; - cmd->cmd.cqe_count = (cq->len / cqe_size); + cmd->cmd.pdid_cqecnt = (cq->len / cqe_size); } else { - cmd->cmd.cqe_count = (cq->len / cqe_size) - 1; + cmd->cmd.pdid_cqecnt = (cq->len / cqe_size) - 1; cmd->cmd.ev_cnt_flags |= OCRDMA_CREATE_CQ_FLAGS_AUTO_VALID; cq->phase_change = true; } - cmd->cmd.pd_id = pd_id; /* valid only for v3 */ + /* pd_id valid only for v3 */ + cmd->cmd.pdid_cqecnt |= (pd_id << + OCRDMA_CREATE_CQ_CMD_PDID_SHIFT); ocrdma_build_q_pages(&cmd->cmd.pa[0], hw_pages, cq->pa, page_size); status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); if (status) @@ -2206,7 +2230,8 @@ int ocrdma_mbx_create_qp(struct ocrdma_qp *qp, struct ib_qp_init_attr *attrs, OCRDMA_CREATE_QP_REQ_RQ_CQID_MASK; qp->rq_cq = cq; - if (pd->dpp_enabled && pd->num_dpp_qp) { + if (pd->dpp_enabled && attrs->cap.max_inline_data && pd->num_dpp_qp && + (attrs->cap.max_inline_data <= dev->attr.max_inline_data)) { ocrdma_set_create_qp_dpp_cmd(cmd, pd, qp, enable_dpp_cq, dpp_cq_id); } @@ -2264,6 +2289,8 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, if ((ah_attr->ah_flags & IB_AH_GRH) == 0) return -EINVAL; + if (atomic_cmpxchg(&qp->dev->update_sl, 1, 0)) + ocrdma_init_service_level(qp->dev); cmd->params.tclass_sq_psn |= (ah_attr->grh.traffic_class << OCRDMA_QP_PARAMS_TCLASS_SHIFT); cmd->params.rnt_rc_sl_fl |= @@ -2297,6 +2324,8 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, cmd->params.vlan_dmac_b4_to_b5 |= vlan_id << OCRDMA_QP_PARAMS_VLAN_SHIFT; cmd->flags |= OCRDMA_QP_PARA_VLAN_EN_VALID; + cmd->params.rnt_rc_sl_fl |= + (qp->dev->sl & 0x07) << OCRDMA_QP_PARAMS_SL_SHIFT; } return 0; } @@ -2604,6 +2633,168 @@ int ocrdma_mbx_destroy_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq) return status; } +static int ocrdma_mbx_get_dcbx_config(struct ocrdma_dev *dev, u32 ptype, + struct ocrdma_dcbx_cfg *dcbxcfg) +{ + int status = 0; + dma_addr_t pa; + struct ocrdma_mqe cmd; + + struct ocrdma_get_dcbx_cfg_req *req = NULL; + struct ocrdma_get_dcbx_cfg_rsp *rsp = NULL; + struct pci_dev *pdev = dev->nic_info.pdev; + struct ocrdma_mqe_sge *mqe_sge = cmd.u.nonemb_req.sge; + + memset(&cmd, 0, sizeof(struct ocrdma_mqe)); + cmd.hdr.pyld_len = max_t (u32, sizeof(struct ocrdma_get_dcbx_cfg_rsp), + sizeof(struct ocrdma_get_dcbx_cfg_req)); + req = dma_alloc_coherent(&pdev->dev, cmd.hdr.pyld_len, &pa, GFP_KERNEL); + if (!req) { + status = -ENOMEM; + goto mem_err; + } + + cmd.hdr.spcl_sge_cnt_emb |= (1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) & + OCRDMA_MQE_HDR_SGE_CNT_MASK; + mqe_sge->pa_lo = (u32) (pa & 0xFFFFFFFFUL); + mqe_sge->pa_hi = (u32) upper_32_bits(pa); + mqe_sge->len = cmd.hdr.pyld_len; + + memset(req, 0, sizeof(struct ocrdma_get_dcbx_cfg_req)); + ocrdma_init_mch(&req->hdr, OCRDMA_CMD_GET_DCBX_CONFIG, + OCRDMA_SUBSYS_DCBX, cmd.hdr.pyld_len); + req->param_type = ptype; + + status = ocrdma_mbx_cmd(dev, &cmd); + if (status) + goto mbx_err; + + rsp = (struct ocrdma_get_dcbx_cfg_rsp *)req; + ocrdma_le32_to_cpu(rsp, sizeof(struct ocrdma_get_dcbx_cfg_rsp)); + memcpy(dcbxcfg, &rsp->cfg, sizeof(struct ocrdma_dcbx_cfg)); + +mbx_err: + dma_free_coherent(&pdev->dev, cmd.hdr.pyld_len, req, pa); +mem_err: + return status; +} + +#define OCRDMA_MAX_SERVICE_LEVEL_INDEX 0x08 +#define OCRDMA_DEFAULT_SERVICE_LEVEL 0x05 + +static int ocrdma_parse_dcbxcfg_rsp(struct ocrdma_dev *dev, int ptype, + struct ocrdma_dcbx_cfg *dcbxcfg, + u8 *srvc_lvl) +{ + int status = -EINVAL, indx, slindx; + int ventry_cnt; + struct ocrdma_app_parameter *app_param; + u8 valid, proto_sel; + u8 app_prio, pfc_prio; + u16 proto; + + if (!(dcbxcfg->tcv_aev_opv_st & OCRDMA_DCBX_STATE_MASK)) { + pr_info("%s ocrdma%d DCBX is disabled\n", + dev_name(&dev->nic_info.pdev->dev), dev->id); + goto out; + } + + if (!ocrdma_is_enabled_and_synced(dcbxcfg->pfc_state)) { + pr_info("%s ocrdma%d priority flow control(%s) is %s%s\n", + dev_name(&dev->nic_info.pdev->dev), dev->id, + (ptype > 0 ? "operational" : "admin"), + (dcbxcfg->pfc_state & OCRDMA_STATE_FLAG_ENABLED) ? + "enabled" : "disabled", + (dcbxcfg->pfc_state & OCRDMA_STATE_FLAG_SYNC) ? + "" : ", not sync'ed"); + goto out; + } else { + pr_info("%s ocrdma%d priority flow control is enabled and sync'ed\n", + dev_name(&dev->nic_info.pdev->dev), dev->id); + } + + ventry_cnt = (dcbxcfg->tcv_aev_opv_st >> + OCRDMA_DCBX_APP_ENTRY_SHIFT) + & OCRDMA_DCBX_STATE_MASK; + + for (indx = 0; indx < ventry_cnt; indx++) { + app_param = &dcbxcfg->app_param[indx]; + valid = (app_param->valid_proto_app >> + OCRDMA_APP_PARAM_VALID_SHIFT) + & OCRDMA_APP_PARAM_VALID_MASK; + proto_sel = (app_param->valid_proto_app + >> OCRDMA_APP_PARAM_PROTO_SEL_SHIFT) + & OCRDMA_APP_PARAM_PROTO_SEL_MASK; + proto = app_param->valid_proto_app & + OCRDMA_APP_PARAM_APP_PROTO_MASK; + + if ( + valid && proto == OCRDMA_APP_PROTO_ROCE && + proto_sel == OCRDMA_PROTO_SELECT_L2) { + for (slindx = 0; slindx < + OCRDMA_MAX_SERVICE_LEVEL_INDEX; slindx++) { + app_prio = ocrdma_get_app_prio( + (u8 *)app_param->app_prio, + slindx); + pfc_prio = ocrdma_get_pfc_prio( + (u8 *)dcbxcfg->pfc_prio, + slindx); + + if (app_prio && pfc_prio) { + *srvc_lvl = slindx; + status = 0; + goto out; + } + } + if (slindx == OCRDMA_MAX_SERVICE_LEVEL_INDEX) { + pr_info("%s ocrdma%d application priority not set for 0x%x protocol\n", + dev_name(&dev->nic_info.pdev->dev), + dev->id, proto); + } + } + } + +out: + return status; +} + +void ocrdma_init_service_level(struct ocrdma_dev *dev) +{ + int status = 0, indx; + struct ocrdma_dcbx_cfg dcbxcfg; + u8 srvc_lvl = OCRDMA_DEFAULT_SERVICE_LEVEL; + int ptype = OCRDMA_PARAMETER_TYPE_OPER; + + for (indx = 0; indx < 2; indx++) { + status = ocrdma_mbx_get_dcbx_config(dev, ptype, &dcbxcfg); + if (status) { + pr_err("%s(): status=%d\n", __func__, status); + ptype = OCRDMA_PARAMETER_TYPE_ADMIN; + continue; + } + + status = ocrdma_parse_dcbxcfg_rsp(dev, ptype, + &dcbxcfg, &srvc_lvl); + if (status) { + ptype = OCRDMA_PARAMETER_TYPE_ADMIN; + continue; + } + + break; + } + + if (status) + pr_info("%s ocrdma%d service level default\n", + dev_name(&dev->nic_info.pdev->dev), dev->id); + else + pr_info("%s ocrdma%d service level %d\n", + dev_name(&dev->nic_info.pdev->dev), dev->id, + srvc_lvl); + + dev->pfc_state = ocrdma_is_enabled_and_synced(dcbxcfg.pfc_state); + dev->sl = srvc_lvl; +} + int ocrdma_alloc_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah) { int i; @@ -2709,13 +2900,15 @@ int ocrdma_init_hw(struct ocrdma_dev *dev) goto conf_err; status = ocrdma_mbx_get_phy_info(dev); if (status) - goto conf_err; + goto info_attrb_err; status = ocrdma_mbx_get_ctrl_attribs(dev); if (status) - goto conf_err; + goto info_attrb_err; return 0; +info_attrb_err: + ocrdma_mbx_delete_ah_tbl(dev); conf_err: ocrdma_destroy_mq(dev); mq_err: diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h index e513f729314..6eed8f19132 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h @@ -135,4 +135,6 @@ int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq); int ocrdma_mbx_rdma_stats(struct ocrdma_dev *, bool reset); char *port_speed_string(struct ocrdma_dev *dev); +void ocrdma_init_service_level(struct ocrdma_dev *); + #endif /* __OCRDMA_HW_H__ */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 7c504e07974..256a06bc0b6 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -324,6 +324,11 @@ static int ocrdma_alloc_resources(struct ocrdma_dev *dev) if (!dev->qp_tbl) goto alloc_err; } + + dev->stag_arr = kzalloc(sizeof(u64) * OCRDMA_MAX_STAG, GFP_KERNEL); + if (dev->stag_arr == NULL) + goto alloc_err; + spin_lock_init(&dev->av_tbl.lock); spin_lock_init(&dev->flush_q_lock); return 0; @@ -334,6 +339,7 @@ alloc_err: static void ocrdma_free_resources(struct ocrdma_dev *dev) { + kfree(dev->stag_arr); kfree(dev->qp_tbl); kfree(dev->cq_tbl); kfree(dev->sgid_tbl); @@ -353,15 +359,25 @@ static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, { struct ocrdma_dev *dev = dev_get_drvdata(device); - return scnprintf(buf, PAGE_SIZE, "%s", &dev->attr.fw_ver[0]); + return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->attr.fw_ver[0]); +} + +static ssize_t show_hca_type(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ocrdma_dev *dev = dev_get_drvdata(device); + + return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->model_number[0]); } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL); static struct device_attribute *ocrdma_attributes[] = { &dev_attr_hw_rev, - &dev_attr_fw_ver + &dev_attr_fw_ver, + &dev_attr_hca_type }; static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev) @@ -372,6 +388,58 @@ static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev) device_remove_file(&dev->ibdev.dev, ocrdma_attributes[i]); } +static void ocrdma_init_ipv4_gids(struct ocrdma_dev *dev, + struct net_device *net) +{ + struct in_device *in_dev; + union ib_gid gid; + in_dev = in_dev_get(net); + if (in_dev) { + for_ifa(in_dev) { + ipv6_addr_set_v4mapped(ifa->ifa_address, + (struct in6_addr *)&gid); + ocrdma_add_sgid(dev, &gid); + } + endfor_ifa(in_dev); + in_dev_put(in_dev); + } +} + +static void ocrdma_init_ipv6_gids(struct ocrdma_dev *dev, + struct net_device *net) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *in6_dev; + union ib_gid *pgid; + struct inet6_ifaddr *ifp; + in6_dev = in6_dev_get(net); + if (in6_dev) { + read_lock_bh(&in6_dev->lock); + list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { + pgid = (union ib_gid *)&ifp->addr; + ocrdma_add_sgid(dev, pgid); + } + read_unlock_bh(&in6_dev->lock); + in6_dev_put(in6_dev); + } +#endif +} + +static void ocrdma_init_gid_table(struct ocrdma_dev *dev) +{ + struct net_device *net_dev; + + for_each_netdev(&init_net, net_dev) { + struct net_device *real_dev = rdma_vlan_dev_real_dev(net_dev) ? + rdma_vlan_dev_real_dev(net_dev) : net_dev; + + if (real_dev == dev->nic_info.netdev) { + ocrdma_init_ipv4_gids(dev, net_dev); + ocrdma_init_ipv6_gids(dev, net_dev); + } + } +} + static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) { int status = 0, i; @@ -399,6 +467,8 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) if (status) goto alloc_err; + ocrdma_init_service_level(dev); + ocrdma_init_gid_table(dev); status = ocrdma_register_device(dev); if (status) goto alloc_err; @@ -508,6 +578,12 @@ static int ocrdma_close(struct ocrdma_dev *dev) return 0; } +static void ocrdma_shutdown(struct ocrdma_dev *dev) +{ + ocrdma_close(dev); + ocrdma_remove(dev); +} + /* event handling via NIC driver ensures that all the NIC specific * initialization done before RoCE driver notifies * event to stack. @@ -521,6 +597,9 @@ static void ocrdma_event_handler(struct ocrdma_dev *dev, u32 event) case BE_DEV_DOWN: ocrdma_close(dev); break; + case BE_DEV_SHUTDOWN: + ocrdma_shutdown(dev); + break; } } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h index 96c9ee602ba..904989ec5ea 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h @@ -44,35 +44,39 @@ enum { #define OCRDMA_SUBSYS_ROCE 10 enum { OCRDMA_CMD_QUERY_CONFIG = 1, - OCRDMA_CMD_ALLOC_PD, - OCRDMA_CMD_DEALLOC_PD, - - OCRDMA_CMD_CREATE_AH_TBL, - OCRDMA_CMD_DELETE_AH_TBL, - - OCRDMA_CMD_CREATE_QP, - OCRDMA_CMD_QUERY_QP, - OCRDMA_CMD_MODIFY_QP, - OCRDMA_CMD_DELETE_QP, - - OCRDMA_CMD_RSVD1, - OCRDMA_CMD_ALLOC_LKEY, - OCRDMA_CMD_DEALLOC_LKEY, - OCRDMA_CMD_REGISTER_NSMR, - OCRDMA_CMD_REREGISTER_NSMR, - OCRDMA_CMD_REGISTER_NSMR_CONT, - OCRDMA_CMD_QUERY_NSMR, - OCRDMA_CMD_ALLOC_MW, - OCRDMA_CMD_QUERY_MW, - - OCRDMA_CMD_CREATE_SRQ, - OCRDMA_CMD_QUERY_SRQ, - OCRDMA_CMD_MODIFY_SRQ, - OCRDMA_CMD_DELETE_SRQ, - - OCRDMA_CMD_ATTACH_MCAST, - OCRDMA_CMD_DETACH_MCAST, - OCRDMA_CMD_GET_RDMA_STATS, + OCRDMA_CMD_ALLOC_PD = 2, + OCRDMA_CMD_DEALLOC_PD = 3, + + OCRDMA_CMD_CREATE_AH_TBL = 4, + OCRDMA_CMD_DELETE_AH_TBL = 5, + + OCRDMA_CMD_CREATE_QP = 6, + OCRDMA_CMD_QUERY_QP = 7, + OCRDMA_CMD_MODIFY_QP = 8 , + OCRDMA_CMD_DELETE_QP = 9, + + OCRDMA_CMD_RSVD1 = 10, + OCRDMA_CMD_ALLOC_LKEY = 11, + OCRDMA_CMD_DEALLOC_LKEY = 12, + OCRDMA_CMD_REGISTER_NSMR = 13, + OCRDMA_CMD_REREGISTER_NSMR = 14, + OCRDMA_CMD_REGISTER_NSMR_CONT = 15, + OCRDMA_CMD_QUERY_NSMR = 16, + OCRDMA_CMD_ALLOC_MW = 17, + OCRDMA_CMD_QUERY_MW = 18, + + OCRDMA_CMD_CREATE_SRQ = 19, + OCRDMA_CMD_QUERY_SRQ = 20, + OCRDMA_CMD_MODIFY_SRQ = 21, + OCRDMA_CMD_DELETE_SRQ = 22, + + OCRDMA_CMD_ATTACH_MCAST = 23, + OCRDMA_CMD_DETACH_MCAST = 24, + + OCRDMA_CMD_CREATE_RBQ = 25, + OCRDMA_CMD_DESTROY_RBQ = 26, + + OCRDMA_CMD_GET_RDMA_STATS = 27, OCRDMA_CMD_MAX }; @@ -103,7 +107,7 @@ enum { #define OCRDMA_MAX_QP 2048 #define OCRDMA_MAX_CQ 2048 -#define OCRDMA_MAX_STAG 8192 +#define OCRDMA_MAX_STAG 16384 enum { OCRDMA_DB_RQ_OFFSET = 0xE0, @@ -422,7 +426,12 @@ struct ocrdma_ae_qp_mcqe { #define OCRDMA_ASYNC_RDMA_EVE_CODE 0x14 #define OCRDMA_ASYNC_GRP5_EVE_CODE 0x5 -#define OCRDMA_ASYNC_EVENT_PVID_STATE 0x3 + +enum ocrdma_async_grp5_events { + OCRDMA_ASYNC_EVENT_QOS_VALUE = 0x01, + OCRDMA_ASYNC_EVENT_COS_VALUE = 0x02, + OCRDMA_ASYNC_EVENT_PVID_STATE = 0x03 +}; enum OCRDMA_ASYNC_EVENT_TYPE { OCRDMA_CQ_ERROR = 0x00, @@ -525,8 +534,8 @@ struct ocrdma_mbx_query_config { u32 max_ird_ord_per_qp; u32 max_shared_ird_ord; u32 max_mr; - u32 max_mr_size_lo; u32 max_mr_size_hi; + u32 max_mr_size_lo; u32 max_num_mr_pbl; u32 max_mw; u32 max_fmr; @@ -580,17 +589,26 @@ enum { OCRDMA_FN_MODE_RDMA = 0x4 }; +enum { + OCRDMA_IF_TYPE_MASK = 0xFFFF0000, + OCRDMA_IF_TYPE_SHIFT = 0x10, + OCRDMA_PHY_TYPE_MASK = 0x0000FFFF, + OCRDMA_FUTURE_DETAILS_MASK = 0xFFFF0000, + OCRDMA_FUTURE_DETAILS_SHIFT = 0x10, + OCRDMA_EX_PHY_DETAILS_MASK = 0x0000FFFF, + OCRDMA_FSPEED_SUPP_MASK = 0xFFFF0000, + OCRDMA_FSPEED_SUPP_SHIFT = 0x10, + OCRDMA_ASPEED_SUPP_MASK = 0x0000FFFF +}; + struct ocrdma_get_phy_info_rsp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_rsp rsp; - u16 phy_type; - u16 interface_type; + u32 ityp_ptyp; u32 misc_params; - u16 ext_phy_details; - u16 rsvd; - u16 auto_speeds_supported; - u16 fixed_speeds_supported; + u32 ftrdtl_exphydtl; + u32 fspeed_aspeed; u32 future_use[2]; }; @@ -603,19 +621,34 @@ enum { OCRDMA_PHY_SPEED_40GBPS = 0x20 }; +enum { + OCRDMA_PORT_NUM_MASK = 0x3F, + OCRDMA_PT_MASK = 0xC0, + OCRDMA_PT_SHIFT = 0x6, + OCRDMA_LINK_DUP_MASK = 0x0000FF00, + OCRDMA_LINK_DUP_SHIFT = 0x8, + OCRDMA_PHY_PS_MASK = 0x00FF0000, + OCRDMA_PHY_PS_SHIFT = 0x10, + OCRDMA_PHY_PFLT_MASK = 0xFF000000, + OCRDMA_PHY_PFLT_SHIFT = 0x18, + OCRDMA_QOS_LNKSP_MASK = 0xFFFF0000, + OCRDMA_QOS_LNKSP_SHIFT = 0x10, + OCRDMA_LLST_MASK = 0xFF, + OCRDMA_PLFC_MASK = 0x00000400, + OCRDMA_PLFC_SHIFT = 0x8, + OCRDMA_PLRFC_MASK = 0x00000200, + OCRDMA_PLRFC_SHIFT = 0x8, + OCRDMA_PLTFC_MASK = 0x00000100, + OCRDMA_PLTFC_SHIFT = 0x8 +}; struct ocrdma_get_link_speed_rsp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_rsp rsp; - u8 pt_port_num; - u8 link_duplex; - u8 phys_port_speed; - u8 phys_port_fault; - u16 rsvd1; - u16 qos_lnk_speed; - u8 logical_lnk_status; - u8 rsvd2[3]; + u32 pflt_pps_ld_pnum; + u32 qos_lsp; + u32 res_lls; }; enum { @@ -666,8 +699,7 @@ struct ocrdma_create_cq_cmd { u32 pgsz_pgcnt; u32 ev_cnt_flags; u32 eqn; - u16 cqe_count; - u16 pd_id; + u32 pdid_cqecnt; u32 rsvd6; struct ocrdma_pa pa[OCRDMA_CREATE_CQ_MAX_PAGES]; }; @@ -678,6 +710,10 @@ struct ocrdma_create_cq { }; enum { + OCRDMA_CREATE_CQ_CMD_PDID_SHIFT = 0x10 +}; + +enum { OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK = 0xFFFF }; @@ -1231,7 +1267,6 @@ struct ocrdma_destroy_srq { enum { OCRDMA_ALLOC_PD_ENABLE_DPP = BIT(16), - OCRDMA_PD_MAX_DPP_ENABLED_QP = 8, OCRDMA_DPP_PAGE_SIZE = 4096 }; @@ -1896,12 +1931,62 @@ struct ocrdma_rdma_stats_resp { struct ocrdma_rx_dbg_stats rx_dbg_stats; } __packed; +enum { + OCRDMA_HBA_ATTRB_EPROM_VER_LO_MASK = 0xFF, + OCRDMA_HBA_ATTRB_EPROM_VER_HI_MASK = 0xFF00, + OCRDMA_HBA_ATTRB_EPROM_VER_HI_SHIFT = 0x08, + OCRDMA_HBA_ATTRB_CDBLEN_MASK = 0xFFFF, + OCRDMA_HBA_ATTRB_ASIC_REV_MASK = 0xFF0000, + OCRDMA_HBA_ATTRB_ASIC_REV_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_GUID0_MASK = 0xFF000000, + OCRDMA_HBA_ATTRB_GUID0_SHIFT = 0x18, + OCRDMA_HBA_ATTRB_GUID13_MASK = 0xFF, + OCRDMA_HBA_ATTRB_GUID14_MASK = 0xFF00, + OCRDMA_HBA_ATTRB_GUID14_SHIFT = 0x08, + OCRDMA_HBA_ATTRB_GUID15_MASK = 0xFF0000, + OCRDMA_HBA_ATTRB_GUID15_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_PCNT_MASK = 0xFF000000, + OCRDMA_HBA_ATTRB_PCNT_SHIFT = 0x18, + OCRDMA_HBA_ATTRB_LDTOUT_MASK = 0xFFFF, + OCRDMA_HBA_ATTRB_ISCSI_VER_MASK = 0xFF0000, + OCRDMA_HBA_ATTRB_ISCSI_VER_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_MFUNC_DEV_MASK = 0xFF000000, + OCRDMA_HBA_ATTRB_MFUNC_DEV_SHIFT = 0x18, + OCRDMA_HBA_ATTRB_CV_MASK = 0xFF, + OCRDMA_HBA_ATTRB_HBA_ST_MASK = 0xFF00, + OCRDMA_HBA_ATTRB_HBA_ST_SHIFT = 0x08, + OCRDMA_HBA_ATTRB_MAX_DOMS_MASK = 0xFF0000, + OCRDMA_HBA_ATTRB_MAX_DOMS_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_PTNUM_MASK = 0x3F000000, + OCRDMA_HBA_ATTRB_PTNUM_SHIFT = 0x18, + OCRDMA_HBA_ATTRB_PT_MASK = 0xC0000000, + OCRDMA_HBA_ATTRB_PT_SHIFT = 0x1E, + OCRDMA_HBA_ATTRB_ISCSI_FET_MASK = 0xFF, + OCRDMA_HBA_ATTRB_ASIC_GEN_MASK = 0xFF00, + OCRDMA_HBA_ATTRB_ASIC_GEN_SHIFT = 0x08, + OCRDMA_HBA_ATTRB_PCI_VID_MASK = 0xFFFF, + OCRDMA_HBA_ATTRB_PCI_DID_MASK = 0xFFFF0000, + OCRDMA_HBA_ATTRB_PCI_DID_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_PCI_SVID_MASK = 0xFFFF, + OCRDMA_HBA_ATTRB_PCI_SSID_MASK = 0xFFFF0000, + OCRDMA_HBA_ATTRB_PCI_SSID_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_PCI_BUSNUM_MASK = 0xFF, + OCRDMA_HBA_ATTRB_PCI_DEVNUM_MASK = 0xFF00, + OCRDMA_HBA_ATTRB_PCI_DEVNUM_SHIFT = 0x08, + OCRDMA_HBA_ATTRB_PCI_FUNCNUM_MASK = 0xFF0000, + OCRDMA_HBA_ATTRB_PCI_FUNCNUM_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_IF_TYPE_MASK = 0xFF000000, + OCRDMA_HBA_ATTRB_IF_TYPE_SHIFT = 0x18, + OCRDMA_HBA_ATTRB_NETFIL_MASK =0xFF +}; struct mgmt_hba_attribs { u8 flashrom_version_string[32]; u8 manufacturer_name[32]; u32 supported_modes; - u32 rsvd0[3]; + u32 rsvd_eprom_verhi_verlo; + u32 mbx_ds_ver; + u32 epfw_ds_ver; u8 ncsi_ver_string[12]; u32 default_extended_timeout; u8 controller_model_number[32]; @@ -1914,34 +1999,26 @@ struct mgmt_hba_attribs { u8 driver_version_string[32]; u8 fw_on_flash_version_string[32]; u32 functionalities_supported; - u16 max_cdblength; - u8 asic_revision; - u8 generational_guid[16]; - u8 hba_port_count; - u16 default_link_down_timeout; - u8 iscsi_ver_min_max; - u8 multifunction_device; - u8 cache_valid; - u8 hba_status; - u8 max_domains_supported; - u8 phy_port; + u32 guid0_asicrev_cdblen; + u8 generational_guid[12]; + u32 portcnt_guid15; + u32 mfuncdev_iscsi_ldtout; + u32 ptpnum_maxdoms_hbast_cv; u32 firmware_post_status; u32 hba_mtu[8]; - u32 rsvd1[4]; + u32 res_asicgen_iscsi_feaures; + u32 rsvd1[3]; }; struct mgmt_controller_attrib { struct mgmt_hba_attribs hba_attribs; - u16 pci_vendor_id; - u16 pci_device_id; - u16 pci_sub_vendor_id; - u16 pci_sub_system_id; - u8 pci_bus_number; - u8 pci_device_number; - u8 pci_function_number; - u8 interface_type; - u64 unique_identifier; - u32 rsvd0[5]; + u32 pci_did_vid; + u32 pci_ssid_svid; + u32 ityp_fnum_devnum_bnum; + u32 uid_hi; + u32 uid_lo; + u32 res_nnetfil; + u32 rsvd0[4]; }; struct ocrdma_get_ctrl_attribs_rsp { @@ -1949,5 +2026,79 @@ struct ocrdma_get_ctrl_attribs_rsp { struct mgmt_controller_attrib ctrl_attribs; }; +#define OCRDMA_SUBSYS_DCBX 0x10 + +enum OCRDMA_DCBX_OPCODE { + OCRDMA_CMD_GET_DCBX_CONFIG = 0x01 +}; + +enum OCRDMA_DCBX_PARAM_TYPE { + OCRDMA_PARAMETER_TYPE_ADMIN = 0x00, + OCRDMA_PARAMETER_TYPE_OPER = 0x01, + OCRDMA_PARAMETER_TYPE_PEER = 0x02 +}; + +enum OCRDMA_DCBX_APP_PROTO { + OCRDMA_APP_PROTO_ROCE = 0x8915 +}; + +enum OCRDMA_DCBX_PROTO { + OCRDMA_PROTO_SELECT_L2 = 0x00, + OCRDMA_PROTO_SELECT_L4 = 0x01 +}; + +enum OCRDMA_DCBX_APP_PARAM { + OCRDMA_APP_PARAM_APP_PROTO_MASK = 0xFFFF, + OCRDMA_APP_PARAM_PROTO_SEL_MASK = 0xFF, + OCRDMA_APP_PARAM_PROTO_SEL_SHIFT = 0x10, + OCRDMA_APP_PARAM_VALID_MASK = 0xFF, + OCRDMA_APP_PARAM_VALID_SHIFT = 0x18 +}; + +enum OCRDMA_DCBX_STATE_FLAGS { + OCRDMA_STATE_FLAG_ENABLED = 0x01, + OCRDMA_STATE_FLAG_ADDVERTISED = 0x02, + OCRDMA_STATE_FLAG_WILLING = 0x04, + OCRDMA_STATE_FLAG_SYNC = 0x08, + OCRDMA_STATE_FLAG_UNSUPPORTED = 0x40000000, + OCRDMA_STATE_FLAG_NEG_FAILD = 0x80000000 +}; + +enum OCRDMA_TCV_AEV_OPV_ST { + OCRDMA_DCBX_TC_SUPPORT_MASK = 0xFF, + OCRDMA_DCBX_TC_SUPPORT_SHIFT = 0x18, + OCRDMA_DCBX_APP_ENTRY_SHIFT = 0x10, + OCRDMA_DCBX_OP_PARAM_SHIFT = 0x08, + OCRDMA_DCBX_STATE_MASK = 0xFF +}; + +struct ocrdma_app_parameter { + u32 valid_proto_app; + u32 oui; + u32 app_prio[2]; +}; + +struct ocrdma_dcbx_cfg { + u32 tcv_aev_opv_st; + u32 tc_state; + u32 pfc_state; + u32 qcn_state; + u32 appl_state; + u32 ll_state; + u32 tc_bw[2]; + u32 tc_prio[8]; + u32 pfc_prio[2]; + struct ocrdma_app_parameter app_param[15]; +}; + +struct ocrdma_get_dcbx_cfg_req { + struct ocrdma_mbx_hdr hdr; + u32 param_type; +} __packed; + +struct ocrdma_get_dcbx_cfg_rsp { + struct ocrdma_mbx_rsp hdr; + struct ocrdma_dcbx_cfg cfg; +} __packed; #endif /* __OCRDMA_SLI_H__ */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index edf6211d84b..acb434d1690 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -69,11 +69,11 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr) memcpy(&attr->fw_ver, &dev->attr.fw_ver[0], min(sizeof(dev->attr.fw_ver), sizeof(attr->fw_ver))); ocrdma_get_guid(dev, (u8 *)&attr->sys_image_guid); - attr->max_mr_size = ~0ull; + attr->max_mr_size = dev->attr.max_mr_size; attr->page_size_cap = 0xffff000; attr->vendor_id = dev->nic_info.pdev->vendor; attr->vendor_part_id = dev->nic_info.pdev->device; - attr->hw_ver = 0; + attr->hw_ver = dev->asic_id; attr->max_qp = dev->attr.max_qp; attr->max_ah = OCRDMA_MAX_AH; attr->max_qp_wr = dev->attr.max_wqe; @@ -268,7 +268,8 @@ static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev, pd->dpp_enabled = ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R; pd->num_dpp_qp = - pd->dpp_enabled ? OCRDMA_PD_MAX_DPP_ENABLED_QP : 0; + pd->dpp_enabled ? (dev->nic_info.db_page_size / + dev->attr.wqe_size) : 0; } retry: @@ -328,7 +329,10 @@ static int ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx) struct ocrdma_pd *pd = uctx->cntxt_pd; struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device); - BUG_ON(uctx->pd_in_use); + if (uctx->pd_in_use) { + pr_err("%s(%d) Freeing in use pdid=0x%x.\n", + __func__, dev->id, pd->id); + } uctx->cntxt_pd = NULL; status = _ocrdma_dealloc_pd(dev, pd); return status; @@ -843,6 +847,13 @@ int ocrdma_dereg_mr(struct ib_mr *ib_mr) if (mr->umem) ib_umem_release(mr->umem); kfree(mr); + + /* Don't stop cleanup, in case FW is unresponsive */ + if (dev->mqe_ctx.fw_error_state) { + status = 0; + pr_err("%s(%d) fw not responding.\n", + __func__, dev->id); + } return status; } @@ -2054,6 +2065,13 @@ int ocrdma_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } while (wr) { + if (qp->qp_type == IB_QPT_UD && + (wr->opcode != IB_WR_SEND && + wr->opcode != IB_WR_SEND_WITH_IMM)) { + *bad_wr = wr; + status = -EINVAL; + break; + } if (ocrdma_hwq_free_cnt(&qp->sq) == 0 || wr->num_sge > qp->sq.max_sges) { *bad_wr = wr; @@ -2488,6 +2506,11 @@ static bool ocrdma_poll_err_scqe(struct ocrdma_qp *qp, *stop = true; expand = false; } + } else if (is_hw_sq_empty(qp)) { + /* Do nothing */ + expand = false; + *polled = false; + *stop = false; } else { *polled = true; expand = ocrdma_update_err_scqe(ibwc, cqe, qp, status); @@ -2593,6 +2616,11 @@ static bool ocrdma_poll_err_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe, *stop = true; expand = false; } + } else if (is_hw_rq_empty(qp)) { + /* Do nothing */ + expand = false; + *polled = false; + *stop = false; } else { *polled = true; expand = ocrdma_update_err_rcqe(ibwc, cqe, qp, status); diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c index 22c720e5740..636be117b57 100644 --- a/drivers/infiniband/hw/qib/qib_mad.c +++ b/drivers/infiniband/hw/qib/qib_mad.c @@ -2476,7 +2476,7 @@ int qib_create_agents(struct qib_ibdev *dev) ibp = &dd->pport[p].ibport_data; agent = ib_register_mad_agent(&dev->ibdev, p + 1, IB_QPT_SMI, NULL, 0, send_handler, - NULL, NULL); + NULL, NULL, 0); if (IS_ERR(agent)) { ret = PTR_ERR(agent); goto err; diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index c639f90cfda..3edce617c31 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -86,7 +86,6 @@ enum { IPOIB_FLAG_INITIALIZED = 1, IPOIB_FLAG_ADMIN_UP = 2, IPOIB_PKEY_ASSIGNED = 3, - IPOIB_PKEY_STOP = 4, IPOIB_FLAG_SUBINTERFACE = 5, IPOIB_MCAST_RUN = 6, IPOIB_STOP_REAPER = 7, @@ -312,7 +311,6 @@ struct ipoib_dev_priv { struct list_head multicast_list; struct rb_root multicast_tree; - struct delayed_work pkey_poll_task; struct delayed_work mcast_task; struct work_struct carrier_on_task; struct work_struct flush_light; @@ -473,10 +471,11 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work); void ipoib_pkey_event(struct work_struct *work); void ipoib_ib_dev_cleanup(struct net_device *dev); -int ipoib_ib_dev_open(struct net_device *dev); +int ipoib_ib_dev_open(struct net_device *dev, int flush); int ipoib_ib_dev_up(struct net_device *dev); int ipoib_ib_dev_down(struct net_device *dev, int flush); int ipoib_ib_dev_stop(struct net_device *dev, int flush); +void ipoib_pkey_dev_check_presence(struct net_device *dev); int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); void ipoib_dev_cleanup(struct net_device *dev); @@ -532,8 +531,7 @@ int ipoib_set_mode(struct net_device *dev, const char *buf); void ipoib_setup(struct net_device *dev); -void ipoib_pkey_poll(struct work_struct *work); -int ipoib_pkey_dev_delay_open(struct net_device *dev); +void ipoib_pkey_open(struct ipoib_dev_priv *priv); void ipoib_drain_cq(struct net_device *dev); void ipoib_set_ethtool_ops(struct net_device *dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c index 50061854616..6bd5740e269 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c @@ -281,10 +281,8 @@ void ipoib_delete_debug_files(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); - if (priv->mcg_dentry) - debugfs_remove(priv->mcg_dentry); - if (priv->path_dentry) - debugfs_remove(priv->path_dentry); + debugfs_remove(priv->mcg_dentry); + debugfs_remove(priv->path_dentry); } int ipoib_register_debugfs(void) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 6a7003ddb0b..72626c34817 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -664,17 +664,18 @@ static void ipoib_ib_tx_timer_func(unsigned long ctx) drain_tx_cq((struct net_device *)ctx); } -int ipoib_ib_dev_open(struct net_device *dev) +int ipoib_ib_dev_open(struct net_device *dev, int flush) { struct ipoib_dev_priv *priv = netdev_priv(dev); int ret; - if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) { - ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey); - clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + ipoib_pkey_dev_check_presence(dev); + + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { + ipoib_warn(priv, "P_Key 0x%04x is %s\n", priv->pkey, + (!(priv->pkey & 0x7fff) ? "Invalid" : "not found")); return -1; } - set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); ret = ipoib_init_qp(dev); if (ret) { @@ -705,16 +706,17 @@ int ipoib_ib_dev_open(struct net_device *dev) dev_stop: if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) napi_enable(&priv->napi); - ipoib_ib_dev_stop(dev, 1); + ipoib_ib_dev_stop(dev, flush); return -1; } -static void ipoib_pkey_dev_check_presence(struct net_device *dev) +void ipoib_pkey_dev_check_presence(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); - u16 pkey_index = 0; - if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) + if (!(priv->pkey & 0x7fff) || + ib_find_pkey(priv->ca, priv->port, priv->pkey, + &priv->pkey_index)) clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); else set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); @@ -745,14 +747,6 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush) clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); netif_carrier_off(dev); - /* Shutdown the P_Key thread if still active */ - if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { - mutex_lock(&pkey_mutex); - set_bit(IPOIB_PKEY_STOP, &priv->flags); - cancel_delayed_work_sync(&priv->pkey_poll_task); - mutex_unlock(&pkey_mutex); - } - ipoib_mcast_stop_thread(dev, flush); ipoib_mcast_dev_flush(dev); @@ -924,7 +918,7 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) (unsigned long) dev); if (dev->flags & IFF_UP) { - if (ipoib_ib_dev_open(dev)) { + if (ipoib_ib_dev_open(dev, 1)) { ipoib_transport_dev_cleanup(dev); return -ENODEV; } @@ -966,13 +960,27 @@ static inline int update_parent_pkey(struct ipoib_dev_priv *priv) return 1; } +/* + * returns 0 if pkey value was found in a different slot. + */ +static inline int update_child_pkey(struct ipoib_dev_priv *priv) +{ + u16 old_index = priv->pkey_index; + + priv->pkey_index = 0; + ipoib_pkey_dev_check_presence(priv->dev); + + if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && + (old_index == priv->pkey_index)) + return 1; + return 0; +} static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, enum ipoib_flush_level level) { struct ipoib_dev_priv *cpriv; struct net_device *dev = priv->dev; - u16 new_index; int result; down_read(&priv->vlan_rwsem); @@ -986,16 +994,20 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, up_read(&priv->vlan_rwsem); - if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { - /* for non-child devices must check/update the pkey value here */ - if (level == IPOIB_FLUSH_HEAVY && - !test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) - update_parent_pkey(priv); + if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) && + level != IPOIB_FLUSH_HEAVY) { ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); return; } if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { + /* interface is down. update pkey and leave. */ + if (level == IPOIB_FLUSH_HEAVY) { + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) + update_parent_pkey(priv); + else + update_child_pkey(priv); + } ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n"); return; } @@ -1005,20 +1017,13 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, * (parent) devices should always takes what present in pkey index 0 */ if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { - if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { - clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); - ipoib_ib_dev_down(dev, 0); - ipoib_ib_dev_stop(dev, 0); - if (ipoib_pkey_dev_delay_open(dev)) - return; - } - /* restart QP only if P_Key index is changed */ - if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && - new_index == priv->pkey_index) { + result = update_child_pkey(priv); + if (result) { + /* restart QP only if P_Key index is changed */ ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); return; } - priv->pkey_index = new_index; + } else { result = update_parent_pkey(priv); /* restart QP only if P_Key value changed */ @@ -1038,8 +1043,12 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, ipoib_ib_dev_down(dev, 0); if (level == IPOIB_FLUSH_HEAVY) { - ipoib_ib_dev_stop(dev, 0); - ipoib_ib_dev_open(dev); + if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + ipoib_ib_dev_stop(dev, 0); + if (ipoib_ib_dev_open(dev, 0) != 0) + return; + if (netif_queue_stopped(dev)) + netif_start_queue(dev); } /* @@ -1094,54 +1103,4 @@ void ipoib_ib_dev_cleanup(struct net_device *dev) ipoib_transport_dev_cleanup(dev); } -/* - * Delayed P_Key Assigment Interim Support - * - * The following is initial implementation of delayed P_Key assigment - * mechanism. It is using the same approach implemented for the multicast - * group join. The single goal of this implementation is to quickly address - * Bug #2507. This implementation will probably be removed when the P_Key - * change async notification is available. - */ - -void ipoib_pkey_poll(struct work_struct *work) -{ - struct ipoib_dev_priv *priv = - container_of(work, struct ipoib_dev_priv, pkey_poll_task.work); - struct net_device *dev = priv->dev; - - ipoib_pkey_dev_check_presence(dev); - - if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) - ipoib_open(dev); - else { - mutex_lock(&pkey_mutex); - if (!test_bit(IPOIB_PKEY_STOP, &priv->flags)) - queue_delayed_work(ipoib_workqueue, - &priv->pkey_poll_task, - HZ); - mutex_unlock(&pkey_mutex); - } -} - -int ipoib_pkey_dev_delay_open(struct net_device *dev) -{ - struct ipoib_dev_priv *priv = netdev_priv(dev); - - /* Look for the interface pkey value in the IB Port P_Key table and */ - /* set the interface pkey assigment flag */ - ipoib_pkey_dev_check_presence(dev); - /* P_Key value not assigned yet - start polling */ - if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { - mutex_lock(&pkey_mutex); - clear_bit(IPOIB_PKEY_STOP, &priv->flags); - queue_delayed_work(ipoib_workqueue, - &priv->pkey_poll_task, - HZ); - mutex_unlock(&pkey_mutex); - return 1; - } - - return 0; -} diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 4e675f4fecc..1310acf6bf9 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -108,11 +108,11 @@ int ipoib_open(struct net_device *dev) set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); - if (ipoib_pkey_dev_delay_open(dev)) - return 0; - - if (ipoib_ib_dev_open(dev)) + if (ipoib_ib_dev_open(dev, 1)) { + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) + return 0; goto err_disable; + } if (ipoib_ib_dev_up(dev)) goto err_stop; @@ -1379,7 +1379,6 @@ void ipoib_setup(struct net_device *dev) INIT_LIST_HEAD(&priv->dead_ahs); INIT_LIST_HEAD(&priv->multicast_list); - INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index eb7973957a6..61ee91d8838 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -596,20 +596,28 @@ iscsi_iser_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr, struct iser_conn *ib_conn; struct iscsi_endpoint *ep; - ep = iscsi_create_endpoint(sizeof(*ib_conn)); + ep = iscsi_create_endpoint(0); if (!ep) return ERR_PTR(-ENOMEM); - ib_conn = ep->dd_data; + ib_conn = kzalloc(sizeof(*ib_conn), GFP_KERNEL); + if (!ib_conn) { + err = -ENOMEM; + goto failure; + } + + ep->dd_data = ib_conn; ib_conn->ep = ep; iser_conn_init(ib_conn); - err = iser_connect(ib_conn, NULL, (struct sockaddr_in *)dst_addr, - non_blocking); + err = iser_connect(ib_conn, NULL, dst_addr, non_blocking); if (err) - return ERR_PTR(err); + goto failure; return ep; +failure: + iscsi_destroy_endpoint(ep); + return ERR_PTR(err); } static int @@ -619,15 +627,16 @@ iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms) int rc; ib_conn = ep->dd_data; - rc = wait_event_interruptible_timeout(ib_conn->wait, - ib_conn->state == ISER_CONN_UP, - msecs_to_jiffies(timeout_ms)); - + rc = wait_for_completion_interruptible_timeout(&ib_conn->up_completion, + msecs_to_jiffies(timeout_ms)); /* if conn establishment failed, return error code to iscsi */ - if (!rc && - (ib_conn->state == ISER_CONN_TERMINATING || - ib_conn->state == ISER_CONN_DOWN)) - rc = -1; + if (rc == 0) { + mutex_lock(&ib_conn->state_mutex); + if (ib_conn->state == ISER_CONN_TERMINATING || + ib_conn->state == ISER_CONN_DOWN) + rc = -1; + mutex_unlock(&ib_conn->state_mutex); + } iser_info("ib conn %p rc = %d\n", ib_conn, rc); @@ -646,19 +655,25 @@ iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep) ib_conn = ep->dd_data; iser_info("ep %p ib conn %p state %d\n", ep, ib_conn, ib_conn->state); + mutex_lock(&ib_conn->state_mutex); iser_conn_terminate(ib_conn); /* - * if iser_conn and iscsi_conn are bound, we must wait iscsi_conn_stop - * call and ISER_CONN_DOWN state before freeing the iser resources. - * otherwise we are safe to free resources immediately. + * if iser_conn and iscsi_conn are bound, we must wait for + * iscsi_conn_stop and flush errors completion before freeing + * the iser resources. Otherwise we are safe to free resources + * immediately. */ if (ib_conn->iscsi_conn) { INIT_WORK(&ib_conn->release_work, iser_release_work); queue_work(release_wq, &ib_conn->release_work); + mutex_unlock(&ib_conn->state_mutex); } else { + ib_conn->state = ISER_CONN_DOWN; + mutex_unlock(&ib_conn->state_mutex); iser_conn_release(ib_conn); } + iscsi_destroy_endpoint(ep); } static umode_t iser_attr_is_visible(int param_type, int param) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 97cd385bf7f..c877dad381c 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -326,7 +326,6 @@ struct iser_conn { struct iser_device *device; /* device context */ struct rdma_cm_id *cma_id; /* CMA ID */ struct ib_qp *qp; /* QP */ - wait_queue_head_t wait; /* waitq for conn/disconn */ unsigned qp_max_recv_dtos; /* num of rx buffers */ unsigned qp_max_recv_dtos_mask; /* above minus 1 */ unsigned min_posted_rx; /* qp_max_recv_dtos >> 2 */ @@ -335,6 +334,9 @@ struct iser_conn { char name[ISER_OBJECT_NAME_SIZE]; struct work_struct release_work; struct completion stop_completion; + struct mutex state_mutex; + struct completion flush_completion; + struct completion up_completion; struct list_head conn_list; /* entry in ig conn list */ char *login_buf; @@ -448,8 +450,8 @@ int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *task, enum iser_data_dir cmd_dir); int iser_connect(struct iser_conn *ib_conn, - struct sockaddr_in *src_addr, - struct sockaddr_in *dst_addr, + struct sockaddr *src_addr, + struct sockaddr *dst_addr, int non_blocking); int iser_reg_page_vec(struct iser_conn *ib_conn, diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index ea01075f9f9..3ef167f97d6 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -491,10 +491,9 @@ out_err: } /** - * releases the QP objects, returns 0 on success, - * -1 on failure + * releases the QP object */ -static int iser_free_ib_conn_res(struct iser_conn *ib_conn) +static void iser_free_ib_conn_res(struct iser_conn *ib_conn) { int cq_index; BUG_ON(ib_conn == NULL); @@ -513,8 +512,6 @@ static int iser_free_ib_conn_res(struct iser_conn *ib_conn) } ib_conn->qp = NULL; - - return 0; } /** @@ -568,31 +565,40 @@ static void iser_device_try_release(struct iser_device *device) mutex_unlock(&ig.device_list_mutex); } +/** + * Called with state mutex held + **/ static int iser_conn_state_comp_exch(struct iser_conn *ib_conn, enum iser_ib_conn_state comp, enum iser_ib_conn_state exch) { int ret; - spin_lock_bh(&ib_conn->lock); if ((ret = (ib_conn->state == comp))) ib_conn->state = exch; - spin_unlock_bh(&ib_conn->lock); return ret; } void iser_release_work(struct work_struct *work) { struct iser_conn *ib_conn; + int rc; ib_conn = container_of(work, struct iser_conn, release_work); /* wait for .conn_stop callback */ - wait_for_completion(&ib_conn->stop_completion); + rc = wait_for_completion_timeout(&ib_conn->stop_completion, 30 * HZ); + WARN_ON(rc == 0); /* wait for the qp`s post send and post receive buffers to empty */ - wait_event_interruptible(ib_conn->wait, - ib_conn->state == ISER_CONN_DOWN); + rc = wait_for_completion_timeout(&ib_conn->flush_completion, 30 * HZ); + WARN_ON(rc == 0); + + ib_conn->state = ISER_CONN_DOWN; + + mutex_lock(&ib_conn->state_mutex); + ib_conn->state = ISER_CONN_DOWN; + mutex_unlock(&ib_conn->state_mutex); iser_conn_release(ib_conn); } @@ -604,23 +610,27 @@ void iser_conn_release(struct iser_conn *ib_conn) { struct iser_device *device = ib_conn->device; - BUG_ON(ib_conn->state == ISER_CONN_UP); - mutex_lock(&ig.connlist_mutex); list_del(&ib_conn->conn_list); mutex_unlock(&ig.connlist_mutex); + + mutex_lock(&ib_conn->state_mutex); + BUG_ON(ib_conn->state != ISER_CONN_DOWN); + iser_free_rx_descriptors(ib_conn); iser_free_ib_conn_res(ib_conn); ib_conn->device = NULL; /* on EVENT_ADDR_ERROR there's no device yet for this conn */ if (device != NULL) iser_device_try_release(device); + mutex_unlock(&ib_conn->state_mutex); + /* if cma handler context, the caller actually destroy the id */ if (ib_conn->cma_id != NULL) { rdma_destroy_id(ib_conn->cma_id); ib_conn->cma_id = NULL; } - iscsi_destroy_endpoint(ib_conn->ep); + kfree(ib_conn); } /** @@ -642,22 +652,31 @@ void iser_conn_terminate(struct iser_conn *ib_conn) ib_conn,err); } +/** + * Called with state mutex held + **/ static void iser_connect_error(struct rdma_cm_id *cma_id) { struct iser_conn *ib_conn; ib_conn = (struct iser_conn *)cma_id->context; - ib_conn->state = ISER_CONN_DOWN; - wake_up_interruptible(&ib_conn->wait); } +/** + * Called with state mutex held + **/ static void iser_addr_handler(struct rdma_cm_id *cma_id) { struct iser_device *device; struct iser_conn *ib_conn; int ret; + ib_conn = (struct iser_conn *)cma_id->context; + if (ib_conn->state != ISER_CONN_PENDING) + /* bailout */ + return; + device = iser_device_find_by_ib_device(cma_id); if (!device) { iser_err("device lookup/creation failed\n"); @@ -665,7 +684,6 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id) return; } - ib_conn = (struct iser_conn *)cma_id->context; ib_conn->device = device; /* connection T10-PI support */ @@ -689,18 +707,27 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id) } } +/** + * Called with state mutex held + **/ static void iser_route_handler(struct rdma_cm_id *cma_id) { struct rdma_conn_param conn_param; int ret; struct iser_cm_hdr req_hdr; + struct iser_conn *ib_conn = (struct iser_conn *)cma_id->context; + struct iser_device *device = ib_conn->device; + + if (ib_conn->state != ISER_CONN_PENDING) + /* bailout */ + return; ret = iser_create_ib_conn_res((struct iser_conn *)cma_id->context); if (ret) goto failure; memset(&conn_param, 0, sizeof conn_param); - conn_param.responder_resources = 4; + conn_param.responder_resources = device->dev_attr.max_qp_rd_atom; conn_param.initiator_depth = 1; conn_param.retry_count = 7; conn_param.rnr_retry_count = 6; @@ -728,12 +755,16 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id) struct ib_qp_attr attr; struct ib_qp_init_attr init_attr; + ib_conn = (struct iser_conn *)cma_id->context; + if (ib_conn->state != ISER_CONN_PENDING) + /* bailout */ + return; + (void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr); iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num); - ib_conn = (struct iser_conn *)cma_id->context; - if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_PENDING, ISER_CONN_UP)) - wake_up_interruptible(&ib_conn->wait); + ib_conn->state = ISER_CONN_UP; + complete(&ib_conn->up_completion); } static void iser_disconnected_handler(struct rdma_cm_id *cma_id) @@ -752,19 +783,25 @@ static void iser_disconnected_handler(struct rdma_cm_id *cma_id) iser_err("iscsi_iser connection isn't bound\n"); } - /* Complete the termination process if no posts are pending */ + /* Complete the termination process if no posts are pending. This code + * block also exists in iser_handle_comp_error(), but it is needed here + * for cases of no flushes at all, e.g. discovery over rdma. + */ if (ib_conn->post_recv_buf_count == 0 && (atomic_read(&ib_conn->post_send_buf_count) == 0)) { - ib_conn->state = ISER_CONN_DOWN; - wake_up_interruptible(&ib_conn->wait); + complete(&ib_conn->flush_completion); } } static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { + struct iser_conn *ib_conn; + + ib_conn = (struct iser_conn *)cma_id->context; iser_info("event %d status %d conn %p id %p\n", event->event, event->status, cma_id->context, cma_id); + mutex_lock(&ib_conn->state_mutex); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: iser_addr_handler(cma_id); @@ -785,24 +822,28 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_TIMEWAIT_EXIT: iser_disconnected_handler(cma_id); break; default: iser_err("Unexpected RDMA CM event (%d)\n", event->event); break; } + mutex_unlock(&ib_conn->state_mutex); return 0; } void iser_conn_init(struct iser_conn *ib_conn) { ib_conn->state = ISER_CONN_INIT; - init_waitqueue_head(&ib_conn->wait); ib_conn->post_recv_buf_count = 0; atomic_set(&ib_conn->post_send_buf_count, 0); init_completion(&ib_conn->stop_completion); + init_completion(&ib_conn->flush_completion); + init_completion(&ib_conn->up_completion); INIT_LIST_HEAD(&ib_conn->conn_list); spin_lock_init(&ib_conn->lock); + mutex_init(&ib_conn->state_mutex); } /** @@ -810,22 +851,21 @@ void iser_conn_init(struct iser_conn *ib_conn) * sleeps until the connection is established or rejected */ int iser_connect(struct iser_conn *ib_conn, - struct sockaddr_in *src_addr, - struct sockaddr_in *dst_addr, + struct sockaddr *src_addr, + struct sockaddr *dst_addr, int non_blocking) { - struct sockaddr *src, *dst; int err = 0; - sprintf(ib_conn->name, "%pI4:%d", - &dst_addr->sin_addr.s_addr, dst_addr->sin_port); + mutex_lock(&ib_conn->state_mutex); + + sprintf(ib_conn->name, "%pISp", dst_addr); + + iser_info("connecting to: %s\n", ib_conn->name); /* the device is known only --after-- address resolution */ ib_conn->device = NULL; - iser_info("connecting to: %pI4, port 0x%x\n", - &dst_addr->sin_addr, dst_addr->sin_port); - ib_conn->state = ISER_CONN_PENDING; ib_conn->cma_id = rdma_create_id(iser_cma_handler, @@ -837,23 +877,21 @@ int iser_connect(struct iser_conn *ib_conn, goto id_failure; } - src = (struct sockaddr *)src_addr; - dst = (struct sockaddr *)dst_addr; - err = rdma_resolve_addr(ib_conn->cma_id, src, dst, 1000); + err = rdma_resolve_addr(ib_conn->cma_id, src_addr, dst_addr, 1000); if (err) { iser_err("rdma_resolve_addr failed: %d\n", err); goto addr_failure; } if (!non_blocking) { - wait_event_interruptible(ib_conn->wait, - (ib_conn->state != ISER_CONN_PENDING)); + wait_for_completion_interruptible(&ib_conn->up_completion); if (ib_conn->state != ISER_CONN_UP) { err = -EIO; goto connect_failure; } } + mutex_unlock(&ib_conn->state_mutex); mutex_lock(&ig.connlist_mutex); list_add(&ib_conn->conn_list, &ig.connlist); @@ -865,6 +903,7 @@ id_failure: addr_failure: ib_conn->state = ISER_CONN_DOWN; connect_failure: + mutex_unlock(&ib_conn->state_mutex); iser_conn_release(ib_conn); return err; } @@ -1049,18 +1088,19 @@ static void iser_handle_comp_error(struct iser_tx_desc *desc, if (ib_conn->post_recv_buf_count == 0 && atomic_read(&ib_conn->post_send_buf_count) == 0) { - /* getting here when the state is UP means that the conn is * - * being terminated asynchronously from the iSCSI layer's * - * perspective. */ - if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, - ISER_CONN_TERMINATING)) + /** + * getting here when the state is UP means that the conn is + * being terminated asynchronously from the iSCSI layer's + * perspective. It is safe to peek at the connection state + * since iscsi_conn_failure is allowed to be called twice. + **/ + if (ib_conn->state == ISER_CONN_UP) iscsi_conn_failure(ib_conn->iscsi_conn, ISCSI_ERR_CONN_FAILED); /* no more non completed posts to the QP, complete the * termination process w.o worrying on disconnect event */ - ib_conn->state = ISER_CONN_DOWN; - wake_up_interruptible(&ib_conn->wait); + complete(&ib_conn->flush_completion); } } diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index e3c2c5b4297..62d2a18e1b4 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -130,6 +130,7 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr); static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); static struct scsi_transport_template *ib_srp_transport_template; +static struct workqueue_struct *srp_remove_wq; static struct ib_client srp_client = { .name = "srp", @@ -731,7 +732,7 @@ static bool srp_queue_remove_work(struct srp_target_port *target) spin_unlock_irq(&target->lock); if (changed) - queue_work(system_long_wq, &target->remove_work); + queue_work(srp_remove_wq, &target->remove_work); return changed; } @@ -1643,10 +1644,14 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) SCSI_SENSE_BUFFERSIZE)); } - if (rsp->flags & (SRP_RSP_FLAG_DOOVER | SRP_RSP_FLAG_DOUNDER)) - scsi_set_resid(scmnd, be32_to_cpu(rsp->data_out_res_cnt)); - else if (rsp->flags & (SRP_RSP_FLAG_DIOVER | SRP_RSP_FLAG_DIUNDER)) + if (unlikely(rsp->flags & SRP_RSP_FLAG_DIUNDER)) scsi_set_resid(scmnd, be32_to_cpu(rsp->data_in_res_cnt)); + else if (unlikely(rsp->flags & SRP_RSP_FLAG_DIOVER)) + scsi_set_resid(scmnd, -be32_to_cpu(rsp->data_in_res_cnt)); + else if (unlikely(rsp->flags & SRP_RSP_FLAG_DOUNDER)) + scsi_set_resid(scmnd, be32_to_cpu(rsp->data_out_res_cnt)); + else if (unlikely(rsp->flags & SRP_RSP_FLAG_DOOVER)) + scsi_set_resid(scmnd, -be32_to_cpu(rsp->data_out_res_cnt)); srp_free_req(target, req, scmnd, be32_to_cpu(rsp->req_lim_delta)); @@ -3261,9 +3266,10 @@ static void srp_remove_one(struct ib_device *device) spin_unlock(&host->target_lock); /* - * Wait for target port removal tasks. + * Wait for tl_err and target port removal tasks. */ flush_workqueue(system_long_wq); + flush_workqueue(srp_remove_wq); kfree(host); } @@ -3313,16 +3319,22 @@ static int __init srp_init_module(void) indirect_sg_entries = cmd_sg_entries; } + srp_remove_wq = create_workqueue("srp_remove"); + if (!srp_remove_wq) { + ret = -ENOMEM; + goto out; + } + + ret = -ENOMEM; ib_srp_transport_template = srp_attach_transport(&ib_srp_transport_functions); if (!ib_srp_transport_template) - return -ENOMEM; + goto destroy_wq; ret = class_register(&srp_class); if (ret) { pr_err("couldn't register class infiniband_srp\n"); - srp_release_transport(ib_srp_transport_template); - return ret; + goto release_tr; } ib_sa_register_client(&srp_sa_client); @@ -3330,13 +3342,22 @@ static int __init srp_init_module(void) ret = ib_register_client(&srp_client); if (ret) { pr_err("couldn't register IB client\n"); - srp_release_transport(ib_srp_transport_template); - ib_sa_unregister_client(&srp_sa_client); - class_unregister(&srp_class); - return ret; + goto unreg_sa; } - return 0; +out: + return ret; + +unreg_sa: + ib_sa_unregister_client(&srp_sa_client); + class_unregister(&srp_class); + +release_tr: + srp_release_transport(ib_srp_transport_template); + +destroy_wq: + destroy_workqueue(srp_remove_wq); + goto out; } static void __exit srp_cleanup_module(void) @@ -3345,6 +3366,7 @@ static void __exit srp_cleanup_module(void) ib_sa_unregister_client(&srp_sa_client); class_unregister(&srp_class); srp_release_transport(ib_srp_transport_template); + destroy_workqueue(srp_remove_wq); } module_init(srp_init_module); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index fe09f2788b1..d28a8c284da 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -198,6 +198,7 @@ static void srpt_event_handler(struct ib_event_handler *handler, case IB_EVENT_PKEY_CHANGE: case IB_EVENT_SM_CHANGE: case IB_EVENT_CLIENT_REREGISTER: + case IB_EVENT_GID_CHANGE: /* Refresh port data asynchronously. */ if (event->element.port_num <= sdev->device->phys_port_cnt) { sport = &sdev->port[event->element.port_num - 1]; @@ -563,7 +564,7 @@ static int srpt_refresh_port(struct srpt_port *sport) ®_req, 0, srpt_mad_send_handler, srpt_mad_recv_handler, - sport); + sport, 0); if (IS_ERR(sport->mad_agent)) { ret = PTR_ERR(sport->mad_agent); sport->mad_agent = NULL; diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 443d03fbac4..8eeab72b93e 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -331,7 +331,7 @@ static int bch_allocator_thread(void *arg) mutex_unlock(&ca->set->bucket_lock); blkdev_issue_discard(ca->bdev, bucket_to_sector(ca->set, bucket), - ca->sb.block_size, GFP_KERNEL, 0); + ca->sb.bucket_size, GFP_KERNEL, 0); mutex_lock(&ca->set->bucket_lock); } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d2ebcf32309..04f7bc28ef8 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -477,9 +477,13 @@ struct gc_stat { * CACHE_SET_STOPPING always gets set first when we're closing down a cache set; * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e. * flushing dirty data). + * + * CACHE_SET_RUNNING means all cache devices have been registered and journal + * replay is complete. */ #define CACHE_SET_UNREGISTERING 0 #define CACHE_SET_STOPPING 1 +#define CACHE_SET_RUNNING 2 struct cache_set { struct closure cl; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 54541641530..646fe85261c 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1182,7 +1182,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, { uint64_t start_time; bool used_mempool = false; - struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, + struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order); if (!out) { struct page *outp; diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 5f6728d5d4d..ae964624efb 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -453,7 +453,7 @@ static inline bool bch_bkey_equal_header(const struct bkey *l, { return (KEY_DIRTY(l) == KEY_DIRTY(r) && KEY_PTRS(l) == KEY_PTRS(r) && - KEY_CSUM(l) == KEY_CSUM(l)); + KEY_CSUM(l) == KEY_CSUM(r)); } /* Keylists */ diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 7347b610096..00cde40db57 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -117,9 +117,9 @@ ({ \ int _r, l = (b)->level - 1; \ bool _w = l <= (op)->lock; \ - struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\ + struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \ + _w, b); \ if (!IS_ERR(_child)) { \ - _child->parent = (b); \ _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ rw_unlock(_w, _child); \ } else \ @@ -142,7 +142,6 @@ rw_lock(_w, _b, _b->level); \ if (_b == (c)->root && \ _w == insert_lock(op, _b)) { \ - _b->parent = NULL; \ _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ } \ rw_unlock(_w, _b); \ @@ -202,7 +201,7 @@ void bch_btree_node_read_done(struct btree *b) struct bset *i = btree_bset_first(b); struct btree_iter *iter; - iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); + iter = mempool_alloc(b->c->fill_iter, GFP_NOIO); iter->size = b->c->sb.bucket_size / b->c->sb.block_size; iter->used = 0; @@ -421,7 +420,7 @@ static void do_btree_node_write(struct btree *b) SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_sector_offset(&b->keys, i)); - if (!bio_alloc_pages(b->bio, GFP_NOIO)) { + if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { int j; struct bio_vec *bv; void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); @@ -967,7 +966,8 @@ err: * level and op->lock. */ struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, - struct bkey *k, int level, bool write) + struct bkey *k, int level, bool write, + struct btree *parent) { int i = 0; struct btree *b; @@ -1002,6 +1002,7 @@ retry: BUG_ON(b->level != level); } + b->parent = parent; b->accessed = 1; for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { @@ -1022,15 +1023,16 @@ retry: return b; } -static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) +static void btree_node_prefetch(struct btree *parent, struct bkey *k) { struct btree *b; - mutex_lock(&c->bucket_lock); - b = mca_alloc(c, NULL, k, level); - mutex_unlock(&c->bucket_lock); + mutex_lock(&parent->c->bucket_lock); + b = mca_alloc(parent->c, NULL, k, parent->level - 1); + mutex_unlock(&parent->c->bucket_lock); if (!IS_ERR_OR_NULL(b)) { + b->parent = parent; bch_btree_node_read(b); rw_unlock(true, b); } @@ -1060,15 +1062,16 @@ static void btree_node_free(struct btree *b) mutex_unlock(&b->c->bucket_lock); } -struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, - int level) +struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, + int level, bool wait, + struct btree *parent) { BKEY_PADDED(key) k; struct btree *b = ERR_PTR(-EAGAIN); mutex_lock(&c->bucket_lock); retry: - if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL)) + if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) goto err; bkey_put(c, &k.key); @@ -1085,6 +1088,7 @@ retry: } b->accessed = 1; + b->parent = parent; bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); mutex_unlock(&c->bucket_lock); @@ -1096,14 +1100,21 @@ err_free: err: mutex_unlock(&c->bucket_lock); - trace_bcache_btree_node_alloc_fail(b); + trace_bcache_btree_node_alloc_fail(c); return b; } +static struct btree *bch_btree_node_alloc(struct cache_set *c, + struct btree_op *op, int level, + struct btree *parent) +{ + return __bch_btree_node_alloc(c, op, level, op != NULL, parent); +} + static struct btree *btree_node_alloc_replacement(struct btree *b, struct btree_op *op) { - struct btree *n = bch_btree_node_alloc(b->c, op, b->level); + struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent); if (!IS_ERR_OR_NULL(n)) { mutex_lock(&n->write_lock); bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); @@ -1403,6 +1414,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, BUG_ON(btree_bset_first(new_nodes[0])->keys); btree_node_free(new_nodes[0]); rw_unlock(true, new_nodes[0]); + new_nodes[0] = NULL; for (i = 0; i < nodes; i++) { if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key))) @@ -1516,7 +1528,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); if (k) { r->b = bch_btree_node_get(b->c, op, k, b->level - 1, - true); + true, b); if (IS_ERR(r->b)) { ret = PTR_ERR(r->b); break; @@ -1811,7 +1823,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); if (k) - btree_node_prefetch(b->c, k, b->level - 1); + btree_node_prefetch(b, k); if (p) ret = btree(check_recurse, p, b, op); @@ -1976,12 +1988,12 @@ static int btree_split(struct btree *b, struct btree_op *op, trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); - n2 = bch_btree_node_alloc(b->c, op, b->level); + n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent); if (IS_ERR(n2)) goto err_free1; if (!b->parent) { - n3 = bch_btree_node_alloc(b->c, op, b->level + 1); + n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL); if (IS_ERR(n3)) goto err_free2; } diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 91dfa5e6968..5c391fa01be 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -242,9 +242,10 @@ void __bch_btree_node_write(struct btree *, struct closure *); void bch_btree_node_write(struct btree *, struct closure *); void bch_btree_set_root(struct btree *); -struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int); +struct btree *__bch_btree_node_alloc(struct cache_set *, struct btree_op *, + int, bool, struct btree *); struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *, - struct bkey *, int, bool); + struct bkey *, int, bool, struct btree *); int bch_btree_insert_check_key(struct btree *, struct btree_op *, struct bkey *); diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 3a0de4cf977..243de0bf15c 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -474,9 +474,8 @@ out: return false; } -static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) +bool __bch_extent_invalid(struct cache_set *c, const struct bkey *k) { - struct btree *b = container_of(bk, struct btree, keys); char buf[80]; if (!KEY_SIZE(k)) @@ -485,16 +484,22 @@ static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) if (KEY_SIZE(k) > KEY_OFFSET(k)) goto bad; - if (__ptr_invalid(b->c, k)) + if (__ptr_invalid(c, k)) goto bad; return false; bad: bch_extent_to_text(buf, sizeof(buf), k); - cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k)); + cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k)); return true; } +static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + return __bch_extent_invalid(b->c, k); +} + static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, unsigned ptr) { diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h index e4e23409782..e2ed54054e7 100644 --- a/drivers/md/bcache/extents.h +++ b/drivers/md/bcache/extents.h @@ -9,5 +9,6 @@ struct cache_set; void bch_extent_to_text(char *, size_t, const struct bkey *); bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); +bool __bch_extent_invalid(struct cache_set *, const struct bkey *); #endif /* _BCACHE_EXTENTS_H */ diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 59e82021b5b..fe080ad0e55 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -7,6 +7,7 @@ #include "bcache.h" #include "btree.h" #include "debug.h" +#include "extents.h" #include <trace/events/bcache.h> @@ -189,11 +190,15 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) if (read_bucket(l)) goto bsearch; - if (list_empty(list)) + /* no journal entries on this device? */ + if (l == ca->sb.njournal_buckets) continue; bsearch: + BUG_ON(list_empty(list)); + /* Binary search */ - m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); + m = l; + r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); pr_debug("starting binary search, l %u r %u", l, r); while (l + 1 < r) { @@ -291,15 +296,16 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) for (k = i->j.start; k < bset_bkey_last(&i->j); - k = bkey_next(k)) { - unsigned j; + k = bkey_next(k)) + if (!__bch_extent_invalid(c, k)) { + unsigned j; - for (j = 0; j < KEY_PTRS(k); j++) - if (ptr_available(c, k, j)) - atomic_inc(&PTR_BUCKET(c, k, j)->pin); + for (j = 0; j < KEY_PTRS(k); j++) + if (ptr_available(c, k, j)) + atomic_inc(&PTR_BUCKET(c, k, j)->pin); - bch_initial_mark_key(c, 0, k); - } + bch_initial_mark_key(c, 0, k); + } } } diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 15fff4f68a7..62e6e98186b 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -311,7 +311,8 @@ void bch_data_insert(struct closure *cl) { struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); - trace_bcache_write(op->bio, op->writeback, op->bypass); + trace_bcache_write(op->c, op->inode, op->bio, + op->writeback, op->bypass); bch_keylist_init(&op->insert_keys); bio_get(op->bio); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 926ded8ccbf..d4713d098a3 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -733,8 +733,6 @@ static void bcache_device_detach(struct bcache_device *d) static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, unsigned id) { - BUG_ON(test_bit(CACHE_SET_STOPPING, &c->flags)); - d->id = id; d->c = c; c->devices[id] = d; @@ -927,6 +925,7 @@ static void cached_dev_detach_finish(struct work_struct *w) list_move(&dc->list, &uncached_devices); clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags); + clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags); mutex_unlock(&bch_register_lock); @@ -1041,6 +1040,9 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) */ atomic_set(&dc->count, 1); + if (bch_cached_dev_writeback_start(dc)) + return -ENOMEM; + if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { bch_sectors_dirty_init(dc); atomic_set(&dc->has_dirty, 1); @@ -1070,7 +1072,8 @@ static void cached_dev_free(struct closure *cl) struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); cancel_delayed_work_sync(&dc->writeback_rate_update); - kthread_stop(dc->writeback_thread); + if (!IS_ERR_OR_NULL(dc->writeback_thread)) + kthread_stop(dc->writeback_thread); mutex_lock(&bch_register_lock); @@ -1081,12 +1084,8 @@ static void cached_dev_free(struct closure *cl) mutex_unlock(&bch_register_lock); - if (!IS_ERR_OR_NULL(dc->bdev)) { - if (dc->bdev->bd_disk) - blk_sync_queue(bdev_get_queue(dc->bdev)); - + if (!IS_ERR_OR_NULL(dc->bdev)) blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); - } wake_up(&unregister_wait); @@ -1213,7 +1212,9 @@ void bch_flash_dev_release(struct kobject *kobj) static void flash_dev_free(struct closure *cl) { struct bcache_device *d = container_of(cl, struct bcache_device, cl); + mutex_lock(&bch_register_lock); bcache_device_free(d); + mutex_unlock(&bch_register_lock); kobject_put(&d->kobj); } @@ -1221,7 +1222,9 @@ static void flash_dev_flush(struct closure *cl) { struct bcache_device *d = container_of(cl, struct bcache_device, cl); + mutex_lock(&bch_register_lock); bcache_device_unlink(d); + mutex_unlock(&bch_register_lock); kobject_del(&d->kobj); continue_at(cl, flash_dev_free, system_wq); } @@ -1277,6 +1280,9 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) if (test_bit(CACHE_SET_STOPPING, &c->flags)) return -EINTR; + if (!test_bit(CACHE_SET_RUNNING, &c->flags)) + return -EPERM; + u = uuid_find_empty(c); if (!u) { pr_err("Can't create volume, no room for UUID"); @@ -1346,8 +1352,11 @@ static void cache_set_free(struct closure *cl) bch_journal_free(c); for_each_cache(ca, c, i) - if (ca) + if (ca) { + ca->set = NULL; + c->cache[ca->sb.nr_this_dev] = NULL; kobject_put(&ca->kobj); + } bch_bset_sort_state_free(&c->sort); free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); @@ -1405,9 +1414,11 @@ static void cache_set_flush(struct closure *cl) if (ca->alloc_thread) kthread_stop(ca->alloc_thread); - cancel_delayed_work_sync(&c->journal.work); - /* flush last journal entry if needed */ - c->journal.work.work.func(&c->journal.work.work); + if (c->journal.cur) { + cancel_delayed_work_sync(&c->journal.work); + /* flush last journal entry if needed */ + c->journal.work.work.func(&c->journal.work.work); + } closure_return(cl); } @@ -1586,7 +1597,7 @@ static void run_cache_set(struct cache_set *c) goto err; err = "error reading btree root"; - c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true); + c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL); if (IS_ERR_OR_NULL(c->root)) goto err; @@ -1661,7 +1672,7 @@ static void run_cache_set(struct cache_set *c) goto err; err = "cannot allocate new btree root"; - c->root = bch_btree_node_alloc(c, NULL, 0); + c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL); if (IS_ERR_OR_NULL(c->root)) goto err; @@ -1697,6 +1708,7 @@ static void run_cache_set(struct cache_set *c) flash_devs_run(c); + set_bit(CACHE_SET_RUNNING, &c->flags); return; err: closure_sync(&cl); @@ -1760,6 +1772,7 @@ found: pr_debug("set version = %llu", c->sb.version); } + kobject_get(&ca->kobj); ca->set = c; ca->set->cache[ca->sb.nr_this_dev] = ca; c->cache_by_alloc[c->caches_loaded++] = ca; @@ -1780,8 +1793,10 @@ void bch_cache_release(struct kobject *kobj) struct cache *ca = container_of(kobj, struct cache, kobj); unsigned i; - if (ca->set) + if (ca->set) { + BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca); ca->set->cache[ca->sb.nr_this_dev] = NULL; + } bio_split_pool_free(&ca->bio_split_hook); @@ -1798,10 +1813,8 @@ void bch_cache_release(struct kobject *kobj) if (ca->sb_bio.bi_inline_vecs[0].bv_page) put_page(ca->sb_bio.bi_io_vec[0].bv_page); - if (!IS_ERR_OR_NULL(ca->bdev)) { - blk_sync_queue(bdev_get_queue(ca->bdev)); + if (!IS_ERR_OR_NULL(ca->bdev)) blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); - } kfree(ca); module_put(THIS_MODULE); @@ -1844,7 +1857,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) } static void register_cache(struct cache_sb *sb, struct page *sb_page, - struct block_device *bdev, struct cache *ca) + struct block_device *bdev, struct cache *ca) { char name[BDEVNAME_SIZE]; const char *err = "cannot allocate memory"; @@ -1877,10 +1890,12 @@ static void register_cache(struct cache_sb *sb, struct page *sb_page, goto err; pr_info("registered cache device %s", bdevname(bdev, name)); +out: + kobject_put(&ca->kobj); return; err: pr_notice("error opening %s: %s", bdevname(bdev, name), err); - kobject_put(&ca->kobj); + goto out; } /* Global interfaces/init */ @@ -1945,10 +1960,12 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (IS_ERR(bdev)) { if (bdev == ERR_PTR(-EBUSY)) { bdev = lookup_bdev(strim(path)); + mutex_lock(&bch_register_lock); if (!IS_ERR(bdev) && bch_is_open(bdev)) err = "device already registered"; else err = "device busy"; + mutex_unlock(&bch_register_lock); } goto err; } diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index ac7d0d1f70d..98df7572b5f 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -416,8 +416,8 @@ do { \ average_frequency, frequency_units); \ __print_time_stat(stats, name, \ average_duration, duration_units); \ - __print_time_stat(stats, name, \ - max_duration, duration_units); \ + sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \ + div_u64((stats)->max_duration, NSEC_PER_ ## duration_units));\ \ sysfs_print(name ## _last_ ## frequency_units, (stats)->last \ ? div_s64(local_clock() - (stats)->last, \ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index f4300e4c011..f1986bcd1bf 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -239,7 +239,7 @@ static void read_dirty(struct cached_dev *dc) if (KEY_START(&w->key) != dc->last_read || jiffies_to_msecs(delay) > 50) while (!kthread_should_stop() && delay) - delay = schedule_timeout_uninterruptible(delay); + delay = schedule_timeout_interruptible(delay); dc->last_read = KEY_OFFSET(&w->key); @@ -436,7 +436,7 @@ static int bch_writeback_thread(void *arg) while (delay && !kthread_should_stop() && !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) - delay = schedule_timeout_uninterruptible(delay); + delay = schedule_timeout_interruptible(delay); } } @@ -478,7 +478,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc) dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk); } -int bch_cached_dev_writeback_init(struct cached_dev *dc) +void bch_cached_dev_writeback_init(struct cached_dev *dc) { sema_init(&dc->in_flight, 64); init_rwsem(&dc->writeback_lock); @@ -494,14 +494,20 @@ int bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_rate_d_term = 30; dc->writeback_rate_p_term_inverse = 6000; + INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); +} + +int bch_cached_dev_writeback_start(struct cached_dev *dc) +{ dc->writeback_thread = kthread_create(bch_writeback_thread, dc, "bcache_writeback"); if (IS_ERR(dc->writeback_thread)) return PTR_ERR(dc->writeback_thread); - INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); schedule_delayed_work(&dc->writeback_rate_update, dc->writeback_rate_update_seconds * HZ); + bch_writeback_queue(dc); + return 0; } diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index e2f8598937a..0a9dab187b7 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -85,6 +85,7 @@ static inline void bch_writeback_add(struct cached_dev *dc) void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); void bch_sectors_dirty_init(struct cached_dev *dc); -int bch_cached_dev_writeback_init(struct cached_dev *); +void bch_cached_dev_writeback_init(struct cached_dev *); +int bch_cached_dev_writeback_start(struct cached_dev *); #endif diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index d2899e7eb3a..06709257add 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -330,7 +330,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) disk_super->discard_root = cpu_to_le64(cmd->discard_root); disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks)); - disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); + disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE); disk_super->data_block_size = cpu_to_le32(cmd->data_block_size); disk_super->cache_blocks = cpu_to_le32(0); @@ -478,7 +478,7 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd, bool may_format_device) { int r; - cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE, + cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, CACHE_METADATA_CACHE_SIZE, CACHE_MAX_CONCURRENT_LOCKS); if (IS_ERR(cmd->bm)) { diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index cd70a78623a..7383c90ccdb 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -9,19 +9,17 @@ #include "dm-cache-block-types.h" #include "dm-cache-policy-internal.h" +#include "persistent-data/dm-space-map-metadata.h" /*----------------------------------------------------------------*/ -#define DM_CACHE_METADATA_BLOCK_SIZE 4096 +#define DM_CACHE_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE /* FIXME: remove this restriction */ /* * The metadata device is currently limited in size. - * - * We have one block of index, which can hold 255 index entries. Each - * index entry contains allocation info about 16k metadata blocks. */ -#define DM_CACHE_METADATA_MAX_SECTORS (255 * (1 << 14) * (DM_CACHE_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) +#define DM_CACHE_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS /* * A metadata device larger than 16GB triggers a warning. diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 2c63326638b..1af40ee209e 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -718,6 +718,22 @@ static int bio_triggers_commit(struct cache *cache, struct bio *bio) return bio->bi_rw & (REQ_FLUSH | REQ_FUA); } +/* + * You must increment the deferred set whilst the prison cell is held. To + * encourage this, we ask for 'cell' to be passed in. + */ +static void inc_ds(struct cache *cache, struct bio *bio, + struct dm_bio_prison_cell *cell) +{ + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + + BUG_ON(!cell); + BUG_ON(pb->all_io_entry); + + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); +} + static void issue(struct cache *cache, struct bio *bio) { unsigned long flags; @@ -737,6 +753,12 @@ static void issue(struct cache *cache, struct bio *bio) spin_unlock_irqrestore(&cache->lock, flags); } +static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell) +{ + inc_ds(cache, bio, cell); + issue(cache, bio); +} + static void defer_writethrough_bio(struct cache *cache, struct bio *bio) { unsigned long flags; @@ -1015,6 +1037,11 @@ static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); + + /* + * No need to inc_ds() here, since the cell will be held for the + * duration of the io. + */ generic_make_request(bio); } @@ -1115,8 +1142,7 @@ static void check_for_quiesced_migrations(struct cache *cache, return; INIT_LIST_HEAD(&work); - if (pb->all_io_entry) - dm_deferred_entry_dec(pb->all_io_entry, &work); + dm_deferred_entry_dec(pb->all_io_entry, &work); if (!list_empty(&work)) queue_quiesced_migrations(cache, &work); @@ -1252,6 +1278,11 @@ static void process_flush_bio(struct cache *cache, struct bio *bio) else remap_to_cache(cache, bio, 0); + /* + * REQ_FLUSH is not directed at any particular block so we don't + * need to inc_ds(). REQ_FUA's are split into a write + REQ_FLUSH + * by dm-core. + */ issue(cache, bio); } @@ -1301,15 +1332,6 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio) &cache->stats.read_miss : &cache->stats.write_miss); } -static void issue_cache_bio(struct cache *cache, struct bio *bio, - struct per_bio_data *pb, - dm_oblock_t oblock, dm_cblock_t cblock) -{ - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - remap_to_cache_dirty(cache, bio, oblock, cblock); - issue(cache, bio); -} - static void process_bio(struct cache *cache, struct prealloc *structs, struct bio *bio) { @@ -1318,8 +1340,6 @@ static void process_bio(struct cache *cache, struct prealloc *structs, dm_oblock_t block = get_bio_block(cache, bio); struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell; struct policy_result lookup_result; - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); bool discarded_block = is_discarded_oblock(cache, block); bool passthrough = passthrough_mode(&cache->features); bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache)); @@ -1359,9 +1379,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs, } else { /* FIXME: factor out issue_origin() */ - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); remap_to_origin_clear_discard(cache, bio, block); - issue(cache, bio); + inc_and_issue(cache, bio, new_ocell); } } else { inc_hit_counter(cache, bio); @@ -1369,20 +1388,21 @@ static void process_bio(struct cache *cache, struct prealloc *structs, if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && !is_dirty(cache, lookup_result.cblock)) { - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); - issue(cache, bio); - } else - issue_cache_bio(cache, bio, pb, block, lookup_result.cblock); + inc_and_issue(cache, bio, new_ocell); + + } else { + remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); + inc_and_issue(cache, bio, new_ocell); + } } break; case POLICY_MISS: inc_miss_counter(cache, bio); - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); remap_to_origin_clear_discard(cache, bio, block); - issue(cache, bio); + inc_and_issue(cache, bio, new_ocell); break; case POLICY_NEW: @@ -1501,6 +1521,9 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) bio_list_init(&cache->deferred_flush_bios); spin_unlock_irqrestore(&cache->lock, flags); + /* + * These bios have already been through inc_ds() + */ while ((bio = bio_list_pop(&bios))) submit_bios ? generic_make_request(bio) : bio_io_error(bio); } @@ -1518,6 +1541,9 @@ static void process_deferred_writethrough_bios(struct cache *cache) bio_list_init(&cache->deferred_writethrough_bios); spin_unlock_irqrestore(&cache->lock, flags); + /* + * These bios have already been through inc_ds() + */ while ((bio = bio_list_pop(&bios))) generic_make_request(bio); } @@ -1694,6 +1720,7 @@ static void do_worker(struct work_struct *ws) if (commit_if_needed(cache)) { process_deferred_flush_bios(cache, false); + process_migrations(cache, &cache->need_commit_migrations, migration_failure); /* * FIXME: rollback metadata or just go into a @@ -2406,16 +2433,13 @@ out: return r; } -static int cache_map(struct dm_target *ti, struct bio *bio) +static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell) { - struct cache *cache = ti->private; - int r; dm_oblock_t block = get_bio_block(cache, bio); size_t pb_data_size = get_per_bio_data_size(cache); bool can_migrate = false; bool discarded_block; - struct dm_bio_prison_cell *cell; struct policy_result lookup_result; struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); @@ -2437,15 +2461,15 @@ static int cache_map(struct dm_target *ti, struct bio *bio) /* * Check to see if that block is currently migrating. */ - cell = alloc_prison_cell(cache); - if (!cell) { + *cell = alloc_prison_cell(cache); + if (!*cell) { defer_bio(cache, bio); return DM_MAPIO_SUBMITTED; } - r = bio_detain(cache, block, bio, cell, + r = bio_detain(cache, block, bio, *cell, (cell_free_fn) free_prison_cell, - cache, &cell); + cache, cell); if (r) { if (r < 0) defer_bio(cache, bio); @@ -2458,11 +2482,12 @@ static int cache_map(struct dm_target *ti, struct bio *bio) r = policy_map(cache->policy, block, false, can_migrate, discarded_block, bio, &lookup_result); if (r == -EWOULDBLOCK) { - cell_defer(cache, cell, true); + cell_defer(cache, *cell, true); return DM_MAPIO_SUBMITTED; } else if (r) { DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r); + cell_defer(cache, *cell, false); bio_io_error(bio); return DM_MAPIO_SUBMITTED; } @@ -2476,52 +2501,44 @@ static int cache_map(struct dm_target *ti, struct bio *bio) * We need to invalidate this block, so * defer for the worker thread. */ - cell_defer(cache, cell, true); + cell_defer(cache, *cell, true); r = DM_MAPIO_SUBMITTED; } else { - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); inc_miss_counter(cache, bio); remap_to_origin_clear_discard(cache, bio, block); - - cell_defer(cache, cell, false); } } else { inc_hit_counter(cache, bio); - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && !is_dirty(cache, lookup_result.cblock)) remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); else remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); - - cell_defer(cache, cell, false); } break; case POLICY_MISS: inc_miss_counter(cache, bio); - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - if (pb->req_nr != 0) { /* * This is a duplicate writethrough io that is no * longer needed because the block has been demoted. */ bio_endio(bio, 0); - cell_defer(cache, cell, false); - return DM_MAPIO_SUBMITTED; - } else { + cell_defer(cache, *cell, false); + r = DM_MAPIO_SUBMITTED; + + } else remap_to_origin_clear_discard(cache, bio, block); - cell_defer(cache, cell, false); - } + break; default: DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__, (unsigned) lookup_result.op); + cell_defer(cache, *cell, false); bio_io_error(bio); r = DM_MAPIO_SUBMITTED; } @@ -2529,6 +2546,21 @@ static int cache_map(struct dm_target *ti, struct bio *bio) return r; } +static int cache_map(struct dm_target *ti, struct bio *bio) +{ + int r; + struct dm_bio_prison_cell *cell; + struct cache *cache = ti->private; + + r = __cache_map(cache, bio, &cell); + if (r == DM_MAPIO_REMAPPED) { + inc_ds(cache, bio, cell); + cell_defer(cache, cell, false); + } + + return r; +} + static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) { struct cache *cache = ti->private; @@ -2808,7 +2840,7 @@ static void cache_status(struct dm_target *ti, status_type_t type, residency = policy_residency(cache->policy); DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ", - (unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT), + (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), (unsigned long long)nr_blocks_metadata, cache->sectors_per_block, @@ -3062,7 +3094,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) */ if (io_opt_sectors < cache->sectors_per_block || do_div(io_opt_sectors, cache->sectors_per_block)) { - blk_limits_io_min(limits, 0); + blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); } set_discard_limits(cache, limits); @@ -3072,7 +3104,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 4cba2d808af..2785007e0e4 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -59,7 +59,7 @@ struct dm_crypt_io { int error; sector_t sector; struct dm_crypt_io *base_io; -}; +} CRYPTO_MINALIGN_ATTR; struct dm_crypt_request { struct convert_context *ctx; @@ -162,6 +162,8 @@ struct crypt_config { */ unsigned int dmreq_start; + unsigned int per_bio_data_size; + unsigned long flags; unsigned int key_size; unsigned int key_parts; /* independent parts in key buffer */ @@ -895,6 +897,15 @@ static void crypt_alloc_req(struct crypt_config *cc, kcryptd_async_done, dmreq_of_req(cc, ctx->req)); } +static void crypt_free_req(struct crypt_config *cc, + struct ablkcipher_request *req, struct bio *base_bio) +{ + struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size); + + if ((struct ablkcipher_request *)(io + 1) != req) + mempool_free(req, cc->req_pool); +} + /* * Encrypt / decrypt data from one bio to another one (can be the same one) */ @@ -1008,12 +1019,9 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) } } -static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc, - struct bio *bio, sector_t sector) +static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc, + struct bio *bio, sector_t sector) { - struct dm_crypt_io *io; - - io = mempool_alloc(cc->io_pool, GFP_NOIO); io->cc = cc; io->base_bio = bio; io->sector = sector; @@ -1021,8 +1029,6 @@ static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc, io->base_io = NULL; io->ctx.req = NULL; atomic_set(&io->io_pending, 0); - - return io; } static void crypt_inc_pending(struct dm_crypt_io *io) @@ -1046,8 +1052,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io) return; if (io->ctx.req) - mempool_free(io->ctx.req, cc->req_pool); - mempool_free(io, cc->io_pool); + crypt_free_req(cc, io->ctx.req, base_bio); + if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size)) + mempool_free(io, cc->io_pool); if (likely(!base_io)) bio_endio(base_bio, error); @@ -1255,8 +1262,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) * between fragments, so switch to a new dm_crypt_io structure. */ if (unlikely(!crypt_finished && remaining)) { - new_io = crypt_io_alloc(io->cc, io->base_bio, - sector); + new_io = mempool_alloc(cc->io_pool, GFP_NOIO); + crypt_io_init(new_io, io->cc, io->base_bio, sector); crypt_inc_pending(new_io); crypt_convert_init(cc, &new_io->ctx, NULL, io->base_bio, sector); @@ -1325,7 +1332,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, if (error < 0) io->error = -EIO; - mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); + crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio); if (!atomic_dec_and_test(&ctx->cc_pending)) return; @@ -1728,6 +1735,10 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } + cc->per_bio_data_size = ti->per_bio_data_size = + sizeof(struct dm_crypt_io) + cc->dmreq_start + + sizeof(struct dm_crypt_request) + cc->iv_size; + cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); if (!cc->page_pool) { ti->error = "Cannot allocate page mempool"; @@ -1824,7 +1835,9 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } - io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); + io = dm_per_bio_data(bio, cc->per_bio_data_size); + crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); + io->ctx.req = (struct ablkcipher_request *)(io + 1); if (bio_data_dir(io->base_bio) == READ) { if (kcryptd_io_read(io, GFP_NOWAIT)) diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index db404a0f7e2..c09359db3a9 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -33,7 +33,6 @@ struct dm_io_client { struct io { unsigned long error_bits; atomic_t count; - struct completion *wait; struct dm_io_client *client; io_notify_fn callback; void *context; @@ -112,28 +111,27 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io, * We need an io object to keep track of the number of bios that * have been dispatched for a particular io. *---------------------------------------------------------------*/ -static void dec_count(struct io *io, unsigned int region, int error) +static void complete_io(struct io *io) { - if (error) - set_bit(region, &io->error_bits); + unsigned long error_bits = io->error_bits; + io_notify_fn fn = io->callback; + void *context = io->context; - if (atomic_dec_and_test(&io->count)) { - if (io->vma_invalidate_size) - invalidate_kernel_vmap_range(io->vma_invalidate_address, - io->vma_invalidate_size); + if (io->vma_invalidate_size) + invalidate_kernel_vmap_range(io->vma_invalidate_address, + io->vma_invalidate_size); - if (io->wait) - complete(io->wait); + mempool_free(io, io->client->pool); + fn(error_bits, context); +} - else { - unsigned long r = io->error_bits; - io_notify_fn fn = io->callback; - void *context = io->context; +static void dec_count(struct io *io, unsigned int region, int error) +{ + if (error) + set_bit(region, &io->error_bits); - mempool_free(io, io->client->pool); - fn(r, context); - } - } + if (atomic_dec_and_test(&io->count)) + complete_io(io); } static void endio(struct bio *bio, int error) @@ -376,41 +374,51 @@ static void dispatch_io(int rw, unsigned int num_regions, dec_count(io, 0, 0); } +struct sync_io { + unsigned long error_bits; + struct completion wait; +}; + +static void sync_io_complete(unsigned long error, void *context) +{ + struct sync_io *sio = context; + + sio->error_bits = error; + complete(&sio->wait); +} + static int sync_io(struct dm_io_client *client, unsigned int num_regions, struct dm_io_region *where, int rw, struct dpages *dp, unsigned long *error_bits) { - /* - * gcc <= 4.3 can't do the alignment for stack variables, so we must - * align it on our own. - * volatile prevents the optimizer from removing or reusing - * "io_" field from the stack frame (allowed in ANSI C). - */ - volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1]; - struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io)); - DECLARE_COMPLETION_ONSTACK(wait); + struct io *io; + struct sync_io sio; if (num_regions > 1 && (rw & RW_MASK) != WRITE) { WARN_ON(1); return -EIO; } + init_completion(&sio.wait); + + io = mempool_alloc(client->pool, GFP_NOIO); io->error_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ - io->wait = &wait; io->client = client; + io->callback = sync_io_complete; + io->context = &sio; io->vma_invalidate_address = dp->vma_invalidate_address; io->vma_invalidate_size = dp->vma_invalidate_size; dispatch_io(rw, num_regions, where, dp, io, 1); - wait_for_completion_io(&wait); + wait_for_completion_io(&sio.wait); if (error_bits) - *error_bits = io->error_bits; + *error_bits = sio.error_bits; - return io->error_bits ? -EIO : 0; + return sio.error_bits ? -EIO : 0; } static int async_io(struct dm_io_client *client, unsigned int num_regions, @@ -428,7 +436,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, io = mempool_alloc(client->pool, GFP_NOIO); io->error_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ - io->wait = NULL; io->client = client; io->callback = fn; io->context = context; @@ -481,9 +488,9 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp, * New collapsed (a)synchronous interface. * * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug - * the queue with blk_unplug() some time later or set REQ_SYNC in -io_req->bi_rw. If you fail to do one of these, the IO will be submitted to - * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c. + * the queue with blk_unplug() some time later or set REQ_SYNC in io_req->bi_rw. + * If you fail to do one of these, the IO will be submitted to the disk after + * q->unplug_delay, which defaults to 3ms in blk-settings.c. */ int dm_io(struct dm_io_request *io_req, unsigned num_regions, struct dm_io_region *where, unsigned long *sync_error_bits) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index f4167b013d9..833d7e752f0 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -373,8 +373,6 @@ static int __must_push_back(struct multipath *m) dm_noflush_suspending(m->ti))); } -#define pg_ready(m) (!(m)->queue_io && !(m)->pg_init_required) - /* * Map cloned requests */ @@ -402,11 +400,11 @@ static int multipath_map(struct dm_target *ti, struct request *clone, if (!__must_push_back(m)) r = -EIO; /* Failed */ goto out_unlock; - } - if (!pg_ready(m)) { + } else if (m->queue_io || m->pg_init_required) { __pg_init_all_paths(m); goto out_unlock; } + if (set_mapinfo(m, map_context) < 0) /* ENOMEM, requeue */ goto out_unlock; diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index 09a688b3d48..50fca469caf 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -137,13 +137,23 @@ static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr *bit *= sctx->region_table_entry_bits; } +static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr) +{ + unsigned long region_index; + unsigned bit; + + switch_get_position(sctx, region_nr, ®ion_index, &bit); + + return (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) & + ((1 << sctx->region_table_entry_bits) - 1); +} + /* * Find which path to use at given offset. */ static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset) { - unsigned long region_index; - unsigned bit, path_nr; + unsigned path_nr; sector_t p; p = offset; @@ -152,9 +162,7 @@ static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset) else sector_div(p, sctx->region_size); - switch_get_position(sctx, p, ®ion_index, &bit); - path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) & - ((1 << sctx->region_table_entry_bits) - 1); + path_nr = switch_region_table_read(sctx, p); /* This can only happen if the processor uses non-atomic stores. */ if (unlikely(path_nr >= sctx->nr_paths)) @@ -363,7 +371,7 @@ static __always_inline unsigned long parse_hex(const char **string) } static int process_set_region_mappings(struct switch_ctx *sctx, - unsigned argc, char **argv) + unsigned argc, char **argv) { unsigned i; unsigned long region_index = 0; @@ -372,6 +380,51 @@ static int process_set_region_mappings(struct switch_ctx *sctx, unsigned long path_nr; const char *string = argv[i]; + if ((*string & 0xdf) == 'R') { + unsigned long cycle_length, num_write; + + string++; + if (unlikely(*string == ',')) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + cycle_length = parse_hex(&string); + if (unlikely(*string != ',')) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + string++; + if (unlikely(!*string)) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + num_write = parse_hex(&string); + if (unlikely(*string)) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + + if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) { + DMWARN("invalid set_region_mappings cycle length: %lu > %lu", + cycle_length - 1, region_index); + return -EINVAL; + } + if (unlikely(region_index + num_write < region_index) || + unlikely(region_index + num_write >= sctx->nr_regions)) { + DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu", + region_index, num_write, sctx->nr_regions); + return -EINVAL; + } + + while (num_write--) { + region_index++; + path_nr = switch_region_table_read(sctx, region_index - cycle_length); + switch_region_table_write(sctx, region_index, path_nr); + } + + continue; + } + if (*string == ':') region_index++; else { @@ -500,7 +553,7 @@ static int switch_iterate_devices(struct dm_target *ti, static struct target_type switch_target = { .name = "switch", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = switch_ctr, .dtr = switch_dtr, diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 5f59f1e3e5b..f9c6cb8dbcf 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1386,6 +1386,14 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, return q && !blk_queue_add_random(q); } +static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags); +} + static bool dm_table_all_devices_attribute(struct dm_table *t, iterate_devices_callout_fn func) { @@ -1430,6 +1438,43 @@ static bool dm_table_supports_write_same(struct dm_table *t) return true; } +static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && blk_queue_discard(q); +} + +static bool dm_table_supports_discards(struct dm_table *t) +{ + struct dm_target *ti; + unsigned i = 0; + + /* + * Unless any target used by the table set discards_supported, + * require at least one underlying device to support discards. + * t->devices includes internal dm devices such as mirror logs + * so we need to use iterate_devices here, which targets + * supporting discard selectively must provide. + */ + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (!ti->num_discard_bios) + continue; + + if (ti->discards_supported) + return 1; + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, device_discard_capable, NULL)) + return 1; + } + + return 0; +} + void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { @@ -1464,6 +1509,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (!dm_table_supports_write_same(t)) q->limits.max_write_same_sectors = 0; + if (dm_table_all_devices_attribute(t, queue_supports_sg_merge)) + queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); + else + queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); + dm_table_set_integrity(t); /* @@ -1636,39 +1686,3 @@ void dm_table_run_md_queue_async(struct dm_table *t) } EXPORT_SYMBOL(dm_table_run_md_queue_async); -static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - struct request_queue *q = bdev_get_queue(dev->bdev); - - return q && blk_queue_discard(q); -} - -bool dm_table_supports_discards(struct dm_table *t) -{ - struct dm_target *ti; - unsigned i = 0; - - /* - * Unless any target used by the table set discards_supported, - * require at least one underlying device to support discards. - * t->devices includes internal dm devices such as mirror logs - * so we need to use iterate_devices here, which targets - * supporting discard selectively must provide. - */ - while (i < dm_table_get_num_targets(t)) { - ti = dm_table_get_target(t, i++); - - if (!ti->num_discard_bios) - continue; - - if (ti->discards_supported) - return 1; - - if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, device_discard_capable, NULL)) - return 1; - } - - return 0; -} diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index fc9c848a60c..4843801173f 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -227,6 +227,7 @@ struct thin_c { struct list_head list; struct dm_dev *pool_dev; struct dm_dev *origin_dev; + sector_t origin_size; dm_thin_id dev_id; struct pool *pool; @@ -554,11 +555,16 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio, struct dm_thin_new_mapping { struct list_head list; - bool quiesced:1; - bool prepared:1; bool pass_discard:1; bool definitely_not_shared:1; + /* + * Track quiescing, copying and zeroing preparation actions. When this + * counter hits zero the block is prepared and can be inserted into the + * btree. + */ + atomic_t prepare_actions; + int err; struct thin_c *tc; dm_block_t virt_block; @@ -575,43 +581,41 @@ struct dm_thin_new_mapping { bio_end_io_t *saved_bi_end_io; }; -static void __maybe_add_mapping(struct dm_thin_new_mapping *m) +static void __complete_mapping_preparation(struct dm_thin_new_mapping *m) { struct pool *pool = m->tc->pool; - if (m->quiesced && m->prepared) { + if (atomic_dec_and_test(&m->prepare_actions)) { list_add_tail(&m->list, &pool->prepared_mappings); wake_worker(pool); } } -static void copy_complete(int read_err, unsigned long write_err, void *context) +static void complete_mapping_preparation(struct dm_thin_new_mapping *m) { unsigned long flags; - struct dm_thin_new_mapping *m = context; struct pool *pool = m->tc->pool; - m->err = read_err || write_err ? -EIO : 0; - spin_lock_irqsave(&pool->lock, flags); - m->prepared = true; - __maybe_add_mapping(m); + __complete_mapping_preparation(m); spin_unlock_irqrestore(&pool->lock, flags); } +static void copy_complete(int read_err, unsigned long write_err, void *context) +{ + struct dm_thin_new_mapping *m = context; + + m->err = read_err || write_err ? -EIO : 0; + complete_mapping_preparation(m); +} + static void overwrite_endio(struct bio *bio, int err) { - unsigned long flags; struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); struct dm_thin_new_mapping *m = h->overwrite_mapping; - struct pool *pool = m->tc->pool; m->err = err; - - spin_lock_irqsave(&pool->lock, flags); - m->prepared = true; - __maybe_add_mapping(m); - spin_unlock_irqrestore(&pool->lock, flags); + complete_mapping_preparation(m); } /*----------------------------------------------------------------*/ @@ -821,10 +825,31 @@ static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) return m; } +static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m, + sector_t begin, sector_t end) +{ + int r; + struct dm_io_region to; + + to.bdev = tc->pool_dev->bdev; + to.sector = begin; + to.count = end - begin; + + r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m); + if (r < 0) { + DMERR_LIMIT("dm_kcopyd_zero() failed"); + copy_complete(1, 1, m); + } +} + +/* + * A partial copy also needs to zero the uncopied region. + */ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, struct dm_dev *origin, dm_block_t data_origin, dm_block_t data_dest, - struct dm_bio_prison_cell *cell, struct bio *bio) + struct dm_bio_prison_cell *cell, struct bio *bio, + sector_t len) { int r; struct pool *pool = tc->pool; @@ -835,8 +860,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, m->data_block = data_dest; m->cell = cell; + /* + * quiesce action + copy action + an extra reference held for the + * duration of this function (we may need to inc later for a + * partial zero). + */ + atomic_set(&m->prepare_actions, 3); + if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list)) - m->quiesced = true; + complete_mapping_preparation(m); /* already quiesced */ /* * IO to pool_dev remaps to the pool target's data_dev. @@ -857,20 +889,38 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, from.bdev = origin->bdev; from.sector = data_origin * pool->sectors_per_block; - from.count = pool->sectors_per_block; + from.count = len; to.bdev = tc->pool_dev->bdev; to.sector = data_dest * pool->sectors_per_block; - to.count = pool->sectors_per_block; + to.count = len; r = dm_kcopyd_copy(pool->copier, &from, 1, &to, 0, copy_complete, m); if (r < 0) { - mempool_free(m, pool->mapping_pool); DMERR_LIMIT("dm_kcopyd_copy() failed"); - cell_error(pool, cell); + copy_complete(1, 1, m); + + /* + * We allow the zero to be issued, to simplify the + * error path. Otherwise we'd need to start + * worrying about decrementing the prepare_actions + * counter. + */ + } + + /* + * Do we need to zero a tail region? + */ + if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) { + atomic_inc(&m->prepare_actions); + ll_zero(tc, m, + data_dest * pool->sectors_per_block + len, + (data_dest + 1) * pool->sectors_per_block); } } + + complete_mapping_preparation(m); /* drop our ref */ } static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, @@ -878,15 +928,8 @@ static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, struct dm_bio_prison_cell *cell, struct bio *bio) { schedule_copy(tc, virt_block, tc->pool_dev, - data_origin, data_dest, cell, bio); -} - -static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, - dm_block_t data_dest, - struct dm_bio_prison_cell *cell, struct bio *bio) -{ - schedule_copy(tc, virt_block, tc->origin_dev, - virt_block, data_dest, cell, bio); + data_origin, data_dest, cell, bio, + tc->pool->sectors_per_block); } static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, @@ -896,8 +939,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, struct pool *pool = tc->pool; struct dm_thin_new_mapping *m = get_next_mapping(pool); - m->quiesced = true; - m->prepared = false; + atomic_set(&m->prepare_actions, 1); /* no need to quiesce */ m->tc = tc; m->virt_block = virt_block; m->data_block = data_block; @@ -919,21 +961,33 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); inc_all_io_entry(pool, bio); remap_and_issue(tc, bio, data_block); - } else { - int r; - struct dm_io_region to; - to.bdev = tc->pool_dev->bdev; - to.sector = data_block * pool->sectors_per_block; - to.count = pool->sectors_per_block; + } else + ll_zero(tc, m, + data_block * pool->sectors_per_block, + (data_block + 1) * pool->sectors_per_block); +} - r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m); - if (r < 0) { - mempool_free(m, pool->mapping_pool); - DMERR_LIMIT("dm_kcopyd_zero() failed"); - cell_error(pool, cell); - } - } +static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, + dm_block_t data_dest, + struct dm_bio_prison_cell *cell, struct bio *bio) +{ + struct pool *pool = tc->pool; + sector_t virt_block_begin = virt_block * pool->sectors_per_block; + sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block; + + if (virt_block_end <= tc->origin_size) + schedule_copy(tc, virt_block, tc->origin_dev, + virt_block, data_dest, cell, bio, + pool->sectors_per_block); + + else if (virt_block_begin < tc->origin_size) + schedule_copy(tc, virt_block, tc->origin_dev, + virt_block, data_dest, cell, bio, + tc->origin_size - virt_block_begin); + + else + schedule_zero(tc, virt_block, data_dest, cell, bio); } /* @@ -1315,7 +1369,18 @@ static void process_bio(struct thin_c *tc, struct bio *bio) inc_all_io_entry(pool, bio); cell_defer_no_holder(tc, cell); - remap_to_origin_and_issue(tc, bio); + if (bio_end_sector(bio) <= tc->origin_size) + remap_to_origin_and_issue(tc, bio); + + else if (bio->bi_iter.bi_sector < tc->origin_size) { + zero_fill_bio(bio); + bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT; + remap_to_origin_and_issue(tc, bio); + + } else { + zero_fill_bio(bio); + bio_endio(bio, 0); + } } else provision_block(tc, bio, block, cell); break; @@ -3112,7 +3177,7 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) */ if (io_opt_sectors < pool->sectors_per_block || do_div(io_opt_sectors, pool->sectors_per_block)) { - blk_limits_io_min(limits, 0); + blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT); blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); } @@ -3141,7 +3206,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 12, 0}, + .version = {1, 13, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -3361,8 +3426,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err) spin_lock_irqsave(&pool->lock, flags); list_for_each_entry_safe(m, tmp, &work, list) { list_del(&m->list); - m->quiesced = true; - __maybe_add_mapping(m); + __complete_mapping_preparation(m); } spin_unlock_irqrestore(&pool->lock, flags); } @@ -3401,6 +3465,16 @@ static void thin_postsuspend(struct dm_target *ti) noflush_work(tc, do_noflush_stop); } +static int thin_preresume(struct dm_target *ti) +{ + struct thin_c *tc = ti->private; + + if (tc->origin_dev) + tc->origin_size = get_dev_size(tc->origin_dev->bdev); + + return 0; +} + /* * <nr mapped sectors> <highest mapped sector> */ @@ -3483,12 +3557,13 @@ static int thin_iterate_devices(struct dm_target *ti, static struct target_type thin_target = { .name = "thin", - .version = {1, 12, 0}, + .version = {1, 13, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, .map = thin_map, .end_io = thin_endio, + .preresume = thin_preresume, .presuspend = thin_presuspend, .postsuspend = thin_postsuspend, .status = thin_status, diff --git a/drivers/md/dm.h b/drivers/md/dm.h index ed76126aac5..e81d2152fa6 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -72,7 +72,6 @@ int dm_table_any_busy_target(struct dm_table *t); unsigned dm_table_get_type(struct dm_table *t); struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); -bool dm_table_supports_discards(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); diff --git a/drivers/mfd/rtsx_usb.c b/drivers/mfd/rtsx_usb.c index 6352bec8419..71f387ce8cb 100644 --- a/drivers/mfd/rtsx_usb.c +++ b/drivers/mfd/rtsx_usb.c @@ -744,6 +744,7 @@ static struct usb_device_id rtsx_usb_usb_ids[] = { { USB_DEVICE(0x0BDA, 0x0140) }, { } }; +MODULE_DEVICE_TABLE(usb, rtsx_usb_usb_ids); static struct usb_driver rtsx_usb_driver = { .name = "rtsx_usb", diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index 452782bffeb..ede41f05c39 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -2028,8 +2028,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req) /* complete ongoing async transfer before issuing discard */ if (card->host->areq) mmc_blk_issue_rw_rq(mq, NULL); - if (req->cmd_flags & REQ_SECURE && - !(card->quirks & MMC_QUIRK_SEC_ERASE_TRIM_BROKEN)) + if (req->cmd_flags & REQ_SECURE) ret = mmc_blk_issue_secdiscard_rq(mq, req); else ret = mmc_blk_issue_discard_rq(mq, req); @@ -2432,6 +2431,8 @@ static int mmc_blk_probe(struct mmc_card *card) if (!(card->csd.cmdclass & CCC_BLOCK_READ)) return -ENODEV; + mmc_fixup_device(card, blk_fixups); + md = mmc_blk_alloc(card); if (IS_ERR(md)) return PTR_ERR(md); @@ -2446,7 +2447,6 @@ static int mmc_blk_probe(struct mmc_card *card) goto out; mmc_set_drvdata(card, md); - mmc_fixup_device(card, blk_fixups); if (mmc_add_disk(md)) goto out; diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c index d2dbf02022b..8a1f1240e05 100644 --- a/drivers/mmc/core/bus.c +++ b/drivers/mmc/core/bus.c @@ -180,7 +180,6 @@ static int mmc_bus_resume(struct device *dev) #endif #ifdef CONFIG_PM_RUNTIME - static int mmc_runtime_suspend(struct device *dev) { struct mmc_card *card = mmc_dev_to_card(dev); @@ -196,17 +195,10 @@ static int mmc_runtime_resume(struct device *dev) return host->bus_ops->runtime_resume(host); } - -static int mmc_runtime_idle(struct device *dev) -{ - return 0; -} - #endif /* !CONFIG_PM_RUNTIME */ static const struct dev_pm_ops mmc_bus_pm_ops = { - SET_RUNTIME_PM_OPS(mmc_runtime_suspend, mmc_runtime_resume, - mmc_runtime_idle) + SET_RUNTIME_PM_OPS(mmc_runtime_suspend, mmc_runtime_resume, NULL) SET_SYSTEM_SLEEP_PM_OPS(mmc_bus_suspend, mmc_bus_resume) }; diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 7dc0c85fdb6..d03a080fb9c 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -2102,7 +2102,8 @@ EXPORT_SYMBOL(mmc_can_sanitize); int mmc_can_secure_erase_trim(struct mmc_card *card) { - if (card->ext_csd.sec_feature_support & EXT_CSD_SEC_ER_EN) + if ((card->ext_csd.sec_feature_support & EXT_CSD_SEC_ER_EN) && + !(card->quirks & MMC_QUIRK_SEC_ERASE_TRIM_BROKEN)) return 1; return 0; } diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index 793c6f7ddb0..1eda8dd8c86 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -324,13 +324,12 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd) } } + /* + * The EXT_CSD format is meant to be forward compatible. As long + * as CSD_STRUCTURE does not change, all values for EXT_CSD_REV + * are authorized, see JEDEC JESD84-B50 section B.8. + */ card->ext_csd.rev = ext_csd[EXT_CSD_REV]; - if (card->ext_csd.rev > 7) { - pr_err("%s: unrecognised EXT_CSD revision %d\n", - mmc_hostname(card->host), card->ext_csd.rev); - err = -EINVAL; - goto out; - } card->ext_csd.raw_sectors[0] = ext_csd[EXT_CSD_SEC_CNT + 0]; card->ext_csd.raw_sectors[1] = ext_csd[EXT_CSD_SEC_CNT + 1]; diff --git a/drivers/mmc/core/quirks.c b/drivers/mmc/core/quirks.c index 6c36fccaa1e..dd1d1e0fe32 100644 --- a/drivers/mmc/core/quirks.c +++ b/drivers/mmc/core/quirks.c @@ -91,7 +91,7 @@ void mmc_fixup_device(struct mmc_card *card, const struct mmc_fixup *table) (f->cis_device == card->cis.device || f->cis_device == (u16) SDIO_ANY_ID) && rev >= f->rev_start && rev <= f->rev_end) { - dev_dbg(&card->dev, "calling %pF\n", f->vendor_fixup); + dev_dbg(&card->dev, "calling %pf\n", f->vendor_fixup); f->vendor_fixup(card, f->data); } } diff --git a/drivers/mmc/core/sd_ops.c b/drivers/mmc/core/sd_ops.c index 274ef00b446..48d0c93ba25 100644 --- a/drivers/mmc/core/sd_ops.c +++ b/drivers/mmc/core/sd_ops.c @@ -184,6 +184,9 @@ int mmc_send_app_op_cond(struct mmc_host *host, u32 ocr, u32 *rocr) mmc_delay(10); } + if (!i) + pr_err("%s: card never left busy state\n", mmc_hostname(host)); + if (rocr && !mmc_host_is_spi(host)) *rocr = cmd.resp[0]; diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index a5652548230..45113582246 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -290,6 +290,18 @@ config MMC_MOXART be found on some embedded hardware such as UC-7112-LX. If you have a controller with this interface, say Y here. +config MMC_SDHCI_ST + tristate "SDHCI support on STMicroelectronics SoC" + depends on ARCH_STI + depends on MMC_SDHCI_PLTFM + select MMC_SDHCI_IO_ACCESSORS + help + This selects the Secure Digital Host Controller Interface in + STMicroelectronics SoCs. + + If you have a controller with this interface, say Y or M here. + If unsure, say N. + config MMC_OMAP tristate "TI OMAP Multimedia Card Interface support" depends on ARCH_OMAP @@ -303,6 +315,7 @@ config MMC_OMAP config MMC_OMAP_HS tristate "TI OMAP High Speed Multimedia Card Interface support" + depends on HAS_DMA depends on ARCH_OMAP2PLUS || COMPILE_TEST help This selects the TI OMAP High Speed Multimedia card Interface. @@ -343,7 +356,7 @@ config MMC_ATMELMCI config MMC_SDHCI_MSM tristate "Qualcomm SDHCI Controller Support" - depends on ARCH_QCOM + depends on ARCH_QCOM || (ARM && COMPILE_TEST) depends on MMC_SDHCI_PLTFM help This selects the Secure Digital Host Controller Interface (SDHCI) @@ -440,6 +453,7 @@ config MMC_SPI config MMC_S3C tristate "Samsung S3C SD/MMC Card Interface support" depends on ARCH_S3C24XX + depends on S3C24XX_DMAC help This selects a driver for the MCI interface found in Samsung's S3C2410, S3C2412, S3C2440, S3C2442 CPUs. @@ -477,15 +491,6 @@ config MMC_S3C_DMA working properly and needs to be debugged before this option is useful. -config MMC_S3C_PIODMA - bool "Support for both PIO and DMA" - help - Compile both the PIO and DMA transfer routines into the - driver and let the platform select at run-time which one - is best. - - See notes for the DMA option. - endchoice config MMC_SDRICOH_CS @@ -623,7 +628,7 @@ config MMC_DW_PCI config MMC_SH_MMCIF tristate "SuperH Internal MMCIF support" - depends on MMC_BLOCK + depends on MMC_BLOCK && HAS_DMA depends on SUPERH || ARCH_SHMOBILE || COMPILE_TEST help This selects the MMC Host Interface controller (MMCIF). @@ -697,6 +702,7 @@ config MMC_WMT config MMC_USDHI6ROL0 tristate "Renesas USDHI6ROL0 SD/SDIO Host Controller support" + depends on HAS_DMA help This selects support for the Renesas USDHI6ROL0 SD/SDIO Host Controller diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile index 7f81ddf1dd2..f211eede8db 100644 --- a/drivers/mmc/host/Makefile +++ b/drivers/mmc/host/Makefile @@ -68,6 +68,7 @@ obj-$(CONFIG_MMC_SDHCI_OF_HLWD) += sdhci-of-hlwd.o obj-$(CONFIG_MMC_SDHCI_BCM_KONA) += sdhci-bcm-kona.o obj-$(CONFIG_MMC_SDHCI_BCM2835) += sdhci-bcm2835.o obj-$(CONFIG_MMC_SDHCI_MSM) += sdhci-msm.o +obj-$(CONFIG_MMC_SDHCI_ST) += sdhci-st.o ifeq ($(CONFIG_CB710_DEBUG),y) CFLAGS-cb710-mmc += -DDEBUG diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 1ac227c603b..8f216edbdf0 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -111,8 +111,7 @@ static const u8 tuning_blk_pattern_8bit[] = { 0xff, 0x77, 0x77, 0xff, 0x77, 0xbb, 0xdd, 0xee, }; -static inline bool dw_mci_fifo_reset(struct dw_mci *host); -static inline bool dw_mci_ctrl_all_reset(struct dw_mci *host); +static bool dw_mci_reset(struct dw_mci *host); #if defined(CONFIG_DEBUG_FS) static int dw_mci_req_show(struct seq_file *s, void *v) @@ -997,7 +996,8 @@ static int dw_mci_get_ro(struct mmc_host *mmc) int gpio_ro = mmc_gpio_get_ro(mmc); /* Use platform get_ro function, else try on board write protect */ - if (slot->quirks & DW_MCI_SLOT_QUIRK_NO_WRITE_PROTECT) + if ((slot->quirks & DW_MCI_SLOT_QUIRK_NO_WRITE_PROTECT) || + (slot->host->quirks & DW_MCI_QUIRK_NO_WRITE_PROTECT)) read_only = 0; else if (!IS_ERR_VALUE(gpio_ro)) read_only = gpio_ro; @@ -1235,7 +1235,7 @@ static int dw_mci_data_complete(struct dw_mci *host, struct mmc_data *data) * After an error, there may be data lingering * in the FIFO */ - dw_mci_fifo_reset(host); + dw_mci_reset(host); } else { data->bytes_xfered = data->blocks * data->blksz; data->error = 0; @@ -1352,7 +1352,7 @@ static void dw_mci_tasklet_func(unsigned long priv) /* CMD error in data command */ if (mrq->cmd->error && mrq->data) - dw_mci_fifo_reset(host); + dw_mci_reset(host); host->cmd = NULL; host->data = NULL; @@ -1963,14 +1963,8 @@ static void dw_mci_work_routine_card(struct work_struct *work) } /* Power down slot */ - if (present == 0) { - /* Clear down the FIFO */ - dw_mci_fifo_reset(host); -#ifdef CONFIG_MMC_DW_IDMAC - dw_mci_idmac_reset(host); -#endif - - } + if (present == 0) + dw_mci_reset(host); spin_unlock_bh(&host->lock); @@ -2021,8 +2015,11 @@ static int dw_mci_of_get_slot_quirks(struct device *dev, u8 slot) /* get quirks */ for (idx = 0; idx < ARRAY_SIZE(of_slot_quirks); idx++) - if (of_get_property(np, of_slot_quirks[idx].quirk, NULL)) + if (of_get_property(np, of_slot_quirks[idx].quirk, NULL)) { + dev_warn(dev, "Slot quirk %s is deprecated\n", + of_slot_quirks[idx].quirk); quirks |= of_slot_quirks[idx].id; + } return quirks; } @@ -2208,8 +2205,11 @@ static bool dw_mci_ctrl_reset(struct dw_mci *host, u32 reset) return false; } -static inline bool dw_mci_fifo_reset(struct dw_mci *host) +static bool dw_mci_reset(struct dw_mci *host) { + u32 flags = SDMMC_CTRL_RESET | SDMMC_CTRL_FIFO_RESET; + bool ret = false; + /* * Reseting generates a block interrupt, hence setting * the scatter-gather pointer to NULL. @@ -2219,15 +2219,60 @@ static inline bool dw_mci_fifo_reset(struct dw_mci *host) host->sg = NULL; } - return dw_mci_ctrl_reset(host, SDMMC_CTRL_FIFO_RESET); -} + if (host->use_dma) + flags |= SDMMC_CTRL_DMA_RESET; -static inline bool dw_mci_ctrl_all_reset(struct dw_mci *host) -{ - return dw_mci_ctrl_reset(host, - SDMMC_CTRL_FIFO_RESET | - SDMMC_CTRL_RESET | - SDMMC_CTRL_DMA_RESET); + if (dw_mci_ctrl_reset(host, flags)) { + /* + * In all cases we clear the RAWINTS register to clear any + * interrupts. + */ + mci_writel(host, RINTSTS, 0xFFFFFFFF); + + /* if using dma we wait for dma_req to clear */ + if (host->use_dma) { + unsigned long timeout = jiffies + msecs_to_jiffies(500); + u32 status; + do { + status = mci_readl(host, STATUS); + if (!(status & SDMMC_STATUS_DMA_REQ)) + break; + cpu_relax(); + } while (time_before(jiffies, timeout)); + + if (status & SDMMC_STATUS_DMA_REQ) { + dev_err(host->dev, + "%s: Timeout waiting for dma_req to " + "clear during reset\n", __func__); + goto ciu_out; + } + + /* when using DMA next we reset the fifo again */ + if (!dw_mci_ctrl_reset(host, SDMMC_CTRL_FIFO_RESET)) + goto ciu_out; + } + } else { + /* if the controller reset bit did clear, then set clock regs */ + if (!(mci_readl(host, CTRL) & SDMMC_CTRL_RESET)) { + dev_err(host->dev, "%s: fifo/dma reset bits didn't " + "clear but ciu was reset, doing clock update\n", + __func__); + goto ciu_out; + } + } + +#if IS_ENABLED(CONFIG_MMC_DW_IDMAC) + /* It is also recommended that we reset and reprogram idmac */ + dw_mci_idmac_reset(host); +#endif + + ret = true; + +ciu_out: + /* After a CTRL reset we need to have CIU set clock registers */ + mci_send_cmd(host->cur_slot, SDMMC_CMD_UPD_CLK, 0); + + return ret; } #ifdef CONFIG_OF @@ -2238,6 +2283,9 @@ static struct dw_mci_of_quirks { { .quirk = "broken-cd", .id = DW_MCI_QUIRK_BROKEN_CARD_DETECTION, + }, { + .quirk = "disable-wp", + .id = DW_MCI_QUIRK_NO_WRITE_PROTECT, }, }; @@ -2425,7 +2473,7 @@ int dw_mci_probe(struct dw_mci *host) } /* Reset all blocks */ - if (!dw_mci_ctrl_all_reset(host)) + if (!dw_mci_ctrl_reset(host, SDMMC_CTRL_ALL_RESET_FLAGS)) return -ENODEV; host->dma_ops = host->pdata->dma_ops; @@ -2612,7 +2660,7 @@ int dw_mci_resume(struct dw_mci *host) } } - if (!dw_mci_ctrl_all_reset(host)) { + if (!dw_mci_ctrl_reset(host, SDMMC_CTRL_ALL_RESET_FLAGS)) { ret = -ENODEV; return ret; } diff --git a/drivers/mmc/host/dw_mmc.h b/drivers/mmc/host/dw_mmc.h index 738fa241d05..08fd956d81f 100644 --- a/drivers/mmc/host/dw_mmc.h +++ b/drivers/mmc/host/dw_mmc.h @@ -129,6 +129,7 @@ #define SDMMC_CMD_INDX(n) ((n) & 0x1F) /* Status register defines */ #define SDMMC_GET_FCNT(x) (((x)>>17) & 0x1FFF) +#define SDMMC_STATUS_DMA_REQ BIT(31) /* FIFOTH register defines */ #define SDMMC_SET_FIFOTH(m, r, t) (((m) & 0x7) << 28 | \ ((r) & 0xFFF) << 16 | \ @@ -150,6 +151,10 @@ /* Card read threshold */ #define SDMMC_SET_RD_THLD(v, x) (((v) & 0x1FFF) << 16 | (x)) +/* All ctrl reset bits */ +#define SDMMC_CTRL_ALL_RESET_FLAGS \ + (SDMMC_CTRL_RESET | SDMMC_CTRL_FIFO_RESET | SDMMC_CTRL_DMA_RESET) + /* Register access macros */ #define mci_readl(dev, reg) \ __raw_readl((dev)->regs + SDMMC_##reg) diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c index 7ad463e9741..e4d47070415 100644 --- a/drivers/mmc/host/mmci.c +++ b/drivers/mmc/host/mmci.c @@ -52,34 +52,53 @@ static unsigned int fmax = 515633; * struct variant_data - MMCI variant-specific quirks * @clkreg: default value for MCICLOCK register * @clkreg_enable: enable value for MMCICLOCK register + * @clkreg_8bit_bus_enable: enable value for 8 bit bus + * @clkreg_neg_edge_enable: enable value for inverted data/cmd output * @datalength_bits: number of bits in the MMCIDATALENGTH register * @fifosize: number of bytes that can be written when MMCI_TXFIFOEMPTY * is asserted (likewise for RX) * @fifohalfsize: number of bytes that can be written when MCI_TXFIFOHALFEMPTY * is asserted (likewise for RX) + * @data_cmd_enable: enable value for data commands. * @sdio: variant supports SDIO * @st_clkdiv: true if using a ST-specific clock divider algorithm + * @datactrl_mask_ddrmode: ddr mode mask in datactrl register. * @blksz_datactrl16: true if Block size is at b16..b30 position in datactrl register + * @blksz_datactrl4: true if Block size is at b4..b16 position in datactrl + * register * @pwrreg_powerup: power up value for MMCIPOWER register + * @f_max: maximum clk frequency supported by the controller. * @signal_direction: input/out direction of bus signals can be indicated * @pwrreg_clkgate: MMCIPOWER register must be used to gate the clock * @busy_detect: true if busy detection on dat0 is supported * @pwrreg_nopower: bits in MMCIPOWER don't controls ext. power supply + * @explicit_mclk_control: enable explicit mclk control in driver. + * @qcom_fifo: enables qcom specific fifo pio read logic. + * @reversed_irq_handling: handle data irq before cmd irq. */ struct variant_data { unsigned int clkreg; unsigned int clkreg_enable; + unsigned int clkreg_8bit_bus_enable; + unsigned int clkreg_neg_edge_enable; unsigned int datalength_bits; unsigned int fifosize; unsigned int fifohalfsize; + unsigned int data_cmd_enable; + unsigned int datactrl_mask_ddrmode; bool sdio; bool st_clkdiv; bool blksz_datactrl16; + bool blksz_datactrl4; u32 pwrreg_powerup; + u32 f_max; bool signal_direction; bool pwrreg_clkgate; bool busy_detect; bool pwrreg_nopower; + bool explicit_mclk_control; + bool qcom_fifo; + bool reversed_irq_handling; }; static struct variant_data variant_arm = { @@ -87,6 +106,8 @@ static struct variant_data variant_arm = { .fifohalfsize = 8 * 4, .datalength_bits = 16, .pwrreg_powerup = MCI_PWR_UP, + .f_max = 100000000, + .reversed_irq_handling = true, }; static struct variant_data variant_arm_extended_fifo = { @@ -94,6 +115,7 @@ static struct variant_data variant_arm_extended_fifo = { .fifohalfsize = 64 * 4, .datalength_bits = 16, .pwrreg_powerup = MCI_PWR_UP, + .f_max = 100000000, }; static struct variant_data variant_arm_extended_fifo_hwfc = { @@ -102,15 +124,18 @@ static struct variant_data variant_arm_extended_fifo_hwfc = { .clkreg_enable = MCI_ARM_HWFCEN, .datalength_bits = 16, .pwrreg_powerup = MCI_PWR_UP, + .f_max = 100000000, }; static struct variant_data variant_u300 = { .fifosize = 16 * 4, .fifohalfsize = 8 * 4, .clkreg_enable = MCI_ST_U300_HWFCEN, + .clkreg_8bit_bus_enable = MCI_ST_8BIT_BUS, .datalength_bits = 16, .sdio = true, .pwrreg_powerup = MCI_PWR_ON, + .f_max = 100000000, .signal_direction = true, .pwrreg_clkgate = true, .pwrreg_nopower = true, @@ -124,6 +149,7 @@ static struct variant_data variant_nomadik = { .sdio = true, .st_clkdiv = true, .pwrreg_powerup = MCI_PWR_ON, + .f_max = 100000000, .signal_direction = true, .pwrreg_clkgate = true, .pwrreg_nopower = true, @@ -134,10 +160,13 @@ static struct variant_data variant_ux500 = { .fifohalfsize = 8 * 4, .clkreg = MCI_CLK_ENABLE, .clkreg_enable = MCI_ST_UX500_HWFCEN, + .clkreg_8bit_bus_enable = MCI_ST_8BIT_BUS, + .clkreg_neg_edge_enable = MCI_ST_UX500_NEG_EDGE, .datalength_bits = 24, .sdio = true, .st_clkdiv = true, .pwrreg_powerup = MCI_PWR_ON, + .f_max = 100000000, .signal_direction = true, .pwrreg_clkgate = true, .busy_detect = true, @@ -149,17 +178,38 @@ static struct variant_data variant_ux500v2 = { .fifohalfsize = 8 * 4, .clkreg = MCI_CLK_ENABLE, .clkreg_enable = MCI_ST_UX500_HWFCEN, + .clkreg_8bit_bus_enable = MCI_ST_8BIT_BUS, + .clkreg_neg_edge_enable = MCI_ST_UX500_NEG_EDGE, + .datactrl_mask_ddrmode = MCI_ST_DPSM_DDRMODE, .datalength_bits = 24, .sdio = true, .st_clkdiv = true, .blksz_datactrl16 = true, .pwrreg_powerup = MCI_PWR_ON, + .f_max = 100000000, .signal_direction = true, .pwrreg_clkgate = true, .busy_detect = true, .pwrreg_nopower = true, }; +static struct variant_data variant_qcom = { + .fifosize = 16 * 4, + .fifohalfsize = 8 * 4, + .clkreg = MCI_CLK_ENABLE, + .clkreg_enable = MCI_QCOM_CLK_FLOWENA | + MCI_QCOM_CLK_SELECT_IN_FBCLK, + .clkreg_8bit_bus_enable = MCI_QCOM_CLK_WIDEBUS_8, + .datactrl_mask_ddrmode = MCI_QCOM_CLK_SELECT_IN_DDR_MODE, + .data_cmd_enable = MCI_QCOM_CSPM_DATCMD, + .blksz_datactrl4 = true, + .datalength_bits = 24, + .pwrreg_powerup = MCI_PWR_UP, + .f_max = 208000000, + .explicit_mclk_control = true, + .qcom_fifo = true, +}; + static int mmci_card_busy(struct mmc_host *mmc) { struct mmci_host *host = mmc_priv(mmc); @@ -260,7 +310,9 @@ static void mmci_set_clkreg(struct mmci_host *host, unsigned int desired) host->cclk = 0; if (desired) { - if (desired >= host->mclk) { + if (variant->explicit_mclk_control) { + host->cclk = host->mclk; + } else if (desired >= host->mclk) { clk = MCI_CLK_BYPASS; if (variant->st_clkdiv) clk |= MCI_ST_UX500_NEG_EDGE; @@ -299,11 +351,11 @@ static void mmci_set_clkreg(struct mmci_host *host, unsigned int desired) if (host->mmc->ios.bus_width == MMC_BUS_WIDTH_4) clk |= MCI_4BIT_BUS; if (host->mmc->ios.bus_width == MMC_BUS_WIDTH_8) - clk |= MCI_ST_8BIT_BUS; + clk |= variant->clkreg_8bit_bus_enable; if (host->mmc->ios.timing == MMC_TIMING_UHS_DDR50 || host->mmc->ios.timing == MMC_TIMING_MMC_DDR52) - clk |= MCI_ST_UX500_NEG_EDGE; + clk |= variant->clkreg_neg_edge_enable; mmci_write_clkreg(host, clk); } @@ -719,7 +771,7 @@ static void mmci_start_data(struct mmci_host *host, struct mmc_data *data) data->bytes_xfered = 0; clks = (unsigned long long)data->timeout_ns * host->cclk; - do_div(clks, 1000000000UL); + do_div(clks, NSEC_PER_SEC); timeout = data->timeout_clks + (unsigned int)clks; @@ -732,6 +784,8 @@ static void mmci_start_data(struct mmci_host *host, struct mmc_data *data) if (variant->blksz_datactrl16) datactrl = MCI_DPSM_ENABLE | (data->blksz << 16); + else if (variant->blksz_datactrl4) + datactrl = MCI_DPSM_ENABLE | (data->blksz << 4); else datactrl = MCI_DPSM_ENABLE | blksz_bits << 4; @@ -767,7 +821,7 @@ static void mmci_start_data(struct mmci_host *host, struct mmc_data *data) if (host->mmc->ios.timing == MMC_TIMING_UHS_DDR50 || host->mmc->ios.timing == MMC_TIMING_MMC_DDR52) - datactrl |= MCI_ST_DPSM_DDRMODE; + datactrl |= variant->datactrl_mask_ddrmode; /* * Attempt to use DMA operation mode, if this @@ -812,7 +866,7 @@ mmci_start_command(struct mmci_host *host, struct mmc_command *cmd, u32 c) if (readl(base + MMCICOMMAND) & MCI_CPSM_ENABLE) { writel(0, base + MMCICOMMAND); - udelay(1); + mmci_reg_delay(host); } c |= cmd->opcode | MCI_CPSM_ENABLE; @@ -824,6 +878,9 @@ mmci_start_command(struct mmci_host *host, struct mmc_command *cmd, u32 c) if (/*interrupt*/0) c |= MCI_CPSM_INTERRUPT; + if (mmc_cmd_type(cmd) == MMC_CMD_ADTC) + c |= host->variant->data_cmd_enable; + host->cmd = cmd; writel(cmd->arg, base + MMCIARGUMENT); @@ -834,6 +891,10 @@ static void mmci_data_irq(struct mmci_host *host, struct mmc_data *data, unsigned int status) { + /* Make sure we have data to handle */ + if (!data) + return; + /* First check for errors */ if (status & (MCI_DATACRCFAIL|MCI_DATATIMEOUT|MCI_STARTBITERR| MCI_TXUNDERRUN|MCI_RXOVERRUN)) { @@ -902,9 +963,17 @@ mmci_cmd_irq(struct mmci_host *host, struct mmc_command *cmd, unsigned int status) { void __iomem *base = host->base; - bool sbc = (cmd == host->mrq->sbc); - bool busy_resp = host->variant->busy_detect && - (cmd->flags & MMC_RSP_BUSY); + bool sbc, busy_resp; + + if (!cmd) + return; + + sbc = (cmd == host->mrq->sbc); + busy_resp = host->variant->busy_detect && (cmd->flags & MMC_RSP_BUSY); + + if (!((status|host->busy_status) & (MCI_CMDCRCFAIL|MCI_CMDTIMEOUT| + MCI_CMDSENT|MCI_CMDRESPEND))) + return; /* Check if we need to wait for busy completion. */ if (host->busy_status && (status & MCI_ST_CARDBUSY)) @@ -957,15 +1026,34 @@ mmci_cmd_irq(struct mmci_host *host, struct mmc_command *cmd, } } +static int mmci_get_rx_fifocnt(struct mmci_host *host, u32 status, int remain) +{ + return remain - (readl(host->base + MMCIFIFOCNT) << 2); +} + +static int mmci_qcom_get_rx_fifocnt(struct mmci_host *host, u32 status, int r) +{ + /* + * on qcom SDCC4 only 8 words are used in each burst so only 8 addresses + * from the fifo range should be used + */ + if (status & MCI_RXFIFOHALFFULL) + return host->variant->fifohalfsize; + else if (status & MCI_RXDATAAVLBL) + return 4; + + return 0; +} + static int mmci_pio_read(struct mmci_host *host, char *buffer, unsigned int remain) { void __iomem *base = host->base; char *ptr = buffer; - u32 status; + u32 status = readl(host->base + MMCISTATUS); int host_remain = host->size; do { - int count = host_remain - (readl(base + MMCIFIFOCNT) << 2); + int count = host->get_rx_fifocnt(host, status, host_remain); if (count > remain) count = remain; @@ -1132,9 +1220,6 @@ static irqreturn_t mmci_irq(int irq, void *dev_id) spin_lock(&host->lock); do { - struct mmc_command *cmd; - struct mmc_data *data; - status = readl(host->base + MMCISTATUS); if (host->singleirq) { @@ -1154,16 +1239,13 @@ static irqreturn_t mmci_irq(int irq, void *dev_id) dev_dbg(mmc_dev(host->mmc), "irq0 (data+cmd) %08x\n", status); - cmd = host->cmd; - if ((status|host->busy_status) & (MCI_CMDCRCFAIL|MCI_CMDTIMEOUT| - MCI_CMDSENT|MCI_CMDRESPEND) && cmd) - mmci_cmd_irq(host, cmd, status); - - data = host->data; - if (status & (MCI_DATACRCFAIL|MCI_DATATIMEOUT|MCI_STARTBITERR| - MCI_TXUNDERRUN|MCI_RXOVERRUN|MCI_DATAEND| - MCI_DATABLOCKEND) && data) - mmci_data_irq(host, data, status); + if (host->variant->reversed_irq_handling) { + mmci_data_irq(host, host->data, status); + mmci_cmd_irq(host, host->cmd, status); + } else { + mmci_cmd_irq(host, host->cmd, status); + mmci_data_irq(host, host->data, status); + } /* Don't poll for busy completion in irq context. */ if (host->busy_status) @@ -1296,6 +1378,17 @@ static void mmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) if (!ios->clock && variant->pwrreg_clkgate) pwr &= ~MCI_PWR_ON; + if (host->variant->explicit_mclk_control && + ios->clock != host->clock_cache) { + ret = clk_set_rate(host->clk, ios->clock); + if (ret < 0) + dev_err(mmc_dev(host->mmc), + "Error setting clock rate (%d)\n", ret); + else + host->mclk = clk_get_rate(host->clk); + } + host->clock_cache = ios->clock; + spin_lock_irqsave(&host->lock, flags); mmci_set_clkreg(host, ios->clock); @@ -1443,6 +1536,11 @@ static int mmci_probe(struct amba_device *dev, if (ret) goto host_free; + if (variant->qcom_fifo) + host->get_rx_fifocnt = mmci_qcom_get_rx_fifocnt; + else + host->get_rx_fifocnt = mmci_get_rx_fifocnt; + host->plat = plat; host->variant = variant; host->mclk = clk_get_rate(host->clk); @@ -1451,8 +1549,8 @@ static int mmci_probe(struct amba_device *dev, * so we try to adjust the clock down to this, * (if possible). */ - if (host->mclk > 100000000) { - ret = clk_set_rate(host->clk, 100000000); + if (host->mclk > variant->f_max) { + ret = clk_set_rate(host->clk, variant->f_max); if (ret < 0) goto clk_disable; host->mclk = clk_get_rate(host->clk); @@ -1471,9 +1569,12 @@ static int mmci_probe(struct amba_device *dev, * The ARM and ST versions of the block have slightly different * clock divider equations which means that the minimum divider * differs too. + * on Qualcomm like controllers get the nearest minimum clock to 100Khz */ if (variant->st_clkdiv) mmc->f_min = DIV_ROUND_UP(host->mclk, 257); + else if (variant->explicit_mclk_control) + mmc->f_min = clk_round_rate(host->clk, 100000); else mmc->f_min = DIV_ROUND_UP(host->mclk, 512); /* @@ -1483,9 +1584,14 @@ static int mmci_probe(struct amba_device *dev, * the block, of course. */ if (mmc->f_max) - mmc->f_max = min(host->mclk, mmc->f_max); + mmc->f_max = variant->explicit_mclk_control ? + min(variant->f_max, mmc->f_max) : + min(host->mclk, mmc->f_max); else - mmc->f_max = min(host->mclk, fmax); + mmc->f_max = variant->explicit_mclk_control ? + fmax : min(host->mclk, fmax); + + dev_dbg(mmc_dev(mmc), "clocking block at %u Hz\n", mmc->f_max); /* Get regulators and the supported OCR mask */ @@ -1752,6 +1858,12 @@ static struct amba_id mmci_ids[] = { .mask = 0xf0ffffff, .data = &variant_ux500v2, }, + /* Qualcomm variants */ + { + .id = 0x00051180, + .mask = 0x000fffff, + .data = &variant_qcom, + }, { 0, 0 }, }; diff --git a/drivers/mmc/host/mmci.h b/drivers/mmc/host/mmci.h index 347d942d740..a1f5e4f49e2 100644 --- a/drivers/mmc/host/mmci.h +++ b/drivers/mmc/host/mmci.h @@ -41,6 +41,15 @@ /* Modified PL180 on Versatile Express platform */ #define MCI_ARM_HWFCEN (1 << 12) +/* Modified on Qualcomm Integrations */ +#define MCI_QCOM_CLK_WIDEBUS_8 (BIT(10) | BIT(11)) +#define MCI_QCOM_CLK_FLOWENA BIT(12) +#define MCI_QCOM_CLK_INVERTOUT BIT(13) + +/* select in latch data and command in */ +#define MCI_QCOM_CLK_SELECT_IN_FBCLK BIT(15) +#define MCI_QCOM_CLK_SELECT_IN_DDR_MODE (BIT(14) | BIT(15)) + #define MMCIARGUMENT 0x008 #define MMCICOMMAND 0x00c #define MCI_CPSM_RESPONSE (1 << 6) @@ -54,6 +63,14 @@ #define MCI_ST_NIEN (1 << 13) #define MCI_ST_CE_ATACMD (1 << 14) +/* Modified on Qualcomm Integrations */ +#define MCI_QCOM_CSPM_DATCMD BIT(12) +#define MCI_QCOM_CSPM_MCIABORT BIT(13) +#define MCI_QCOM_CSPM_CCSENABLE BIT(14) +#define MCI_QCOM_CSPM_CCSDISABLE BIT(15) +#define MCI_QCOM_CSPM_AUTO_CMD19 BIT(16) +#define MCI_QCOM_CSPM_AUTO_CMD21 BIT(21) + #define MMCIRESPCMD 0x010 #define MMCIRESPONSE0 0x014 #define MMCIRESPONSE1 0x018 @@ -191,6 +208,8 @@ struct mmci_host { spinlock_t lock; unsigned int mclk; + /* cached value of requested clk in set_ios */ + unsigned int clock_cache; unsigned int cclk; u32 pwr_reg; u32 pwr_reg_add; @@ -210,6 +229,7 @@ struct mmci_host { /* pio stuff */ struct sg_mapping_iter sg_miter; unsigned int size; + int (*get_rx_fifocnt)(struct mmci_host *h, u32 status, int remain); #ifdef CONFIG_DMA_ENGINE /* DMA stuff */ diff --git a/drivers/mmc/host/moxart-mmc.c b/drivers/mmc/host/moxart-mmc.c index 74924a04026..b4b1efbf6c1 100644 --- a/drivers/mmc/host/moxart-mmc.c +++ b/drivers/mmc/host/moxart-mmc.c @@ -13,7 +13,6 @@ * warranty of any kind, whether express or implied. */ -#include <linux/version.h> #include <linux/module.h> #include <linux/init.h> #include <linux/platform_device.h> diff --git a/drivers/mmc/host/mxs-mmc.c b/drivers/mmc/host/mxs-mmc.c index babfea03ba8..140885a5a4e 100644 --- a/drivers/mmc/host/mxs-mmc.c +++ b/drivers/mmc/host/mxs-mmc.c @@ -86,7 +86,8 @@ static int mxs_mmc_get_cd(struct mmc_host *mmc) if (ret >= 0) return ret; - present = !(readl(ssp->base + HW_SSP_STATUS(ssp)) & + present = mmc->caps & MMC_CAP_NEEDS_POLL || + !(readl(ssp->base + HW_SSP_STATUS(ssp)) & BM_SSP_STATUS_CARD_DETECT); if (mmc->caps2 & MMC_CAP2_CD_ACTIVE_HIGH) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 6b7b7558592..965672663ef 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -29,6 +29,7 @@ #include <linux/timer.h> #include <linux/clk.h> #include <linux/of.h> +#include <linux/of_irq.h> #include <linux/of_gpio.h> #include <linux/of_device.h> #include <linux/omap-dmaengine.h> @@ -36,6 +37,7 @@ #include <linux/mmc/core.h> #include <linux/mmc/mmc.h> #include <linux/io.h> +#include <linux/irq.h> #include <linux/gpio.h> #include <linux/regulator/consumer.h> #include <linux/pinctrl/consumer.h> @@ -54,6 +56,7 @@ #define OMAP_HSMMC_RSP54 0x0118 #define OMAP_HSMMC_RSP76 0x011C #define OMAP_HSMMC_DATA 0x0120 +#define OMAP_HSMMC_PSTATE 0x0124 #define OMAP_HSMMC_HCTL 0x0128 #define OMAP_HSMMC_SYSCTL 0x012C #define OMAP_HSMMC_STAT 0x0130 @@ -91,7 +94,10 @@ #define BCE (1 << 1) #define FOUR_BIT (1 << 1) #define HSPE (1 << 2) +#define IWE (1 << 24) #define DDR (1 << 19) +#define CLKEXTFREE (1 << 16) +#define CTPL (1 << 11) #define DW8 (1 << 5) #define OD 0x1 #define STAT_CLEAR 0xFFFFFFFF @@ -101,11 +107,15 @@ #define SRD (1 << 26) #define SOFTRESET (1 << 1) +/* PSTATE */ +#define DLEV_DAT(x) (1 << (20 + (x))) + /* Interrupt masks for IE and ISE register */ #define CC_EN (1 << 0) #define TC_EN (1 << 1) #define BWR_EN (1 << 4) #define BRR_EN (1 << 5) +#define CIRQ_EN (1 << 8) #define ERR_EN (1 << 15) #define CTO_EN (1 << 16) #define CCRC_EN (1 << 17) @@ -140,7 +150,6 @@ #define VDD_3V0 3000000 /* 300000 uV */ #define VDD_165_195 (ffs(MMC_VDD_165_195) - 1) -#define AUTO_CMD23 (1 << 1) /* Auto CMD23 support */ /* * One controller can have multiple slots, like on some omap boards using * omap.c controller driver. Luckily this is not currently done on any known @@ -194,6 +203,7 @@ struct omap_hsmmc_host { u32 sysctl; u32 capa; int irq; + int wake_irq; int use_dma, dma_ch; struct dma_chan *tx_chan; struct dma_chan *rx_chan; @@ -206,6 +216,9 @@ struct omap_hsmmc_host { int req_in_progress; unsigned long clk_rate; unsigned int flags; +#define AUTO_CMD23 (1 << 0) /* Auto CMD23 support */ +#define HSMMC_SDIO_IRQ_ENABLED (1 << 1) /* SDIO irq enabled */ +#define HSMMC_WAKE_IRQ_ENABLED (1 << 2) struct omap_hsmmc_next next_data; struct omap_mmc_platform_data *pdata; }; @@ -510,27 +523,40 @@ static void omap_hsmmc_stop_clock(struct omap_hsmmc_host *host) static void omap_hsmmc_enable_irq(struct omap_hsmmc_host *host, struct mmc_command *cmd) { - unsigned int irq_mask; + u32 irq_mask = INT_EN_MASK; + unsigned long flags; if (host->use_dma) - irq_mask = INT_EN_MASK & ~(BRR_EN | BWR_EN); - else - irq_mask = INT_EN_MASK; + irq_mask &= ~(BRR_EN | BWR_EN); /* Disable timeout for erases */ if (cmd->opcode == MMC_ERASE) irq_mask &= ~DTO_EN; + spin_lock_irqsave(&host->irq_lock, flags); OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR); OMAP_HSMMC_WRITE(host->base, ISE, irq_mask); + + /* latch pending CIRQ, but don't signal MMC core */ + if (host->flags & HSMMC_SDIO_IRQ_ENABLED) + irq_mask |= CIRQ_EN; OMAP_HSMMC_WRITE(host->base, IE, irq_mask); + spin_unlock_irqrestore(&host->irq_lock, flags); } static void omap_hsmmc_disable_irq(struct omap_hsmmc_host *host) { - OMAP_HSMMC_WRITE(host->base, ISE, 0); - OMAP_HSMMC_WRITE(host->base, IE, 0); + u32 irq_mask = 0; + unsigned long flags; + + spin_lock_irqsave(&host->irq_lock, flags); + /* no transfer running but need to keep cirq if enabled */ + if (host->flags & HSMMC_SDIO_IRQ_ENABLED) + irq_mask |= CIRQ_EN; + OMAP_HSMMC_WRITE(host->base, ISE, irq_mask); + OMAP_HSMMC_WRITE(host->base, IE, irq_mask); OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR); + spin_unlock_irqrestore(&host->irq_lock, flags); } /* Calculate divisor for the given clock frequency */ @@ -667,6 +693,9 @@ static int omap_hsmmc_context_restore(struct omap_hsmmc_host *host) capa = VS18; } + if (host->mmc->caps & MMC_CAP_SDIO_IRQ) + hctl |= IWE; + OMAP_HSMMC_WRITE(host->base, HCTL, OMAP_HSMMC_READ(host->base, HCTL) | hctl); @@ -681,7 +710,9 @@ static int omap_hsmmc_context_restore(struct omap_hsmmc_host *host) && time_before(jiffies, timeout)) ; - omap_hsmmc_disable_irq(host); + OMAP_HSMMC_WRITE(host->base, ISE, 0); + OMAP_HSMMC_WRITE(host->base, IE, 0); + OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR); /* Do not initialize card-specific things if the power is off */ if (host->power_mode == MMC_POWER_OFF) @@ -1118,8 +1149,12 @@ static irqreturn_t omap_hsmmc_irq(int irq, void *dev_id) int status; status = OMAP_HSMMC_READ(host->base, STAT); - while (status & INT_EN_MASK && host->req_in_progress) { - omap_hsmmc_do_irq(host, status); + while (status & (INT_EN_MASK | CIRQ_EN)) { + if (host->req_in_progress) + omap_hsmmc_do_irq(host, status); + + if (status & CIRQ_EN) + mmc_signal_sdio_irq(host->mmc); /* Flush posted write */ status = OMAP_HSMMC_READ(host->base, STAT); @@ -1128,6 +1163,22 @@ static irqreturn_t omap_hsmmc_irq(int irq, void *dev_id) return IRQ_HANDLED; } +static irqreturn_t omap_hsmmc_wake_irq(int irq, void *dev_id) +{ + struct omap_hsmmc_host *host = dev_id; + + /* cirq is level triggered, disable to avoid infinite loop */ + spin_lock(&host->irq_lock); + if (host->flags & HSMMC_WAKE_IRQ_ENABLED) { + disable_irq_nosync(host->wake_irq); + host->flags &= ~HSMMC_WAKE_IRQ_ENABLED; + } + spin_unlock(&host->irq_lock); + pm_request_resume(host->dev); /* no use counter */ + + return IRQ_HANDLED; +} + static void set_sd_bus_power(struct omap_hsmmc_host *host) { unsigned long i; @@ -1639,6 +1690,103 @@ static void omap_hsmmc_init_card(struct mmc_host *mmc, struct mmc_card *card) mmc_slot(host).init_card(card); } +static void omap_hsmmc_enable_sdio_irq(struct mmc_host *mmc, int enable) +{ + struct omap_hsmmc_host *host = mmc_priv(mmc); + u32 irq_mask, con; + unsigned long flags; + + spin_lock_irqsave(&host->irq_lock, flags); + + con = OMAP_HSMMC_READ(host->base, CON); + irq_mask = OMAP_HSMMC_READ(host->base, ISE); + if (enable) { + host->flags |= HSMMC_SDIO_IRQ_ENABLED; + irq_mask |= CIRQ_EN; + con |= CTPL | CLKEXTFREE; + } else { + host->flags &= ~HSMMC_SDIO_IRQ_ENABLED; + irq_mask &= ~CIRQ_EN; + con &= ~(CTPL | CLKEXTFREE); + } + OMAP_HSMMC_WRITE(host->base, CON, con); + OMAP_HSMMC_WRITE(host->base, IE, irq_mask); + + /* + * if enable, piggy back detection on current request + * but always disable immediately + */ + if (!host->req_in_progress || !enable) + OMAP_HSMMC_WRITE(host->base, ISE, irq_mask); + + /* flush posted write */ + OMAP_HSMMC_READ(host->base, IE); + + spin_unlock_irqrestore(&host->irq_lock, flags); +} + +static int omap_hsmmc_configure_wake_irq(struct omap_hsmmc_host *host) +{ + struct mmc_host *mmc = host->mmc; + int ret; + + /* + * For omaps with wake-up path, wakeirq will be irq from pinctrl and + * for other omaps, wakeirq will be from GPIO (dat line remuxed to + * gpio). wakeirq is needed to detect sdio irq in runtime suspend state + * with functional clock disabled. + */ + if (!host->dev->of_node || !host->wake_irq) + return -ENODEV; + + /* Prevent auto-enabling of IRQ */ + irq_set_status_flags(host->wake_irq, IRQ_NOAUTOEN); + ret = devm_request_irq(host->dev, host->wake_irq, omap_hsmmc_wake_irq, + IRQF_TRIGGER_LOW | IRQF_ONESHOT, + mmc_hostname(mmc), host); + if (ret) { + dev_err(mmc_dev(host->mmc), "Unable to request wake IRQ\n"); + goto err; + } + + /* + * Some omaps don't have wake-up path from deeper idle states + * and need to remux SDIO DAT1 to GPIO for wake-up from idle. + */ + if (host->pdata->controller_flags & OMAP_HSMMC_SWAKEUP_MISSING) { + struct pinctrl *p = devm_pinctrl_get(host->dev); + if (!p) { + ret = -ENODEV; + goto err_free_irq; + } + if (IS_ERR(pinctrl_lookup_state(p, PINCTRL_STATE_DEFAULT))) { + dev_info(host->dev, "missing default pinctrl state\n"); + devm_pinctrl_put(p); + ret = -EINVAL; + goto err_free_irq; + } + + if (IS_ERR(pinctrl_lookup_state(p, PINCTRL_STATE_IDLE))) { + dev_info(host->dev, "missing idle pinctrl state\n"); + devm_pinctrl_put(p); + ret = -EINVAL; + goto err_free_irq; + } + devm_pinctrl_put(p); + } + + OMAP_HSMMC_WRITE(host->base, HCTL, + OMAP_HSMMC_READ(host->base, HCTL) | IWE); + return 0; + +err_free_irq: + devm_free_irq(host->dev, host->wake_irq, host); +err: + dev_warn(host->dev, "no SDIO IRQ support, falling back to polling\n"); + host->wake_irq = 0; + return ret; +} + static void omap_hsmmc_conf_bus_power(struct omap_hsmmc_host *host) { u32 hctl, capa, value; @@ -1691,7 +1839,7 @@ static const struct mmc_host_ops omap_hsmmc_ops = { .get_cd = omap_hsmmc_get_cd, .get_ro = omap_hsmmc_get_ro, .init_card = omap_hsmmc_init_card, - /* NYET -- enable_sdio_irq */ + .enable_sdio_irq = omap_hsmmc_enable_sdio_irq, }; #ifdef CONFIG_DEBUG_FS @@ -1701,13 +1849,23 @@ static int omap_hsmmc_regs_show(struct seq_file *s, void *data) struct mmc_host *mmc = s->private; struct omap_hsmmc_host *host = mmc_priv(mmc); - seq_printf(s, "mmc%d:\n ctx_loss:\t%d\n\nregs:\n", - mmc->index, host->context_loss); + seq_printf(s, "mmc%d:\n", mmc->index); + seq_printf(s, "sdio irq mode\t%s\n", + (mmc->caps & MMC_CAP_SDIO_IRQ) ? "interrupt" : "polling"); - pm_runtime_get_sync(host->dev); + if (mmc->caps & MMC_CAP_SDIO_IRQ) { + seq_printf(s, "sdio irq \t%s\n", + (host->flags & HSMMC_SDIO_IRQ_ENABLED) ? "enabled" + : "disabled"); + } + seq_printf(s, "ctx_loss:\t%d\n", host->context_loss); + pm_runtime_get_sync(host->dev); + seq_puts(s, "\nregs:\n"); seq_printf(s, "CON:\t\t0x%08x\n", OMAP_HSMMC_READ(host->base, CON)); + seq_printf(s, "PSTATE:\t\t0x%08x\n", + OMAP_HSMMC_READ(host->base, PSTATE)); seq_printf(s, "HCTL:\t\t0x%08x\n", OMAP_HSMMC_READ(host->base, HCTL)); seq_printf(s, "SYSCTL:\t\t0x%08x\n", @@ -1761,6 +1919,10 @@ static const struct omap_mmc_of_data omap3_pre_es3_mmc_of_data = { static const struct omap_mmc_of_data omap4_mmc_of_data = { .reg_offset = 0x100, }; +static const struct omap_mmc_of_data am33xx_mmc_of_data = { + .reg_offset = 0x100, + .controller_flags = OMAP_HSMMC_SWAKEUP_MISSING, +}; static const struct of_device_id omap_mmc_of_match[] = { { @@ -1777,6 +1939,10 @@ static const struct of_device_id omap_mmc_of_match[] = { .compatible = "ti,omap4-hsmmc", .data = &omap4_mmc_of_data, }, + { + .compatible = "ti,am33xx-hsmmc", + .data = &am33xx_mmc_of_data, + }, {}, }; MODULE_DEVICE_TABLE(of, omap_mmc_of_match); @@ -1850,7 +2016,6 @@ static int omap_hsmmc_probe(struct platform_device *pdev) const struct of_device_id *match; dma_cap_mask_t mask; unsigned tx_req, rx_req; - struct pinctrl *pinctrl; const struct omap_mmc_of_data *data; void __iomem *base; @@ -1913,6 +2078,9 @@ static int omap_hsmmc_probe(struct platform_device *pdev) platform_set_drvdata(pdev, host); + if (pdev->dev.of_node) + host->wake_irq = irq_of_parse_and_map(pdev->dev.of_node, 1); + mmc->ops = &omap_hsmmc_ops; mmc->f_min = OMAP_MMC_MIN_CLOCK; @@ -2061,10 +2229,17 @@ static int omap_hsmmc_probe(struct platform_device *pdev) omap_hsmmc_disable_irq(host); - pinctrl = devm_pinctrl_get_select_default(&pdev->dev); - if (IS_ERR(pinctrl)) - dev_warn(&pdev->dev, - "pins are not configured from the driver\n"); + /* + * For now, only support SDIO interrupt if we have a separate + * wake-up interrupt configured from device tree. This is because + * the wake-up interrupt is needed for idle state and some + * platforms need special quirks. And we don't want to add new + * legacy mux platform init code callbacks any longer as we + * are moving to DT based booting anyways. + */ + ret = omap_hsmmc_configure_wake_irq(host); + if (!ret) + mmc->caps |= MMC_CAP_SDIO_IRQ; omap_hsmmc_protect_card(host); @@ -2170,11 +2345,18 @@ static int omap_hsmmc_suspend(struct device *dev) pm_runtime_get_sync(host->dev); if (!(host->mmc->pm_flags & MMC_PM_KEEP_POWER)) { - omap_hsmmc_disable_irq(host); + OMAP_HSMMC_WRITE(host->base, ISE, 0); + OMAP_HSMMC_WRITE(host->base, IE, 0); + OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR); OMAP_HSMMC_WRITE(host->base, HCTL, OMAP_HSMMC_READ(host->base, HCTL) & ~SDBP); } + /* do not wake up due to sdio irq */ + if ((host->mmc->caps & MMC_CAP_SDIO_IRQ) && + !(host->mmc->pm_flags & MMC_PM_WAKE_SDIO_IRQ)) + disable_irq(host->wake_irq); + if (host->dbclk) clk_disable_unprepare(host->dbclk); @@ -2200,6 +2382,10 @@ static int omap_hsmmc_resume(struct device *dev) omap_hsmmc_protect_card(host); + if ((host->mmc->caps & MMC_CAP_SDIO_IRQ) && + !(host->mmc->pm_flags & MMC_PM_WAKE_SDIO_IRQ)) + enable_irq(host->wake_irq); + pm_runtime_mark_last_busy(host->dev); pm_runtime_put_autosuspend(host->dev); return 0; @@ -2215,22 +2401,77 @@ static int omap_hsmmc_resume(struct device *dev) static int omap_hsmmc_runtime_suspend(struct device *dev) { struct omap_hsmmc_host *host; + unsigned long flags; + int ret = 0; host = platform_get_drvdata(to_platform_device(dev)); omap_hsmmc_context_save(host); dev_dbg(dev, "disabled\n"); - return 0; + spin_lock_irqsave(&host->irq_lock, flags); + if ((host->mmc->caps & MMC_CAP_SDIO_IRQ) && + (host->flags & HSMMC_SDIO_IRQ_ENABLED)) { + /* disable sdio irq handling to prevent race */ + OMAP_HSMMC_WRITE(host->base, ISE, 0); + OMAP_HSMMC_WRITE(host->base, IE, 0); + + if (!(OMAP_HSMMC_READ(host->base, PSTATE) & DLEV_DAT(1))) { + /* + * dat1 line low, pending sdio irq + * race condition: possible irq handler running on + * multi-core, abort + */ + dev_dbg(dev, "pending sdio irq, abort suspend\n"); + OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR); + OMAP_HSMMC_WRITE(host->base, ISE, CIRQ_EN); + OMAP_HSMMC_WRITE(host->base, IE, CIRQ_EN); + pm_runtime_mark_last_busy(dev); + ret = -EBUSY; + goto abort; + } + + pinctrl_pm_select_idle_state(dev); + + WARN_ON(host->flags & HSMMC_WAKE_IRQ_ENABLED); + enable_irq(host->wake_irq); + host->flags |= HSMMC_WAKE_IRQ_ENABLED; + } else { + pinctrl_pm_select_idle_state(dev); + } + +abort: + spin_unlock_irqrestore(&host->irq_lock, flags); + return ret; } static int omap_hsmmc_runtime_resume(struct device *dev) { struct omap_hsmmc_host *host; + unsigned long flags; host = platform_get_drvdata(to_platform_device(dev)); omap_hsmmc_context_restore(host); dev_dbg(dev, "enabled\n"); + spin_lock_irqsave(&host->irq_lock, flags); + if ((host->mmc->caps & MMC_CAP_SDIO_IRQ) && + (host->flags & HSMMC_SDIO_IRQ_ENABLED)) { + /* sdio irq flag can't change while in runtime suspend */ + if (host->flags & HSMMC_WAKE_IRQ_ENABLED) { + disable_irq_nosync(host->wake_irq); + host->flags &= ~HSMMC_WAKE_IRQ_ENABLED; + } + + pinctrl_pm_select_default_state(host->dev); + + /* irq lost, if pinmux incorrect */ + OMAP_HSMMC_WRITE(host->base, STAT, STAT_CLEAR); + OMAP_HSMMC_WRITE(host->base, ISE, CIRQ_EN); + OMAP_HSMMC_WRITE(host->base, IE, CIRQ_EN); + } else { + pinctrl_pm_select_default_state(host->dev); + } + spin_unlock_irqrestore(&host->irq_lock, flags); return 0; } diff --git a/drivers/mmc/host/s3cmci.c b/drivers/mmc/host/s3cmci.c index f23782683a7..e5516a22636 100644 --- a/drivers/mmc/host/s3cmci.c +++ b/drivers/mmc/host/s3cmci.c @@ -12,6 +12,7 @@ */ #include <linux/module.h> +#include <linux/dmaengine.h> #include <linux/dma-mapping.h> #include <linux/clk.h> #include <linux/mmc/host.h> @@ -27,6 +28,7 @@ #include <mach/dma.h> #include <mach/gpio-samsung.h> +#include <linux/platform_data/dma-s3c24xx.h> #include <linux/platform_data/mmc-s3cmci.h> #include "s3cmci.h" @@ -140,10 +142,6 @@ static const int dbgmap_debug = dbg_err | dbg_debug; dev_dbg(&host->pdev->dev, args); \ } while (0) -static struct s3c2410_dma_client s3cmci_dma_client = { - .name = "s3c-mci", -}; - static void finalize_request(struct s3cmci_host *host); static void s3cmci_send_request(struct mmc_host *mmc); static void s3cmci_reset(struct s3cmci_host *host); @@ -256,25 +254,8 @@ static inline bool s3cmci_host_usedma(struct s3cmci_host *host) { #ifdef CONFIG_MMC_S3C_PIO return false; -#elif defined(CONFIG_MMC_S3C_DMA) +#else /* CONFIG_MMC_S3C_DMA */ return true; -#else - return host->dodma; -#endif -} - -/** - * s3cmci_host_canpio - return true if host has pio code available - * - * Return true if the driver has been compiled with the PIO support code - * available. - */ -static inline bool s3cmci_host_canpio(void) -{ -#ifdef CONFIG_MMC_S3C_PIO - return true; -#else - return false; #endif } @@ -841,60 +822,24 @@ static irqreturn_t s3cmci_irq_cd(int irq, void *dev_id) return IRQ_HANDLED; } -static void s3cmci_dma_done_callback(struct s3c2410_dma_chan *dma_ch, - void *buf_id, int size, - enum s3c2410_dma_buffresult result) +static void s3cmci_dma_done_callback(void *arg) { - struct s3cmci_host *host = buf_id; + struct s3cmci_host *host = arg; unsigned long iflags; - u32 mci_csta, mci_dsta, mci_fsta, mci_dcnt; - - mci_csta = readl(host->base + S3C2410_SDICMDSTAT); - mci_dsta = readl(host->base + S3C2410_SDIDSTA); - mci_fsta = readl(host->base + S3C2410_SDIFSTA); - mci_dcnt = readl(host->base + S3C2410_SDIDCNT); BUG_ON(!host->mrq); BUG_ON(!host->mrq->data); - BUG_ON(!host->dmatogo); spin_lock_irqsave(&host->complete_lock, iflags); - if (result != S3C2410_RES_OK) { - dbg(host, dbg_fail, "DMA FAILED: csta=0x%08x dsta=0x%08x " - "fsta=0x%08x dcnt:0x%08x result:0x%08x toGo:%u\n", - mci_csta, mci_dsta, mci_fsta, - mci_dcnt, result, host->dmatogo); - - goto fail_request; - } - - host->dmatogo--; - if (host->dmatogo) { - dbg(host, dbg_dma, "DMA DONE Size:%i DSTA:[%08x] " - "DCNT:[%08x] toGo:%u\n", - size, mci_dsta, mci_dcnt, host->dmatogo); - - goto out; - } - - dbg(host, dbg_dma, "DMA FINISHED Size:%i DSTA:%08x DCNT:%08x\n", - size, mci_dsta, mci_dcnt); + dbg(host, dbg_dma, "DMA FINISHED\n"); host->dma_complete = 1; host->complete_what = COMPLETION_FINALIZE; -out: tasklet_schedule(&host->pio_tasklet); spin_unlock_irqrestore(&host->complete_lock, iflags); - return; -fail_request: - host->mrq->data->error = -EINVAL; - host->complete_what = COMPLETION_FINALIZE; - clear_imask(host); - - goto out; } static void finalize_request(struct s3cmci_host *host) @@ -966,7 +911,7 @@ static void finalize_request(struct s3cmci_host *host) * DMA channel and the fifo to clear out any garbage. */ if (mrq->data->error != 0) { if (s3cmci_host_usedma(host)) - s3c2410_dma_ctrl(host->dma, S3C2410_DMAOP_FLUSH); + dmaengine_terminate_all(host->dma); if (host->is2440) { /* Clear failure register and reset fifo. */ @@ -992,29 +937,6 @@ request_done: mmc_request_done(host->mmc, mrq); } -static void s3cmci_dma_setup(struct s3cmci_host *host, - enum dma_data_direction source) -{ - static enum dma_data_direction last_source = -1; - static int setup_ok; - - if (last_source == source) - return; - - last_source = source; - - s3c2410_dma_devconfig(host->dma, source, - host->mem->start + host->sdidata); - - if (!setup_ok) { - s3c2410_dma_config(host->dma, 4); - s3c2410_dma_set_buffdone_fn(host->dma, - s3cmci_dma_done_callback); - s3c2410_dma_setflags(host->dma, S3C2410_DMAF_AUTOSTART); - setup_ok = 1; - } -} - static void s3cmci_send_command(struct s3cmci_host *host, struct mmc_command *cmd) { @@ -1162,43 +1084,45 @@ static int s3cmci_prepare_pio(struct s3cmci_host *host, struct mmc_data *data) static int s3cmci_prepare_dma(struct s3cmci_host *host, struct mmc_data *data) { - int dma_len, i; int rw = data->flags & MMC_DATA_WRITE; + struct dma_async_tx_descriptor *desc; + struct dma_slave_config conf = { + .src_addr = host->mem->start + host->sdidata, + .dst_addr = host->mem->start + host->sdidata, + .src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES, + .dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES, + }; BUG_ON((data->flags & BOTH_DIR) == BOTH_DIR); - s3cmci_dma_setup(host, rw ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - s3c2410_dma_ctrl(host->dma, S3C2410_DMAOP_FLUSH); - - dma_len = dma_map_sg(mmc_dev(host->mmc), data->sg, data->sg_len, - rw ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - - if (dma_len == 0) - return -ENOMEM; - - host->dma_complete = 0; - host->dmatogo = dma_len; - - for (i = 0; i < dma_len; i++) { - int res; - - dbg(host, dbg_dma, "enqueue %i: %08x@%u\n", i, - sg_dma_address(&data->sg[i]), - sg_dma_len(&data->sg[i])); + /* Restore prescaler value */ + writel(host->prescaler, host->base + S3C2410_SDIPRE); - res = s3c2410_dma_enqueue(host->dma, host, - sg_dma_address(&data->sg[i]), - sg_dma_len(&data->sg[i])); + if (!rw) + conf.direction = DMA_DEV_TO_MEM; + else + conf.direction = DMA_MEM_TO_DEV; - if (res) { - s3c2410_dma_ctrl(host->dma, S3C2410_DMAOP_FLUSH); - return -EBUSY; - } - } + dma_map_sg(mmc_dev(host->mmc), data->sg, data->sg_len, + rw ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - s3c2410_dma_ctrl(host->dma, S3C2410_DMAOP_START); + dmaengine_slave_config(host->dma, &conf); + desc = dmaengine_prep_slave_sg(host->dma, data->sg, data->sg_len, + conf.direction, + DMA_CTRL_ACK | DMA_PREP_INTERRUPT); + if (!desc) + goto unmap_exit; + desc->callback = s3cmci_dma_done_callback; + desc->callback_param = host; + dmaengine_submit(desc); + dma_async_issue_pending(host->dma); return 0; + +unmap_exit: + dma_unmap_sg(mmc_dev(host->mmc), data->sg, data->sg_len, + rw ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + return -ENOMEM; } static void s3cmci_send_request(struct mmc_host *mmc) @@ -1676,10 +1600,6 @@ static int s3cmci_probe(struct platform_device *pdev) host->complete_what = COMPLETION_NONE; host->pio_active = XFER_NONE; -#ifdef CONFIG_MMC_S3C_PIODMA - host->dodma = host->pdata->use_dma; -#endif - host->mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!host->mem) { dev_err(&pdev->dev, @@ -1765,17 +1685,17 @@ static int s3cmci_probe(struct platform_device *pdev) /* depending on the dma state, get a dma channel to use. */ if (s3cmci_host_usedma(host)) { - host->dma = s3c2410_dma_request(DMACH_SDI, &s3cmci_dma_client, - host); - if (host->dma < 0) { + dma_cap_mask_t mask; + + dma_cap_zero(mask); + dma_cap_set(DMA_SLAVE, mask); + + host->dma = dma_request_slave_channel_compat(mask, + s3c24xx_dma_filter, (void *)DMACH_SDI, &pdev->dev, "rx-tx"); + if (!host->dma) { dev_err(&pdev->dev, "cannot get DMA channel.\n"); - if (!s3cmci_host_canpio()) { - ret = -EBUSY; - goto probe_free_gpio_wp; - } else { - dev_warn(&pdev->dev, "falling back to PIO.\n"); - host->dodma = 0; - } + ret = -EBUSY; + goto probe_free_gpio_wp; } } @@ -1787,7 +1707,7 @@ static int s3cmci_probe(struct platform_device *pdev) goto probe_free_dma; } - ret = clk_enable(host->clk); + ret = clk_prepare_enable(host->clk); if (ret) { dev_err(&pdev->dev, "failed to enable clock source.\n"); goto clk_free; @@ -1816,7 +1736,7 @@ static int s3cmci_probe(struct platform_device *pdev) mmc->max_segs = 128; dbg(host, dbg_debug, - "probe: mode:%s mapped mci_base:%p irq:%u irq_cd:%u dma:%u.\n", + "probe: mode:%s mapped mci_base:%p irq:%u irq_cd:%u dma:%p.\n", (host->is2440?"2440":""), host->base, host->irq, host->irq_cd, host->dma); @@ -1845,14 +1765,14 @@ static int s3cmci_probe(struct platform_device *pdev) s3cmci_cpufreq_deregister(host); free_dmabuf: - clk_disable(host->clk); + clk_disable_unprepare(host->clk); clk_free: clk_put(host->clk); probe_free_dma: if (s3cmci_host_usedma(host)) - s3c2410_dma_free(host->dma, &s3cmci_dma_client); + dma_release_channel(host->dma); probe_free_gpio_wp: if (!host->pdata->no_wprotect) @@ -1897,7 +1817,7 @@ static void s3cmci_shutdown(struct platform_device *pdev) s3cmci_debugfs_remove(host); s3cmci_cpufreq_deregister(host); mmc_remove_host(mmc); - clk_disable(host->clk); + clk_disable_unprepare(host->clk); } static int s3cmci_remove(struct platform_device *pdev) @@ -1914,7 +1834,7 @@ static int s3cmci_remove(struct platform_device *pdev) tasklet_disable(&host->pio_tasklet); if (s3cmci_host_usedma(host)) - s3c2410_dma_free(host->dma, &s3cmci_dma_client); + dma_release_channel(host->dma); free_irq(host->irq, host); diff --git a/drivers/mmc/host/s3cmci.h b/drivers/mmc/host/s3cmci.h index c76b53dbeb6..cc2e46cb5c6 100644 --- a/drivers/mmc/host/s3cmci.h +++ b/drivers/mmc/host/s3cmci.h @@ -26,7 +26,7 @@ struct s3cmci_host { void __iomem *base; int irq; int irq_cd; - int dma; + struct dma_chan *dma; unsigned long clk_rate; unsigned long clk_div; @@ -36,8 +36,6 @@ struct s3cmci_host { int is2440; unsigned sdiimsk; unsigned sdidata; - int dodma; - int dmatogo; bool irq_disabled; bool irq_enabled; diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c index 8ce3c28cb76..8c5337002c5 100644 --- a/drivers/mmc/host/sdhci-acpi.c +++ b/drivers/mmc/host/sdhci-acpi.c @@ -124,9 +124,11 @@ static const struct sdhci_acpi_chip sdhci_acpi_chip_int = { static const struct sdhci_acpi_slot sdhci_acpi_slot_int_emmc = { .chip = &sdhci_acpi_chip_int, - .caps = MMC_CAP_8_BIT_DATA | MMC_CAP_NONREMOVABLE | MMC_CAP_HW_RESET, + .caps = MMC_CAP_8_BIT_DATA | MMC_CAP_NONREMOVABLE | + MMC_CAP_HW_RESET | MMC_CAP_1_8V_DDR, .caps2 = MMC_CAP2_HC_ERASE_SZ, .flags = SDHCI_ACPI_RUNTIME_PM, + .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN, }; static const struct sdhci_acpi_slot sdhci_acpi_slot_int_sdio = { diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index 40573a58486..1a6661ed620 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -16,7 +16,6 @@ #include <linux/module.h> #include <linux/of_device.h> -#include <linux/regulator/consumer.h> #include <linux/delay.h> #include <linux/mmc/mmc.h> #include <linux/slab.h> diff --git a/drivers/mmc/host/sdhci-pci.c b/drivers/mmc/host/sdhci-pci.c index 52c42fcc284..c3a1debc928 100644 --- a/drivers/mmc/host/sdhci-pci.c +++ b/drivers/mmc/host/sdhci-pci.c @@ -103,6 +103,10 @@ static const struct sdhci_pci_fixes sdhci_cafe = { SDHCI_QUIRK_BROKEN_TIMEOUT_VAL, }; +static const struct sdhci_pci_fixes sdhci_intel_qrk = { + .quirks = SDHCI_QUIRK_NO_HISPD_BIT, +}; + static int mrst_hc_probe_slot(struct sdhci_pci_slot *slot) { slot->host->mmc->caps |= MMC_CAP_8_BIT_DATA; @@ -264,7 +268,7 @@ static void sdhci_pci_int_hw_reset(struct sdhci_host *host) static int byt_emmc_probe_slot(struct sdhci_pci_slot *slot) { slot->host->mmc->caps |= MMC_CAP_8_BIT_DATA | MMC_CAP_NONREMOVABLE | - MMC_CAP_HW_RESET; + MMC_CAP_HW_RESET | MMC_CAP_1_8V_DDR; slot->host->mmc->caps2 |= MMC_CAP2_HC_ERASE_SZ; slot->hw_reset = sdhci_pci_int_hw_reset; return 0; @@ -279,6 +283,7 @@ static int byt_sdio_probe_slot(struct sdhci_pci_slot *slot) static const struct sdhci_pci_fixes sdhci_intel_byt_emmc = { .allow_runtime_pm = true, .probe_slot = byt_emmc_probe_slot, + .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN, }; static const struct sdhci_pci_fixes sdhci_intel_byt_sdio = { @@ -753,6 +758,14 @@ static const struct pci_device_id pci_ids[] = { { .vendor = PCI_VENDOR_ID_INTEL, + .device = PCI_DEVICE_ID_INTEL_QRK_SD, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + .driver_data = (kernel_ulong_t)&sdhci_intel_qrk, + }, + + { + .vendor = PCI_VENDOR_ID_INTEL, .device = PCI_DEVICE_ID_INTEL_MRST_SD0, .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID, @@ -1130,18 +1143,13 @@ static int sdhci_pci_suspend(struct device *dev) goto err_pci_suspend; } - pci_save_state(pdev); if (pm_flags & MMC_PM_KEEP_POWER) { - if (pm_flags & MMC_PM_WAKE_SDIO_IRQ) { - pci_pme_active(pdev, true); - pci_enable_wake(pdev, PCI_D3hot, 1); - } - pci_set_power_state(pdev, PCI_D3hot); - } else { - pci_enable_wake(pdev, PCI_D3hot, 0); - pci_disable_device(pdev); - pci_set_power_state(pdev, PCI_D3hot); - } + if (pm_flags & MMC_PM_WAKE_SDIO_IRQ) + device_init_wakeup(dev, true); + else + device_init_wakeup(dev, false); + } else + device_init_wakeup(dev, false); return 0; @@ -1162,12 +1170,6 @@ static int sdhci_pci_resume(struct device *dev) if (!chip) return 0; - pci_set_power_state(pdev, PCI_D0); - pci_restore_state(pdev); - ret = pci_enable_device(pdev); - if (ret) - return ret; - if (chip->fixes && chip->fixes->resume) { ret = chip->fixes->resume(chip); if (ret) diff --git a/drivers/mmc/host/sdhci-pci.h b/drivers/mmc/host/sdhci-pci.h index 6d718719659..c101477ef3b 100644 --- a/drivers/mmc/host/sdhci-pci.h +++ b/drivers/mmc/host/sdhci-pci.h @@ -17,6 +17,7 @@ #define PCI_DEVICE_ID_INTEL_CLV_SDIO2 0x08fb #define PCI_DEVICE_ID_INTEL_CLV_EMMC0 0x08e5 #define PCI_DEVICE_ID_INTEL_CLV_EMMC1 0x08e6 +#define PCI_DEVICE_ID_INTEL_QRK_SD 0x08A7 /* * PCI registers diff --git a/drivers/mmc/host/sdhci-pxav3.c b/drivers/mmc/host/sdhci-pxav3.c index f4f12894756..6f842fb8e6b 100644 --- a/drivers/mmc/host/sdhci-pxav3.c +++ b/drivers/mmc/host/sdhci-pxav3.c @@ -288,15 +288,13 @@ static int sdhci_pxav3_probe(struct platform_device *pdev) int ret; struct clk *clk; - pxa = kzalloc(sizeof(struct sdhci_pxa), GFP_KERNEL); + pxa = devm_kzalloc(&pdev->dev, sizeof(struct sdhci_pxa), GFP_KERNEL); if (!pxa) return -ENOMEM; host = sdhci_pltfm_init(pdev, &sdhci_pxav3_pdata, 0); - if (IS_ERR(host)) { - kfree(pxa); + if (IS_ERR(host)) return PTR_ERR(host); - } if (of_device_is_compatible(np, "marvell,armada-380-sdhci")) { ret = mv_conf_mbus_windows(pdev, mv_mbus_dram_info()); @@ -308,7 +306,7 @@ static int sdhci_pxav3_probe(struct platform_device *pdev) pltfm_host = sdhci_priv(host); pltfm_host->priv = pxa; - clk = clk_get(dev, NULL); + clk = devm_clk_get(dev, NULL); if (IS_ERR(clk)) { dev_err(dev, "failed to get io clock\n"); ret = PTR_ERR(clk); @@ -389,11 +387,9 @@ err_add_host: pm_runtime_put_sync(&pdev->dev); pm_runtime_disable(&pdev->dev); clk_disable_unprepare(clk); - clk_put(clk); err_clk_get: err_mbus_win: sdhci_pltfm_free(pdev); - kfree(pxa); return ret; } @@ -401,17 +397,14 @@ static int sdhci_pxav3_remove(struct platform_device *pdev) { struct sdhci_host *host = platform_get_drvdata(pdev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); - struct sdhci_pxa *pxa = pltfm_host->priv; pm_runtime_get_sync(&pdev->dev); sdhci_remove_host(host, 1); pm_runtime_disable(&pdev->dev); clk_disable_unprepare(pltfm_host->clk); - clk_put(pltfm_host->clk); sdhci_pltfm_free(pdev); - kfree(pxa); return 0; } diff --git a/drivers/mmc/host/sdhci-st.c b/drivers/mmc/host/sdhci-st.c new file mode 100644 index 00000000000..328f348c724 --- /dev/null +++ b/drivers/mmc/host/sdhci-st.c @@ -0,0 +1,176 @@ +/* + * Support for SDHCI on STMicroelectronics SoCs + * + * Copyright (C) 2014 STMicroelectronics Ltd + * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com> + * Contributors: Peter Griffin <peter.griffin@linaro.org> + * + * Based on sdhci-cns3xxx.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/io.h> +#include <linux/of.h> +#include <linux/module.h> +#include <linux/err.h> +#include <linux/mmc/host.h> + +#include "sdhci-pltfm.h" + +static u32 sdhci_st_readl(struct sdhci_host *host, int reg) +{ + u32 ret; + + switch (reg) { + case SDHCI_CAPABILITIES: + ret = readl_relaxed(host->ioaddr + reg); + /* Support 3.3V and 1.8V */ + ret &= ~SDHCI_CAN_VDD_300; + break; + default: + ret = readl_relaxed(host->ioaddr + reg); + } + return ret; +} + +static const struct sdhci_ops sdhci_st_ops = { + .get_max_clock = sdhci_pltfm_clk_get_max_clock, + .set_clock = sdhci_set_clock, + .set_bus_width = sdhci_set_bus_width, + .read_l = sdhci_st_readl, + .reset = sdhci_reset, +}; + +static const struct sdhci_pltfm_data sdhci_st_pdata = { + .ops = &sdhci_st_ops, + .quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC | + SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN, +}; + + +static int sdhci_st_probe(struct platform_device *pdev) +{ + struct sdhci_host *host; + struct sdhci_pltfm_host *pltfm_host; + struct clk *clk; + int ret = 0; + u16 host_version; + + clk = devm_clk_get(&pdev->dev, "mmc"); + if (IS_ERR(clk)) { + dev_err(&pdev->dev, "Peripheral clk not found\n"); + return PTR_ERR(clk); + } + + host = sdhci_pltfm_init(pdev, &sdhci_st_pdata, 0); + if (IS_ERR(host)) { + dev_err(&pdev->dev, "Failed sdhci_pltfm_init\n"); + return PTR_ERR(host); + } + + ret = mmc_of_parse(host->mmc); + + if (ret) { + dev_err(&pdev->dev, "Failed mmc_of_parse\n"); + return ret; + } + + clk_prepare_enable(clk); + + pltfm_host = sdhci_priv(host); + pltfm_host->clk = clk; + + ret = sdhci_add_host(host); + if (ret) { + dev_err(&pdev->dev, "Failed sdhci_add_host\n"); + goto err_out; + } + + platform_set_drvdata(pdev, host); + + host_version = readw_relaxed((host->ioaddr + SDHCI_HOST_VERSION)); + + dev_info(&pdev->dev, "SDHCI ST Initialised: Host Version: 0x%x Vendor Version 0x%x\n", + ((host_version & SDHCI_SPEC_VER_MASK) >> SDHCI_SPEC_VER_SHIFT), + ((host_version & SDHCI_VENDOR_VER_MASK) >> + SDHCI_VENDOR_VER_SHIFT)); + + return 0; + +err_out: + clk_disable_unprepare(clk); + sdhci_pltfm_free(pdev); + + return ret; +} + +static int sdhci_st_remove(struct platform_device *pdev) +{ + struct sdhci_host *host = platform_get_drvdata(pdev); + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + + clk_disable_unprepare(pltfm_host->clk); + + return sdhci_pltfm_unregister(pdev); +} + +#ifdef CONFIG_PM_SLEEP +static int sdhci_st_suspend(struct device *dev) +{ + struct sdhci_host *host = dev_get_drvdata(dev); + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + int ret = sdhci_suspend_host(host); + + if (ret) + goto out; + + clk_disable_unprepare(pltfm_host->clk); +out: + return ret; +} + +static int sdhci_st_resume(struct device *dev) +{ + struct sdhci_host *host = dev_get_drvdata(dev); + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + + clk_prepare_enable(pltfm_host->clk); + + return sdhci_resume_host(host); +} +#endif + +static SIMPLE_DEV_PM_OPS(sdhci_st_pmops, sdhci_st_suspend, sdhci_st_resume); + +static const struct of_device_id st_sdhci_match[] = { + { .compatible = "st,sdhci" }, + {}, +}; + +MODULE_DEVICE_TABLE(of, st_sdhci_match); + +static struct platform_driver sdhci_st_driver = { + .probe = sdhci_st_probe, + .remove = sdhci_st_remove, + .driver = { + .name = "sdhci-st", + .pm = &sdhci_st_pmops, + .of_match_table = of_match_ptr(st_sdhci_match), + }, +}; + +module_platform_driver(sdhci_st_driver); + +MODULE_DESCRIPTION("SDHCI driver for STMicroelectronics SoCs"); +MODULE_AUTHOR("Giuseppe Cavallaro <peppe.cavallaro@st.com>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("platform:st-sdhci"); diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index d93a063a36f..33100d10d17 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -26,8 +26,6 @@ #include <linux/mmc/host.h> #include <linux/mmc/slot-gpio.h> -#include <asm/gpio.h> - #include "sdhci-pltfm.h" /* Tegra SDHOST controller vendor register definitions */ diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 47055f3f01b..37b2a9ae52e 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -1223,8 +1223,16 @@ EXPORT_SYMBOL_GPL(sdhci_set_clock); static void sdhci_set_power(struct sdhci_host *host, unsigned char mode, unsigned short vdd) { + struct mmc_host *mmc = host->mmc; u8 pwr = 0; + if (!IS_ERR(mmc->supply.vmmc)) { + spin_unlock_irq(&host->lock); + mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, vdd); + spin_lock_irq(&host->lock); + return; + } + if (mode != MMC_POWER_OFF) { switch (1 << vdd) { case MMC_VDD_165_195: @@ -1283,12 +1291,6 @@ static void sdhci_set_power(struct sdhci_host *host, unsigned char mode, if (host->quirks & SDHCI_QUIRK_DELAY_AFTER_POWER) mdelay(10); } - - if (host->vmmc) { - spin_unlock_irq(&host->lock); - mmc_regulator_set_ocr(host->mmc, host->vmmc, vdd); - spin_lock_irq(&host->lock); - } } /*****************************************************************************\ @@ -1440,13 +1442,15 @@ static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios) { unsigned long flags; u8 ctrl; + struct mmc_host *mmc = host->mmc; spin_lock_irqsave(&host->lock, flags); if (host->flags & SDHCI_DEVICE_DEAD) { spin_unlock_irqrestore(&host->lock, flags); - if (host->vmmc && ios->power_mode == MMC_POWER_OFF) - mmc_regulator_set_ocr(host->mmc, host->vmmc, 0); + if (!IS_ERR(mmc->supply.vmmc) && + ios->power_mode == MMC_POWER_OFF) + mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, 0); return; } @@ -1530,7 +1534,6 @@ static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios) host->ops->set_clock(host, host->clock); } - /* Reset SD Clock Enable */ clk = sdhci_readw(host, SDHCI_CLOCK_CONTROL); clk &= ~SDHCI_CLOCK_CARD_EN; @@ -1707,6 +1710,7 @@ static void sdhci_enable_sdio_irq(struct mmc_host *mmc, int enable) static int sdhci_do_start_signal_voltage_switch(struct sdhci_host *host, struct mmc_ios *ios) { + struct mmc_host *mmc = host->mmc; u16 ctrl; int ret; @@ -1725,11 +1729,12 @@ static int sdhci_do_start_signal_voltage_switch(struct sdhci_host *host, ctrl &= ~SDHCI_CTRL_VDD_180; sdhci_writew(host, ctrl, SDHCI_HOST_CONTROL2); - if (host->vqmmc) { - ret = regulator_set_voltage(host->vqmmc, 2700000, 3600000); + if (!IS_ERR(mmc->supply.vqmmc)) { + ret = regulator_set_voltage(mmc->supply.vqmmc, 2700000, + 3600000); if (ret) { pr_warning("%s: Switching to 3.3V signalling voltage " - " failed\n", mmc_hostname(host->mmc)); + " failed\n", mmc_hostname(mmc)); return -EIO; } } @@ -1742,16 +1747,16 @@ static int sdhci_do_start_signal_voltage_switch(struct sdhci_host *host, return 0; pr_warning("%s: 3.3V regulator output did not became stable\n", - mmc_hostname(host->mmc)); + mmc_hostname(mmc)); return -EAGAIN; case MMC_SIGNAL_VOLTAGE_180: - if (host->vqmmc) { - ret = regulator_set_voltage(host->vqmmc, + if (!IS_ERR(mmc->supply.vqmmc)) { + ret = regulator_set_voltage(mmc->supply.vqmmc, 1700000, 1950000); if (ret) { pr_warning("%s: Switching to 1.8V signalling voltage " - " failed\n", mmc_hostname(host->mmc)); + " failed\n", mmc_hostname(mmc)); return -EIO; } } @@ -1763,24 +1768,22 @@ static int sdhci_do_start_signal_voltage_switch(struct sdhci_host *host, ctrl |= SDHCI_CTRL_VDD_180; sdhci_writew(host, ctrl, SDHCI_HOST_CONTROL2); - /* Wait for 5ms */ - usleep_range(5000, 5500); - /* 1.8V regulator output should be stable within 5 ms */ ctrl = sdhci_readw(host, SDHCI_HOST_CONTROL2); if (ctrl & SDHCI_CTRL_VDD_180) return 0; pr_warning("%s: 1.8V regulator output did not became stable\n", - mmc_hostname(host->mmc)); + mmc_hostname(mmc)); return -EAGAIN; case MMC_SIGNAL_VOLTAGE_120: - if (host->vqmmc) { - ret = regulator_set_voltage(host->vqmmc, 1100000, 1300000); + if (!IS_ERR(mmc->supply.vqmmc)) { + ret = regulator_set_voltage(mmc->supply.vqmmc, 1100000, + 1300000); if (ret) { pr_warning("%s: Switching to 1.2V signalling voltage " - " failed\n", mmc_hostname(host->mmc)); + " failed\n", mmc_hostname(mmc)); return -EIO; } } @@ -2643,7 +2646,6 @@ static void sdhci_runtime_pm_bus_off(struct sdhci_host *host) int sdhci_runtime_suspend_host(struct sdhci_host *host) { unsigned long flags; - int ret = 0; /* Disable tuning since we are suspending */ if (host->flags & SDHCI_USING_RETUNING_TIMER) { @@ -2663,14 +2665,14 @@ int sdhci_runtime_suspend_host(struct sdhci_host *host) host->runtime_suspended = true; spin_unlock_irqrestore(&host->lock, flags); - return ret; + return 0; } EXPORT_SYMBOL_GPL(sdhci_runtime_suspend_host); int sdhci_runtime_resume_host(struct sdhci_host *host) { unsigned long flags; - int ret = 0, host_flags = host->flags; + int host_flags = host->flags; if (host_flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA)) { if (host->ops->enable_dma) @@ -2709,7 +2711,7 @@ int sdhci_runtime_resume_host(struct sdhci_host *host) spin_unlock_irqrestore(&host->lock, flags); - return ret; + return 0; } EXPORT_SYMBOL_GPL(sdhci_runtime_resume_host); @@ -2820,12 +2822,12 @@ int sdhci_add_host(struct sdhci_host *host) * (128) and potentially one alignment transfer for * each of those entries. */ - host->adma_desc = dma_alloc_coherent(mmc_dev(host->mmc), + host->adma_desc = dma_alloc_coherent(mmc_dev(mmc), ADMA_SIZE, &host->adma_addr, GFP_KERNEL); host->align_buffer = kmalloc(128 * 4, GFP_KERNEL); if (!host->adma_desc || !host->align_buffer) { - dma_free_coherent(mmc_dev(host->mmc), ADMA_SIZE, + dma_free_coherent(mmc_dev(mmc), ADMA_SIZE, host->adma_desc, host->adma_addr); kfree(host->align_buffer); pr_warning("%s: Unable to allocate ADMA " @@ -2838,7 +2840,7 @@ int sdhci_add_host(struct sdhci_host *host) pr_warning("%s: unable to allocate aligned ADMA descriptor\n", mmc_hostname(mmc)); host->flags &= ~SDHCI_USE_ADMA; - dma_free_coherent(mmc_dev(host->mmc), ADMA_SIZE, + dma_free_coherent(mmc_dev(mmc), ADMA_SIZE, host->adma_desc, host->adma_addr); kfree(host->align_buffer); host->adma_desc = NULL; @@ -2853,7 +2855,7 @@ int sdhci_add_host(struct sdhci_host *host) */ if (!(host->flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA))) { host->dma_mask = DMA_BIT_MASK(64); - mmc_dev(host->mmc)->dma_mask = &host->dma_mask; + mmc_dev(mmc)->dma_mask = &host->dma_mask; } if (host->version >= SDHCI_SPEC_300) @@ -2959,28 +2961,25 @@ int sdhci_add_host(struct sdhci_host *host) mmc->caps |= MMC_CAP_SD_HIGHSPEED | MMC_CAP_MMC_HIGHSPEED; if ((host->quirks & SDHCI_QUIRK_BROKEN_CARD_DETECTION) && - !(host->mmc->caps & MMC_CAP_NONREMOVABLE)) + !(mmc->caps & MMC_CAP_NONREMOVABLE)) mmc->caps |= MMC_CAP_NEEDS_POLL; + /* If there are external regulators, get them */ + if (mmc_regulator_get_supply(mmc) == -EPROBE_DEFER) + return -EPROBE_DEFER; + /* If vqmmc regulator and no 1.8V signalling, then there's no UHS */ - host->vqmmc = regulator_get_optional(mmc_dev(mmc), "vqmmc"); - if (IS_ERR_OR_NULL(host->vqmmc)) { - if (PTR_ERR(host->vqmmc) < 0) { - pr_info("%s: no vqmmc regulator found\n", - mmc_hostname(mmc)); - host->vqmmc = NULL; - } - } else { - ret = regulator_enable(host->vqmmc); - if (!regulator_is_supported_voltage(host->vqmmc, 1700000, - 1950000)) + if (!IS_ERR(mmc->supply.vqmmc)) { + ret = regulator_enable(mmc->supply.vqmmc); + if (!regulator_is_supported_voltage(mmc->supply.vqmmc, 1700000, + 1950000)) caps[1] &= ~(SDHCI_SUPPORT_SDR104 | SDHCI_SUPPORT_SDR50 | SDHCI_SUPPORT_DDR50); if (ret) { pr_warn("%s: Failed to enable vqmmc regulator: %d\n", mmc_hostname(mmc), ret); - host->vqmmc = NULL; + mmc->supply.vqmmc = NULL; } } @@ -3041,34 +3040,6 @@ int sdhci_add_host(struct sdhci_host *host) ocr_avail = 0; - host->vmmc = regulator_get_optional(mmc_dev(mmc), "vmmc"); - if (IS_ERR_OR_NULL(host->vmmc)) { - if (PTR_ERR(host->vmmc) < 0) { - pr_info("%s: no vmmc regulator found\n", - mmc_hostname(mmc)); - host->vmmc = NULL; - } - } - -#ifdef CONFIG_REGULATOR - /* - * Voltage range check makes sense only if regulator reports - * any voltage value. - */ - if (host->vmmc && regulator_get_voltage(host->vmmc) > 0) { - ret = regulator_is_supported_voltage(host->vmmc, 2700000, - 3600000); - if ((ret <= 0) || (!(caps[0] & SDHCI_CAN_VDD_330))) - caps[0] &= ~SDHCI_CAN_VDD_330; - if ((ret <= 0) || (!(caps[0] & SDHCI_CAN_VDD_300))) - caps[0] &= ~SDHCI_CAN_VDD_300; - ret = regulator_is_supported_voltage(host->vmmc, 1700000, - 1950000); - if ((ret <= 0) || (!(caps[0] & SDHCI_CAN_VDD_180))) - caps[0] &= ~SDHCI_CAN_VDD_180; - } -#endif /* CONFIG_REGULATOR */ - /* * According to SD Host Controller spec v3.00, if the Host System * can afford more than 150mA, Host Driver should set XPC to 1. Also @@ -3077,8 +3048,8 @@ int sdhci_add_host(struct sdhci_host *host) * value. */ max_current_caps = sdhci_readl(host, SDHCI_MAX_CURRENT); - if (!max_current_caps && host->vmmc) { - u32 curr = regulator_get_current_limit(host->vmmc); + if (!max_current_caps && !IS_ERR(mmc->supply.vmmc)) { + u32 curr = regulator_get_current_limit(mmc->supply.vmmc); if (curr > 0) { /* convert to SDHCI_MAX_CURRENT format */ @@ -3118,8 +3089,12 @@ int sdhci_add_host(struct sdhci_host *host) SDHCI_MAX_CURRENT_MULTIPLIER; } + /* If OCR set by external regulators, use it instead */ + if (mmc->ocr_avail) + ocr_avail = mmc->ocr_avail; + if (host->ocr_mask) - ocr_avail = host->ocr_mask; + ocr_avail &= host->ocr_mask; mmc->ocr_avail = ocr_avail; mmc->ocr_avail_sdio = ocr_avail; @@ -3273,6 +3248,7 @@ EXPORT_SYMBOL_GPL(sdhci_add_host); void sdhci_remove_host(struct sdhci_host *host, int dead) { + struct mmc_host *mmc = host->mmc; unsigned long flags; if (dead) { @@ -3282,7 +3258,7 @@ void sdhci_remove_host(struct sdhci_host *host, int dead) if (host->mrq) { pr_err("%s: Controller removed during " - " transfer!\n", mmc_hostname(host->mmc)); + " transfer!\n", mmc_hostname(mmc)); host->mrq->cmd->error = -ENOMEDIUM; tasklet_schedule(&host->finish_tasklet); @@ -3293,7 +3269,7 @@ void sdhci_remove_host(struct sdhci_host *host, int dead) sdhci_disable_card_detection(host); - mmc_remove_host(host->mmc); + mmc_remove_host(mmc); #ifdef SDHCI_USE_LEDS_CLASS led_classdev_unregister(&host->led); @@ -3310,18 +3286,14 @@ void sdhci_remove_host(struct sdhci_host *host, int dead) tasklet_kill(&host->finish_tasklet); - if (host->vmmc) { - regulator_disable(host->vmmc); - regulator_put(host->vmmc); - } + if (!IS_ERR(mmc->supply.vmmc)) + regulator_disable(mmc->supply.vmmc); - if (host->vqmmc) { - regulator_disable(host->vqmmc); - regulator_put(host->vqmmc); - } + if (!IS_ERR(mmc->supply.vqmmc)) + regulator_disable(mmc->supply.vqmmc); if (host->adma_desc) - dma_free_coherent(mmc_dev(host->mmc), ADMA_SIZE, + dma_free_coherent(mmc_dev(mmc), ADMA_SIZE, host->adma_desc, host->adma_addr); kfree(host->align_buffer); diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c index 656fbba4c42..d11708c815d 100644 --- a/drivers/mmc/host/sh_mmcif.c +++ b/drivers/mmc/host/sh_mmcif.c @@ -386,7 +386,7 @@ sh_mmcif_request_dma_one(struct sh_mmcif_host *host, struct sh_mmcif_plat_data *pdata, enum dma_transfer_direction direction) { - struct dma_slave_config cfg; + struct dma_slave_config cfg = { 0, }; struct dma_chan *chan; unsigned int slave_id; struct resource *res; @@ -417,8 +417,15 @@ sh_mmcif_request_dma_one(struct sh_mmcif_host *host, /* In the OF case the driver will get the slave ID from the DT */ cfg.slave_id = slave_id; cfg.direction = direction; - cfg.dst_addr = res->start + MMCIF_CE_DATA; - cfg.src_addr = 0; + + if (direction == DMA_DEV_TO_MEM) { + cfg.src_addr = res->start + MMCIF_CE_DATA; + cfg.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; + } else { + cfg.dst_addr = res->start + MMCIF_CE_DATA; + cfg.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; + } + ret = dmaengine_slave_config(chan, &cfg); if (ret < 0) { dma_release_channel(chan); @@ -1378,26 +1385,19 @@ static int sh_mmcif_probe(struct platform_device *pdev) dev_err(&pdev->dev, "Get irq error\n"); return -ENXIO; } + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) { - dev_err(&pdev->dev, "platform_get_resource error.\n"); - return -ENXIO; - } - reg = ioremap(res->start, resource_size(res)); - if (!reg) { - dev_err(&pdev->dev, "ioremap error.\n"); - return -ENOMEM; - } + reg = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(reg)) + return PTR_ERR(reg); mmc = mmc_alloc_host(sizeof(struct sh_mmcif_host), &pdev->dev); - if (!mmc) { - ret = -ENOMEM; - goto ealloch; - } + if (!mmc) + return -ENOMEM; ret = mmc_of_parse(mmc); if (ret < 0) - goto eofparse; + goto err_host; host = mmc_priv(mmc); host->mmc = mmc; @@ -1427,19 +1427,19 @@ static int sh_mmcif_probe(struct platform_device *pdev) pm_runtime_enable(&pdev->dev); host->power = false; - host->hclk = clk_get(&pdev->dev, NULL); + host->hclk = devm_clk_get(&pdev->dev, NULL); if (IS_ERR(host->hclk)) { ret = PTR_ERR(host->hclk); dev_err(&pdev->dev, "cannot get clock: %d\n", ret); - goto eclkget; + goto err_pm; } ret = sh_mmcif_clk_update(host); if (ret < 0) - goto eclkupdate; + goto err_pm; ret = pm_runtime_resume(&pdev->dev); if (ret < 0) - goto eresume; + goto err_clk; INIT_DELAYED_WORK(&host->timeout_work, mmcif_timeout_work); @@ -1447,65 +1447,55 @@ static int sh_mmcif_probe(struct platform_device *pdev) sh_mmcif_writel(host->addr, MMCIF_CE_INT_MASK, MASK_ALL); name = irq[1] < 0 ? dev_name(&pdev->dev) : "sh_mmc:error"; - ret = request_threaded_irq(irq[0], sh_mmcif_intr, sh_mmcif_irqt, 0, name, host); + ret = devm_request_threaded_irq(&pdev->dev, irq[0], sh_mmcif_intr, + sh_mmcif_irqt, 0, name, host); if (ret) { dev_err(&pdev->dev, "request_irq error (%s)\n", name); - goto ereqirq0; + goto err_clk; } if (irq[1] >= 0) { - ret = request_threaded_irq(irq[1], sh_mmcif_intr, sh_mmcif_irqt, - 0, "sh_mmc:int", host); + ret = devm_request_threaded_irq(&pdev->dev, irq[1], + sh_mmcif_intr, sh_mmcif_irqt, + 0, "sh_mmc:int", host); if (ret) { dev_err(&pdev->dev, "request_irq error (sh_mmc:int)\n"); - goto ereqirq1; + goto err_clk; } } if (pd && pd->use_cd_gpio) { ret = mmc_gpio_request_cd(mmc, pd->cd_gpio, 0); if (ret < 0) - goto erqcd; + goto err_clk; } mutex_init(&host->thread_lock); - clk_disable_unprepare(host->hclk); ret = mmc_add_host(mmc); if (ret < 0) - goto emmcaddh; + goto err_clk; dev_pm_qos_expose_latency_limit(&pdev->dev, 100); - dev_info(&pdev->dev, "driver version %s\n", DRIVER_VERSION); - dev_dbg(&pdev->dev, "chip ver H'%04x\n", - sh_mmcif_readl(host->addr, MMCIF_CE_VERSION) & 0x0000ffff); + dev_info(&pdev->dev, "Chip version 0x%04x, clock rate %luMHz\n", + sh_mmcif_readl(host->addr, MMCIF_CE_VERSION) & 0xffff, + clk_get_rate(host->hclk) / 1000000UL); + + clk_disable_unprepare(host->hclk); return ret; -emmcaddh: -erqcd: - if (irq[1] >= 0) - free_irq(irq[1], host); -ereqirq1: - free_irq(irq[0], host); -ereqirq0: - pm_runtime_suspend(&pdev->dev); -eresume: +err_clk: clk_disable_unprepare(host->hclk); -eclkupdate: - clk_put(host->hclk); -eclkget: +err_pm: pm_runtime_disable(&pdev->dev); -eofparse: +err_host: mmc_free_host(mmc); -ealloch: - iounmap(reg); return ret; } static int sh_mmcif_remove(struct platform_device *pdev) { struct sh_mmcif_host *host = platform_get_drvdata(pdev); - int irq[2]; host->dying = true; clk_prepare_enable(host->hclk); @@ -1523,16 +1513,6 @@ static int sh_mmcif_remove(struct platform_device *pdev) */ cancel_delayed_work_sync(&host->timeout_work); - if (host->addr) - iounmap(host->addr); - - irq[0] = platform_get_irq(pdev, 0); - irq[1] = platform_get_irq(pdev, 1); - - free_irq(irq[0], host); - if (irq[1] >= 0) - free_irq(irq[1], host); - clk_disable_unprepare(host->hclk); mmc_free_host(host->mmc); pm_runtime_put_sync(&pdev->dev); diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c index 03e7b280cb4..eb8f1d5c34b 100644 --- a/drivers/mmc/host/tmio_mmc_dma.c +++ b/drivers/mmc/host/tmio_mmc_dma.c @@ -294,6 +294,7 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat cfg.slave_id = pdata->dma->slave_id_tx; cfg.direction = DMA_MEM_TO_DEV; cfg.dst_addr = res->start + (CTL_SD_DATA_PORT << host->pdata->bus_shift); + cfg.dst_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES; cfg.src_addr = 0; ret = dmaengine_slave_config(host->chan_tx, &cfg); if (ret < 0) @@ -312,6 +313,7 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat cfg.slave_id = pdata->dma->slave_id_rx; cfg.direction = DMA_DEV_TO_MEM; cfg.src_addr = cfg.dst_addr; + cfg.src_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES; cfg.dst_addr = 0; ret = dmaengine_slave_config(host->chan_rx, &cfg); if (ret < 0) diff --git a/drivers/mmc/host/wmt-sdmmc.c b/drivers/mmc/host/wmt-sdmmc.c index 282891a8e45..54181b4f6e9 100644 --- a/drivers/mmc/host/wmt-sdmmc.c +++ b/drivers/mmc/host/wmt-sdmmc.c @@ -72,7 +72,6 @@ #define BM_SPI_CS 0x20 #define BM_SD_POWER 0x40 #define BM_SOFT_RESET 0x80 -#define BM_ONEBIT_MASK 0xFD /* SDMMC_BLKLEN bit fields */ #define BLKL_CRCERR_ABORT 0x0800 @@ -120,6 +119,8 @@ #define STS2_DATARSP_BUSY 0x20 #define STS2_DIS_FORCECLK 0x80 +/* SDMMC_EXTCTRL bit fields */ +#define EXT_EIGHTBIT 0x04 /* MMC/SD DMA Controller Registers */ #define SDDMA_GCR 0x100 @@ -672,7 +673,7 @@ static void wmt_mci_request(struct mmc_host *mmc, struct mmc_request *req) static void wmt_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) { struct wmt_mci_priv *priv; - u32 reg_tmp; + u32 busmode, extctrl; priv = mmc_priv(mmc); @@ -687,28 +688,26 @@ static void wmt_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) if (ios->clock != 0) clk_set_rate(priv->clk_sdmmc, ios->clock); + busmode = readb(priv->sdmmc_base + SDMMC_BUSMODE); + extctrl = readb(priv->sdmmc_base + SDMMC_EXTCTRL); + + busmode &= ~(BM_EIGHTBIT_MODE | BM_FOURBIT_MODE); + extctrl &= ~EXT_EIGHTBIT; + switch (ios->bus_width) { case MMC_BUS_WIDTH_8: - reg_tmp = readb(priv->sdmmc_base + SDMMC_EXTCTRL); - writeb(reg_tmp | 0x04, priv->sdmmc_base + SDMMC_EXTCTRL); + busmode |= BM_EIGHTBIT_MODE; + extctrl |= EXT_EIGHTBIT; break; case MMC_BUS_WIDTH_4: - reg_tmp = readb(priv->sdmmc_base + SDMMC_BUSMODE); - writeb(reg_tmp | BM_FOURBIT_MODE, priv->sdmmc_base + - SDMMC_BUSMODE); - - reg_tmp = readb(priv->sdmmc_base + SDMMC_EXTCTRL); - writeb(reg_tmp & 0xFB, priv->sdmmc_base + SDMMC_EXTCTRL); + busmode |= BM_FOURBIT_MODE; break; case MMC_BUS_WIDTH_1: - reg_tmp = readb(priv->sdmmc_base + SDMMC_BUSMODE); - writeb(reg_tmp & BM_ONEBIT_MASK, priv->sdmmc_base + - SDMMC_BUSMODE); - - reg_tmp = readb(priv->sdmmc_base + SDMMC_EXTCTRL); - writeb(reg_tmp & 0xFB, priv->sdmmc_base + SDMMC_EXTCTRL); break; } + + writeb(busmode, priv->sdmmc_base + SDMMC_BUSMODE); + writeb(extctrl, priv->sdmmc_base + SDMMC_EXTCTRL); } static int wmt_mci_get_ro(struct mmc_host *mmc) @@ -830,7 +829,7 @@ static int wmt_mci_probe(struct platform_device *pdev) goto fail3; } - ret = request_irq(dma_irq, wmt_mci_dma_isr, 32, "sdmmc", priv); + ret = request_irq(dma_irq, wmt_mci_dma_isr, 0, "sdmmc", priv); if (ret) { dev_err(&pdev->dev, "Register DMA IRQ fail\n"); goto fail4; diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c index af7c40ac145..e1a8f4e1998 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c @@ -581,7 +581,11 @@ static struct xgene_enet_desc_ring *xgene_enet_create_desc_ring( struct xgene_enet_desc_ring *ring; struct xgene_enet_pdata *pdata = netdev_priv(ndev); struct device *dev = ndev_to_dev(ndev); - u32 size; + int size; + + size = xgene_enet_get_ring_size(dev, cfgsize); + if (size < 0) + return NULL; ring = devm_kzalloc(dev, sizeof(struct xgene_enet_desc_ring), GFP_KERNEL); @@ -593,7 +597,6 @@ static struct xgene_enet_desc_ring *xgene_enet_create_desc_ring( ring->cfgsize = cfgsize; ring->id = ring_id; - size = xgene_enet_get_ring_size(dev, cfgsize); ring->desc_addr = dma_zalloc_coherent(dev, size, &ring->dma, GFP_KERNEL); if (!ring->desc_addr) { diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index a3dd5dc64f4..4296b3d26f0 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -14093,8 +14093,9 @@ static struct rtnl_link_stats64 *tg3_get_stats64(struct net_device *dev, spin_lock_bh(&tp->lock); if (!tp->hw_stats) { + *stats = tp->net_stats_prev; spin_unlock_bh(&tp->lock); - return &tp->net_stats_prev; + return stats; } tg3_get_nstats(tp, stats); diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h index 811f1351db7..43e08d0bc3d 100644 --- a/drivers/net/ethernet/emulex/benet/be.h +++ b/drivers/net/ethernet/emulex/benet/be.h @@ -897,5 +897,6 @@ void be_roce_dev_remove(struct be_adapter *); */ void be_roce_dev_open(struct be_adapter *); void be_roce_dev_close(struct be_adapter *); +void be_roce_dev_shutdown(struct be_adapter *); #endif /* BE_H */ diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index db4ff14ff18..9cdeda54674 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -5014,6 +5014,7 @@ static void be_shutdown(struct pci_dev *pdev) if (!adapter) return; + be_roce_dev_shutdown(adapter); cancel_delayed_work_sync(&adapter->work); cancel_delayed_work_sync(&adapter->func_recovery_work); diff --git a/drivers/net/ethernet/emulex/benet/be_roce.c b/drivers/net/ethernet/emulex/benet/be_roce.c index 5bf16603a3e..ef4672dc735 100644 --- a/drivers/net/ethernet/emulex/benet/be_roce.c +++ b/drivers/net/ethernet/emulex/benet/be_roce.c @@ -120,7 +120,8 @@ static void _be_roce_dev_open(struct be_adapter *adapter) { if (ocrdma_drv && adapter->ocrdma_dev && ocrdma_drv->state_change_handler) - ocrdma_drv->state_change_handler(adapter->ocrdma_dev, 0); + ocrdma_drv->state_change_handler(adapter->ocrdma_dev, + BE_DEV_UP); } void be_roce_dev_open(struct be_adapter *adapter) @@ -136,7 +137,8 @@ static void _be_roce_dev_close(struct be_adapter *adapter) { if (ocrdma_drv && adapter->ocrdma_dev && ocrdma_drv->state_change_handler) - ocrdma_drv->state_change_handler(adapter->ocrdma_dev, 1); + ocrdma_drv->state_change_handler(adapter->ocrdma_dev, + BE_DEV_DOWN); } void be_roce_dev_close(struct be_adapter *adapter) @@ -148,6 +150,18 @@ void be_roce_dev_close(struct be_adapter *adapter) } } +void be_roce_dev_shutdown(struct be_adapter *adapter) +{ + if (be_roce_supported(adapter)) { + mutex_lock(&be_adapter_list_lock); + if (ocrdma_drv && adapter->ocrdma_dev && + ocrdma_drv->state_change_handler) + ocrdma_drv->state_change_handler(adapter->ocrdma_dev, + BE_DEV_SHUTDOWN); + mutex_unlock(&be_adapter_list_lock); + } +} + int be_roce_register_driver(struct ocrdma_driver *drv) { struct be_adapter *dev; diff --git a/drivers/net/ethernet/emulex/benet/be_roce.h b/drivers/net/ethernet/emulex/benet/be_roce.h index a3d9e96c18e..e6f7eb1a7d8 100644 --- a/drivers/net/ethernet/emulex/benet/be_roce.h +++ b/drivers/net/ethernet/emulex/benet/be_roce.h @@ -62,7 +62,8 @@ struct ocrdma_driver { enum { BE_DEV_UP = 0, - BE_DEV_DOWN = 1 + BE_DEV_DOWN = 1, + BE_DEV_SHUTDOWN = 2 }; /* APIs for RoCE driver to register callback handlers, diff --git a/drivers/net/ethernet/ibm/ehea/Makefile b/drivers/net/ethernet/ibm/ehea/Makefile index 775d9969b5c..cd473e29524 100644 --- a/drivers/net/ethernet/ibm/ehea/Makefile +++ b/drivers/net/ethernet/ibm/ehea/Makefile @@ -1,6 +1,6 @@ # # Makefile for the eHEA ethernet device driver for IBM eServer System p # -ehea-y = ehea_main.o ehea_phyp.o ehea_qmr.o ehea_ethtool.o ehea_phyp.o +ehea-y = ehea_main.o ehea_phyp.o ehea_qmr.o ehea_ethtool.o obj-$(CONFIG_EHEA) += ehea.o diff --git a/drivers/net/ethernet/intel/e1000e/manage.c b/drivers/net/ethernet/intel/e1000e/manage.c index 58856032298..06edfca1a35 100644 --- a/drivers/net/ethernet/intel/e1000e/manage.c +++ b/drivers/net/ethernet/intel/e1000e/manage.c @@ -47,7 +47,7 @@ static u8 e1000_calculate_checksum(u8 *buffer, u32 length) * e1000_mng_enable_host_if - Checks host interface is enabled * @hw: pointer to the HW structure * - * Returns E1000_success upon success, else E1000_ERR_HOST_INTERFACE_COMMAND + * Returns 0 upon success, else -E1000_ERR_HOST_INTERFACE_COMMAND * * This function checks whether the HOST IF is enabled for command operation * and also checks whether the previous command is completed. It busy waits @@ -78,7 +78,7 @@ static s32 e1000_mng_enable_host_if(struct e1000_hw *hw) } if (i == E1000_MNG_DHCP_COMMAND_TIMEOUT) { - e_dbg("Previous command timeout failed .\n"); + e_dbg("Previous command timeout failed.\n"); return -E1000_ERR_HOST_INTERFACE_COMMAND; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_fcoe.c b/drivers/net/ethernet/intel/i40e/i40e_fcoe.c index 6938fc1ad87..5d01db1d789 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_fcoe.c +++ b/drivers/net/ethernet/intel/i40e/i40e_fcoe.c @@ -33,6 +33,7 @@ #include <scsi/fc/fc_fcoe.h> #include <scsi/libfc.h> #include <scsi/libfcoe.h> +#include <uapi/linux/dcbnl.h> #include "i40e.h" #include "i40e_fcoe.h" diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 51bc03072ed..871474f6fe6 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -4415,13 +4415,13 @@ static void i40e_print_link_message(struct i40e_vsi *vsi, bool isup) switch (vsi->back->hw.phy.link_info.link_speed) { case I40E_LINK_SPEED_40GB: - strncpy(speed, "40 Gbps", SPEED_SIZE); + strlcpy(speed, "40 Gbps", SPEED_SIZE); break; case I40E_LINK_SPEED_10GB: - strncpy(speed, "10 Gbps", SPEED_SIZE); + strlcpy(speed, "10 Gbps", SPEED_SIZE); break; case I40E_LINK_SPEED_1GB: - strncpy(speed, "1000 Mbps", SPEED_SIZE); + strlcpy(speed, "1000 Mbps", SPEED_SIZE); break; default: break; @@ -4429,16 +4429,16 @@ static void i40e_print_link_message(struct i40e_vsi *vsi, bool isup) switch (vsi->back->hw.fc.current_mode) { case I40E_FC_FULL: - strncpy(fc, "RX/TX", FC_SIZE); + strlcpy(fc, "RX/TX", FC_SIZE); break; case I40E_FC_TX_PAUSE: - strncpy(fc, "TX", FC_SIZE); + strlcpy(fc, "TX", FC_SIZE); break; case I40E_FC_RX_PAUSE: - strncpy(fc, "RX", FC_SIZE); + strlcpy(fc, "RX", FC_SIZE); break; default: - strncpy(fc, "None", FC_SIZE); + strlcpy(fc, "None", FC_SIZE); break; } @@ -5839,7 +5839,7 @@ static void i40e_send_version(struct i40e_pf *pf) dv.minor_version = DRV_VERSION_MINOR; dv.build_version = DRV_VERSION_BUILD; dv.subbuild_version = 0; - strncpy(dv.driver_string, DRV_VERSION, sizeof(dv.driver_string)); + strlcpy(dv.driver_string, DRV_VERSION, sizeof(dv.driver_string)); i40e_aq_send_driver_version(&pf->hw, &dv, NULL); } @@ -6293,7 +6293,7 @@ static int i40e_vsi_alloc_arrays(struct i40e_vsi *vsi, bool alloc_qvectors) if (alloc_qvectors) { /* allocate memory for q_vector pointers */ - size = sizeof(struct i40e_q_vectors *) * vsi->num_q_vectors; + size = sizeof(struct i40e_q_vector *) * vsi->num_q_vectors; vsi->q_vectors = kzalloc(size, GFP_KERNEL); if (!vsi->q_vectors) { ret = -ENOMEM; diff --git a/drivers/net/ethernet/intel/i40e/i40e_nvm.c b/drivers/net/ethernet/intel/i40e/i40e_nvm.c index 97bda3dffd4..25c4f9a3011 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_nvm.c +++ b/drivers/net/ethernet/intel/i40e/i40e_nvm.c @@ -251,9 +251,9 @@ i40e_status i40e_read_nvm_buffer(struct i40e_hw *hw, u16 offset, * * Writes a 16 bit words buffer to the Shadow RAM using the admin command. **/ -i40e_status i40e_write_nvm_aq(struct i40e_hw *hw, u8 module_pointer, - u32 offset, u16 words, void *data, - bool last_command) +static i40e_status i40e_write_nvm_aq(struct i40e_hw *hw, u8 module_pointer, + u32 offset, u16 words, void *data, + bool last_command) { i40e_status ret_code = I40E_ERR_NVM; diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index 5d940a26055..65a4a0f88ea 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -1311,6 +1311,15 @@ static struct mlx4_cmd_info cmd_info[] = { .wrapper = mlx4_MAD_IFC_wrapper }, { + .opcode = MLX4_CMD_MAD_DEMUX, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_CMD_EPERM_wrapper + }, + { .opcode = MLX4_CMD_QUERY_IF_STAT, .has_inbox = false, .has_outbox = true, diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 688e1eabab2..494753e44ae 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -136,7 +136,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags) [7] = "FSM (MAC anti-spoofing) support", [8] = "Dynamic QP updates support", [9] = "Device managed flow steering IPoIB support", - [10] = "TCP/IP offloads/flow-steering for VXLAN support" + [10] = "TCP/IP offloads/flow-steering for VXLAN support", + [11] = "MAD DEMUX (Secure-Host) support" }; int i; @@ -571,6 +572,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET 0xa0 #define QUERY_DEV_CAP_FW_REASSIGN_MAC 0x9d #define QUERY_DEV_CAP_VXLAN 0x9e +#define QUERY_DEV_CAP_MAD_DEMUX_OFFSET 0xb0 dev_cap->flags2 = 0; mailbox = mlx4_alloc_cmd_mailbox(dev); @@ -748,6 +750,11 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) MLX4_GET(dev_cap->max_counters, outbox, QUERY_DEV_CAP_MAX_COUNTERS_OFFSET); + MLX4_GET(field32, outbox, + QUERY_DEV_CAP_MAD_DEMUX_OFFSET); + if (field32 & (1 << 0)) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_MAD_DEMUX; + MLX4_GET(field32, outbox, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET); if (field32 & (1 << 16)) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_UPDATE_QP; @@ -2016,3 +2023,85 @@ void mlx4_opreq_action(struct work_struct *work) out: mlx4_free_cmd_mailbox(dev, mailbox); } + +static int mlx4_check_smp_firewall_active(struct mlx4_dev *dev, + struct mlx4_cmd_mailbox *mailbox) +{ +#define MLX4_CMD_MAD_DEMUX_SET_ATTR_OFFSET 0x10 +#define MLX4_CMD_MAD_DEMUX_GETRESP_ATTR_OFFSET 0x20 +#define MLX4_CMD_MAD_DEMUX_TRAP_ATTR_OFFSET 0x40 +#define MLX4_CMD_MAD_DEMUX_TRAP_REPRESS_ATTR_OFFSET 0x70 + + u32 set_attr_mask, getresp_attr_mask; + u32 trap_attr_mask, traprepress_attr_mask; + + MLX4_GET(set_attr_mask, mailbox->buf, + MLX4_CMD_MAD_DEMUX_SET_ATTR_OFFSET); + mlx4_dbg(dev, "SMP firewall set_attribute_mask = 0x%x\n", + set_attr_mask); + + MLX4_GET(getresp_attr_mask, mailbox->buf, + MLX4_CMD_MAD_DEMUX_GETRESP_ATTR_OFFSET); + mlx4_dbg(dev, "SMP firewall getresp_attribute_mask = 0x%x\n", + getresp_attr_mask); + + MLX4_GET(trap_attr_mask, mailbox->buf, + MLX4_CMD_MAD_DEMUX_TRAP_ATTR_OFFSET); + mlx4_dbg(dev, "SMP firewall trap_attribute_mask = 0x%x\n", + trap_attr_mask); + + MLX4_GET(traprepress_attr_mask, mailbox->buf, + MLX4_CMD_MAD_DEMUX_TRAP_REPRESS_ATTR_OFFSET); + mlx4_dbg(dev, "SMP firewall traprepress_attribute_mask = 0x%x\n", + traprepress_attr_mask); + + if (set_attr_mask && getresp_attr_mask && trap_attr_mask && + traprepress_attr_mask) + return 1; + + return 0; +} + +int mlx4_config_mad_demux(struct mlx4_dev *dev) +{ + struct mlx4_cmd_mailbox *mailbox; + int secure_host_active; + int err; + + /* Check if mad_demux is supported */ + if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_MAD_DEMUX)) + return 0; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + mlx4_warn(dev, "Failed to allocate mailbox for cmd MAD_DEMUX"); + return -ENOMEM; + } + + /* Query mad_demux to find out which MADs are handled by internal sma */ + err = mlx4_cmd_box(dev, 0, mailbox->dma, 0x01 /* subn mgmt class */, + MLX4_CMD_MAD_DEMUX_QUERY_RESTR, MLX4_CMD_MAD_DEMUX, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + if (err) { + mlx4_warn(dev, "MLX4_CMD_MAD_DEMUX: query restrictions failed (%d)\n", + err); + goto out; + } + + secure_host_active = mlx4_check_smp_firewall_active(dev, mailbox); + + /* Config mad_demux to handle all MADs returned by the query above */ + err = mlx4_cmd(dev, mailbox->dma, 0x01 /* subn mgmt class */, + MLX4_CMD_MAD_DEMUX_CONFIG, MLX4_CMD_MAD_DEMUX, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + if (err) { + mlx4_warn(dev, "MLX4_CMD_MAD_DEMUX: configure failed (%d)\n", err); + goto out; + } + + if (secure_host_active) + mlx4_warn(dev, "HCA operating in secure-host mode. SMP firewall activated.\n"); +out: + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 80b8c5f30e4..0158689906f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -1853,6 +1853,11 @@ static int mlx4_setup_hca(struct mlx4_dev *dev) mlx4_err(dev, "Failed to initialize multicast group table, aborting\n"); goto err_mr_table_free; } + err = mlx4_config_mad_demux(dev); + if (err) { + mlx4_err(dev, "Failed in config_mad_demux, aborting\n"); + goto err_mcg_table_free; + } } err = mlx4_init_eq_table(dev); diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index 13fbcd03c3e..b508c7887ef 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -274,6 +274,8 @@ struct mlx4_icm_table { #define MLX4_MPT_FLAG_PHYSICAL (1 << 9) #define MLX4_MPT_FLAG_REGION (1 << 8) +#define MLX4_MPT_PD_MASK (0x1FFFFUL) +#define MLX4_MPT_PD_VF_MASK (0xFE0000UL) #define MLX4_MPT_PD_FLAG_FAST_REG (1 << 27) #define MLX4_MPT_PD_FLAG_RAE (1 << 28) #define MLX4_MPT_PD_FLAG_EN_INV (3 << 24) @@ -1306,5 +1308,6 @@ void mlx4_init_quotas(struct mlx4_dev *dev); int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave, int port); /* Returns the VF index of slave */ int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave); +int mlx4_config_mad_demux(struct mlx4_dev *dev); #endif /* MLX4_H */ diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c index 2839abb878a..7d717eccb7b 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mr.c +++ b/drivers/net/ethernet/mellanox/mlx4/mr.c @@ -298,6 +298,131 @@ static int mlx4_HW2SW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); } +int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr, + struct mlx4_mpt_entry ***mpt_entry) +{ + int err; + int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1); + struct mlx4_cmd_mailbox *mailbox = NULL; + + /* Make sure that at this point we have single-threaded access only */ + + if (mmr->enabled != MLX4_MPT_EN_HW) + return -EINVAL; + + err = mlx4_HW2SW_MPT(dev, NULL, key); + + if (err) { + mlx4_warn(dev, "HW2SW_MPT failed (%d).", err); + mlx4_warn(dev, "Most likely the MR has MWs bound to it.\n"); + return err; + } + + mmr->enabled = MLX4_MPT_EN_SW; + + if (!mlx4_is_mfunc(dev)) { + **mpt_entry = mlx4_table_find( + &mlx4_priv(dev)->mr_table.dmpt_table, + key, NULL); + } else { + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR_OR_NULL(mailbox)) + return PTR_ERR(mailbox); + + err = mlx4_cmd_box(dev, 0, mailbox->dma, key, + 0, MLX4_CMD_QUERY_MPT, + MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + + if (err) + goto free_mailbox; + + *mpt_entry = (struct mlx4_mpt_entry **)&mailbox->buf; + } + + if (!(*mpt_entry) || !(**mpt_entry)) { + err = -ENOMEM; + goto free_mailbox; + } + + return 0; + +free_mailbox: + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_mr_hw_get_mpt); + +int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr, + struct mlx4_mpt_entry **mpt_entry) +{ + int err; + + if (!mlx4_is_mfunc(dev)) { + /* Make sure any changes to this entry are flushed */ + wmb(); + + *(u8 *)(*mpt_entry) = MLX4_MPT_STATUS_HW; + + /* Make sure the new status is written */ + wmb(); + + err = mlx4_SYNC_TPT(dev); + } else { + int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1); + + struct mlx4_cmd_mailbox *mailbox = + container_of((void *)mpt_entry, struct mlx4_cmd_mailbox, + buf); + + err = mlx4_SW2HW_MPT(dev, mailbox, key); + } + + mmr->pd = be32_to_cpu((*mpt_entry)->pd_flags) & MLX4_MPT_PD_MASK; + if (!err) + mmr->enabled = MLX4_MPT_EN_HW; + return err; +} +EXPORT_SYMBOL_GPL(mlx4_mr_hw_write_mpt); + +void mlx4_mr_hw_put_mpt(struct mlx4_dev *dev, + struct mlx4_mpt_entry **mpt_entry) +{ + if (mlx4_is_mfunc(dev)) { + struct mlx4_cmd_mailbox *mailbox = + container_of((void *)mpt_entry, struct mlx4_cmd_mailbox, + buf); + mlx4_free_cmd_mailbox(dev, mailbox); + } +} +EXPORT_SYMBOL_GPL(mlx4_mr_hw_put_mpt); + +int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry *mpt_entry, + u32 pdn) +{ + u32 pd_flags = be32_to_cpu(mpt_entry->pd_flags); + /* The wrapper function will put the slave's id here */ + if (mlx4_is_mfunc(dev)) + pd_flags &= ~MLX4_MPT_PD_VF_MASK; + mpt_entry->pd_flags = cpu_to_be32((pd_flags & ~MLX4_MPT_PD_MASK) | + (pdn & MLX4_MPT_PD_MASK) + | MLX4_MPT_PD_FLAG_EN_INV); + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_pd); + +int mlx4_mr_hw_change_access(struct mlx4_dev *dev, + struct mlx4_mpt_entry *mpt_entry, + u32 access) +{ + u32 flags = (be32_to_cpu(mpt_entry->flags) & ~MLX4_PERM_MASK) | + (access & MLX4_PERM_MASK); + + mpt_entry->flags = cpu_to_be32(flags); + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_access); + static int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd, u64 iova, u64 size, u32 access, int npages, int page_shift, struct mlx4_mr *mr) @@ -463,6 +588,41 @@ int mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr) } EXPORT_SYMBOL_GPL(mlx4_mr_free); +void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr *mr) +{ + mlx4_mtt_cleanup(dev, &mr->mtt); +} +EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_cleanup); + +int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr, + u64 iova, u64 size, int npages, + int page_shift, struct mlx4_mpt_entry *mpt_entry) +{ + int err; + + mpt_entry->start = cpu_to_be64(mr->iova); + mpt_entry->length = cpu_to_be64(mr->size); + mpt_entry->entity_size = cpu_to_be32(mr->mtt.page_shift); + + err = mlx4_mtt_init(dev, npages, page_shift, &mr->mtt); + if (err) + return err; + + if (mr->mtt.order < 0) { + mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL); + mpt_entry->mtt_addr = 0; + } else { + mpt_entry->mtt_addr = cpu_to_be64(mlx4_mtt_addr(dev, + &mr->mtt)); + if (mr->mtt.page_shift == 0) + mpt_entry->mtt_sz = cpu_to_be32(1 << mr->mtt.order); + } + mr->enabled = MLX4_MPT_EN_SW; + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_write); + int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr) { struct mlx4_cmd_mailbox *mailbox; diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index 0efc1368e5a..1089367fed2 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -2613,12 +2613,34 @@ int mlx4_QUERY_MPT_wrapper(struct mlx4_dev *dev, int slave, if (err) return err; - if (mpt->com.from_state != RES_MPT_HW) { + if (mpt->com.from_state == RES_MPT_MAPPED) { + /* In order to allow rereg in SRIOV, we need to alter the MPT entry. To do + * that, the VF must read the MPT. But since the MPT entry memory is not + * in the VF's virtual memory space, it must use QUERY_MPT to obtain the + * entry contents. To guarantee that the MPT cannot be changed, the driver + * must perform HW2SW_MPT before this query and return the MPT entry to HW + * ownership fofollowing the change. The change here allows the VF to + * perform QUERY_MPT also when the entry is in SW ownership. + */ + struct mlx4_mpt_entry *mpt_entry = mlx4_table_find( + &mlx4_priv(dev)->mr_table.dmpt_table, + mpt->key, NULL); + + if (NULL == mpt_entry || NULL == outbox->buf) { + err = -EINVAL; + goto out; + } + + memcpy(outbox->buf, mpt_entry, sizeof(*mpt_entry)); + + err = 0; + } else if (mpt->com.from_state == RES_MPT_HW) { + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + } else { err = -EBUSY; goto out; } - err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); out: put_res(dev, slave, id, RES_MPT); diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c index 69c26f04d8c..679db026f4b 100644 --- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c +++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c @@ -873,6 +873,10 @@ static int myri10ge_dma_test(struct myri10ge_priv *mgp, int test_type) return -ENOMEM; dmatest_bus = pci_map_page(mgp->pdev, dmatest_page, 0, PAGE_SIZE, DMA_BIDIRECTIONAL); + if (unlikely(pci_dma_mapping_error(mgp->pdev, dmatest_bus))) { + __free_page(dmatest_page); + return -ENOMEM; + } /* Run a small DMA test. * The magic multipliers to the length tell the firmware @@ -1294,6 +1298,7 @@ myri10ge_alloc_rx_pages(struct myri10ge_priv *mgp, struct myri10ge_rx_buf *rx, int bytes, int watchdog) { struct page *page; + dma_addr_t bus; int idx; #if MYRI10GE_ALLOC_SIZE > 4096 int end_offset; @@ -1318,11 +1323,21 @@ myri10ge_alloc_rx_pages(struct myri10ge_priv *mgp, struct myri10ge_rx_buf *rx, rx->watchdog_needed = 1; return; } + + bus = pci_map_page(mgp->pdev, page, 0, + MYRI10GE_ALLOC_SIZE, + PCI_DMA_FROMDEVICE); + if (unlikely(pci_dma_mapping_error(mgp->pdev, bus))) { + __free_pages(page, MYRI10GE_ALLOC_ORDER); + if (rx->fill_cnt - rx->cnt < 16) + rx->watchdog_needed = 1; + return; + } + rx->page = page; rx->page_offset = 0; - rx->bus = pci_map_page(mgp->pdev, page, 0, - MYRI10GE_ALLOC_SIZE, - PCI_DMA_FROMDEVICE); + rx->bus = bus; + } rx->info[idx].page = rx->page; rx->info[idx].page_offset = rx->page_offset; @@ -2764,6 +2779,35 @@ myri10ge_submit_req(struct myri10ge_tx_buf *tx, struct mcp_kreq_ether_send *src, mb(); } +static void myri10ge_unmap_tx_dma(struct myri10ge_priv *mgp, + struct myri10ge_tx_buf *tx, int idx) +{ + unsigned int len; + int last_idx; + + /* Free any DMA resources we've alloced and clear out the skb slot */ + last_idx = (idx + 1) & tx->mask; + idx = tx->req & tx->mask; + do { + len = dma_unmap_len(&tx->info[idx], len); + if (len) { + if (tx->info[idx].skb != NULL) + pci_unmap_single(mgp->pdev, + dma_unmap_addr(&tx->info[idx], + bus), len, + PCI_DMA_TODEVICE); + else + pci_unmap_page(mgp->pdev, + dma_unmap_addr(&tx->info[idx], + bus), len, + PCI_DMA_TODEVICE); + dma_unmap_len_set(&tx->info[idx], len, 0); + tx->info[idx].skb = NULL; + } + idx = (idx + 1) & tx->mask; + } while (idx != last_idx); +} + /* * Transmit a packet. We need to split the packet so that a single * segment does not cross myri10ge->tx_boundary, so this makes segment @@ -2787,7 +2831,7 @@ static netdev_tx_t myri10ge_xmit(struct sk_buff *skb, u32 low; __be32 high_swapped; unsigned int len; - int idx, last_idx, avail, frag_cnt, frag_idx, count, mss, max_segments; + int idx, avail, frag_cnt, frag_idx, count, mss, max_segments; u16 pseudo_hdr_offset, cksum_offset, queue; int cum_len, seglen, boundary, rdma_count; u8 flags, odd_flag; @@ -2884,9 +2928,12 @@ again: /* map the skb for DMA */ len = skb_headlen(skb); + bus = pci_map_single(mgp->pdev, skb->data, len, PCI_DMA_TODEVICE); + if (unlikely(pci_dma_mapping_error(mgp->pdev, bus))) + goto drop; + idx = tx->req & tx->mask; tx->info[idx].skb = skb; - bus = pci_map_single(mgp->pdev, skb->data, len, PCI_DMA_TODEVICE); dma_unmap_addr_set(&tx->info[idx], bus, bus); dma_unmap_len_set(&tx->info[idx], len, len); @@ -2985,12 +3032,16 @@ again: break; /* map next fragment for DMA */ - idx = (count + tx->req) & tx->mask; frag = &skb_shinfo(skb)->frags[frag_idx]; frag_idx++; len = skb_frag_size(frag); bus = skb_frag_dma_map(&mgp->pdev->dev, frag, 0, len, DMA_TO_DEVICE); + if (unlikely(pci_dma_mapping_error(mgp->pdev, bus))) { + myri10ge_unmap_tx_dma(mgp, tx, idx); + goto drop; + } + idx = (count + tx->req) & tx->mask; dma_unmap_addr_set(&tx->info[idx], bus, bus); dma_unmap_len_set(&tx->info[idx], len, len); } @@ -3021,31 +3072,8 @@ again: return NETDEV_TX_OK; abort_linearize: - /* Free any DMA resources we've alloced and clear out the skb - * slot so as to not trip up assertions, and to avoid a - * double-free if linearizing fails */ + myri10ge_unmap_tx_dma(mgp, tx, idx); - last_idx = (idx + 1) & tx->mask; - idx = tx->req & tx->mask; - tx->info[idx].skb = NULL; - do { - len = dma_unmap_len(&tx->info[idx], len); - if (len) { - if (tx->info[idx].skb != NULL) - pci_unmap_single(mgp->pdev, - dma_unmap_addr(&tx->info[idx], - bus), len, - PCI_DMA_TODEVICE); - else - pci_unmap_page(mgp->pdev, - dma_unmap_addr(&tx->info[idx], - bus), len, - PCI_DMA_TODEVICE); - dma_unmap_len_set(&tx->info[idx], len, 0); - tx->info[idx].skb = NULL; - } - idx = (idx + 1) & tx->mask; - } while (idx != last_idx); if (skb_is_gso(skb)) { netdev_err(mgp->dev, "TSO but wanted to linearize?!?!?\n"); goto drop; diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c index d813bfb1a84..23c89ab5a6a 100644 --- a/drivers/net/ethernet/sun/sunvnet.c +++ b/drivers/net/ethernet/sun/sunvnet.c @@ -32,6 +32,11 @@ MODULE_DESCRIPTION("Sun LDOM virtual network driver"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_MODULE_VERSION); +/* Heuristic for the number of times to exponentially backoff and + * retry sending an LDC trigger when EAGAIN is encountered + */ +#define VNET_MAX_RETRIES 10 + /* Ordered from largest major to lowest */ static struct vio_version vnet_versions[] = { { .major = 1, .minor = 0 }, @@ -260,6 +265,7 @@ static int vnet_send_ack(struct vnet_port *port, struct vio_dring_state *dr, .state = vio_dring_state, }; int err, delay; + int retries = 0; hdr.seq = dr->snd_nxt; delay = 1; @@ -272,6 +278,13 @@ static int vnet_send_ack(struct vnet_port *port, struct vio_dring_state *dr, udelay(delay); if ((delay <<= 1) > 128) delay = 128; + if (retries++ > VNET_MAX_RETRIES) { + pr_info("ECONNRESET %x:%x:%x:%x:%x:%x\n", + port->raddr[0], port->raddr[1], + port->raddr[2], port->raddr[3], + port->raddr[4], port->raddr[5]); + err = -ECONNRESET; + } } while (err == -EAGAIN); return err; @@ -475,8 +488,9 @@ static int handle_mcast(struct vnet_port *port, void *msgbuf) return 0; } -static void maybe_tx_wakeup(struct vnet *vp) +static void maybe_tx_wakeup(unsigned long param) { + struct vnet *vp = (struct vnet *)param; struct net_device *dev = vp->dev; netif_tx_lock(dev); @@ -573,8 +587,13 @@ static void vnet_event(void *arg, int event) break; } spin_unlock(&vio->lock); + /* Kick off a tasklet to wake the queue. We cannot call + * maybe_tx_wakeup directly here because we could deadlock on + * netif_tx_lock() with dev_watchdog() + */ if (unlikely(tx_wakeup && err != -ECONNRESET)) - maybe_tx_wakeup(port->vp); + tasklet_schedule(&port->vp->vnet_tx_wakeup); + local_irq_restore(flags); } @@ -593,6 +612,7 @@ static int __vnet_tx_trigger(struct vnet_port *port) .end_idx = (u32) -1, }; int err, delay; + int retries = 0; hdr.seq = dr->snd_nxt; delay = 1; @@ -605,6 +625,8 @@ static int __vnet_tx_trigger(struct vnet_port *port) udelay(delay); if ((delay <<= 1) > 128) delay = 128; + if (retries++ > VNET_MAX_RETRIES) + break; } while (err == -EAGAIN); return err; @@ -691,7 +713,15 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev) memset(tx_buf+VNET_PACKET_SKIP+skb->len, 0, len - skb->len); } - d->hdr.ack = VIO_ACK_ENABLE; + /* We don't rely on the ACKs to free the skb in vnet_start_xmit(), + * thus it is safe to not set VIO_ACK_ENABLE for each transmission: + * the protocol itself does not require it as long as the peer + * sends a VIO_SUBTYPE_ACK for VIO_DRING_STOPPED. + * + * An ACK for every packet in the ring is expensive as the + * sending of LDC messages is slow and affects performance. + */ + d->hdr.ack = VIO_ACK_DISABLE; d->size = len; d->ncookies = port->tx_bufs[dr->prod].ncookies; for (i = 0; i < d->ncookies; i++) @@ -1046,6 +1076,7 @@ static struct vnet *vnet_new(const u64 *local_mac) vp = netdev_priv(dev); spin_lock_init(&vp->lock); + tasklet_init(&vp->vnet_tx_wakeup, maybe_tx_wakeup, (unsigned long)vp); vp->dev = dev; INIT_LIST_HEAD(&vp->port_list); @@ -1105,6 +1136,7 @@ static void vnet_cleanup(void) vp = list_first_entry(&vnet_list, struct vnet, list); list_del(&vp->list); dev = vp->dev; + tasklet_kill(&vp->vnet_tx_wakeup); /* vio_unregister_driver() should have cleaned up port_list */ BUG_ON(!list_empty(&vp->port_list)); unregister_netdev(dev); diff --git a/drivers/net/ethernet/sun/sunvnet.h b/drivers/net/ethernet/sun/sunvnet.h index d347a5bf24b..de5c2c64996 100644 --- a/drivers/net/ethernet/sun/sunvnet.h +++ b/drivers/net/ethernet/sun/sunvnet.h @@ -1,6 +1,8 @@ #ifndef _SUNVNET_H #define _SUNVNET_H +#include <linux/interrupt.h> + #define DESC_NCOOKIES(entry_size) \ ((entry_size) - sizeof(struct vio_net_desc)) @@ -78,6 +80,8 @@ struct vnet { struct list_head list; u64 local_mac; + + struct tasklet_struct vnet_tx_wakeup; }; #endif /* _SUNVNET_H */ diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index 36f4459520c..fda5891835d 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -1170,7 +1170,6 @@ static struct platform_driver temac_of_driver = { .probe = temac_of_probe, .remove = temac_of_remove, .driver = { - .owner = THIS_MODULE, .name = "xilinx_temac", .of_match_table = temac_of_match, }, diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 30e8608ff05..c8fd94133ec 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -1645,7 +1645,6 @@ static struct platform_driver axienet_of_driver = { .probe = axienet_of_probe, .remove = axienet_of_remove, .driver = { - .owner = THIS_MODULE, .name = "xilinx_axienet", .of_match_table = axienet_of_match, }, diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c index 782bb9373cd..28dbbdc393e 100644 --- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c +++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c @@ -1245,7 +1245,6 @@ MODULE_DEVICE_TABLE(of, xemaclite_of_match); static struct platform_driver xemaclite_of_driver = { .driver = { .name = DRIVER_NAME, - .owner = THIS_MODULE, .of_match_table = xemaclite_of_match, }, .probe = xemaclite_of_probe, diff --git a/drivers/net/irda/donauboe.c b/drivers/net/irda/donauboe.c index 768dfe9a931..6d3e2093bf7 100644 --- a/drivers/net/irda/donauboe.c +++ b/drivers/net/irda/donauboe.c @@ -1755,17 +1755,4 @@ static struct pci_driver donauboe_pci_driver = { .resume = toshoboe_wakeup }; -static int __init -donauboe_init (void) -{ - return pci_register_driver(&donauboe_pci_driver); -} - -static void __exit -donauboe_cleanup (void) -{ - pci_unregister_driver(&donauboe_pci_driver); -} - -module_init(donauboe_init); -module_exit(donauboe_cleanup); +module_pci_driver(donauboe_pci_driver); diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index ef8a5c20236..60e4ca01ccb 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -45,10 +45,9 @@ struct macvlan_port { struct sk_buff_head bc_queue; struct work_struct bc_work; bool passthru; + int count; }; -#define MACVLAN_PORT_IS_EMPTY(port) list_empty(&port->vlans) - struct macvlan_skb_cb { const struct macvlan_dev *src; }; @@ -667,7 +666,8 @@ static void macvlan_uninit(struct net_device *dev) free_percpu(vlan->pcpu_stats); - if (MACVLAN_PORT_IS_EMPTY(port)) + port->count -= 1; + if (!port->count) macvlan_port_destroy(port->dev); } @@ -1020,12 +1020,13 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, vlan->flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]); if (vlan->mode == MACVLAN_MODE_PASSTHRU) { - if (!MACVLAN_PORT_IS_EMPTY(port)) + if (port->count) return -EINVAL; port->passthru = true; eth_hw_addr_inherit(dev, lowerdev); } + port->count += 1; err = register_netdevice(dev); if (err < 0) goto destroy_port; @@ -1043,7 +1044,8 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, unregister_netdev: unregister_netdevice(dev); destroy_port: - if (MACVLAN_PORT_IS_EMPTY(port)) + port->count -= 1; + if (!port->count) macvlan_port_destroy(lowerdev); return err; diff --git a/drivers/net/wireless/ath/carl9170/carl9170.h b/drivers/net/wireless/ath/carl9170/carl9170.h index 8596aba34f9..237d0cda1bc 100644 --- a/drivers/net/wireless/ath/carl9170/carl9170.h +++ b/drivers/net/wireless/ath/carl9170/carl9170.h @@ -256,6 +256,7 @@ struct ar9170 { atomic_t rx_work_urbs; atomic_t rx_pool_urbs; kernel_ulong_t features; + bool usb_ep_cmd_is_bulk; /* firmware settings */ struct completion fw_load_wait; diff --git a/drivers/net/wireless/ath/carl9170/usb.c b/drivers/net/wireless/ath/carl9170/usb.c index f35c7f30f9a..c9f93310c0d 100644 --- a/drivers/net/wireless/ath/carl9170/usb.c +++ b/drivers/net/wireless/ath/carl9170/usb.c @@ -621,9 +621,16 @@ int __carl9170_exec_cmd(struct ar9170 *ar, struct carl9170_cmd *cmd, goto err_free; } - usb_fill_int_urb(urb, ar->udev, usb_sndintpipe(ar->udev, - AR9170_USB_EP_CMD), cmd, cmd->hdr.len + 4, - carl9170_usb_cmd_complete, ar, 1); + if (ar->usb_ep_cmd_is_bulk) + usb_fill_bulk_urb(urb, ar->udev, + usb_sndbulkpipe(ar->udev, AR9170_USB_EP_CMD), + cmd, cmd->hdr.len + 4, + carl9170_usb_cmd_complete, ar); + else + usb_fill_int_urb(urb, ar->udev, + usb_sndintpipe(ar->udev, AR9170_USB_EP_CMD), + cmd, cmd->hdr.len + 4, + carl9170_usb_cmd_complete, ar, 1); if (free_buf) urb->transfer_flags |= URB_FREE_BUFFER; @@ -1032,9 +1039,10 @@ static void carl9170_usb_firmware_step2(const struct firmware *fw, static int carl9170_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) { + struct usb_endpoint_descriptor *ep; struct ar9170 *ar; struct usb_device *udev; - int err; + int i, err; err = usb_reset_device(interface_to_usbdev(intf)); if (err) @@ -1050,6 +1058,21 @@ static int carl9170_usb_probe(struct usb_interface *intf, ar->intf = intf; ar->features = id->driver_info; + /* We need to remember the type of endpoint 4 because it differs + * between high- and full-speed configuration. The high-speed + * configuration specifies it as interrupt and the full-speed + * configuration as bulk endpoint. This information is required + * later when sending urbs to that endpoint. + */ + for (i = 0; i < intf->cur_altsetting->desc.bNumEndpoints; ++i) { + ep = &intf->cur_altsetting->endpoint[i].desc; + + if (usb_endpoint_num(ep) == AR9170_USB_EP_CMD && + usb_endpoint_dir_out(ep) && + usb_endpoint_type(ep) == USB_ENDPOINT_XFER_BULK) + ar->usb_ep_cmd_is_bulk = true; + } + usb_set_intfdata(intf, ar); SET_IEEE80211_DEV(ar->hw, &intf->dev); diff --git a/drivers/net/wireless/brcm80211/brcmfmac/msgbuf.c b/drivers/net/wireless/brcm80211/brcmfmac/msgbuf.c index 535c7eb01b3..8f8b9373de9 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/msgbuf.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/msgbuf.c @@ -1318,6 +1318,8 @@ int brcmf_proto_msgbuf_attach(struct brcmf_pub *drvr) msgbuf->nrof_flowrings = if_msgbuf->nrof_flowrings; msgbuf->flowring_dma_handle = kzalloc(msgbuf->nrof_flowrings * sizeof(*msgbuf->flowring_dma_handle), GFP_ATOMIC); + if (!msgbuf->flowring_dma_handle) + goto fail; msgbuf->rx_dataoffset = if_msgbuf->rx_dataoffset; msgbuf->max_rxbufpost = if_msgbuf->max_rxbufpost; @@ -1362,6 +1364,7 @@ fail: kfree(msgbuf->flow_map); kfree(msgbuf->txstatus_done_map); brcmf_msgbuf_release_pktids(msgbuf); + kfree(msgbuf->flowring_dma_handle); if (msgbuf->ioctbuf) dma_free_coherent(drvr->bus_if->dev, BRCMF_TX_IOCTL_MAX_MSG_SIZE, @@ -1391,6 +1394,7 @@ void brcmf_proto_msgbuf_detach(struct brcmf_pub *drvr) BRCMF_TX_IOCTL_MAX_MSG_SIZE, msgbuf->ioctbuf, msgbuf->ioctbuf_handle); brcmf_msgbuf_release_pktids(msgbuf); + kfree(msgbuf->flowring_dma_handle); kfree(msgbuf); drvr->proto->pd = NULL; } diff --git a/drivers/net/wireless/brcm80211/brcmfmac/pcie.c b/drivers/net/wireless/brcm80211/brcmfmac/pcie.c index bc972c0ba5f..e5101b287e4 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/pcie.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/pcie.c @@ -591,12 +591,13 @@ static void brcmf_pcie_handle_mb_data(struct brcmf_pciedev_info *devinfo) } if (dtoh_mb_data & BRCMF_D2H_DEV_DS_EXIT_NOTE) brcmf_dbg(PCIE, "D2H_MB_DATA: DEEP SLEEP EXIT\n"); - if (dtoh_mb_data & BRCMF_D2H_DEV_D3_ACK) + if (dtoh_mb_data & BRCMF_D2H_DEV_D3_ACK) { brcmf_dbg(PCIE, "D2H_MB_DATA: D3 ACK\n"); if (waitqueue_active(&devinfo->mbdata_resp_wait)) { devinfo->mbdata_completed = true; wake_up(&devinfo->mbdata_resp_wait); } + } } diff --git a/drivers/net/wireless/ipw2x00/ipw2200.c b/drivers/net/wireless/ipw2x00/ipw2200.c index c5aa404069f..389656bd1a7 100644 --- a/drivers/net/wireless/ipw2x00/ipw2200.c +++ b/drivers/net/wireless/ipw2x00/ipw2200.c @@ -9853,6 +9853,7 @@ static int ipw_wx_get_wireless_mode(struct net_device *dev, strncpy(extra, "unknown", MAX_WX_STRING); break; } + extra[MAX_WX_STRING - 1] = '\0'; IPW_DEBUG_WX("PRIV GET MODE: %s\n", extra); diff --git a/drivers/net/wireless/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/iwlwifi/mvm/mac80211.c index 0d6a8b768a6..7c8796584c2 100644 --- a/drivers/net/wireless/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/iwlwifi/mvm/mac80211.c @@ -396,7 +396,8 @@ int iwl_mvm_mac_setup_register(struct iwl_mvm *mvm) else hw->wiphy->flags &= ~WIPHY_FLAG_PS_ON_BY_DEFAULT; - hw->wiphy->flags |= WIPHY_FLAG_SUPPORTS_SCHED_SCAN; + /* TODO: enable that only for firmwares that don't crash */ + /* hw->wiphy->flags |= WIPHY_FLAG_SUPPORTS_SCHED_SCAN; */ hw->wiphy->max_sched_scan_ssids = PROBE_OPTION_MAX; hw->wiphy->max_match_sets = IWL_SCAN_MAX_PROFILES; /* we create the 802.11 header and zero length SSID IE. */ diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index ef3026f46a3..d4eb8d2e9cb 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -165,6 +165,7 @@ struct xenvif_queue { /* Per-queue data for xenvif */ u16 dealloc_ring[MAX_PENDING_REQS]; struct task_struct *dealloc_task; wait_queue_head_t dealloc_wq; + atomic_t inflight_packets; /* Use kthread for guest RX */ struct task_struct *task; @@ -329,4 +330,8 @@ extern unsigned int xenvif_max_queues; extern struct dentry *xen_netback_dbg_root; #endif +void xenvif_skb_zerocopy_prepare(struct xenvif_queue *queue, + struct sk_buff *skb); +void xenvif_skb_zerocopy_complete(struct xenvif_queue *queue); + #endif /* __XEN_NETBACK__COMMON_H__ */ diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index bfd10cb9c8d..e29e15dca86 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -43,6 +43,23 @@ #define XENVIF_QUEUE_LENGTH 32 #define XENVIF_NAPI_WEIGHT 64 +/* This function is used to set SKBTX_DEV_ZEROCOPY as well as + * increasing the inflight counter. We need to increase the inflight + * counter because core driver calls into xenvif_zerocopy_callback + * which calls xenvif_skb_zerocopy_complete. + */ +void xenvif_skb_zerocopy_prepare(struct xenvif_queue *queue, + struct sk_buff *skb) +{ + skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; + atomic_inc(&queue->inflight_packets); +} + +void xenvif_skb_zerocopy_complete(struct xenvif_queue *queue) +{ + atomic_dec(&queue->inflight_packets); +} + static inline void xenvif_stop_queue(struct xenvif_queue *queue) { struct net_device *dev = queue->vif->dev; @@ -524,9 +541,6 @@ int xenvif_init_queue(struct xenvif_queue *queue) init_timer(&queue->rx_stalled); - netif_napi_add(queue->vif->dev, &queue->napi, xenvif_poll, - XENVIF_NAPI_WEIGHT); - return 0; } @@ -560,6 +574,7 @@ int xenvif_connect(struct xenvif_queue *queue, unsigned long tx_ring_ref, init_waitqueue_head(&queue->wq); init_waitqueue_head(&queue->dealloc_wq); + atomic_set(&queue->inflight_packets, 0); if (tx_evtchn == rx_evtchn) { /* feature-split-event-channels == 0 */ @@ -614,6 +629,9 @@ int xenvif_connect(struct xenvif_queue *queue, unsigned long tx_ring_ref, wake_up_process(queue->task); wake_up_process(queue->dealloc_task); + netif_napi_add(queue->vif->dev, &queue->napi, xenvif_poll, + XENVIF_NAPI_WEIGHT); + return 0; err_rx_unbind: @@ -642,25 +660,6 @@ void xenvif_carrier_off(struct xenvif *vif) rtnl_unlock(); } -static void xenvif_wait_unmap_timeout(struct xenvif_queue *queue, - unsigned int worst_case_skb_lifetime) -{ - int i, unmap_timeout = 0; - - for (i = 0; i < MAX_PENDING_REQS; ++i) { - if (queue->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) { - unmap_timeout++; - schedule_timeout(msecs_to_jiffies(1000)); - if (unmap_timeout > worst_case_skb_lifetime && - net_ratelimit()) - netdev_err(queue->vif->dev, - "Page still granted! Index: %x\n", - i); - i = -1; - } - } -} - void xenvif_disconnect(struct xenvif *vif) { struct xenvif_queue *queue = NULL; @@ -672,6 +671,8 @@ void xenvif_disconnect(struct xenvif *vif) for (queue_index = 0; queue_index < num_queues; ++queue_index) { queue = &vif->queues[queue_index]; + netif_napi_del(&queue->napi); + if (queue->task) { del_timer_sync(&queue->rx_stalled); kthread_stop(queue->task); @@ -704,7 +705,6 @@ void xenvif_disconnect(struct xenvif *vif) void xenvif_deinit_queue(struct xenvif_queue *queue) { free_xenballooned_pages(MAX_PENDING_REQS, queue->mmap_pages); - netif_napi_del(&queue->napi); } void xenvif_free(struct xenvif *vif) @@ -712,21 +712,11 @@ void xenvif_free(struct xenvif *vif) struct xenvif_queue *queue = NULL; unsigned int num_queues = vif->num_queues; unsigned int queue_index; - /* Here we want to avoid timeout messages if an skb can be legitimately - * stuck somewhere else. Realistically this could be an another vif's - * internal or QDisc queue. That another vif also has this - * rx_drain_timeout_msecs timeout, so give it time to drain out. - * Although if that other guest wakes up just before its timeout happens - * and takes only one skb from QDisc, it can hold onto other skbs for a - * longer period. - */ - unsigned int worst_case_skb_lifetime = (rx_drain_timeout_msecs/1000); unregister_netdev(vif->dev); for (queue_index = 0; queue_index < num_queues; ++queue_index) { queue = &vif->queues[queue_index]; - xenvif_wait_unmap_timeout(queue, worst_case_skb_lifetime); xenvif_deinit_queue(queue); } diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 4734472aa62..08f65996534 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1525,10 +1525,12 @@ static int xenvif_handle_frag_list(struct xenvif_queue *queue, struct sk_buff *s /* remove traces of mapped pages and frag_list */ skb_frag_list_init(skb); uarg = skb_shinfo(skb)->destructor_arg; + /* increase inflight counter to offset decrement in callback */ + atomic_inc(&queue->inflight_packets); uarg->callback(uarg, true); skb_shinfo(skb)->destructor_arg = NULL; - skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY; + xenvif_skb_zerocopy_prepare(queue, nskb); kfree_skb(nskb); return 0; @@ -1589,7 +1591,7 @@ static int xenvif_tx_submit(struct xenvif_queue *queue) if (net_ratelimit()) netdev_err(queue->vif->dev, "Not enough memory to consolidate frag_list!\n"); - skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; + xenvif_skb_zerocopy_prepare(queue, skb); kfree_skb(skb); continue; } @@ -1609,7 +1611,7 @@ static int xenvif_tx_submit(struct xenvif_queue *queue) "Can't setup checksum in net_tx_action\n"); /* We have to set this flag to trigger the callback */ if (skb_shinfo(skb)->destructor_arg) - skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; + xenvif_skb_zerocopy_prepare(queue, skb); kfree_skb(skb); continue; } @@ -1641,7 +1643,7 @@ static int xenvif_tx_submit(struct xenvif_queue *queue) * skb. E.g. the __pskb_pull_tail earlier can do such thing. */ if (skb_shinfo(skb)->destructor_arg) { - skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; + xenvif_skb_zerocopy_prepare(queue, skb); queue->stats.tx_zerocopy_sent++; } @@ -1681,6 +1683,7 @@ void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success) queue->stats.tx_zerocopy_success++; else queue->stats.tx_zerocopy_fail++; + xenvif_skb_zerocopy_complete(queue); } static inline void xenvif_tx_dealloc_action(struct xenvif_queue *queue) @@ -2058,15 +2061,24 @@ int xenvif_kthread_guest_rx(void *data) return 0; } +static bool xenvif_dealloc_kthread_should_stop(struct xenvif_queue *queue) +{ + /* Dealloc thread must remain running until all inflight + * packets complete. + */ + return kthread_should_stop() && + !atomic_read(&queue->inflight_packets); +} + int xenvif_dealloc_kthread(void *data) { struct xenvif_queue *queue = data; - while (!kthread_should_stop()) { + for (;;) { wait_event_interruptible(queue->dealloc_wq, tx_dealloc_work_todo(queue) || - kthread_should_stop()); - if (kthread_should_stop()) + xenvif_dealloc_kthread_should_stop(queue)); + if (xenvif_dealloc_kthread_should_stop(queue)) break; xenvif_tx_dealloc_action(queue); diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index 580517d857b..9c47b897b6d 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -116,6 +116,7 @@ static int xenvif_read_io_ring(struct seq_file *m, void *v) } #define XENVIF_KICK_STR "kick" +#define BUFFER_SIZE 32 static ssize_t xenvif_write_io_ring(struct file *filp, const char __user *buf, size_t count, @@ -124,22 +125,24 @@ xenvif_write_io_ring(struct file *filp, const char __user *buf, size_t count, struct xenvif_queue *queue = ((struct seq_file *)filp->private_data)->private; int len; - char write[sizeof(XENVIF_KICK_STR)]; + char write[BUFFER_SIZE]; /* don't allow partial writes and check the length */ if (*ppos != 0) return 0; - if (count < sizeof(XENVIF_KICK_STR) - 1) + if (count >= sizeof(write)) return -ENOSPC; len = simple_write_to_buffer(write, - sizeof(write), + sizeof(write) - 1, ppos, buf, count); if (len < 0) return len; + write[len] = '\0'; + if (!strncmp(write, XENVIF_KICK_STR, sizeof(XENVIF_KICK_STR) - 1)) xenvif_interrupt(0, (void *)queue); else { @@ -171,10 +174,9 @@ static const struct file_operations xenvif_dbg_io_ring_ops_fops = { .write = xenvif_write_io_ring, }; -static void xenvif_debugfs_addif(struct xenvif_queue *queue) +static void xenvif_debugfs_addif(struct xenvif *vif) { struct dentry *pfile; - struct xenvif *vif = queue->vif; int i; if (IS_ERR_OR_NULL(xen_netback_dbg_root)) @@ -733,10 +735,11 @@ static void connect(struct backend_info *be) be->vif->num_queues = queue_index; goto err; } + } + #ifdef CONFIG_DEBUG_FS - xenvif_debugfs_addif(queue); + xenvif_debugfs_addif(be->vif); #endif /* CONFIG_DEBUG_FS */ - } /* Initialisation completed, tell core driver the number of * active queues. diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig index 2dcb0541012..5160c4eb73c 100644 --- a/drivers/of/Kconfig +++ b/drivers/of/Kconfig @@ -9,7 +9,8 @@ menu "Device Tree and Open Firmware support" config OF_SELFTEST bool "Device Tree Runtime self tests" - depends on OF_IRQ + depends on OF_IRQ && OF_EARLY_FLATTREE + select OF_DYNAMIC help This option builds in test cases for the device tree infrastructure that are executed once at boot time, and the results dumped to the diff --git a/drivers/of/Makefile b/drivers/of/Makefile index 099b1fb00af..2b6a7b129d1 100644 --- a/drivers/of/Makefile +++ b/drivers/of/Makefile @@ -1,11 +1,13 @@ obj-y = base.o device.o platform.o +obj-$(CONFIG_OF_DYNAMIC) += dynamic.o obj-$(CONFIG_OF_FLATTREE) += fdt.o obj-$(CONFIG_OF_EARLY_FLATTREE) += fdt_address.o obj-$(CONFIG_OF_PROMTREE) += pdt.o obj-$(CONFIG_OF_ADDRESS) += address.o obj-$(CONFIG_OF_IRQ) += irq.o obj-$(CONFIG_OF_NET) += of_net.o -obj-$(CONFIG_OF_SELFTEST) += selftest.o +obj-$(CONFIG_OF_SELFTEST) += of_selftest.o +of_selftest-objs := selftest.o testcase-data/testcases.dtb.o obj-$(CONFIG_OF_MDIO) += of_mdio.o obj-$(CONFIG_OF_PCI) += of_pci.o obj-$(CONFIG_OF_PCI_IRQ) += of_pci_irq.o diff --git a/drivers/of/base.c b/drivers/of/base.c index b9864806e9b..d8574adf0d6 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -17,6 +17,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +#include <linux/console.h> #include <linux/ctype.h> #include <linux/cpu.h> #include <linux/module.h> @@ -35,15 +36,17 @@ struct device_node *of_allnodes; EXPORT_SYMBOL(of_allnodes); struct device_node *of_chosen; struct device_node *of_aliases; -static struct device_node *of_stdout; +struct device_node *of_stdout; -static struct kset *of_kset; +struct kset *of_kset; /* - * Used to protect the of_aliases; but also overloaded to hold off addition of - * nodes to sysfs + * Used to protect the of_aliases, to hold off addition of nodes to sysfs. + * This mutex must be held whenever modifications are being made to the + * device tree. The of_{attach,detach}_node() and + * of_{add,remove,update}_property() helpers make sure this happens. */ -DEFINE_MUTEX(of_aliases_mutex); +DEFINE_MUTEX(of_mutex); /* use when traversing tree through the allnext, child, sibling, * or parent members of struct device_node. @@ -89,79 +92,7 @@ int __weak of_node_to_nid(struct device_node *np) } #endif -#if defined(CONFIG_OF_DYNAMIC) -/** - * of_node_get - Increment refcount of a node - * @node: Node to inc refcount, NULL is supported to - * simplify writing of callers - * - * Returns node. - */ -struct device_node *of_node_get(struct device_node *node) -{ - if (node) - kobject_get(&node->kobj); - return node; -} -EXPORT_SYMBOL(of_node_get); - -static inline struct device_node *kobj_to_device_node(struct kobject *kobj) -{ - return container_of(kobj, struct device_node, kobj); -} - -/** - * of_node_release - release a dynamically allocated node - * @kref: kref element of the node to be released - * - * In of_node_put() this function is passed to kref_put() - * as the destructor. - */ -static void of_node_release(struct kobject *kobj) -{ - struct device_node *node = kobj_to_device_node(kobj); - struct property *prop = node->properties; - - /* We should never be releasing nodes that haven't been detached. */ - if (!of_node_check_flag(node, OF_DETACHED)) { - pr_err("ERROR: Bad of_node_put() on %s\n", node->full_name); - dump_stack(); - return; - } - - if (!of_node_check_flag(node, OF_DYNAMIC)) - return; - - while (prop) { - struct property *next = prop->next; - kfree(prop->name); - kfree(prop->value); - kfree(prop); - prop = next; - - if (!prop) { - prop = node->deadprops; - node->deadprops = NULL; - } - } - kfree(node->full_name); - kfree(node->data); - kfree(node); -} - -/** - * of_node_put - Decrement refcount of a node - * @node: Node to dec refcount, NULL is supported to - * simplify writing of callers - * - */ -void of_node_put(struct device_node *node) -{ - if (node) - kobject_put(&node->kobj); -} -EXPORT_SYMBOL(of_node_put); -#else +#ifndef CONFIG_OF_DYNAMIC static void of_node_release(struct kobject *kobj) { /* Without CONFIG_OF_DYNAMIC, no nodes gets freed */ @@ -200,13 +131,16 @@ static const char *safe_name(struct kobject *kobj, const char *orig_name) return name; } -static int __of_add_property_sysfs(struct device_node *np, struct property *pp) +int __of_add_property_sysfs(struct device_node *np, struct property *pp) { int rc; /* Important: Don't leak passwords */ bool secure = strncmp(pp->name, "security-", 9) == 0; + if (!of_kset || !of_node_is_attached(np)) + return 0; + sysfs_bin_attr_init(&pp->attr); pp->attr.attr.name = safe_name(&np->kobj, pp->name); pp->attr.attr.mode = secure ? S_IRUSR : S_IRUGO; @@ -218,12 +152,15 @@ static int __of_add_property_sysfs(struct device_node *np, struct property *pp) return rc; } -static int __of_node_add(struct device_node *np) +int __of_attach_node_sysfs(struct device_node *np) { const char *name; struct property *pp; int rc; + if (!of_kset) + return 0; + np->kobj.kset = of_kset; if (!np->parent) { /* Nodes without parents are new top level trees */ @@ -245,59 +182,20 @@ static int __of_node_add(struct device_node *np) return 0; } -int of_node_add(struct device_node *np) -{ - int rc = 0; - - BUG_ON(!of_node_is_initialized(np)); - - /* - * Grab the mutex here so that in a race condition between of_init() and - * of_node_add(), node addition will still be consistent. - */ - mutex_lock(&of_aliases_mutex); - if (of_kset) - rc = __of_node_add(np); - else - /* This scenario may be perfectly valid, but report it anyway */ - pr_info("of_node_add(%s) before of_init()\n", np->full_name); - mutex_unlock(&of_aliases_mutex); - return rc; -} - -#if defined(CONFIG_OF_DYNAMIC) -static void of_node_remove(struct device_node *np) -{ - struct property *pp; - - BUG_ON(!of_node_is_initialized(np)); - - /* only remove properties if on sysfs */ - if (of_node_is_attached(np)) { - for_each_property_of_node(np, pp) - sysfs_remove_bin_file(&np->kobj, &pp->attr); - kobject_del(&np->kobj); - } - - /* finally remove the kobj_init ref */ - of_node_put(np); -} -#endif - static int __init of_init(void) { struct device_node *np; /* Create the kset, and register existing nodes */ - mutex_lock(&of_aliases_mutex); + mutex_lock(&of_mutex); of_kset = kset_create_and_add("devicetree", NULL, firmware_kobj); if (!of_kset) { - mutex_unlock(&of_aliases_mutex); + mutex_unlock(&of_mutex); return -ENOMEM; } for_each_of_allnodes(np) - __of_node_add(np); - mutex_unlock(&of_aliases_mutex); + __of_attach_node_sysfs(np); + mutex_unlock(&of_mutex); /* Symlink in /proc as required by userspace ABI */ if (of_allnodes) @@ -369,8 +267,8 @@ EXPORT_SYMBOL(of_find_all_nodes); * Find a property with a given name for a given node * and return the value. */ -static const void *__of_get_property(const struct device_node *np, - const char *name, int *lenp) +const void *__of_get_property(const struct device_node *np, + const char *name, int *lenp) { struct property *pp = __of_find_property(np, name, lenp); @@ -1748,32 +1646,10 @@ int of_count_phandle_with_args(const struct device_node *np, const char *list_na } EXPORT_SYMBOL(of_count_phandle_with_args); -#if defined(CONFIG_OF_DYNAMIC) -static int of_property_notify(int action, struct device_node *np, - struct property *prop) -{ - struct of_prop_reconfig pr; - - /* only call notifiers if the node is attached */ - if (!of_node_is_attached(np)) - return 0; - - pr.dn = np; - pr.prop = prop; - return of_reconfig_notify(action, &pr); -} -#else -static int of_property_notify(int action, struct device_node *np, - struct property *prop) -{ - return 0; -} -#endif - /** * __of_add_property - Add a property to a node without lock operations */ -static int __of_add_property(struct device_node *np, struct property *prop) +int __of_add_property(struct device_node *np, struct property *prop) { struct property **next; @@ -1799,22 +1675,49 @@ int of_add_property(struct device_node *np, struct property *prop) unsigned long flags; int rc; - rc = of_property_notify(OF_RECONFIG_ADD_PROPERTY, np, prop); - if (rc) - return rc; + mutex_lock(&of_mutex); raw_spin_lock_irqsave(&devtree_lock, flags); rc = __of_add_property(np, prop); raw_spin_unlock_irqrestore(&devtree_lock, flags); - if (rc) - return rc; - if (of_node_is_attached(np)) + if (!rc) __of_add_property_sysfs(np, prop); + mutex_unlock(&of_mutex); + + if (!rc) + of_property_notify(OF_RECONFIG_ADD_PROPERTY, np, prop, NULL); + return rc; } +int __of_remove_property(struct device_node *np, struct property *prop) +{ + struct property **next; + + for (next = &np->properties; *next; next = &(*next)->next) { + if (*next == prop) + break; + } + if (*next == NULL) + return -ENODEV; + + /* found the node */ + *next = prop->next; + prop->next = np->deadprops; + np->deadprops = prop; + + return 0; +} + +void __of_remove_property_sysfs(struct device_node *np, struct property *prop) +{ + /* at early boot, bail here and defer setup to of_init() */ + if (of_kset && of_node_is_attached(np)) + sysfs_remove_bin_file(&np->kobj, &prop->attr); +} + /** * of_remove_property - Remove a property from a node. * @@ -1825,211 +1728,98 @@ int of_add_property(struct device_node *np, struct property *prop) */ int of_remove_property(struct device_node *np, struct property *prop) { - struct property **next; unsigned long flags; - int found = 0; int rc; - rc = of_property_notify(OF_RECONFIG_REMOVE_PROPERTY, np, prop); - if (rc) - return rc; + mutex_lock(&of_mutex); raw_spin_lock_irqsave(&devtree_lock, flags); - next = &np->properties; - while (*next) { - if (*next == prop) { - /* found the node */ - *next = prop->next; - prop->next = np->deadprops; - np->deadprops = prop; - found = 1; - break; - } - next = &(*next)->next; - } + rc = __of_remove_property(np, prop); raw_spin_unlock_irqrestore(&devtree_lock, flags); - if (!found) - return -ENODEV; + if (!rc) + __of_remove_property_sysfs(np, prop); - /* at early boot, bail hear and defer setup to of_init() */ - if (!of_kset) - return 0; + mutex_unlock(&of_mutex); - sysfs_remove_bin_file(&np->kobj, &prop->attr); + if (!rc) + of_property_notify(OF_RECONFIG_REMOVE_PROPERTY, np, prop, NULL); - return 0; + return rc; } -/* - * of_update_property - Update a property in a node, if the property does - * not exist, add it. - * - * Note that we don't actually remove it, since we have given out - * who-knows-how-many pointers to the data using get-property. - * Instead we just move the property to the "dead properties" list, - * and add the new property to the property list - */ -int of_update_property(struct device_node *np, struct property *newprop) +int __of_update_property(struct device_node *np, struct property *newprop, + struct property **oldpropp) { struct property **next, *oldprop; - unsigned long flags; - int rc; - - rc = of_property_notify(OF_RECONFIG_UPDATE_PROPERTY, np, newprop); - if (rc) - return rc; - if (!newprop->name) - return -EINVAL; + for (next = &np->properties; *next; next = &(*next)->next) { + if (of_prop_cmp((*next)->name, newprop->name) == 0) + break; + } + *oldpropp = oldprop = *next; - raw_spin_lock_irqsave(&devtree_lock, flags); - next = &np->properties; - oldprop = __of_find_property(np, newprop->name, NULL); - if (!oldprop) { - /* add the new node */ - rc = __of_add_property(np, newprop); - } else while (*next) { + if (oldprop) { /* replace the node */ - if (*next == oldprop) { - newprop->next = oldprop->next; - *next = newprop; - oldprop->next = np->deadprops; - np->deadprops = oldprop; - break; - } - next = &(*next)->next; + newprop->next = oldprop->next; + *next = newprop; + oldprop->next = np->deadprops; + np->deadprops = oldprop; + } else { + /* new node */ + newprop->next = NULL; + *next = newprop; } - raw_spin_unlock_irqrestore(&devtree_lock, flags); - if (rc) - return rc; + return 0; +} + +void __of_update_property_sysfs(struct device_node *np, struct property *newprop, + struct property *oldprop) +{ /* At early boot, bail out and defer setup to of_init() */ if (!of_kset) - return 0; + return; - /* Update the sysfs attribute */ if (oldprop) sysfs_remove_bin_file(&np->kobj, &oldprop->attr); __of_add_property_sysfs(np, newprop); - - return 0; } -#if defined(CONFIG_OF_DYNAMIC) /* - * Support for dynamic device trees. + * of_update_property - Update a property in a node, if the property does + * not exist, add it. * - * On some platforms, the device tree can be manipulated at runtime. - * The routines in this section support adding, removing and changing - * device tree nodes. - */ - -static BLOCKING_NOTIFIER_HEAD(of_reconfig_chain); - -int of_reconfig_notifier_register(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&of_reconfig_chain, nb); -} -EXPORT_SYMBOL_GPL(of_reconfig_notifier_register); - -int of_reconfig_notifier_unregister(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&of_reconfig_chain, nb); -} -EXPORT_SYMBOL_GPL(of_reconfig_notifier_unregister); - -int of_reconfig_notify(unsigned long action, void *p) -{ - int rc; - - rc = blocking_notifier_call_chain(&of_reconfig_chain, action, p); - return notifier_to_errno(rc); -} - -/** - * of_attach_node - Plug a device node into the tree and global list. + * Note that we don't actually remove it, since we have given out + * who-knows-how-many pointers to the data using get-property. + * Instead we just move the property to the "dead properties" list, + * and add the new property to the property list */ -int of_attach_node(struct device_node *np) +int of_update_property(struct device_node *np, struct property *newprop) { + struct property *oldprop; unsigned long flags; int rc; - rc = of_reconfig_notify(OF_RECONFIG_ATTACH_NODE, np); - if (rc) - return rc; - - raw_spin_lock_irqsave(&devtree_lock, flags); - np->sibling = np->parent->child; - np->allnext = np->parent->allnext; - np->parent->allnext = np; - np->parent->child = np; - of_node_clear_flag(np, OF_DETACHED); - raw_spin_unlock_irqrestore(&devtree_lock, flags); - - of_node_add(np); - return 0; -} - -/** - * of_detach_node - "Unplug" a node from the device tree. - * - * The caller must hold a reference to the node. The memory associated with - * the node is not freed until its refcount goes to zero. - */ -int of_detach_node(struct device_node *np) -{ - struct device_node *parent; - unsigned long flags; - int rc = 0; + if (!newprop->name) + return -EINVAL; - rc = of_reconfig_notify(OF_RECONFIG_DETACH_NODE, np); - if (rc) - return rc; + mutex_lock(&of_mutex); raw_spin_lock_irqsave(&devtree_lock, flags); + rc = __of_update_property(np, newprop, &oldprop); + raw_spin_unlock_irqrestore(&devtree_lock, flags); - if (of_node_check_flag(np, OF_DETACHED)) { - /* someone already detached it */ - raw_spin_unlock_irqrestore(&devtree_lock, flags); - return rc; - } - - parent = np->parent; - if (!parent) { - raw_spin_unlock_irqrestore(&devtree_lock, flags); - return rc; - } + if (!rc) + __of_update_property_sysfs(np, newprop, oldprop); - if (of_allnodes == np) - of_allnodes = np->allnext; - else { - struct device_node *prev; - for (prev = of_allnodes; - prev->allnext != np; - prev = prev->allnext) - ; - prev->allnext = np->allnext; - } + mutex_unlock(&of_mutex); - if (parent->child == np) - parent->child = np->sibling; - else { - struct device_node *prevsib; - for (prevsib = np->parent->child; - prevsib->sibling != np; - prevsib = prevsib->sibling) - ; - prevsib->sibling = np->sibling; - } - - of_node_set_flag(np, OF_DETACHED); - raw_spin_unlock_irqrestore(&devtree_lock, flags); + if (!rc) + of_property_notify(OF_RECONFIG_UPDATE_PROPERTY, np, newprop, oldprop); - of_node_remove(np); return rc; } -#endif /* defined(CONFIG_OF_DYNAMIC) */ static void of_alias_add(struct alias_prop *ap, struct device_node *np, int id, const char *stem, int stem_len) @@ -2062,9 +1852,12 @@ void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align)) of_chosen = of_find_node_by_path("/chosen@0"); if (of_chosen) { + /* linux,stdout-path and /aliases/stdout are for legacy compatibility */ const char *name = of_get_property(of_chosen, "stdout-path", NULL); if (!name) name = of_get_property(of_chosen, "linux,stdout-path", NULL); + if (IS_ENABLED(CONFIG_PPC) && !name) + name = of_get_property(of_aliases, "stdout", NULL); if (name) of_stdout = of_find_node_by_path(name); } @@ -2122,7 +1915,7 @@ int of_alias_get_id(struct device_node *np, const char *stem) struct alias_prop *app; int id = -ENODEV; - mutex_lock(&of_aliases_mutex); + mutex_lock(&of_mutex); list_for_each_entry(app, &aliases_lookup, link) { if (strcmp(app->stem, stem) != 0) continue; @@ -2132,7 +1925,7 @@ int of_alias_get_id(struct device_node *np, const char *stem) break; } } - mutex_unlock(&of_aliases_mutex); + mutex_unlock(&of_mutex); return id; } @@ -2180,20 +1973,22 @@ const char *of_prop_next_string(struct property *prop, const char *cur) EXPORT_SYMBOL_GPL(of_prop_next_string); /** - * of_device_is_stdout_path - check if a device node matches the - * linux,stdout-path property - * - * Check if this device node matches the linux,stdout-path property - * in the chosen node. return true if yes, false otherwise. + * of_console_check() - Test and setup console for DT setup + * @dn - Pointer to device node + * @name - Name to use for preferred console without index. ex. "ttyS" + * @index - Index to use for preferred console. + * + * Check if the given device node matches the stdout-path property in the + * /chosen node. If it does then register it as the preferred console and return + * TRUE. Otherwise return FALSE. */ -int of_device_is_stdout_path(struct device_node *dn) +bool of_console_check(struct device_node *dn, char *name, int index) { - if (!of_stdout) + if (!dn || dn != of_stdout || console_set_on_cmdline) return false; - - return of_stdout == dn; + return add_preferred_console(name, index, NULL); } -EXPORT_SYMBOL_GPL(of_device_is_stdout_path); +EXPORT_SYMBOL_GPL(of_console_check); /** * of_find_next_cache_node - Find a node's subsidiary cache diff --git a/drivers/of/device.c b/drivers/of/device.c index dafb9736ab9..46d6c75c140 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -160,7 +160,7 @@ void of_device_uevent(struct device *dev, struct kobj_uevent_env *env) add_uevent_var(env, "OF_COMPATIBLE_N=%d", seen); seen = 0; - mutex_lock(&of_aliases_mutex); + mutex_lock(&of_mutex); list_for_each_entry(app, &aliases_lookup, link) { if (dev->of_node == app->np) { add_uevent_var(env, "OF_ALIAS_%d=%s", seen, @@ -168,7 +168,7 @@ void of_device_uevent(struct device *dev, struct kobj_uevent_env *env) seen++; } } - mutex_unlock(&of_aliases_mutex); + mutex_unlock(&of_mutex); } int of_device_uevent_modalias(struct device *dev, struct kobj_uevent_env *env) diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c new file mode 100644 index 00000000000..54fecc49a1f --- /dev/null +++ b/drivers/of/dynamic.c @@ -0,0 +1,660 @@ +/* + * Support for dynamic device trees. + * + * On some platforms, the device tree can be manipulated at runtime. + * The routines in this section support adding, removing and changing + * device tree nodes. + */ + +#include <linux/of.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/proc_fs.h> + +#include "of_private.h" + +/** + * of_node_get() - Increment refcount of a node + * @node: Node to inc refcount, NULL is supported to simplify writing of + * callers + * + * Returns node. + */ +struct device_node *of_node_get(struct device_node *node) +{ + if (node) + kobject_get(&node->kobj); + return node; +} +EXPORT_SYMBOL(of_node_get); + +/** + * of_node_put() - Decrement refcount of a node + * @node: Node to dec refcount, NULL is supported to simplify writing of + * callers + */ +void of_node_put(struct device_node *node) +{ + if (node) + kobject_put(&node->kobj); +} +EXPORT_SYMBOL(of_node_put); + +void __of_detach_node_sysfs(struct device_node *np) +{ + struct property *pp; + + BUG_ON(!of_node_is_initialized(np)); + if (!of_kset) + return; + + /* only remove properties if on sysfs */ + if (of_node_is_attached(np)) { + for_each_property_of_node(np, pp) + sysfs_remove_bin_file(&np->kobj, &pp->attr); + kobject_del(&np->kobj); + } + + /* finally remove the kobj_init ref */ + of_node_put(np); +} + +static BLOCKING_NOTIFIER_HEAD(of_reconfig_chain); + +int of_reconfig_notifier_register(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&of_reconfig_chain, nb); +} +EXPORT_SYMBOL_GPL(of_reconfig_notifier_register); + +int of_reconfig_notifier_unregister(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&of_reconfig_chain, nb); +} +EXPORT_SYMBOL_GPL(of_reconfig_notifier_unregister); + +int of_reconfig_notify(unsigned long action, void *p) +{ + int rc; + + rc = blocking_notifier_call_chain(&of_reconfig_chain, action, p); + return notifier_to_errno(rc); +} + +int of_property_notify(int action, struct device_node *np, + struct property *prop, struct property *oldprop) +{ + struct of_prop_reconfig pr; + + /* only call notifiers if the node is attached */ + if (!of_node_is_attached(np)) + return 0; + + pr.dn = np; + pr.prop = prop; + pr.old_prop = oldprop; + return of_reconfig_notify(action, &pr); +} + +void __of_attach_node(struct device_node *np) +{ + const __be32 *phandle; + int sz; + + np->name = __of_get_property(np, "name", NULL) ? : "<NULL>"; + np->type = __of_get_property(np, "device_type", NULL) ? : "<NULL>"; + + phandle = __of_get_property(np, "phandle", &sz); + if (!phandle) + phandle = __of_get_property(np, "linux,phandle", &sz); + if (IS_ENABLED(PPC_PSERIES) && !phandle) + phandle = __of_get_property(np, "ibm,phandle", &sz); + np->phandle = (phandle && (sz >= 4)) ? be32_to_cpup(phandle) : 0; + + np->child = NULL; + np->sibling = np->parent->child; + np->allnext = np->parent->allnext; + np->parent->allnext = np; + np->parent->child = np; + of_node_clear_flag(np, OF_DETACHED); +} + +/** + * of_attach_node() - Plug a device node into the tree and global list. + */ +int of_attach_node(struct device_node *np) +{ + unsigned long flags; + + mutex_lock(&of_mutex); + raw_spin_lock_irqsave(&devtree_lock, flags); + __of_attach_node(np); + raw_spin_unlock_irqrestore(&devtree_lock, flags); + + __of_attach_node_sysfs(np); + mutex_unlock(&of_mutex); + + of_reconfig_notify(OF_RECONFIG_ATTACH_NODE, np); + + return 0; +} + +void __of_detach_node(struct device_node *np) +{ + struct device_node *parent; + + if (WARN_ON(of_node_check_flag(np, OF_DETACHED))) + return; + + parent = np->parent; + if (WARN_ON(!parent)) + return; + + if (of_allnodes == np) + of_allnodes = np->allnext; + else { + struct device_node *prev; + for (prev = of_allnodes; + prev->allnext != np; + prev = prev->allnext) + ; + prev->allnext = np->allnext; + } + + if (parent->child == np) + parent->child = np->sibling; + else { + struct device_node *prevsib; + for (prevsib = np->parent->child; + prevsib->sibling != np; + prevsib = prevsib->sibling) + ; + prevsib->sibling = np->sibling; + } + + of_node_set_flag(np, OF_DETACHED); +} + +/** + * of_detach_node() - "Unplug" a node from the device tree. + * + * The caller must hold a reference to the node. The memory associated with + * the node is not freed until its refcount goes to zero. + */ +int of_detach_node(struct device_node *np) +{ + unsigned long flags; + int rc = 0; + + mutex_lock(&of_mutex); + raw_spin_lock_irqsave(&devtree_lock, flags); + __of_detach_node(np); + raw_spin_unlock_irqrestore(&devtree_lock, flags); + + __of_detach_node_sysfs(np); + mutex_unlock(&of_mutex); + + of_reconfig_notify(OF_RECONFIG_DETACH_NODE, np); + + return rc; +} + +/** + * of_node_release() - release a dynamically allocated node + * @kref: kref element of the node to be released + * + * In of_node_put() this function is passed to kref_put() as the destructor. + */ +void of_node_release(struct kobject *kobj) +{ + struct device_node *node = kobj_to_device_node(kobj); + struct property *prop = node->properties; + + /* We should never be releasing nodes that haven't been detached. */ + if (!of_node_check_flag(node, OF_DETACHED)) { + pr_err("ERROR: Bad of_node_put() on %s\n", node->full_name); + dump_stack(); + return; + } + + if (!of_node_check_flag(node, OF_DYNAMIC)) + return; + + while (prop) { + struct property *next = prop->next; + kfree(prop->name); + kfree(prop->value); + kfree(prop); + prop = next; + + if (!prop) { + prop = node->deadprops; + node->deadprops = NULL; + } + } + kfree(node->full_name); + kfree(node->data); + kfree(node); +} + +/** + * __of_prop_dup - Copy a property dynamically. + * @prop: Property to copy + * @allocflags: Allocation flags (typically pass GFP_KERNEL) + * + * Copy a property by dynamically allocating the memory of both the + * property stucture and the property name & contents. The property's + * flags have the OF_DYNAMIC bit set so that we can differentiate between + * dynamically allocated properties and not. + * Returns the newly allocated property or NULL on out of memory error. + */ +struct property *__of_prop_dup(const struct property *prop, gfp_t allocflags) +{ + struct property *new; + + new = kzalloc(sizeof(*new), allocflags); + if (!new) + return NULL; + + /* + * NOTE: There is no check for zero length value. + * In case of a boolean property, this will allocate a value + * of zero bytes. We do this to work around the use + * of of_get_property() calls on boolean values. + */ + new->name = kstrdup(prop->name, allocflags); + new->value = kmemdup(prop->value, prop->length, allocflags); + new->length = prop->length; + if (!new->name || !new->value) + goto err_free; + + /* mark the property as dynamic */ + of_property_set_flag(new, OF_DYNAMIC); + + return new; + + err_free: + kfree(new->name); + kfree(new->value); + kfree(new); + return NULL; +} + +/** + * __of_node_alloc() - Create an empty device node dynamically. + * @full_name: Full name of the new device node + * @allocflags: Allocation flags (typically pass GFP_KERNEL) + * + * Create an empty device tree node, suitable for further modification. + * The node data are dynamically allocated and all the node flags + * have the OF_DYNAMIC & OF_DETACHED bits set. + * Returns the newly allocated node or NULL on out of memory error. + */ +struct device_node *__of_node_alloc(const char *full_name, gfp_t allocflags) +{ + struct device_node *node; + + node = kzalloc(sizeof(*node), allocflags); + if (!node) + return NULL; + + node->full_name = kstrdup(full_name, allocflags); + of_node_set_flag(node, OF_DYNAMIC); + of_node_set_flag(node, OF_DETACHED); + if (!node->full_name) + goto err_free; + + of_node_init(node); + + return node; + + err_free: + kfree(node->full_name); + kfree(node); + return NULL; +} + +static void __of_changeset_entry_destroy(struct of_changeset_entry *ce) +{ + of_node_put(ce->np); + list_del(&ce->node); + kfree(ce); +} + +#ifdef DEBUG +static void __of_changeset_entry_dump(struct of_changeset_entry *ce) +{ + switch (ce->action) { + case OF_RECONFIG_ADD_PROPERTY: + pr_debug("%p: %s %s/%s\n", + ce, "ADD_PROPERTY ", ce->np->full_name, + ce->prop->name); + break; + case OF_RECONFIG_REMOVE_PROPERTY: + pr_debug("%p: %s %s/%s\n", + ce, "REMOVE_PROPERTY", ce->np->full_name, + ce->prop->name); + break; + case OF_RECONFIG_UPDATE_PROPERTY: + pr_debug("%p: %s %s/%s\n", + ce, "UPDATE_PROPERTY", ce->np->full_name, + ce->prop->name); + break; + case OF_RECONFIG_ATTACH_NODE: + pr_debug("%p: %s %s\n", + ce, "ATTACH_NODE ", ce->np->full_name); + break; + case OF_RECONFIG_DETACH_NODE: + pr_debug("%p: %s %s\n", + ce, "DETACH_NODE ", ce->np->full_name); + break; + } +} +#else +static inline void __of_changeset_entry_dump(struct of_changeset_entry *ce) +{ + /* empty */ +} +#endif + +static void __of_changeset_entry_invert(struct of_changeset_entry *ce, + struct of_changeset_entry *rce) +{ + memcpy(rce, ce, sizeof(*rce)); + + switch (ce->action) { + case OF_RECONFIG_ATTACH_NODE: + rce->action = OF_RECONFIG_DETACH_NODE; + break; + case OF_RECONFIG_DETACH_NODE: + rce->action = OF_RECONFIG_ATTACH_NODE; + break; + case OF_RECONFIG_ADD_PROPERTY: + rce->action = OF_RECONFIG_REMOVE_PROPERTY; + break; + case OF_RECONFIG_REMOVE_PROPERTY: + rce->action = OF_RECONFIG_ADD_PROPERTY; + break; + case OF_RECONFIG_UPDATE_PROPERTY: + rce->old_prop = ce->prop; + rce->prop = ce->old_prop; + break; + } +} + +static void __of_changeset_entry_notify(struct of_changeset_entry *ce, bool revert) +{ + struct of_changeset_entry ce_inverted; + int ret; + + if (revert) { + __of_changeset_entry_invert(ce, &ce_inverted); + ce = &ce_inverted; + } + + switch (ce->action) { + case OF_RECONFIG_ATTACH_NODE: + case OF_RECONFIG_DETACH_NODE: + ret = of_reconfig_notify(ce->action, ce->np); + break; + case OF_RECONFIG_ADD_PROPERTY: + case OF_RECONFIG_REMOVE_PROPERTY: + case OF_RECONFIG_UPDATE_PROPERTY: + ret = of_property_notify(ce->action, ce->np, ce->prop, ce->old_prop); + break; + default: + pr_err("%s: invalid devicetree changeset action: %i\n", __func__, + (int)ce->action); + return; + } + + if (ret) + pr_err("%s: notifier error @%s\n", __func__, ce->np->full_name); +} + +static int __of_changeset_entry_apply(struct of_changeset_entry *ce) +{ + struct property *old_prop, **propp; + unsigned long flags; + int ret = 0; + + __of_changeset_entry_dump(ce); + + raw_spin_lock_irqsave(&devtree_lock, flags); + switch (ce->action) { + case OF_RECONFIG_ATTACH_NODE: + __of_attach_node(ce->np); + break; + case OF_RECONFIG_DETACH_NODE: + __of_detach_node(ce->np); + break; + case OF_RECONFIG_ADD_PROPERTY: + /* If the property is in deadprops then it must be removed */ + for (propp = &ce->np->deadprops; *propp; propp = &(*propp)->next) { + if (*propp == ce->prop) { + *propp = ce->prop->next; + ce->prop->next = NULL; + break; + } + } + + ret = __of_add_property(ce->np, ce->prop); + if (ret) { + pr_err("%s: add_property failed @%s/%s\n", + __func__, ce->np->full_name, + ce->prop->name); + break; + } + break; + case OF_RECONFIG_REMOVE_PROPERTY: + ret = __of_remove_property(ce->np, ce->prop); + if (ret) { + pr_err("%s: remove_property failed @%s/%s\n", + __func__, ce->np->full_name, + ce->prop->name); + break; + } + break; + + case OF_RECONFIG_UPDATE_PROPERTY: + /* If the property is in deadprops then it must be removed */ + for (propp = &ce->np->deadprops; *propp; propp = &(*propp)->next) { + if (*propp == ce->prop) { + *propp = ce->prop->next; + ce->prop->next = NULL; + break; + } + } + + ret = __of_update_property(ce->np, ce->prop, &old_prop); + if (ret) { + pr_err("%s: update_property failed @%s/%s\n", + __func__, ce->np->full_name, + ce->prop->name); + break; + } + break; + default: + ret = -EINVAL; + } + raw_spin_unlock_irqrestore(&devtree_lock, flags); + + if (ret) + return ret; + + switch (ce->action) { + case OF_RECONFIG_ATTACH_NODE: + __of_attach_node_sysfs(ce->np); + break; + case OF_RECONFIG_DETACH_NODE: + __of_detach_node_sysfs(ce->np); + break; + case OF_RECONFIG_ADD_PROPERTY: + /* ignore duplicate names */ + __of_add_property_sysfs(ce->np, ce->prop); + break; + case OF_RECONFIG_REMOVE_PROPERTY: + __of_remove_property_sysfs(ce->np, ce->prop); + break; + case OF_RECONFIG_UPDATE_PROPERTY: + __of_update_property_sysfs(ce->np, ce->prop, ce->old_prop); + break; + } + + return 0; +} + +static inline int __of_changeset_entry_revert(struct of_changeset_entry *ce) +{ + struct of_changeset_entry ce_inverted; + + __of_changeset_entry_invert(ce, &ce_inverted); + return __of_changeset_entry_apply(&ce_inverted); +} + +/** + * of_changeset_init - Initialize a changeset for use + * + * @ocs: changeset pointer + * + * Initialize a changeset structure + */ +void of_changeset_init(struct of_changeset *ocs) +{ + memset(ocs, 0, sizeof(*ocs)); + INIT_LIST_HEAD(&ocs->entries); +} + +/** + * of_changeset_destroy - Destroy a changeset + * + * @ocs: changeset pointer + * + * Destroys a changeset. Note that if a changeset is applied, + * its changes to the tree cannot be reverted. + */ +void of_changeset_destroy(struct of_changeset *ocs) +{ + struct of_changeset_entry *ce, *cen; + + list_for_each_entry_safe_reverse(ce, cen, &ocs->entries, node) + __of_changeset_entry_destroy(ce); +} + +/** + * of_changeset_apply - Applies a changeset + * + * @ocs: changeset pointer + * + * Applies a changeset to the live tree. + * Any side-effects of live tree state changes are applied here on + * sucess, like creation/destruction of devices and side-effects + * like creation of sysfs properties and directories. + * Returns 0 on success, a negative error value in case of an error. + * On error the partially applied effects are reverted. + */ +int of_changeset_apply(struct of_changeset *ocs) +{ + struct of_changeset_entry *ce; + int ret; + + /* perform the rest of the work */ + pr_debug("of_changeset: applying...\n"); + list_for_each_entry(ce, &ocs->entries, node) { + ret = __of_changeset_entry_apply(ce); + if (ret) { + pr_err("%s: Error applying changeset (%d)\n", __func__, ret); + list_for_each_entry_continue_reverse(ce, &ocs->entries, node) + __of_changeset_entry_revert(ce); + return ret; + } + } + pr_debug("of_changeset: applied, emitting notifiers.\n"); + + /* drop the global lock while emitting notifiers */ + mutex_unlock(&of_mutex); + list_for_each_entry(ce, &ocs->entries, node) + __of_changeset_entry_notify(ce, 0); + mutex_lock(&of_mutex); + pr_debug("of_changeset: notifiers sent.\n"); + + return 0; +} + +/** + * of_changeset_revert - Reverts an applied changeset + * + * @ocs: changeset pointer + * + * Reverts a changeset returning the state of the tree to what it + * was before the application. + * Any side-effects like creation/destruction of devices and + * removal of sysfs properties and directories are applied. + * Returns 0 on success, a negative error value in case of an error. + */ +int of_changeset_revert(struct of_changeset *ocs) +{ + struct of_changeset_entry *ce; + int ret; + + pr_debug("of_changeset: reverting...\n"); + list_for_each_entry_reverse(ce, &ocs->entries, node) { + ret = __of_changeset_entry_revert(ce); + if (ret) { + pr_err("%s: Error reverting changeset (%d)\n", __func__, ret); + list_for_each_entry_continue(ce, &ocs->entries, node) + __of_changeset_entry_apply(ce); + return ret; + } + } + pr_debug("of_changeset: reverted, emitting notifiers.\n"); + + /* drop the global lock while emitting notifiers */ + mutex_unlock(&of_mutex); + list_for_each_entry_reverse(ce, &ocs->entries, node) + __of_changeset_entry_notify(ce, 1); + mutex_lock(&of_mutex); + pr_debug("of_changeset: notifiers sent.\n"); + + return 0; +} + +/** + * of_changeset_action - Perform a changeset action + * + * @ocs: changeset pointer + * @action: action to perform + * @np: Pointer to device node + * @prop: Pointer to property + * + * On action being one of: + * + OF_RECONFIG_ATTACH_NODE + * + OF_RECONFIG_DETACH_NODE, + * + OF_RECONFIG_ADD_PROPERTY + * + OF_RECONFIG_REMOVE_PROPERTY, + * + OF_RECONFIG_UPDATE_PROPERTY + * Returns 0 on success, a negative error value in case of an error. + */ +int of_changeset_action(struct of_changeset *ocs, unsigned long action, + struct device_node *np, struct property *prop) +{ + struct of_changeset_entry *ce; + + ce = kzalloc(sizeof(*ce), GFP_KERNEL); + if (!ce) { + pr_err("%s: Failed to allocate\n", __func__); + return -ENOMEM; + } + /* get a reference to the node */ + ce->action = action; + ce->np = of_node_get(np); + ce->prop = prop; + + if (action == OF_RECONFIG_UPDATE_PROPERTY && prop) + ce->old_prop = of_find_property(np, prop->name, NULL); + + /* add it to the list */ + list_add_tail(&ce->node, &ocs->entries); + return 0; +} diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 9aa012e6ea0..f46a24ffa3f 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -923,24 +923,24 @@ int __init early_init_dt_scan_chosen(unsigned long node, const char *uname, } #ifdef CONFIG_HAVE_MEMBLOCK +#define MAX_PHYS_ADDR ((phys_addr_t)~0) + void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size) { const u64 phys_offset = __pa(PAGE_OFFSET); base &= PAGE_MASK; size &= PAGE_MASK; - if (sizeof(phys_addr_t) < sizeof(u64)) { - if (base > ULONG_MAX) { - pr_warning("Ignoring memory block 0x%llx - 0x%llx\n", - base, base + size); - return; - } + if (base > MAX_PHYS_ADDR) { + pr_warning("Ignoring memory block 0x%llx - 0x%llx\n", + base, base + size); + return; + } - if (base + size > ULONG_MAX) { - pr_warning("Ignoring memory range 0x%lx - 0x%llx\n", - ULONG_MAX, base + size); - size = ULONG_MAX - base; - } + if (base + size > MAX_PHYS_ADDR) { + pr_warning("Ignoring memory range 0x%lx - 0x%llx\n", + ULONG_MAX, base + size); + size = MAX_PHYS_ADDR - base; } if (base + size < phys_offset) { diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h index ff350c8fa7a..858e0a5d9a1 100644 --- a/drivers/of/of_private.h +++ b/drivers/of/of_private.h @@ -31,6 +31,63 @@ struct alias_prop { char stem[0]; }; -extern struct mutex of_aliases_mutex; +extern struct mutex of_mutex; extern struct list_head aliases_lookup; +extern struct kset *of_kset; + + +static inline struct device_node *kobj_to_device_node(struct kobject *kobj) +{ + return container_of(kobj, struct device_node, kobj); +} + +#if defined(CONFIG_OF_DYNAMIC) +extern int of_property_notify(int action, struct device_node *np, + struct property *prop, struct property *old_prop); +extern void of_node_release(struct kobject *kobj); +#else /* CONFIG_OF_DYNAMIC */ +static inline int of_property_notify(int action, struct device_node *np, + struct property *prop, struct property *old_prop) +{ + return 0; +} +#endif /* CONFIG_OF_DYNAMIC */ + +/** + * General utilities for working with live trees. + * + * All functions with two leading underscores operate + * without taking node references, so you either have to + * own the devtree lock or work on detached trees only. + */ +struct property *__of_prop_dup(const struct property *prop, gfp_t allocflags); +struct device_node *__of_node_alloc(const char *full_name, gfp_t allocflags); + +extern const void *__of_get_property(const struct device_node *np, + const char *name, int *lenp); +extern int __of_add_property(struct device_node *np, struct property *prop); +extern int __of_add_property_sysfs(struct device_node *np, + struct property *prop); +extern int __of_remove_property(struct device_node *np, struct property *prop); +extern void __of_remove_property_sysfs(struct device_node *np, + struct property *prop); +extern int __of_update_property(struct device_node *np, + struct property *newprop, struct property **oldprop); +extern void __of_update_property_sysfs(struct device_node *np, + struct property *newprop, struct property *oldprop); + +extern void __of_attach_node(struct device_node *np); +extern int __of_attach_node_sysfs(struct device_node *np); +extern void __of_detach_node(struct device_node *np); +extern void __of_detach_node_sysfs(struct device_node *np); + +/* iterators for transactions, used for overlays */ +/* forward iterator */ +#define for_each_transaction_entry(_oft, _te) \ + list_for_each_entry(_te, &(_oft)->te_list, node) + +/* reverse iterator */ +#define for_each_transaction_entry_reverse(_oft, _te) \ + list_for_each_entry_reverse(_te, &(_oft)->te_list, node) + #endif /* _LINUX_OF_PRIVATE_H */ diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 632aae86137..59fb12e84e6 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -206,8 +206,16 @@ void __init fdt_init_reserved_mem(void) for (i = 0; i < reserved_mem_count; i++) { struct reserved_mem *rmem = &reserved_mem[i]; unsigned long node = rmem->fdt_node; + int len; + const __be32 *prop; int err = 0; + prop = of_get_flat_dt_prop(node, "phandle", &len); + if (!prop) + prop = of_get_flat_dt_prop(node, "linux,phandle", &len); + if (prop) + rmem->phandle = of_read_number(prop, len/4); + if (rmem->size == 0) err = __reserved_mem_alloc_size(node, rmem->name, &rmem->base, &rmem->size); @@ -215,3 +223,65 @@ void __init fdt_init_reserved_mem(void) __reserved_mem_init_node(rmem); } } + +static inline struct reserved_mem *__find_rmem(struct device_node *node) +{ + unsigned int i; + + if (!node->phandle) + return NULL; + + for (i = 0; i < reserved_mem_count; i++) + if (reserved_mem[i].phandle == node->phandle) + return &reserved_mem[i]; + return NULL; +} + +/** + * of_reserved_mem_device_init() - assign reserved memory region to given device + * + * This function assign memory region pointed by "memory-region" device tree + * property to the given device. + */ +void of_reserved_mem_device_init(struct device *dev) +{ + struct reserved_mem *rmem; + struct device_node *np; + + np = of_parse_phandle(dev->of_node, "memory-region", 0); + if (!np) + return; + + rmem = __find_rmem(np); + of_node_put(np); + + if (!rmem || !rmem->ops || !rmem->ops->device_init) + return; + + rmem->ops->device_init(rmem, dev); + dev_info(dev, "assigned reserved memory node %s\n", rmem->name); +} + +/** + * of_reserved_mem_device_release() - release reserved memory device structures + * + * This function releases structures allocated for memory region handling for + * the given device. + */ +void of_reserved_mem_device_release(struct device *dev) +{ + struct reserved_mem *rmem; + struct device_node *np; + + np = of_parse_phandle(dev->of_node, "memory-region", 0); + if (!np) + return; + + rmem = __find_rmem(np); + of_node_put(np); + + if (!rmem || !rmem->ops || !rmem->ops->device_release) + return; + + rmem->ops->device_release(rmem, dev); +} diff --git a/drivers/of/platform.c b/drivers/of/platform.c index 500436f9be7..0197725e033 100644 --- a/drivers/of/platform.c +++ b/drivers/of/platform.c @@ -422,6 +422,7 @@ static int of_platform_bus_create(struct device_node *bus, break; } } + of_node_set_flag(bus, OF_POPULATED_BUS); return rc; } @@ -508,19 +509,13 @@ EXPORT_SYMBOL_GPL(of_platform_populate); static int of_platform_device_destroy(struct device *dev, void *data) { - bool *children_left = data; - /* Do not touch devices not populated from the device tree */ - if (!dev->of_node || !of_node_check_flag(dev->of_node, OF_POPULATED)) { - *children_left = true; + if (!dev->of_node || !of_node_check_flag(dev->of_node, OF_POPULATED)) return 0; - } - /* Recurse, but don't touch this device if it has any children left */ - if (of_platform_depopulate(dev) != 0) { - *children_left = true; - return 0; - } + /* Recurse for any nodes that were treated as busses */ + if (of_node_check_flag(dev->of_node, OF_POPULATED_BUS)) + device_for_each_child(dev, NULL, of_platform_device_destroy); if (dev->bus == &platform_bus_type) platform_device_unregister(to_platform_device(dev)); @@ -528,19 +523,15 @@ static int of_platform_device_destroy(struct device *dev, void *data) else if (dev->bus == &amba_bustype) amba_device_unregister(to_amba_device(dev)); #endif - else { - *children_left = true; - return 0; - } of_node_clear_flag(dev->of_node, OF_POPULATED); - + of_node_clear_flag(dev->of_node, OF_POPULATED_BUS); return 0; } /** * of_platform_depopulate() - Remove devices populated from device tree - * @parent: device which childred will be removed + * @parent: device which children will be removed * * Complementary to of_platform_populate(), this function removes children * of the given device (and, recurrently, their children) that have been @@ -550,14 +541,9 @@ static int of_platform_device_destroy(struct device *dev, void *data) * Returns 0 when all children devices have been removed or * -EBUSY when some children remained. */ -int of_platform_depopulate(struct device *parent) +void of_platform_depopulate(struct device *parent) { - bool children_left = false; - - device_for_each_child(parent, &children_left, - of_platform_device_destroy); - - return children_left ? -EBUSY : 0; + device_for_each_child(parent, NULL, of_platform_device_destroy); } EXPORT_SYMBOL_GPL(of_platform_depopulate); diff --git a/drivers/of/selftest.c b/drivers/of/selftest.c index 077314eebb9..d4100266783 100644 --- a/drivers/of/selftest.c +++ b/drivers/of/selftest.c @@ -9,6 +9,7 @@ #include <linux/errno.h> #include <linux/module.h> #include <linux/of.h> +#include <linux/of_fdt.h> #include <linux/of_irq.h> #include <linux/of_platform.h> #include <linux/list.h> @@ -16,11 +17,17 @@ #include <linux/slab.h> #include <linux/device.h> +#include "of_private.h" + static struct selftest_results { int passed; int failed; } selftest_results; +#define NO_OF_NODES 2 +static struct device_node *nodes[NO_OF_NODES]; +static int last_node_index; + #define selftest(result, fmt, ...) { \ if (!(result)) { \ selftest_results.failed++; \ @@ -266,6 +273,81 @@ static void __init of_selftest_property_match_string(void) selftest(rc == -EILSEQ, "unterminated string; rc=%i", rc); } +#define propcmp(p1, p2) (((p1)->length == (p2)->length) && \ + (p1)->value && (p2)->value && \ + !memcmp((p1)->value, (p2)->value, (p1)->length) && \ + !strcmp((p1)->name, (p2)->name)) +static void __init of_selftest_property_copy(void) +{ +#ifdef CONFIG_OF_DYNAMIC + struct property p1 = { .name = "p1", .length = 0, .value = "" }; + struct property p2 = { .name = "p2", .length = 5, .value = "abcd" }; + struct property *new; + + new = __of_prop_dup(&p1, GFP_KERNEL); + selftest(new && propcmp(&p1, new), "empty property didn't copy correctly\n"); + kfree(new->value); + kfree(new->name); + kfree(new); + + new = __of_prop_dup(&p2, GFP_KERNEL); + selftest(new && propcmp(&p2, new), "non-empty property didn't copy correctly\n"); + kfree(new->value); + kfree(new->name); + kfree(new); +#endif +} + +static void __init of_selftest_changeset(void) +{ +#ifdef CONFIG_OF_DYNAMIC + struct property *ppadd, padd = { .name = "prop-add", .length = 0, .value = "" }; + struct property *ppupdate, pupdate = { .name = "prop-update", .length = 5, .value = "abcd" }; + struct property *ppremove; + struct device_node *n1, *n2, *n21, *nremove, *parent; + struct of_changeset chgset; + + of_changeset_init(&chgset); + n1 = __of_node_alloc("/testcase-data/changeset/n1", GFP_KERNEL); + selftest(n1, "testcase setup failure\n"); + n2 = __of_node_alloc("/testcase-data/changeset/n2", GFP_KERNEL); + selftest(n2, "testcase setup failure\n"); + n21 = __of_node_alloc("/testcase-data/changeset/n2/n21", GFP_KERNEL); + selftest(n21, "testcase setup failure %p\n", n21); + nremove = of_find_node_by_path("/testcase-data/changeset/node-remove"); + selftest(nremove, "testcase setup failure\n"); + ppadd = __of_prop_dup(&padd, GFP_KERNEL); + selftest(ppadd, "testcase setup failure\n"); + ppupdate = __of_prop_dup(&pupdate, GFP_KERNEL); + selftest(ppupdate, "testcase setup failure\n"); + parent = nremove->parent; + n1->parent = parent; + n2->parent = parent; + n21->parent = n2; + n2->child = n21; + ppremove = of_find_property(parent, "prop-remove", NULL); + selftest(ppremove, "failed to find removal prop"); + + of_changeset_init(&chgset); + selftest(!of_changeset_attach_node(&chgset, n1), "fail attach n1\n"); + selftest(!of_changeset_attach_node(&chgset, n2), "fail attach n2\n"); + selftest(!of_changeset_detach_node(&chgset, nremove), "fail remove node\n"); + selftest(!of_changeset_attach_node(&chgset, n21), "fail attach n21\n"); + selftest(!of_changeset_add_property(&chgset, parent, ppadd), "fail add prop\n"); + selftest(!of_changeset_update_property(&chgset, parent, ppupdate), "fail update prop\n"); + selftest(!of_changeset_remove_property(&chgset, parent, ppremove), "fail remove prop\n"); + mutex_lock(&of_mutex); + selftest(!of_changeset_apply(&chgset), "apply failed\n"); + mutex_unlock(&of_mutex); + + mutex_lock(&of_mutex); + selftest(!of_changeset_revert(&chgset), "revert failed\n"); + mutex_unlock(&of_mutex); + + of_changeset_destroy(&chgset); +#endif +} + static void __init of_selftest_parse_interrupts(void) { struct device_node *np; @@ -517,9 +599,156 @@ static void __init of_selftest_platform_populate(void) } } +/** + * update_node_properties - adds the properties + * of np into dup node (present in live tree) and + * updates parent of children of np to dup. + * + * @np: node already present in live tree + * @dup: node present in live tree to be updated + */ +static void update_node_properties(struct device_node *np, + struct device_node *dup) +{ + struct property *prop; + struct device_node *child; + + for_each_property_of_node(np, prop) + of_add_property(dup, prop); + + for_each_child_of_node(np, child) + child->parent = dup; +} + +/** + * attach_node_and_children - attaches nodes + * and its children to live tree + * + * @np: Node to attach to live tree + */ +static int attach_node_and_children(struct device_node *np) +{ + struct device_node *next, *root = np, *dup; + + if (!np) { + pr_warn("%s: No tree to attach; not running tests\n", + __func__); + return -ENODATA; + } + + + /* skip root node */ + np = np->child; + /* storing a copy in temporary node */ + dup = np; + + while (dup) { + nodes[last_node_index++] = dup; + dup = dup->sibling; + } + dup = NULL; + + while (np) { + next = np->allnext; + dup = of_find_node_by_path(np->full_name); + if (dup) + update_node_properties(np, dup); + else { + np->child = NULL; + if (np->parent == root) + np->parent = of_allnodes; + of_attach_node(np); + } + np = next; + } + + return 0; +} + +/** + * selftest_data_add - Reads, copies data from + * linked tree and attaches it to the live tree + */ +static int __init selftest_data_add(void) +{ + void *selftest_data; + struct device_node *selftest_data_node; + extern uint8_t __dtb_testcases_begin[]; + extern uint8_t __dtb_testcases_end[]; + const int size = __dtb_testcases_end - __dtb_testcases_begin; + + if (!size || !of_allnodes) { + pr_warn("%s: No testcase data to attach; not running tests\n", + __func__); + return -ENODATA; + } + + /* creating copy */ + selftest_data = kmemdup(__dtb_testcases_begin, size, GFP_KERNEL); + + if (!selftest_data) { + pr_warn("%s: Failed to allocate memory for selftest_data; " + "not running tests\n", __func__); + return -ENOMEM; + } + of_fdt_unflatten_tree(selftest_data, &selftest_data_node); + + /* attach the sub-tree to live tree */ + return attach_node_and_children(selftest_data_node); +} + +/** + * detach_node_and_children - detaches node + * and its children from live tree + * + * @np: Node to detach from live tree + */ +static void detach_node_and_children(struct device_node *np) +{ + while (np->child) + detach_node_and_children(np->child); + + while (np->sibling) + detach_node_and_children(np->sibling); + + of_detach_node(np); +} + +/** + * selftest_data_remove - removes the selftest data + * nodes from the live tree + */ +static void selftest_data_remove(void) +{ + struct device_node *np; + struct property *prop; + + while (last_node_index >= 0) { + if (nodes[last_node_index]) { + np = of_find_node_by_path(nodes[last_node_index]->full_name); + if (strcmp(np->full_name, "/aliases") != 0) { + detach_node_and_children(np->child); + of_detach_node(np); + } else { + for_each_property_of_node(np, prop) { + if (strcmp(prop->name, "testcase-alias") == 0) + of_remove_property(np, prop); + } + } + } + last_node_index--; + } +} + static int __init of_selftest(void) { struct device_node *np; + int res; + + /* adding data for selftest */ + res = selftest_data_add(); + if (res) + return res; np = of_find_node_by_path("/testcase-data/phandle-tests/consumer-a"); if (!np) { @@ -533,12 +762,18 @@ static int __init of_selftest(void) of_selftest_dynamic(); of_selftest_parse_phandle_with_args(); of_selftest_property_match_string(); + of_selftest_property_copy(); + of_selftest_changeset(); of_selftest_parse_interrupts(); of_selftest_parse_interrupts_extended(); of_selftest_match_node(); of_selftest_platform_populate(); pr_info("end of selftest - %i passed, %i failed\n", selftest_results.passed, selftest_results.failed); + + /* removing selftest data from live tree */ + selftest_data_remove(); + return 0; } late_initcall(of_selftest); diff --git a/drivers/of/testcase-data/testcases.dts b/drivers/of/testcase-data/testcases.dts new file mode 100644 index 00000000000..219ef9324e9 --- /dev/null +++ b/drivers/of/testcase-data/testcases.dts @@ -0,0 +1,15 @@ +/dts-v1/; +/ { + testcase-data { + changeset { + prop-update = "hello"; + prop-remove = "world"; + node-remove { + }; + }; + }; +}; +#include "tests-phandle.dtsi" +#include "tests-interrupts.dtsi" +#include "tests-match.dtsi" +#include "tests-platform.dtsi" diff --git a/drivers/of/testcase-data/testcases.dtsi b/drivers/of/testcase-data/testcases.dtsi deleted file mode 100644 index 6d8d980ac85..00000000000 --- a/drivers/of/testcase-data/testcases.dtsi +++ /dev/null @@ -1,4 +0,0 @@ -#include "tests-phandle.dtsi" -#include "tests-interrupts.dtsi" -#include "tests-match.dtsi" -#include "tests-platform.dtsi" diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c index 93aa29f6d39..f2945fa73d4 100644 --- a/drivers/pci/hotplug/rpaphp_core.c +++ b/drivers/pci/hotplug/rpaphp_core.c @@ -375,11 +375,11 @@ static void __exit cleanup_slots(void) static int __init rpaphp_init(void) { - struct device_node *dn = NULL; + struct device_node *dn; info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); - while ((dn = of_find_node_by_name(dn, "pci"))) + for_each_node_by_name(dn, "pci") rpaphp_add_slot(dn); return 0; diff --git a/drivers/scsi/cxgbi/cxgb3i/Kconfig b/drivers/scsi/cxgbi/cxgb3i/Kconfig index 6bbc36fbd6e..e4603985dce 100644 --- a/drivers/scsi/cxgbi/cxgb3i/Kconfig +++ b/drivers/scsi/cxgbi/cxgb3i/Kconfig @@ -1,6 +1,6 @@ config SCSI_CXGB3_ISCSI tristate "Chelsio T3 iSCSI support" - depends on PCI && INET + depends on PCI && INET && (IPV6 || IPV6=n) select NETDEVICES select ETHERNET select NET_VENDOR_CHELSIO diff --git a/drivers/scsi/cxgbi/cxgb4i/Kconfig b/drivers/scsi/cxgbi/cxgb4i/Kconfig index 16b2c7d2661..8c4e423037b 100644 --- a/drivers/scsi/cxgbi/cxgb4i/Kconfig +++ b/drivers/scsi/cxgbi/cxgb4i/Kconfig @@ -1,6 +1,6 @@ config SCSI_CXGB4_ISCSI tristate "Chelsio T4 iSCSI support" - depends on PCI && INET + depends on PCI && INET && (IPV6 || IPV6=n) select NETDEVICES select ETHERNET select NET_VENDOR_CHELSIO diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c index 43fea2219f8..ae45bd99bae 100644 --- a/drivers/scsi/scsi_transport_srp.c +++ b/drivers/scsi/scsi_transport_srp.c @@ -472,7 +472,8 @@ static void __srp_start_tl_fail_timers(struct srp_rport *rport) if (delay > 0) queue_delayed_work(system_long_wq, &rport->reconnect_work, 1UL * delay * HZ); - if (srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) { + if ((fast_io_fail_tmo >= 0 || dev_loss_tmo >= 0) && + srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) { pr_debug("%s new state: %d\n", dev_name(&shost->shost_gendev), rport->state); scsi_target_block(&shost->shost_gendev); diff --git a/drivers/tty/ehv_bytechan.c b/drivers/tty/ehv_bytechan.c index 0419b69e270..4f485e88f60 100644 --- a/drivers/tty/ehv_bytechan.c +++ b/drivers/tty/ehv_bytechan.c @@ -108,55 +108,23 @@ static void disable_tx_interrupt(struct ehv_bc_data *bc) * * The byte channel to be used for the console is specified via a "stdout" * property in the /chosen node. - * - * For compatible with legacy device trees, we also look for a "stdout" alias. */ static int find_console_handle(void) { - struct device_node *np, *np2; + struct device_node *np = of_stdout; const char *sprop = NULL; const uint32_t *iprop; - np = of_find_node_by_path("/chosen"); - if (np) - sprop = of_get_property(np, "stdout-path", NULL); - - if (!np || !sprop) { - of_node_put(np); - np = of_find_node_by_name(NULL, "aliases"); - if (np) - sprop = of_get_property(np, "stdout", NULL); - } - - if (!sprop) { - of_node_put(np); - return 0; - } - /* We don't care what the aliased node is actually called. We only * care if it's compatible with "epapr,hv-byte-channel", because that - * indicates that it's a byte channel node. We use a temporary - * variable, 'np2', because we can't release 'np' until we're done with - * 'sprop'. + * indicates that it's a byte channel node. */ - np2 = of_find_node_by_path(sprop); - of_node_put(np); - np = np2; - if (!np) { - pr_warning("ehv-bc: stdout node '%s' does not exist\n", sprop); - return 0; - } - - /* Is it a byte channel? */ - if (!of_device_is_compatible(np, "epapr,hv-byte-channel")) { - of_node_put(np); + if (!np || !of_device_is_compatible(np, "epapr,hv-byte-channel")) return 0; - } stdout_irq = irq_of_parse_and_map(np, 0); if (stdout_irq == NO_IRQ) { - pr_err("ehv-bc: no 'interrupts' property in %s node\n", sprop); - of_node_put(np); + pr_err("ehv-bc: no 'interrupts' property in %s node\n", np->full_name); return 0; } @@ -167,12 +135,9 @@ static int find_console_handle(void) if (!iprop) { pr_err("ehv-bc: no 'hv-handle' property in %s node\n", np->name); - of_node_put(np); return 0; } stdout_bc = be32_to_cpu(*iprop); - - of_node_put(np); return 1; } diff --git a/drivers/tty/hvc/hvc_opal.c b/drivers/tty/hvc/hvc_opal.c index a585079b4b3..a2cc5f834c6 100644 --- a/drivers/tty/hvc/hvc_opal.c +++ b/drivers/tty/hvc/hvc_opal.c @@ -342,22 +342,13 @@ static void udbg_init_opal_common(void) void __init hvc_opal_init_early(void) { - struct device_node *stdout_node = NULL; + struct device_node *stdout_node = of_node_get(of_stdout); const __be32 *termno; - const char *name = NULL; const struct hv_ops *ops; u32 index; - /* find the boot console from /chosen/stdout */ - if (of_chosen) - name = of_get_property(of_chosen, "linux,stdout-path", NULL); - if (name) { - stdout_node = of_find_node_by_path(name); - if (!stdout_node) { - pr_err("hvc_opal: Failed to locate default console!\n"); - return; - } - } else { + /* If the console wasn't in /chosen, try /ibm,opal */ + if (!stdout_node) { struct device_node *opal, *np; /* Current OPAL takeover doesn't provide the stdout diff --git a/drivers/tty/hvc/hvc_vio.c b/drivers/tty/hvc/hvc_vio.c index b594abfbf21..5618b5fc750 100644 --- a/drivers/tty/hvc/hvc_vio.c +++ b/drivers/tty/hvc/hvc_vio.c @@ -404,42 +404,35 @@ module_exit(hvc_vio_exit); void __init hvc_vio_init_early(void) { - struct device_node *stdout_node; const __be32 *termno; const char *name; const struct hv_ops *ops; /* find the boot console from /chosen/stdout */ - if (!of_chosen) + if (!of_stdout) return; - name = of_get_property(of_chosen, "linux,stdout-path", NULL); - if (name == NULL) - return; - stdout_node = of_find_node_by_path(name); - if (!stdout_node) - return; - name = of_get_property(stdout_node, "name", NULL); + name = of_get_property(of_stdout, "name", NULL); if (!name) { printk(KERN_WARNING "stdout node missing 'name' property!\n"); - goto out; + return; } /* Check if it's a virtual terminal */ if (strncmp(name, "vty", 3) != 0) - goto out; - termno = of_get_property(stdout_node, "reg", NULL); + return; + termno = of_get_property(of_stdout, "reg", NULL); if (termno == NULL) - goto out; + return; hvterm_priv0.termno = of_read_number(termno, 1); spin_lock_init(&hvterm_priv0.buf_lock); hvterm_privs[0] = &hvterm_priv0; /* Check the protocol */ - if (of_device_is_compatible(stdout_node, "hvterm1")) { + if (of_device_is_compatible(of_stdout, "hvterm1")) { hvterm_priv0.proto = HV_PROTOCOL_RAW; ops = &hvterm_raw_ops; } - else if (of_device_is_compatible(stdout_node, "hvterm-protocol")) { + else if (of_device_is_compatible(of_stdout, "hvterm-protocol")) { hvterm_priv0.proto = HV_PROTOCOL_HVSI; ops = &hvterm_hvsi_ops; hvsilib_init(&hvterm_priv0.hvsi, hvc_get_chars, hvc_put_chars, @@ -447,7 +440,7 @@ void __init hvc_vio_init_early(void) /* HVSI, perform the handshake now */ hvsilib_establish(&hvterm_priv0.hvsi); } else - goto out; + return; udbg_putc = udbg_hvc_putc; udbg_getc = udbg_hvc_getc; udbg_getc_poll = udbg_hvc_getc_poll; @@ -456,14 +449,12 @@ void __init hvc_vio_init_early(void) * backend for HVSI, only do udbg */ if (hvterm_priv0.proto == HV_PROTOCOL_HVSI) - goto out; + return; #endif /* Check whether the user has requested a different console. */ if (!strstr(cmd_line, "console=")) add_preferred_console("hvc", 0, NULL); hvc_instantiate(0, 0, ops); -out: - of_node_put(stdout_node); } /* call this from early_init() for a working debug console on diff --git a/drivers/tty/serial/pmac_zilog.c b/drivers/tty/serial/pmac_zilog.c index f7ad5b90305..abbfedb8490 100644 --- a/drivers/tty/serial/pmac_zilog.c +++ b/drivers/tty/serial/pmac_zilog.c @@ -1653,8 +1653,7 @@ static int __init pmz_probe(void) /* * Find all escc chips in the system */ - node_p = of_find_node_by_name(NULL, "escc"); - while (node_p) { + for_each_node_by_name(node_p, "escc") { /* * First get channel A/B node pointers * @@ -1672,7 +1671,7 @@ static int __init pmz_probe(void) of_node_put(node_b); printk(KERN_ERR "pmac_zilog: missing node %c for escc %s\n", (!node_a) ? 'a' : 'b', node_p->full_name); - goto next; + continue; } /* @@ -1699,11 +1698,9 @@ static int __init pmz_probe(void) of_node_put(node_b); memset(&pmz_ports[count], 0, sizeof(struct uart_pmac_port)); memset(&pmz_ports[count+1], 0, sizeof(struct uart_pmac_port)); - goto next; + continue; } count += 2; -next: - node_p = of_find_node_by_name(node_p, "escc"); } pmz_ports_count = count; diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 8bb19da0163..29a7be47389 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -26,6 +26,7 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/console.h> +#include <linux/of.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/device.h> @@ -2611,6 +2612,8 @@ int uart_add_one_port(struct uart_driver *drv, struct uart_port *uport) spin_lock_init(&uport->lock); lockdep_set_class(&uport->lock, &port_lock_key); } + if (uport->cons && uport->dev) + of_console_check(uport->dev->of_node, uport->cons->name, uport->line); uart_configure_port(drv, state, uport); diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index af7b204b921..d8c57636b9c 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -8,11 +8,17 @@ config VFIO_IOMMU_SPAPR_TCE depends on VFIO && SPAPR_TCE_IOMMU default n +config VFIO_SPAPR_EEH + tristate + depends on EEH && VFIO_IOMMU_SPAPR_TCE + default n + menuconfig VFIO tristate "VFIO Non-Privileged userspace driver framework" depends on IOMMU_API select VFIO_IOMMU_TYPE1 if X86 select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES) + select VFIO_SPAPR_EEH if (PPC_POWERNV || PPC_PSERIES) select ANON_INODES help VFIO provides a framework for secure userspace device drivers. diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 50e30bc75e8..0b035b12600 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -1,5 +1,5 @@ obj-$(CONFIG_VFIO) += vfio.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o -obj-$(CONFIG_EEH) += vfio_spapr_eeh.o +obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o obj-$(CONFIG_VFIO_PCI) += pci/ diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index e2ee80f36e3..f7825332a32 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -37,6 +37,10 @@ module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(nointxmask, "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag."); +static DEFINE_MUTEX(driver_lock); + +static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev); + static int vfio_pci_enable(struct vfio_pci_device *vdev) { struct pci_dev *pdev = vdev->pdev; @@ -44,6 +48,9 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev) u16 cmd; u8 msix_pos; + /* Don't allow our initial saved state to include busmaster */ + pci_clear_master(pdev); + ret = pci_enable_device(pdev); if (ret) return ret; @@ -99,7 +106,8 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) struct pci_dev *pdev = vdev->pdev; int bar; - pci_disable_device(pdev); + /* Stop the device from further DMA */ + pci_clear_master(pdev); vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, @@ -117,6 +125,8 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) vdev->barmap[bar] = NULL; } + vdev->needs_reset = true; + /* * If we have saved state, restore it. If we can reset the device, * even better. Resetting with current state seems better than @@ -128,7 +138,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) __func__, dev_name(&pdev->dev)); if (!vdev->reset_works) - return; + goto out; pci_save_state(pdev); } @@ -148,46 +158,55 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) if (ret) pr_warn("%s: Failed to reset device %s (%d)\n", __func__, dev_name(&pdev->dev), ret); + else + vdev->needs_reset = false; } pci_restore_state(pdev); +out: + pci_disable_device(pdev); + + vfio_pci_try_bus_reset(vdev); } static void vfio_pci_release(void *device_data) { struct vfio_pci_device *vdev = device_data; - if (atomic_dec_and_test(&vdev->refcnt)) { + mutex_lock(&driver_lock); + + if (!(--vdev->refcnt)) { vfio_spapr_pci_eeh_release(vdev->pdev); vfio_pci_disable(vdev); } + mutex_unlock(&driver_lock); + module_put(THIS_MODULE); } static int vfio_pci_open(void *device_data) { struct vfio_pci_device *vdev = device_data; - int ret; + int ret = 0; if (!try_module_get(THIS_MODULE)) return -ENODEV; - if (atomic_inc_return(&vdev->refcnt) == 1) { + mutex_lock(&driver_lock); + + if (!vdev->refcnt) { ret = vfio_pci_enable(vdev); if (ret) goto error; - ret = vfio_spapr_pci_eeh_open(vdev->pdev); - if (ret) { - vfio_pci_disable(vdev); - goto error; - } + vfio_spapr_pci_eeh_open(vdev->pdev); } - - return 0; + vdev->refcnt++; error: - module_put(THIS_MODULE); + mutex_unlock(&driver_lock); + if (ret) + module_put(THIS_MODULE); return ret; } @@ -843,7 +862,6 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) vdev->irq_type = VFIO_PCI_NUM_IRQS; mutex_init(&vdev->igate); spin_lock_init(&vdev->irqlock); - atomic_set(&vdev->refcnt, 0); ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); if (ret) { @@ -858,12 +876,15 @@ static void vfio_pci_remove(struct pci_dev *pdev) { struct vfio_pci_device *vdev; + mutex_lock(&driver_lock); + vdev = vfio_del_group_dev(&pdev->dev); - if (!vdev) - return; + if (vdev) { + iommu_group_put(pdev->dev.iommu_group); + kfree(vdev); + } - iommu_group_put(pdev->dev.iommu_group); - kfree(vdev); + mutex_unlock(&driver_lock); } static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, @@ -906,6 +927,110 @@ static struct pci_driver vfio_pci_driver = { .err_handler = &vfio_err_handlers, }; +/* + * Test whether a reset is necessary and possible. We mark devices as + * needs_reset when they are released, but don't have a function-local reset + * available. If any of these exist in the affected devices, we want to do + * a bus/slot reset. We also need all of the affected devices to be unused, + * so we abort if any device has a non-zero refcnt. driver_lock prevents a + * device from being opened during the scan or unbound from vfio-pci. + */ +static int vfio_pci_test_bus_reset(struct pci_dev *pdev, void *data) +{ + bool *needs_reset = data; + struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver); + int ret = -EBUSY; + + if (pci_drv == &vfio_pci_driver) { + struct vfio_device *device; + struct vfio_pci_device *vdev; + + device = vfio_device_get_from_dev(&pdev->dev); + if (!device) + return ret; + + vdev = vfio_device_data(device); + if (vdev) { + if (vdev->needs_reset) + *needs_reset = true; + + if (!vdev->refcnt) + ret = 0; + } + + vfio_device_put(device); + } + + /* + * TODO: vfio-core considers groups to be viable even if some devices + * are attached to known drivers, like pci-stub or pcieport. We can't + * freeze devices from being unbound to those drivers like we can + * here though, so it would be racy to test for them. We also can't + * use device_lock() to prevent changes as that would interfere with + * PCI-core taking device_lock during bus reset. For now, we require + * devices to be bound to vfio-pci to get a bus/slot reset on release. + */ + + return ret; +} + +/* Clear needs_reset on all affected devices after successful bus/slot reset */ +static int vfio_pci_clear_needs_reset(struct pci_dev *pdev, void *data) +{ + struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver); + + if (pci_drv == &vfio_pci_driver) { + struct vfio_device *device; + struct vfio_pci_device *vdev; + + device = vfio_device_get_from_dev(&pdev->dev); + if (!device) + return 0; + + vdev = vfio_device_data(device); + if (vdev) + vdev->needs_reset = false; + + vfio_device_put(device); + } + + return 0; +} + +/* + * Attempt to do a bus/slot reset if there are devices affected by a reset for + * this device that are needs_reset and all of the affected devices are unused + * (!refcnt). Callers of this function are required to hold driver_lock such + * that devices can not be unbound from vfio-pci or opened by a user while we + * test for and perform a bus/slot reset. + */ +static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev) +{ + bool needs_reset = false, slot = false; + int ret; + + if (!pci_probe_reset_slot(vdev->pdev->slot)) + slot = true; + else if (pci_probe_reset_bus(vdev->pdev->bus)) + return; + + if (vfio_pci_for_each_slot_or_bus(vdev->pdev, + vfio_pci_test_bus_reset, + &needs_reset, slot) || !needs_reset) + return; + + if (slot) + ret = pci_try_reset_slot(vdev->pdev->slot); + else + ret = pci_try_reset_bus(vdev->pdev->bus); + + if (ret) + return; + + vfio_pci_for_each_slot_or_bus(vdev->pdev, + vfio_pci_clear_needs_reset, NULL, slot); +} + static void __exit vfio_pci_cleanup(void) { pci_unregister_driver(&vfio_pci_driver); diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 9c6d5d0f3b0..671c17a6e6d 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -54,8 +54,9 @@ struct vfio_pci_device { bool extended_caps; bool bardirty; bool has_vga; + bool needs_reset; struct pci_saved_state *pci_saved_state; - atomic_t refcnt; + int refcnt; struct eventfd_ctx *err_trigger; }; diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c index f834b4ce143..86dfceb9201 100644 --- a/drivers/vfio/vfio_spapr_eeh.c +++ b/drivers/vfio/vfio_spapr_eeh.c @@ -9,20 +9,27 @@ * published by the Free Software Foundation. */ +#include <linux/module.h> #include <linux/uaccess.h> #include <linux/vfio.h> #include <asm/eeh.h> +#define DRIVER_VERSION "0.1" +#define DRIVER_AUTHOR "Gavin Shan, IBM Corporation" +#define DRIVER_DESC "VFIO IOMMU SPAPR EEH" + /* We might build address mapping here for "fast" path later */ -int vfio_spapr_pci_eeh_open(struct pci_dev *pdev) +void vfio_spapr_pci_eeh_open(struct pci_dev *pdev) { - return eeh_dev_open(pdev); + eeh_dev_open(pdev); } +EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_open); void vfio_spapr_pci_eeh_release(struct pci_dev *pdev) { eeh_dev_release(pdev); } +EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_release); long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd, unsigned long arg) @@ -85,3 +92,9 @@ long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, return ret; } +EXPORT_SYMBOL(vfio_spapr_iommu_eeh_ioctl); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); |