diff options
Diffstat (limited to 'drivers/md/bcache/btree.c')
-rw-r--r-- | drivers/md/bcache/btree.c | 451 |
1 files changed, 212 insertions, 239 deletions
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 7a5658f04e6..ee372884c40 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -24,6 +24,7 @@ #include "btree.h" #include "debug.h" #include "request.h" +#include "writeback.h" #include <linux/slab.h> #include <linux/bitops.h> @@ -134,44 +135,17 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i) return crc ^ 0xffffffffffffffffULL; } -static void btree_bio_endio(struct bio *bio, int error) +static void bch_btree_node_read_done(struct btree *b) { - struct closure *cl = bio->bi_private; - struct btree *b = container_of(cl, struct btree, io.cl); - - if (error) - set_btree_node_io_error(b); - - bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE) - ? "writing btree" : "reading btree"); - closure_put(cl); -} - -static void btree_bio_init(struct btree *b) -{ - BUG_ON(b->bio); - b->bio = bch_bbio_alloc(b->c); - - b->bio->bi_end_io = btree_bio_endio; - b->bio->bi_private = &b->io.cl; -} - -void bch_btree_read_done(struct closure *cl) -{ - struct btree *b = container_of(cl, struct btree, io.cl); - struct bset *i = b->sets[0].data; - struct btree_iter *iter = b->c->fill_iter; const char *err = "bad btree header"; - BUG_ON(b->nsets || b->written); - - bch_bbio_free(b->bio, b->c); - b->bio = NULL; + struct bset *i = b->sets[0].data; + struct btree_iter *iter; - mutex_lock(&b->c->fill_lock); + iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); + iter->size = b->c->sb.bucket_size / b->c->sb.block_size; iter->used = 0; - if (btree_node_io_error(b) || - !i->seq) + if (!i->seq) goto err; for (; @@ -228,17 +202,8 @@ void bch_btree_read_done(struct closure *cl) if (b->written < btree_blocks(b)) bch_bset_init_next(b); out: - - mutex_unlock(&b->c->fill_lock); - - spin_lock(&b->c->btree_read_time_lock); - bch_time_stats_update(&b->c->btree_read_time, b->io_start_time); - spin_unlock(&b->c->btree_read_time_lock); - - smp_wmb(); /* read_done is our write lock */ - set_btree_node_read_done(b); - - closure_return(cl); + mempool_free(iter, b->c->fill_iter); + return; err: set_btree_node_io_error(b); bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", @@ -247,48 +212,69 @@ err: goto out; } -void bch_btree_read(struct btree *b) +static void btree_node_read_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + closure_put(cl); +} + +void bch_btree_node_read(struct btree *b) { - BUG_ON(b->nsets || b->written); + uint64_t start_time = local_clock(); + struct closure cl; + struct bio *bio; + + trace_bcache_btree_read(b); + + closure_init_stack(&cl); + + bio = bch_bbio_alloc(b->c); + bio->bi_rw = REQ_META|READ_SYNC; + bio->bi_size = KEY_SIZE(&b->key) << 9; + bio->bi_end_io = btree_node_read_endio; + bio->bi_private = &cl; + + bch_bio_map(bio, b->sets[0].data); + + bch_submit_bbio(bio, b->c, &b->key, 0); + closure_sync(&cl); - if (!closure_trylock(&b->io.cl, &b->c->cl)) - BUG(); + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + set_btree_node_io_error(b); - b->io_start_time = local_clock(); + bch_bbio_free(bio, b->c); - btree_bio_init(b); - b->bio->bi_rw = REQ_META|READ_SYNC; - b->bio->bi_size = KEY_SIZE(&b->key) << 9; + if (btree_node_io_error(b)) + goto err; - bch_bio_map(b->bio, b->sets[0].data); + bch_btree_node_read_done(b); - pr_debug("%s", pbtree(b)); - trace_bcache_btree_read(b->bio); - bch_submit_bbio(b->bio, b->c, &b->key, 0); + spin_lock(&b->c->btree_read_time_lock); + bch_time_stats_update(&b->c->btree_read_time, start_time); + spin_unlock(&b->c->btree_read_time_lock); - continue_at(&b->io.cl, bch_btree_read_done, system_wq); + return; +err: + bch_cache_set_error(b->c, "io error reading bucket %lu", + PTR_BUCKET_NR(b->c, &b->key, 0)); } static void btree_complete_write(struct btree *b, struct btree_write *w) { if (w->prio_blocked && !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) - wake_up(&b->c->alloc_wait); + wake_up_allocators(b->c); if (w->journal) { atomic_dec_bug(w->journal); __closure_wake_up(&b->c->journal.wait); } - if (w->owner) - closure_put(w->owner); - w->prio_blocked = 0; w->journal = NULL; - w->owner = NULL; } -static void __btree_write_done(struct closure *cl) +static void __btree_node_write_done(struct closure *cl) { struct btree *b = container_of(cl, struct btree, io.cl); struct btree_write *w = btree_prev_write(b); @@ -304,7 +290,7 @@ static void __btree_write_done(struct closure *cl) closure_return(cl); } -static void btree_write_done(struct closure *cl) +static void btree_node_write_done(struct closure *cl) { struct btree *b = container_of(cl, struct btree, io.cl); struct bio_vec *bv; @@ -313,10 +299,22 @@ static void btree_write_done(struct closure *cl) __bio_for_each_segment(bv, b->bio, n, 0) __free_page(bv->bv_page); - __btree_write_done(cl); + __btree_node_write_done(cl); } -static void do_btree_write(struct btree *b) +static void btree_node_write_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + struct btree *b = container_of(cl, struct btree, io.cl); + + if (error) + set_btree_node_io_error(b); + + bch_bbio_count_io_errors(b->c, bio, error, "writing btree"); + closure_put(cl); +} + +static void do_btree_node_write(struct btree *b) { struct closure *cl = &b->io.cl; struct bset *i = b->sets[b->nsets].data; @@ -325,15 +323,34 @@ static void do_btree_write(struct btree *b) i->version = BCACHE_BSET_VERSION; i->csum = btree_csum_set(b, i); - btree_bio_init(b); - b->bio->bi_rw = REQ_META|WRITE_SYNC; - b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); + BUG_ON(b->bio); + b->bio = bch_bbio_alloc(b->c); + + b->bio->bi_end_io = btree_node_write_endio; + b->bio->bi_private = &b->io.cl; + b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; + b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); bch_bio_map(b->bio, i); + /* + * If we're appending to a leaf node, we don't technically need FUA - + * this write just needs to be persisted before the next journal write, + * which will be marked FLUSH|FUA. + * + * Similarly if we're writing a new btree root - the pointer is going to + * be in the next journal entry. + * + * But if we're writing a new btree node (that isn't a root) or + * appending to a non leaf btree node, we need either FUA or a flush + * when we write the parent with the new pointer. FUA is cheaper than a + * flush, and writes appending to leaf nodes aren't blocking anything so + * just make all btree node writes FUA to keep things sane. + */ + bkey_copy(&k.key, &b->key); SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); - if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) { + if (!bio_alloc_pages(b->bio, GFP_NOIO)) { int j; struct bio_vec *bv; void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); @@ -342,40 +359,41 @@ static void do_btree_write(struct btree *b) memcpy(page_address(bv->bv_page), base + j * PAGE_SIZE, PAGE_SIZE); - trace_bcache_btree_write(b->bio); bch_submit_bbio(b->bio, b->c, &k.key, 0); - continue_at(cl, btree_write_done, NULL); + continue_at(cl, btree_node_write_done, NULL); } else { b->bio->bi_vcnt = 0; bch_bio_map(b->bio, i); - trace_bcache_btree_write(b->bio); bch_submit_bbio(b->bio, b->c, &k.key, 0); closure_sync(cl); - __btree_write_done(cl); + __btree_node_write_done(cl); } } -static void __btree_write(struct btree *b) +void bch_btree_node_write(struct btree *b, struct closure *parent) { struct bset *i = b->sets[b->nsets].data; + trace_bcache_btree_write(b); + BUG_ON(current->bio_list); + BUG_ON(b->written >= btree_blocks(b)); + BUG_ON(b->written && !i->keys); + BUG_ON(b->sets->data->seq != i->seq); + bch_check_key_order(b, i); - closure_lock(&b->io, &b->c->cl); cancel_delayed_work(&b->work); + /* If caller isn't waiting for write, parent refcount is cache set */ + closure_lock(&b->io, parent ?: &b->c->cl); + clear_bit(BTREE_NODE_dirty, &b->flags); change_bit(BTREE_NODE_write_idx, &b->flags); - bch_check_key_order(b, i); - BUG_ON(b->written && !i->keys); - - do_btree_write(b); - - pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys); + do_btree_node_write(b); b->written += set_blocks(i, b->c); atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, @@ -387,37 +405,31 @@ static void __btree_write(struct btree *b) bch_bset_init_next(b); } -static void btree_write_work(struct work_struct *w) +static void btree_node_write_work(struct work_struct *w) { struct btree *b = container_of(to_delayed_work(w), struct btree, work); - down_write(&b->lock); + rw_lock(true, b, b->level); if (btree_node_dirty(b)) - __btree_write(b); - up_write(&b->lock); + bch_btree_node_write(b, NULL); + rw_unlock(true, b); } -void bch_btree_write(struct btree *b, bool now, struct btree_op *op) +static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) { struct bset *i = b->sets[b->nsets].data; struct btree_write *w = btree_current_write(b); - BUG_ON(b->written && - (b->written >= btree_blocks(b) || - i->seq != b->sets[0].data->seq || - !i->keys)); + BUG_ON(!b->written); + BUG_ON(!i->keys); - if (!btree_node_dirty(b)) { - set_btree_node_dirty(b); - queue_delayed_work(btree_io_wq, &b->work, - msecs_to_jiffies(30000)); - } + if (!btree_node_dirty(b)) + queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); - w->prio_blocked += b->prio_blocked; - b->prio_blocked = 0; + set_btree_node_dirty(b); - if (op && op->journal && !b->level) { + if (op && op->journal) { if (w->journal && journal_pin_cmp(b->c, w, op)) { atomic_dec_bug(w->journal); @@ -430,23 +442,10 @@ void bch_btree_write(struct btree *b, bool now, struct btree_op *op) } } - if (current->bio_list) - return; - /* Force write if set is too big */ - if (now || - b->level || - set_bytes(i) > PAGE_SIZE - 48) { - if (op && now) { - /* Must wait on multiple writes */ - BUG_ON(w->owner); - w->owner = &op->cl; - closure_get(&op->cl); - } - - __btree_write(b); - } - BUG_ON(!b->written); + if (set_bytes(i) > PAGE_SIZE - 48 && + !current->bio_list) + bch_btree_node_write(b, NULL); } /* @@ -559,7 +558,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, init_rwsem(&b->lock); lockdep_set_novalidate_class(&b->lock); INIT_LIST_HEAD(&b->list); - INIT_DELAYED_WORK(&b->work, btree_write_work); + INIT_DELAYED_WORK(&b->work, btree_node_write_work); b->c = c; closure_init_unlocked(&b->io); @@ -582,7 +581,7 @@ static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) BUG_ON(btree_node_dirty(b) && !b->sets[0].data); if (cl && btree_node_dirty(b)) - bch_btree_write(b, true, NULL); + bch_btree_node_write(b, NULL); if (cl) closure_wait_event_async(&b->io.wait, cl, @@ -623,6 +622,13 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc) else if (!mutex_trylock(&c->bucket_lock)) return -1; + /* + * It's _really_ critical that we don't free too many btree nodes - we + * have to always leave ourselves a reserve. The reserve is how we + * guarantee that allocating memory for a new btree node can always + * succeed, so that inserting keys into the btree can always succeed and + * IO can always make forward progress: + */ nr /= c->btree_pages; nr = min_t(unsigned long, nr, mca_can_free(c)); @@ -766,6 +772,8 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, int ret = -ENOMEM; struct btree *i; + trace_bcache_btree_cache_cannibalize(c); + if (!cl) return ERR_PTR(-ENOMEM); @@ -784,7 +792,6 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, return ERR_PTR(-EAGAIN); } - /* XXX: tracepoint */ c->try_harder = cl; c->try_harder_start = local_clock(); retry: @@ -905,6 +912,9 @@ retry: b = mca_find(c, k); if (!b) { + if (current->bio_list) + return ERR_PTR(-EAGAIN); + mutex_lock(&c->bucket_lock); b = mca_alloc(c, k, level, &op->cl); mutex_unlock(&c->bucket_lock); @@ -914,7 +924,7 @@ retry: if (IS_ERR(b)) return b; - bch_btree_read(b); + bch_btree_node_read(b); if (!write) downgrade_write(&b->lock); @@ -937,15 +947,12 @@ retry: for (; i <= b->nsets; i++) prefetch(b->sets[i].data); - if (!closure_wait_event(&b->io.wait, &op->cl, - btree_node_read_done(b))) { - rw_unlock(write, b); - b = ERR_PTR(-EAGAIN); - } else if (btree_node_io_error(b)) { + if (btree_node_io_error(b)) { rw_unlock(write, b); - b = ERR_PTR(-EIO); - } else - BUG_ON(!b->written); + return ERR_PTR(-EIO); + } + + BUG_ON(!b->written); return b; } @@ -959,7 +966,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) mutex_unlock(&c->bucket_lock); if (!IS_ERR_OR_NULL(b)) { - bch_btree_read(b); + bch_btree_node_read(b); rw_unlock(true, b); } } @@ -970,24 +977,19 @@ static void btree_node_free(struct btree *b, struct btree_op *op) { unsigned i; + trace_bcache_btree_node_free(b); + /* * The BUG_ON() in btree_node_get() implies that we must have a write * lock on parent to free or even invalidate a node */ BUG_ON(op->lock <= b->level); BUG_ON(b == b->c->root); - pr_debug("bucket %s", pbtree(b)); if (btree_node_dirty(b)) btree_complete_write(b, btree_current_write(b)); clear_bit(BTREE_NODE_dirty, &b->flags); - if (b->prio_blocked && - !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked)) - wake_up(&b->c->alloc_wait); - - b->prio_blocked = 0; - cancel_delayed_work(&b->work); mutex_lock(&b->c->bucket_lock); @@ -1028,17 +1030,20 @@ retry: goto retry; } - set_btree_node_read_done(b); b->accessed = 1; bch_bset_init_next(b); mutex_unlock(&c->bucket_lock); + + trace_bcache_btree_node_alloc(b); return b; err_free: bch_bucket_free(c, &k.key); __bkey_put(c, &k.key); err: mutex_unlock(&c->bucket_lock); + + trace_bcache_btree_node_alloc_fail(b); return b; } @@ -1137,11 +1142,8 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, gc->nkeys++; gc->data += KEY_SIZE(k); - if (KEY_DIRTY(k)) { + if (KEY_DIRTY(k)) gc->dirty += KEY_SIZE(k); - if (d) - d->sectors_dirty_gc += KEY_SIZE(k); - } } for (t = b->sets; t <= &b->sets[b->nsets]; t++) @@ -1166,14 +1168,11 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, if (!IS_ERR_OR_NULL(n)) { swap(b, n); + __bkey_put(b->c, &b->key); memcpy(k->ptr, b->key.ptr, sizeof(uint64_t) * KEY_PTRS(&b->key)); - __bkey_put(b->c, &b->key); - atomic_inc(&b->c->prio_blocked); - b->prio_blocked++; - btree_node_free(n, op); up_write(&n->lock); } @@ -1278,7 +1277,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op, btree_node_free(r->b, op); up_write(&r->b->lock); - pr_debug("coalesced %u nodes", nodes); + trace_bcache_btree_gc_coalesce(nodes); gc->nodes--; nodes--; @@ -1293,14 +1292,9 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, void write(struct btree *r) { if (!r->written) - bch_btree_write(r, true, op); - else if (btree_node_dirty(r)) { - BUG_ON(btree_current_write(r)->owner); - btree_current_write(r)->owner = writes; - closure_get(writes); - - bch_btree_write(r, true, NULL); - } + bch_btree_node_write(r, &op->cl); + else if (btree_node_dirty(r)) + bch_btree_node_write(r, writes); up_write(&r->lock); } @@ -1386,9 +1380,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, ret = btree_gc_recurse(b, op, writes, gc); if (!b->written || btree_node_dirty(b)) { - atomic_inc(&b->c->prio_blocked); - b->prio_blocked++; - bch_btree_write(b, true, n ? op : NULL); + bch_btree_node_write(b, n ? &op->cl : NULL); } if (!IS_ERR_OR_NULL(n)) { @@ -1405,7 +1397,6 @@ static void btree_gc_start(struct cache_set *c) { struct cache *ca; struct bucket *b; - struct bcache_device **d; unsigned i; if (!c->gc_mark_valid) @@ -1419,16 +1410,12 @@ static void btree_gc_start(struct cache_set *c) for_each_cache(ca, c, i) for_each_bucket(b, ca) { b->gc_gen = b->gen; - if (!atomic_read(&b->pin)) + if (!atomic_read(&b->pin)) { SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + SET_GC_SECTORS_USED(b, 0); + } } - for (d = c->devices; - d < c->devices + c->nr_uuids; - d++) - if (*d) - (*d)->sectors_dirty_gc = 0; - mutex_unlock(&c->bucket_lock); } @@ -1437,7 +1424,6 @@ size_t bch_btree_gc_finish(struct cache_set *c) size_t available = 0; struct bucket *b; struct cache *ca; - struct bcache_device **d; unsigned i; mutex_lock(&c->bucket_lock); @@ -1480,22 +1466,6 @@ size_t bch_btree_gc_finish(struct cache_set *c) } } - for (d = c->devices; - d < c->devices + c->nr_uuids; - d++) - if (*d) { - unsigned long last = - atomic_long_read(&((*d)->sectors_dirty)); - long difference = (*d)->sectors_dirty_gc - last; - - pr_debug("sectors dirty off by %li", difference); - - (*d)->sectors_dirty_last += difference; - - atomic_long_set(&((*d)->sectors_dirty), - (*d)->sectors_dirty_gc); - } - mutex_unlock(&c->bucket_lock); return available; } @@ -1508,10 +1478,9 @@ static void bch_btree_gc(struct closure *cl) struct gc_stat stats; struct closure writes; struct btree_op op; - uint64_t start_time = local_clock(); - trace_bcache_gc_start(c->sb.set_uuid); - blktrace_msg_all(c, "Starting gc"); + + trace_bcache_gc_start(c); memset(&stats, 0, sizeof(struct gc_stat)); closure_init_stack(&writes); @@ -1520,14 +1489,14 @@ static void bch_btree_gc(struct closure *cl) btree_gc_start(c); + atomic_inc(&c->prio_blocked); + ret = btree_root(gc_root, c, &op, &writes, &stats); closure_sync(&op.cl); closure_sync(&writes); if (ret) { - blktrace_msg_all(c, "Stopped gc"); pr_warn("gc failed!"); - continue_at(cl, bch_btree_gc, bch_gc_wq); } @@ -1537,6 +1506,9 @@ static void bch_btree_gc(struct closure *cl) available = bch_btree_gc_finish(c); + atomic_dec(&c->prio_blocked); + wake_up_allocators(c); + bch_time_stats_update(&c->btree_gc_time, start_time); stats.key_bytes *= sizeof(uint64_t); @@ -1544,10 +1516,8 @@ static void bch_btree_gc(struct closure *cl) stats.data <<= 9; stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); - blktrace_msg_all(c, "Finished gc"); - trace_bcache_gc_end(c->sb.set_uuid); - wake_up(&c->alloc_wait); + trace_bcache_gc_end(c); continue_at(cl, bch_moving_gc, bch_gc_wq); } @@ -1654,14 +1624,14 @@ static bool fix_overlapping_extents(struct btree *b, struct btree_iter *iter, struct btree_op *op) { - void subtract_dirty(struct bkey *k, int sectors) + void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) { - struct bcache_device *d = b->c->devices[KEY_INODE(k)]; - - if (KEY_DIRTY(k) && d) - atomic_long_sub(sectors, &d->sectors_dirty); + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), + offset, -sectors); } + uint64_t old_offset; unsigned old_size, sectors_found = 0; while (1) { @@ -1673,6 +1643,7 @@ static bool fix_overlapping_extents(struct btree *b, if (bkey_cmp(k, &START_KEY(insert)) <= 0) continue; + old_offset = KEY_START(k); old_size = KEY_SIZE(k); /* @@ -1728,7 +1699,7 @@ static bool fix_overlapping_extents(struct btree *b, struct bkey *top; - subtract_dirty(k, KEY_SIZE(insert)); + subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); if (bkey_written(b, k)) { /* @@ -1775,7 +1746,7 @@ static bool fix_overlapping_extents(struct btree *b, } } - subtract_dirty(k, old_size - KEY_SIZE(k)); + subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); } check_failed: @@ -1798,7 +1769,7 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, { struct bset *i = b->sets[b->nsets].data; struct bkey *m, *prev; - const char *status = "insert"; + unsigned status = BTREE_INSERT_STATUS_INSERT; BUG_ON(bkey_cmp(k, &b->key) > 0); BUG_ON(b->level && !KEY_PTRS(k)); @@ -1831,17 +1802,17 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, goto insert; /* prev is in the tree, if we merge we're done */ - status = "back merging"; + status = BTREE_INSERT_STATUS_BACK_MERGE; if (prev && bch_bkey_try_merge(b, prev, k)) goto merged; - status = "overwrote front"; + status = BTREE_INSERT_STATUS_OVERWROTE; if (m != end(i) && KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) goto copy; - status = "front merge"; + status = BTREE_INSERT_STATUS_FRONT_MERGE; if (m != end(i) && bch_bkey_try_merge(b, k, m)) goto copy; @@ -1851,21 +1822,21 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, insert: shift_keys(b, m, k); copy: bkey_copy(m, k); merged: - bch_check_keys(b, "%s for %s at %s: %s", status, - op_type(op), pbtree(b), pkey(k)); - bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status, - op_type(op), pbtree(b), pkey(k)); + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), + KEY_START(k), KEY_SIZE(k)); + + bch_check_keys(b, "%u for %s", status, op_type(op)); if (b->level && !KEY_OFFSET(k)) - b->prio_blocked++; + btree_current_write(b)->prio_blocked++; - pr_debug("%s for %s at %s: %s", status, - op_type(op), pbtree(b), pkey(k)); + trace_bcache_btree_insert_key(b, k, op->type, status); return true; } -bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) +static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) { bool ret = false; struct bkey *k; @@ -1896,7 +1867,7 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, should_split(b)) goto out; - op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio)); + op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio)); SET_KEY_PTRS(&op->replace, 1); get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); @@ -1907,7 +1878,6 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, BUG_ON(op->type != BTREE_INSERT); BUG_ON(!btree_insert_key(b, op, &tmp.k)); - bch_btree_write(b, false, NULL); ret = true; out: downgrade_write(&b->lock); @@ -1929,12 +1899,11 @@ static int btree_split(struct btree *b, struct btree_op *op) split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; - pr_debug("%ssplitting at %s keys %i", split ? "" : "not ", - pbtree(b), n1->sets[0].data->keys); - if (split) { unsigned keys = 0; + trace_bcache_btree_node_split(b, n1->sets[0].data->keys); + n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); if (IS_ERR(n2)) goto err_free1; @@ -1967,18 +1936,21 @@ static int btree_split(struct btree *b, struct btree_op *op) bkey_copy_key(&n2->key, &b->key); bch_keylist_add(&op->keys, &n2->key); - bch_btree_write(n2, true, op); + bch_btree_node_write(n2, &op->cl); rw_unlock(true, n2); - } else + } else { + trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); + bch_btree_insert_keys(n1, op); + } bch_keylist_add(&op->keys, &n1->key); - bch_btree_write(n1, true, op); + bch_btree_node_write(n1, &op->cl); if (n3) { bkey_copy_key(&n3->key, &MAX_KEY); bch_btree_insert_keys(n3, op); - bch_btree_write(n3, true, op); + bch_btree_node_write(n3, &op->cl); closure_sync(&op->cl); bch_btree_set_root(n3); @@ -2082,8 +2054,12 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, BUG_ON(write_block(b) != b->sets[b->nsets].data); - if (bch_btree_insert_keys(b, op)) - bch_btree_write(b, false, op); + if (bch_btree_insert_keys(b, op)) { + if (!b->level) + bch_btree_leaf_dirty(b, op); + else + bch_btree_node_write(b, &op->cl); + } } return 0; @@ -2140,6 +2116,11 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c) void bch_btree_set_root(struct btree *b) { unsigned i; + struct closure cl; + + closure_init_stack(&cl); + + trace_bcache_btree_set_root(b); BUG_ON(!b->written); @@ -2153,8 +2134,8 @@ void bch_btree_set_root(struct btree *b) b->c->root = b; __bkey_put(b->c, &b->key); - bch_journal_meta(b->c, NULL); - pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0)); + bch_journal_meta(b->c, &cl); + closure_sync(&cl); } /* Cache lookup */ @@ -2215,9 +2196,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, KEY_OFFSET(k) - bio->bi_sector); n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); - if (!n) - return -EAGAIN; - if (n == bio) op->lookup_done = true; @@ -2240,7 +2218,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, n->bi_end_io = bch_cache_read_endio; n->bi_private = &s->cl; - trace_bcache_cache_hit(n); __bch_submit_bbio(n, b->c); } @@ -2257,9 +2234,6 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op) struct btree_iter iter; bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); - pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode, - (uint64_t) bio->bi_sector); - do { k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); if (!k) { @@ -2303,7 +2277,8 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, } static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, - struct keybuf *buf, struct bkey *end) + struct keybuf *buf, struct bkey *end, + keybuf_pred_fn *pred) { struct btree_iter iter; bch_btree_iter_init(b, &iter, &buf->last_scanned); @@ -2322,11 +2297,9 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, if (bkey_cmp(&buf->last_scanned, end) >= 0) break; - if (buf->key_predicate(buf, k)) { + if (pred(buf, k)) { struct keybuf_key *w; - pr_debug("%s", pkey(k)); - spin_lock(&buf->lock); w = array_alloc(&buf->freelist); @@ -2343,7 +2316,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, if (!k) break; - btree(refill_keybuf, k, b, op, buf, end); + btree(refill_keybuf, k, b, op, buf, end, pred); /* * Might get an error here, but can't really do anything * and it'll get logged elsewhere. Just read what we @@ -2361,7 +2334,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, } void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, - struct bkey *end) + struct bkey *end, keybuf_pred_fn *pred) { struct bkey start = buf->last_scanned; struct btree_op op; @@ -2369,7 +2342,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, cond_resched(); - btree_root(refill_keybuf, c, &op, buf, end); + btree_root(refill_keybuf, c, &op, buf, end, pred); closure_sync(&op.cl); pr_debug("found %s keys from %llu:%llu to %llu:%llu", @@ -2455,7 +2428,8 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf) struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, struct keybuf *buf, - struct bkey *end) + struct bkey *end, + keybuf_pred_fn *pred) { struct keybuf_key *ret; @@ -2469,15 +2443,14 @@ struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, break; } - bch_refill_keybuf(c, buf, end); + bch_refill_keybuf(c, buf, end, pred); } return ret; } -void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn) +void bch_keybuf_init(struct keybuf *buf) { - buf->key_predicate = fn; buf->last_scanned = MAX_KEY; buf->keys = RB_ROOT; |