From ca6673b02e26356bcb3b86e074eaa59cfa51114b Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 3 Dec 2013 17:32:53 -0600 Subject: block: Replace __this_cpu_ptr with raw_cpu_ptr __this_cpu_ptr is being phased out. Cc: Jens Axboe Signed-off-by: Christoph Lameter Signed-off-by: Jens Axboe --- fs/buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/buffer.c b/fs/buffer.c index 1c04ec66974..651dba10b9c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1312,7 +1312,7 @@ static void bh_lru_install(struct buffer_head *bh) } while (out < BH_LRU_SIZE) bhs[out++] = NULL; - memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); + memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); } bh_lru_unlock(); -- cgit v1.2.3-70-g09d2 From 644ff181da10e053dad8fa3de07d1bed8ac0a23f Mon Sep 17 00:00:00 2001 From: Michael Opdenacker Date: Tue, 3 Dec 2013 19:22:17 -0700 Subject: drivers/cdrom/gdrom.c: remove deprecated IRQF_DISABLED Remove the IRQF_DISABLED flag from drivers/cdrom/gdrom.c. It's a NOOP since 2.6.35 and it will be removed one day. Signed-off-by: Michael Opdenacker Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- drivers/cdrom/gdrom.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 5980cb9af85..51e75ad9642 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -561,11 +561,11 @@ static int gdrom_set_interrupt_handlers(void) int err; err = request_irq(HW_EVENT_GDROM_CMD, gdrom_command_interrupt, - IRQF_DISABLED, "gdrom_command", &gd); + 0, "gdrom_command", &gd); if (err) return err; err = request_irq(HW_EVENT_GDROM_DMA, gdrom_dma_interrupt, - IRQF_DISABLED, "gdrom_dma", &gd); + 0, "gdrom_dma", &gd); if (err) free_irq(HW_EVENT_GDROM_CMD, &gd); return err; -- cgit v1.2.3-70-g09d2 From 8586ea96b4f919a9b38929040bc9ce57c9998fc4 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 29 Dec 2013 23:47:21 +0100 Subject: pktcdvd: fix error return code Set the return variable to an error code as done elsewhere in the function. A simplified version of the semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // ( if@p1 (\(ret < 0\|ret != 0\)) { ... return ret; } | ret@p1 = 0 ) ... when != ret = e1 when != &ret *if(...) { ... when != ret = e2 when forall return ret; } // Signed-off-by: Julia Lawall Signed-off-by: Jiri Kosina --- drivers/block/pktcdvd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 3dda09a5ec4..a2af73db187 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -706,7 +706,9 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * WRITE : READ, __GFP_WAIT); if (cgc->buflen) { - if (blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, __GFP_WAIT)) + ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, + __GFP_WAIT); + if (ret) goto out; } -- cgit v1.2.3-70-g09d2 From ef71ec00002d92a08eb27e9d036e3d48835b6597 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 17 Dec 2013 17:51:02 -0800 Subject: bcache: Data corruption fix The code that handles overlapping extents that we've just read back in from disk was depending on the behaviour of the code that handles overlapping extents as we're inserting into a btree node in the case of an insert that forced an existing extent to be split: on insert, if we had to split we'd also insert a new extent to represent the top part of the old extent - and then that new extent would get written out. The code that read the extents back in thus not bother with splitting extents - if it saw an extent that ovelapped in the middle of an older extent, it would trim the old extent to only represent the bottom part, assuming that the original insert would've inserted a new extent to represent the top part. I still haven't figured out _how_ it can happen, but I'm now pretty convinced (and testing has confirmed) that there's some kind of an obscure corner case (probably involving extent merging, and multiple overwrites in different sets) that breaks this. The fix is to change the mergesort fixup code to split extents itself when required. Signed-off-by: Kent Overstreet Cc: linux-stable # >= v3.10 --- drivers/md/bcache/bset.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 7d388b8bb50..16958704af3 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -955,7 +955,7 @@ static void sort_key_next(struct btree_iter *iter, *i = iter->data[--iter->used]; } -static void btree_sort_fixup(struct btree_iter *iter) +static struct bkey *btree_sort_fixup(struct btree_iter *iter, struct bkey *tmp) { while (iter->used > 1) { struct btree_iter_set *top = iter->data, *i = top + 1; @@ -983,9 +983,22 @@ static void btree_sort_fixup(struct btree_iter *iter) } else { /* can't happen because of comparison func */ BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); - bch_cut_back(&START_KEY(i->k), top->k); + + if (bkey_cmp(i->k, top->k) < 0) { + bkey_copy(tmp, top->k); + + bch_cut_back(&START_KEY(i->k), tmp); + bch_cut_front(i->k, top->k); + heap_sift(iter, 0, btree_iter_cmp); + + return tmp; + } else { + bch_cut_back(&START_KEY(i->k), top->k); + } } } + + return NULL; } static void btree_mergesort(struct btree *b, struct bset *out, @@ -993,15 +1006,20 @@ static void btree_mergesort(struct btree *b, struct bset *out, bool fixup, bool remove_stale) { struct bkey *k, *last = NULL; + BKEY_PADDED(k) tmp; bool (*bad)(struct btree *, const struct bkey *) = remove_stale ? bch_ptr_bad : bch_ptr_invalid; while (!btree_iter_end(iter)) { if (fixup && !b->level) - btree_sort_fixup(iter); + k = btree_sort_fixup(iter, &tmp.k); + else + k = NULL; + + if (!k) + k = bch_btree_iter_next(iter); - k = bch_btree_iter_next(iter); if (bad(b, k)) continue; -- cgit v1.2.3-70-g09d2 From b3fa7e77e67e647db3db2166b65083a427d84ed3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 5 Aug 2013 14:04:06 -0700 Subject: bcache: Minor journal fix The real fix is where we check the bytes we need against how much is remaining - we also need to check for a journal entry bigger than our buffer, we'll never write those and it would be bad if we tried to read one. Also improve the diagnostic messages. Signed-off-by: Kent Overstreet --- drivers/md/bcache/journal.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 7eafdf09a0a..29cccc510eb 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -44,11 +44,11 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list, closure_init_stack(&cl); - pr_debug("reading %llu", (uint64_t) bucket); + pr_debug("reading %u", bucket_index); while (offset < ca->sb.bucket_size) { reread: left = ca->sb.bucket_size - offset; - len = min_t(unsigned, left, PAGE_SECTORS * 8); + len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS); bio_reset(bio); bio->bi_iter.bi_sector = bucket + offset; @@ -74,17 +74,26 @@ reread: left = ca->sb.bucket_size - offset; struct list_head *where; size_t blocks, bytes = set_bytes(j); - if (j->magic != jset_magic(&ca->sb)) + if (j->magic != jset_magic(&ca->sb)) { + pr_debug("%u: bad magic", bucket_index); return ret; + } - if (bytes > left << 9) + if (bytes > left << 9 || + bytes > PAGE_SIZE << JSET_BITS) { + pr_info("%u: too big, %zu bytes, offset %u", + bucket_index, bytes, offset); return ret; + } if (bytes > len << 9) goto reread; - if (j->csum != csum_set(j)) + if (j->csum != csum_set(j)) { + pr_info("%u: bad csum, %zu bytes, offset %u", + bucket_index, bytes, offset); return ret; + } blocks = set_blocks(j, ca->set); -- cgit v1.2.3-70-g09d2 From 5775e2133dfa0dc1f4c7f233e2144d32cb516f54 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Dec 2013 16:10:46 -0800 Subject: bcache: Performance fix for when journal entry is full We were unnecessarily waiting on a journal write to complete when we just needed to start a journal write and start setting up the next one. Signed-off-by: Kent Overstreet --- drivers/md/bcache/journal.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 29cccc510eb..ad687285c2d 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -673,6 +673,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, { size_t sectors; struct closure cl; + bool wait = false; closure_init_stack(&cl); @@ -689,9 +690,12 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, PAGE_SECTORS << JSET_BITS)) return w; - /* XXX: tracepoint */ + if (wait) + closure_wait(&c->journal.wait, &cl); + if (!journal_full(&c->journal)) { - trace_bcache_journal_entry_full(c); + if (wait) + trace_bcache_journal_entry_full(c); /* * XXX: If we were inserting so many keys that they @@ -701,12 +705,11 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, */ BUG_ON(!w->data->keys); - closure_wait(&w->wait, &cl); journal_try_write(c); /* unlocks */ } else { - trace_bcache_journal_full(c); + if (wait) + trace_bcache_journal_full(c); - closure_wait(&c->journal.wait, &cl); journal_reclaim(c); spin_unlock(&c->journal.lock); @@ -715,6 +718,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, closure_sync(&cl); spin_lock(&c->journal.lock); + wait = true; } } -- cgit v1.2.3-70-g09d2 From b0f32a56f27eb0df4124dbfc8eb6f09f423eed99 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Dec 2013 13:24:26 -0800 Subject: bcache: Minor btree cache fix Signed-off-by: Kent Overstreet --- drivers/md/bcache/btree.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 946ecd3b048..aaa87b3be9e 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -714,14 +714,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, } } - /* - * Can happen right when we first start up, before we've read in any - * btree nodes - */ - if (list_empty(&c->btree_cache)) - goto out; - for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { + if (list_empty(&c->btree_cache)) + goto out; + b = list_first_entry(&c->btree_cache, struct btree, list); list_rotate_left(&c->btree_cache); -- cgit v1.2.3-70-g09d2 From d56d000a1f424aa77538bd5aad18b43037ed20cc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 9 Aug 2013 21:14:13 -0700 Subject: bcache: Don't touch bucket gen for dirty ptrs Unnecessary since a bucket that has dirty pointers pointing to it can never be invalidated - and skipping it is a measurable performance boost, since the bucket gen will usually be a cache miss. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bset.c | 6 +++++- drivers/md/bcache/request.c | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 16958704af3..e51a739f751 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -176,10 +176,14 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k) bch_ptr_invalid(b, k)) return true; - for (i = 0; i < KEY_PTRS(k); i++) { + for (i = 0; i < KEY_PTRS(k); i++) if (!ptr_available(b->c, k, i)) return true; + if (!expensive_debug_checks(b->c) && KEY_DIRTY(k)) + return false; + + for (i = 0; i < KEY_PTRS(k); i++) { g = PTR_BUCKET(b->c, k, i); stale = ptr_stale(b->c, k, i); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 5878cdb3952..59b3d6df100 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -630,7 +630,8 @@ static void bch_cache_read_endio(struct bio *bio, int error) if (error) s->iop.error = error; - else if (ptr_stale(s->iop.c, &b->key, 0)) { + else if (!KEY_DIRTY(&b->key) && + ptr_stale(s->iop.c, &b->key, 0)) { atomic_long_inc(&s->iop.c->cache_read_races); s->iop.error = -EINTR; } -- cgit v1.2.3-70-g09d2 From a5ae4300c15c778722c139953c825cd24d6ff517 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Sep 2013 19:16:31 -0700 Subject: bcache: Zero less memory Another minor performance optimization Signed-off-by: Kent Overstreet --- drivers/md/bcache/closure.h | 13 +------------ drivers/md/bcache/request.c | 47 ++++++++++++++++++++++++++------------------- drivers/md/bcache/request.h | 21 ++++++++++++-------- 3 files changed, 41 insertions(+), 40 deletions(-) diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 9762f1be330..d29b773b863 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -347,14 +347,6 @@ do { \ do_closure_init(_cl, parent, running); \ } while (0) -/** - * __closure_init() - Initialize a closure, skipping the memset() - * - * May be used instead of closure_init() when memory has already been zeroed. - */ -#define __closure_init(cl, parent) \ - closure_init_type(cl, parent, true) - /** * closure_init() - Initialize a closure, setting the refcount to 1 * @cl: closure to initialize @@ -362,10 +354,7 @@ do { \ * lifetime; may be NULL. */ #define closure_init(cl, parent) \ -do { \ - memset((cl), 0, sizeof(*(cl))); \ - __closure_init(cl, parent); \ -} while (0) + closure_init_type(cl, parent, true) static inline void closure_init_stack(struct closure *cl) { diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 59b3d6df100..cce02f19e6c 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -597,14 +597,12 @@ struct search { /* Stack frame for bio_complete */ struct closure cl; - struct bcache_device *d; - struct bbio bio; struct bio *orig_bio; struct bio *cache_miss; + struct bcache_device *d; unsigned insert_bio_sectors; - unsigned recoverable:1; unsigned write:1; unsigned read_dirty_data:1; @@ -712,10 +710,13 @@ static void cache_lookup(struct closure *cl) { struct search *s = container_of(cl, struct search, iop.cl); struct bio *bio = &s->bio.bio; + int ret; - int ret = bch_btree_map_keys(&s->op, s->iop.c, - &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), - cache_lookup_fn, MAP_END_KEY); + bch_btree_op_init(&s->op, -1); + + ret = bch_btree_map_keys(&s->op, s->iop.c, + &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), + cache_lookup_fn, MAP_END_KEY); if (ret == -EAGAIN) continue_at(cl, cache_lookup, bcache_wq); @@ -756,12 +757,12 @@ static void bio_complete(struct search *s) } } -static void do_bio_hook(struct search *s) +static void do_bio_hook(struct search *s, struct bio *orig_bio) { struct bio *bio = &s->bio.bio; bio_init(bio); - __bio_clone_fast(bio, s->orig_bio); + __bio_clone_fast(bio, orig_bio); bio->bi_end_io = request_endio; bio->bi_private = &s->cl; @@ -780,26 +781,32 @@ static void search_free(struct closure *cl) mempool_free(s, s->d->c->search); } -static struct search *search_alloc(struct bio *bio, struct bcache_device *d) +static inline struct search *search_alloc(struct bio *bio, + struct bcache_device *d) { struct search *s; s = mempool_alloc(d->c->search, GFP_NOIO); - memset(s, 0, offsetof(struct search, iop.insert_keys)); - __closure_init(&s->cl, NULL); + closure_init(&s->cl, NULL); + do_bio_hook(s, bio); - s->iop.inode = d->id; - s->iop.c = d->c; - s->d = d; - s->op.lock = -1; - s->iop.write_point = hash_long((unsigned long) current, 16); s->orig_bio = bio; - s->write = (bio->bi_rw & REQ_WRITE) != 0; - s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; + s->cache_miss = NULL; + s->d = d; s->recoverable = 1; + s->write = (bio->bi_rw & REQ_WRITE) != 0; + s->read_dirty_data = 0; s->start_time = jiffies; - do_bio_hook(s); + + s->iop.c = d->c; + s->iop.bio = NULL; + s->iop.inode = d->id; + s->iop.write_point = hash_long((unsigned long) current, 16); + s->iop.write_prio = 0; + s->iop.error = 0; + s->iop.flags = 0; + s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; return s; } @@ -845,7 +852,7 @@ static void cached_dev_read_error(struct closure *cl) trace_bcache_read_retry(s->orig_bio); s->iop.error = 0; - do_bio_hook(s); + do_bio_hook(s, s->orig_bio); /* XXX: invalidate cache */ diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 2cd65bf073c..39f21dbedc3 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -13,17 +13,22 @@ struct data_insert_op { uint16_t write_prio; short error; - unsigned bypass:1; - unsigned writeback:1; - unsigned flush_journal:1; - unsigned csum:1; + union { + uint16_t flags; - unsigned replace:1; - unsigned replace_collision:1; + struct { + unsigned bypass:1; + unsigned writeback:1; + unsigned flush_journal:1; + unsigned csum:1; - unsigned insert_data_done:1; + unsigned replace:1; + unsigned replace_collision:1; + + unsigned insert_data_done:1; + }; + }; - /* Anything past this point won't get zeroed in search_alloc() */ struct keylist insert_keys; BKEY_PADDED(replace_key); }; -- cgit v1.2.3-70-g09d2 From cb7a583e6a6ace661a5890803e115d2292a293df Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 16 Dec 2013 15:27:25 -0800 Subject: bcache: kill closure locking usage Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 9 +++++--- drivers/md/bcache/btree.c | 52 ++++++++++++++++++++++++++++--------------- drivers/md/bcache/btree.h | 3 ++- drivers/md/bcache/debug.c | 7 ++---- drivers/md/bcache/journal.c | 27 ++++++++++++----------- drivers/md/bcache/journal.h | 1 + drivers/md/bcache/super.c | 54 ++++++++++++++++++++++++++++++++------------- 7 files changed, 98 insertions(+), 55 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index dbdbca5a959..9d062bc5626 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -309,7 +309,8 @@ struct cached_dev { struct cache_sb sb; struct bio sb_bio; struct bio_vec sb_bv[1]; - struct closure_with_waitlist sb_write; + struct closure sb_write; + struct semaphore sb_write_mutex; /* Refcount on the cache set. Always nonzero when we're caching. */ atomic_t count; @@ -514,7 +515,8 @@ struct cache_set { uint64_t cached_dev_sectors; struct closure caching; - struct closure_with_waitlist sb_write; + struct closure sb_write; + struct semaphore sb_write_mutex; mempool_t *search; mempool_t *bio_meta; @@ -635,7 +637,8 @@ struct cache_set { unsigned nr_uuids; struct uuid_entry *uuids; BKEY_PADDED(uuid_bucket); - struct closure_with_waitlist uuid_write; + struct closure uuid_write; + struct semaphore uuid_write_mutex; /* * A btree node on disk could have too many bsets for an iterator to fit diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index aaa87b3be9e..101231f0f39 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -340,9 +340,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w) w->journal = NULL; } +static void btree_node_write_unlock(struct closure *cl) +{ + struct btree *b = container_of(cl, struct btree, io); + + up(&b->io_mutex); +} + static void __btree_node_write_done(struct closure *cl) { - struct btree *b = container_of(cl, struct btree, io.cl); + struct btree *b = container_of(cl, struct btree, io); struct btree_write *w = btree_prev_write(b); bch_bbio_free(b->bio, b->c); @@ -353,12 +360,12 @@ static void __btree_node_write_done(struct closure *cl) queue_delayed_work(btree_io_wq, &b->work, msecs_to_jiffies(30000)); - closure_return(cl); + closure_return_with_destructor(cl, btree_node_write_unlock); } static void btree_node_write_done(struct closure *cl) { - struct btree *b = container_of(cl, struct btree, io.cl); + struct btree *b = container_of(cl, struct btree, io); struct bio_vec *bv; int n; @@ -371,7 +378,7 @@ static void btree_node_write_done(struct closure *cl) static void btree_node_write_endio(struct bio *bio, int error) { struct closure *cl = bio->bi_private; - struct btree *b = container_of(cl, struct btree, io.cl); + struct btree *b = container_of(cl, struct btree, io); if (error) set_btree_node_io_error(b); @@ -382,7 +389,7 @@ static void btree_node_write_endio(struct bio *bio, int error) static void do_btree_node_write(struct btree *b) { - struct closure *cl = &b->io.cl; + struct closure *cl = &b->io; struct bset *i = b->sets[b->nsets].data; BKEY_PADDED(key) k; @@ -435,7 +442,7 @@ static void do_btree_node_write(struct btree *b) bch_submit_bbio(b->bio, b->c, &k.key, 0); closure_sync(cl); - __btree_node_write_done(cl); + continue_at_nobarrier(cl, __btree_node_write_done, NULL); } } @@ -454,7 +461,8 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) cancel_delayed_work(&b->work); /* If caller isn't waiting for write, parent refcount is cache set */ - closure_lock(&b->io, parent ?: &b->c->cl); + down(&b->io_mutex); + closure_init(&b->io, parent ?: &b->c->cl); clear_bit(BTREE_NODE_dirty, &b->flags); change_bit(BTREE_NODE_write_idx, &b->flags); @@ -554,7 +562,8 @@ static void mca_reinit(struct btree *b) static void mca_data_free(struct btree *b) { struct bset_tree *t = b->sets; - BUG_ON(!closure_is_unlocked(&b->io.cl)); + + BUG_ON(b->io_mutex.count != 1); if (bset_prev_bytes(b) < PAGE_SIZE) kfree(t->prev); @@ -635,7 +644,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, INIT_LIST_HEAD(&b->list); INIT_DELAYED_WORK(&b->work, btree_node_write_work); b->c = c; - closure_init_unlocked(&b->io); + sema_init(&b->io_mutex, 1); mca_data_alloc(b, k, gfp); return b; @@ -653,22 +662,29 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush) BUG_ON(btree_node_dirty(b) && !b->sets[0].data); - if (b->page_order < min_order || - (!flush && - (btree_node_dirty(b) || - atomic_read(&b->io.cl.remaining) != -1))) { - rw_unlock(true, b); - return -ENOMEM; + if (b->page_order < min_order) + goto out_unlock; + + if (!flush) { + if (btree_node_dirty(b)) + goto out_unlock; + + if (down_trylock(&b->io_mutex)) + goto out_unlock; + up(&b->io_mutex); } if (btree_node_dirty(b)) bch_btree_node_write_sync(b); /* wait for any in flight btree write */ - closure_wait_event(&b->io.wait, &cl, - atomic_read(&b->io.cl.remaining) == -1); + down(&b->io_mutex); + up(&b->io_mutex); return 0; +out_unlock: + rw_unlock(true, b); + return -ENOMEM; } static unsigned long bch_mca_scan(struct shrinker *shrink, @@ -918,7 +934,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) if (!b->sets->data) goto err; out: - BUG_ON(!closure_is_unlocked(&b->io.cl)); + BUG_ON(b->io_mutex.count != 1); bkey_copy(&b->key, k); list_move(&b->list, &c->btree_cache); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 767e7557089..d68af7442f7 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -143,7 +143,8 @@ struct btree { struct bset_tree sets[MAX_BSETS]; /* For outstanding btree writes, used as a lock - protects write_idx */ - struct closure_with_waitlist io; + struct closure io; + struct semaphore io_mutex; struct list_head list; struct delayed_work work; diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 03cb4d114e1..fab3767d6d2 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -127,9 +127,7 @@ void bch_btree_verify(struct btree *b, struct bset *new) if (!b->c->verify) return; - closure_wait_event(&b->io.wait, &cl, - atomic_read(&b->io.cl.remaining) == -1); - + down(&b->io_mutex); mutex_lock(&b->c->verify_lock); bkey_copy(&v->key, &b->key); @@ -137,8 +135,6 @@ void bch_btree_verify(struct btree *b, struct bset *new) v->level = b->level; bch_btree_node_read(v); - closure_wait_event(&v->io.wait, &cl, - atomic_read(&b->io.cl.remaining) == -1); if (new->keys != v->sets[0].data->keys || memcmp(new->start, @@ -167,6 +163,7 @@ void bch_btree_verify(struct btree *b, struct bset *new) } mutex_unlock(&b->c->verify_lock); + up(&b->io_mutex); } void bch_data_verify(struct cached_dev *dc, struct bio *bio) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index ad687285c2d..9d32d579082 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -564,6 +564,14 @@ static void journal_write_done(struct closure *cl) continue_at_nobarrier(cl, journal_write, system_wq); } +static void journal_write_unlock(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, journal.io); + + c->journal.io_in_flight = 0; + spin_unlock(&c->journal.lock); +} + static void journal_write_unlocked(struct closure *cl) __releases(c->journal.lock) { @@ -578,15 +586,7 @@ static void journal_write_unlocked(struct closure *cl) bio_list_init(&list); if (!w->need_write) { - /* - * XXX: have to unlock closure before we unlock journal lock, - * else we race with bch_journal(). But this way we race - * against cache set unregister. Doh. - */ - set_closure_fn(cl, NULL, NULL); - closure_sub(cl, CLOSURE_RUNNING + 1); - spin_unlock(&c->journal.lock); - return; + closure_return_with_destructor(cl, journal_write_unlock); } else if (journal_full(&c->journal)) { journal_reclaim(c); spin_unlock(&c->journal.lock); @@ -662,10 +662,12 @@ static void journal_try_write(struct cache_set *c) w->need_write = true; - if (closure_trylock(cl, &c->cl)) - journal_write_unlocked(cl); - else + if (!c->journal.io_in_flight) { + c->journal.io_in_flight = 1; + closure_call(cl, journal_write_unlocked, NULL, &c->cl); + } else { spin_unlock(&c->journal.lock); + } } static struct journal_write *journal_wait_for_write(struct cache_set *c, @@ -793,7 +795,6 @@ int bch_journal_alloc(struct cache_set *c) { struct journal *j = &c->journal; - closure_init_unlocked(&j->io); spin_lock_init(&j->lock); INIT_DELAYED_WORK(&j->work, journal_write_work); diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index a6472fda94b..9180c446507 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -104,6 +104,7 @@ struct journal { /* used when waiting because the journal was full */ struct closure_waitlist wait; struct closure io; + int io_in_flight; struct delayed_work work; /* Number of blocks free in the bucket(s) we're currently writing to */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 93d593f957f..b057676fc67 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -225,7 +225,7 @@ static void write_bdev_super_endio(struct bio *bio, int error) struct cached_dev *dc = bio->bi_private; /* XXX: error checking */ - closure_put(&dc->sb_write.cl); + closure_put(&dc->sb_write); } static void __write_super(struct cache_sb *sb, struct bio *bio) @@ -263,12 +263,20 @@ static void __write_super(struct cache_sb *sb, struct bio *bio) submit_bio(REQ_WRITE, bio); } +static void bch_write_bdev_super_unlock(struct closure *cl) +{ + struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write); + + up(&dc->sb_write_mutex); +} + void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) { - struct closure *cl = &dc->sb_write.cl; + struct closure *cl = &dc->sb_write; struct bio *bio = &dc->sb_bio; - closure_lock(&dc->sb_write, parent); + down(&dc->sb_write_mutex); + closure_init(cl, parent); bio_reset(bio); bio->bi_bdev = dc->bdev; @@ -278,7 +286,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) closure_get(cl); __write_super(&dc->sb, bio); - closure_return(cl); + closure_return_with_destructor(cl, bch_write_bdev_super_unlock); } static void write_super_endio(struct bio *bio, int error) @@ -286,16 +294,24 @@ static void write_super_endio(struct bio *bio, int error) struct cache *ca = bio->bi_private; bch_count_io_errors(ca, error, "writing superblock"); - closure_put(&ca->set->sb_write.cl); + closure_put(&ca->set->sb_write); +} + +static void bcache_write_super_unlock(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, sb_write); + + up(&c->sb_write_mutex); } void bcache_write_super(struct cache_set *c) { - struct closure *cl = &c->sb_write.cl; + struct closure *cl = &c->sb_write; struct cache *ca; unsigned i; - closure_lock(&c->sb_write, &c->cl); + down(&c->sb_write_mutex); + closure_init(cl, &c->cl); c->sb.seq++; @@ -317,7 +333,7 @@ void bcache_write_super(struct cache_set *c) __write_super(&ca->sb, bio); } - closure_return(cl); + closure_return_with_destructor(cl, bcache_write_super_unlock); } /* UUID io */ @@ -325,23 +341,31 @@ void bcache_write_super(struct cache_set *c) static void uuid_endio(struct bio *bio, int error) { struct closure *cl = bio->bi_private; - struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl); + struct cache_set *c = container_of(cl, struct cache_set, uuid_write); cache_set_err_on(error, c, "accessing uuids"); bch_bbio_free(bio, c); closure_put(cl); } +static void uuid_io_unlock(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, uuid_write); + + up(&c->uuid_write_mutex); +} + static void uuid_io(struct cache_set *c, unsigned long rw, struct bkey *k, struct closure *parent) { - struct closure *cl = &c->uuid_write.cl; + struct closure *cl = &c->uuid_write; struct uuid_entry *u; unsigned i; char buf[80]; BUG_ON(!parent); - closure_lock(&c->uuid_write, parent); + down(&c->uuid_write_mutex); + closure_init(cl, parent); for (i = 0; i < KEY_PTRS(k); i++) { struct bio *bio = bch_bbio_alloc(c); @@ -368,7 +392,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw, u - c->uuids, u->uuid, u->label, u->first_reg, u->last_reg, u->invalidated); - closure_return(cl); + closure_return_with_destructor(cl, uuid_io_unlock); } static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) @@ -1098,7 +1122,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); INIT_WORK(&dc->detach, cached_dev_detach_finish); - closure_init_unlocked(&dc->sb_write); + sema_init(&dc->sb_write_mutex, 1); INIT_LIST_HEAD(&dc->io_lru); spin_lock_init(&dc->io_lock); bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); @@ -1454,11 +1478,11 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->sort_crit_factor = int_sqrt(c->btree_pages); - closure_init_unlocked(&c->sb_write); + sema_init(&c->sb_write_mutex, 1); mutex_init(&c->bucket_lock); init_waitqueue_head(&c->try_wait); init_waitqueue_head(&c->bucket_wait); - closure_init_unlocked(&c->uuid_write); + sema_init(&c->uuid_write_mutex, 1); mutex_init(&c->sort_lock); spin_lock_init(&c->sort_time.lock); -- cgit v1.2.3-70-g09d2 From 1dd13c8d3c2d82e1b668d0b4754591291656542a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 20 Dec 2013 15:55:23 -0800 Subject: bcache: kill closure locking code Also flesh out the documentation a bit Signed-off-by: Kent Overstreet --- drivers/md/bcache/closure.c | 90 +++--------- drivers/md/bcache/closure.h | 346 ++++++++++++++------------------------------ 2 files changed, 123 insertions(+), 313 deletions(-) diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index dfff2410322..7a228de95fd 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -11,19 +11,6 @@ #include "closure.h" -#define CL_FIELD(type, field) \ - case TYPE_ ## type: \ - return &container_of(cl, struct type, cl)->field - -static struct closure_waitlist *closure_waitlist(struct closure *cl) -{ - switch (cl->type) { - CL_FIELD(closure_with_waitlist, wait); - default: - return NULL; - } -} - static inline void closure_put_after_sub(struct closure *cl, int flags) { int r = flags & CLOSURE_REMAINING_MASK; @@ -42,17 +29,10 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) closure_queue(cl); } else { struct closure *parent = cl->parent; - struct closure_waitlist *wait = closure_waitlist(cl); closure_fn *destructor = cl->fn; closure_debug_destroy(cl); - smp_mb(); - atomic_set(&cl->remaining, -1); - - if (wait) - closure_wake_up(wait); - if (destructor) destructor(cl); @@ -69,19 +49,18 @@ void closure_sub(struct closure *cl, int v) } EXPORT_SYMBOL(closure_sub); +/** + * closure_put - decrement a closure's refcount + */ void closure_put(struct closure *cl) { closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); } EXPORT_SYMBOL(closure_put); -static void set_waiting(struct closure *cl, unsigned long f) -{ -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG - cl->waiting_on = f; -#endif -} - +/** + * closure_wake_up - wake up all closures on a wait list, without memory barrier + */ void __closure_wake_up(struct closure_waitlist *wait_list) { struct llist_node *list; @@ -106,27 +85,34 @@ void __closure_wake_up(struct closure_waitlist *wait_list) cl = container_of(reverse, struct closure, list); reverse = llist_next(reverse); - set_waiting(cl, 0); + closure_set_waiting(cl, 0); closure_sub(cl, CLOSURE_WAITING + 1); } } EXPORT_SYMBOL(__closure_wake_up); -bool closure_wait(struct closure_waitlist *list, struct closure *cl) +/** + * closure_wait - add a closure to a waitlist + * + * @waitlist will own a ref on @cl, which will be released when + * closure_wake_up() is called on @waitlist. + * + */ +bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) { if (atomic_read(&cl->remaining) & CLOSURE_WAITING) return false; - set_waiting(cl, _RET_IP_); + closure_set_waiting(cl, _RET_IP_); atomic_add(CLOSURE_WAITING + 1, &cl->remaining); - llist_add(&cl->list, &list->list); + llist_add(&cl->list, &waitlist->list); return true; } EXPORT_SYMBOL(closure_wait); /** - * closure_sync() - sleep until a closure a closure has nothing left to wait on + * closure_sync - sleep until a closure a closure has nothing left to wait on * * Sleeps until the refcount hits 1 - the thread that's running the closure owns * the last refcount. @@ -148,46 +134,6 @@ void closure_sync(struct closure *cl) } EXPORT_SYMBOL(closure_sync); -/** - * closure_trylock() - try to acquire the closure, without waiting - * @cl: closure to lock - * - * Returns true if the closure was succesfully locked. - */ -bool closure_trylock(struct closure *cl, struct closure *parent) -{ - if (atomic_cmpxchg(&cl->remaining, -1, - CLOSURE_REMAINING_INITIALIZER) != -1) - return false; - - smp_mb(); - - cl->parent = parent; - if (parent) - closure_get(parent); - - closure_set_ret_ip(cl); - closure_debug_create(cl); - return true; -} -EXPORT_SYMBOL(closure_trylock); - -void __closure_lock(struct closure *cl, struct closure *parent, - struct closure_waitlist *wait_list) -{ - struct closure wait; - closure_init_stack(&wait); - - while (1) { - if (closure_trylock(cl, parent)) - return; - - closure_wait_event(wait_list, &wait, - atomic_read(&cl->remaining) == -1); - } -} -EXPORT_SYMBOL(__closure_lock); - #ifdef CONFIG_BCACHE_CLOSURES_DEBUG static LIST_HEAD(closure_list); diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index d29b773b863..7ef7461912b 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -72,30 +72,6 @@ * closure - _always_ use continue_at(). Doing so consistently will help * eliminate an entire class of particularly pernicious races. * - * For a closure to wait on an arbitrary event, we need to introduce waitlists: - * - * struct closure_waitlist list; - * closure_wait_event(list, cl, condition); - * closure_wake_up(wait_list); - * - * These work analagously to wait_event() and wake_up() - except that instead of - * operating on the current thread (for wait_event()) and lists of threads, they - * operate on an explicit closure and lists of closures. - * - * Because it's a closure we can now wait either synchronously or - * asynchronously. closure_wait_event() returns the current value of the - * condition, and if it returned false continue_at() or closure_sync() can be - * used to wait for it to become true. - * - * It's useful for waiting on things when you can't sleep in the context in - * which you must check the condition (perhaps a spinlock held, or you might be - * beneath generic_make_request() - in which case you can't sleep on IO). - * - * closure_wait_event() will wait either synchronously or asynchronously, - * depending on whether the closure is in blocking mode or not. You can pick a - * mode explicitly with closure_wait_event_sync() and - * closure_wait_event_async(), which do just what you might expect. - * * Lastly, you might have a wait list dedicated to a specific event, and have no * need for specifying the condition - you just want to wait until someone runs * closure_wake_up() on the appropriate wait list. In that case, just use @@ -121,40 +97,6 @@ * All this implies that a closure should typically be embedded in a particular * struct (which its refcount will normally control the lifetime of), and that * struct can very much be thought of as a stack frame. - * - * Locking: - * - * Closures are based on work items but they can be thought of as more like - * threads - in that like threads and unlike work items they have a well - * defined lifetime; they are created (with closure_init()) and eventually - * complete after a continue_at(cl, NULL, NULL). - * - * Suppose you've got some larger structure with a closure embedded in it that's - * used for periodically doing garbage collection. You only want one garbage - * collection happening at a time, so the natural thing to do is protect it with - * a lock. However, it's difficult to use a lock protecting a closure correctly - * because the unlock should come after the last continue_to() (additionally, if - * you're using the closure asynchronously a mutex won't work since a mutex has - * to be unlocked by the same process that locked it). - * - * So to make it less error prone and more efficient, we also have the ability - * to use closures as locks: - * - * closure_init_unlocked(); - * closure_trylock(); - * - * That's all we need for trylock() - the last closure_put() implicitly unlocks - * it for you. But for closure_lock(), we also need a wait list: - * - * struct closure_with_waitlist frobnicator_cl; - * - * closure_init_unlocked(&frobnicator_cl); - * closure_lock(&frobnicator_cl); - * - * A closure_with_waitlist embeds a closure and a wait list - much like struct - * delayed_work embeds a work item and a timer_list. The important thing is, use - * it exactly like you would a regular closure and closure_put() will magically - * handle everything for you. */ struct closure; @@ -164,12 +106,6 @@ struct closure_waitlist { struct llist_head list; }; -enum closure_type { - TYPE_closure = 0, - TYPE_closure_with_waitlist = 1, - MAX_CLOSURE_TYPE = 1, -}; - enum closure_state { /* * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by @@ -224,8 +160,6 @@ struct closure { atomic_t remaining; - enum closure_type type; - #ifdef CONFIG_BCACHE_CLOSURES_DEBUG #define CLOSURE_MAGIC_DEAD 0xc054dead #define CLOSURE_MAGIC_ALIVE 0xc054a11e @@ -237,34 +171,12 @@ struct closure { #endif }; -struct closure_with_waitlist { - struct closure cl; - struct closure_waitlist wait; -}; - -extern unsigned invalid_closure_type(void); - -#define __CLOSURE_TYPE(cl, _t) \ - __builtin_types_compatible_p(typeof(cl), struct _t) \ - ? TYPE_ ## _t : \ - -#define __closure_type(cl) \ -( \ - __CLOSURE_TYPE(cl, closure) \ - __CLOSURE_TYPE(cl, closure_with_waitlist) \ - invalid_closure_type() \ -) - void closure_sub(struct closure *cl, int v); void closure_put(struct closure *cl); void __closure_wake_up(struct closure_waitlist *list); bool closure_wait(struct closure_waitlist *list, struct closure *cl); void closure_sync(struct closure *cl); -bool closure_trylock(struct closure *cl, struct closure *parent); -void __closure_lock(struct closure *cl, struct closure *parent, - struct closure_waitlist *wait_list); - #ifdef CONFIG_BCACHE_CLOSURES_DEBUG void closure_debug_init(void); @@ -293,123 +205,97 @@ static inline void closure_set_ret_ip(struct closure *cl) #endif } -static inline void closure_get(struct closure *cl) +static inline void closure_set_waiting(struct closure *cl, unsigned long f) { #ifdef CONFIG_BCACHE_CLOSURES_DEBUG - BUG_ON((atomic_inc_return(&cl->remaining) & - CLOSURE_REMAINING_MASK) <= 1); -#else - atomic_inc(&cl->remaining); + cl->waiting_on = f; #endif } +static inline void __closure_end_sleep(struct closure *cl) +{ + __set_current_state(TASK_RUNNING); + + if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING) + atomic_sub(CLOSURE_SLEEPING, &cl->remaining); +} + +static inline void __closure_start_sleep(struct closure *cl) +{ + closure_set_ip(cl); + cl->task = current; + set_current_state(TASK_UNINTERRUPTIBLE); + + if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) + atomic_add(CLOSURE_SLEEPING, &cl->remaining); +} + static inline void closure_set_stopped(struct closure *cl) { atomic_sub(CLOSURE_RUNNING, &cl->remaining); } -static inline bool closure_is_unlocked(struct closure *cl) +static inline void set_closure_fn(struct closure *cl, closure_fn *fn, + struct workqueue_struct *wq) { - return atomic_read(&cl->remaining) == -1; + BUG_ON(object_is_on_stack(cl)); + closure_set_ip(cl); + cl->fn = fn; + cl->wq = wq; + /* between atomic_dec() in closure_put() */ + smp_mb__before_atomic_dec(); } -static inline void do_closure_init(struct closure *cl, struct closure *parent, - bool running) +static inline void closure_queue(struct closure *cl) { - cl->parent = parent; - if (parent) - closure_get(parent); - - if (running) { - closure_debug_create(cl); - atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); + struct workqueue_struct *wq = cl->wq; + if (wq) { + INIT_WORK(&cl->work, cl->work.func); + BUG_ON(!queue_work(wq, &cl->work)); } else - atomic_set(&cl->remaining, -1); - - closure_set_ip(cl); + cl->fn(cl); } -/* - * Hack to get at the embedded closure if there is one, by doing an unsafe cast: - * the result of __closure_type() is thrown away, it's used merely for type - * checking. +/** + * closure_get - increment a closure's refcount */ -#define __to_internal_closure(cl) \ -({ \ - BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE); \ - (struct closure *) cl; \ -}) - -#define closure_init_type(cl, parent, running) \ -do { \ - struct closure *_cl = __to_internal_closure(cl); \ - _cl->type = __closure_type(*(cl)); \ - do_closure_init(_cl, parent, running); \ -} while (0) +static inline void closure_get(struct closure *cl) +{ +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + BUG_ON((atomic_inc_return(&cl->remaining) & + CLOSURE_REMAINING_MASK) <= 1); +#else + atomic_inc(&cl->remaining); +#endif +} /** - * closure_init() - Initialize a closure, setting the refcount to 1 + * closure_init - Initialize a closure, setting the refcount to 1 * @cl: closure to initialize * @parent: parent of the new closure. cl will take a refcount on it for its * lifetime; may be NULL. */ -#define closure_init(cl, parent) \ - closure_init_type(cl, parent, true) - -static inline void closure_init_stack(struct closure *cl) +static inline void closure_init(struct closure *cl, struct closure *parent) { memset(cl, 0, sizeof(struct closure)); - atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); -} - -/** - * closure_init_unlocked() - Initialize a closure but leave it unlocked. - * @cl: closure to initialize - * - * For when the closure will be used as a lock. The closure may not be used - * until after a closure_lock() or closure_trylock(). - */ -#define closure_init_unlocked(cl) \ -do { \ - memset((cl), 0, sizeof(*(cl))); \ - closure_init_type(cl, NULL, false); \ -} while (0) + cl->parent = parent; + if (parent) + closure_get(parent); -/** - * closure_lock() - lock and initialize a closure. - * @cl: the closure to lock - * @parent: the new parent for this closure - * - * The closure must be of one of the types that has a waitlist (otherwise we - * wouldn't be able to sleep on contention). - * - * @parent has exactly the same meaning as in closure_init(); if non null, the - * closure will take a reference on @parent which will be released when it is - * unlocked. - */ -#define closure_lock(cl, parent) \ - __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait) + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); -static inline void __closure_end_sleep(struct closure *cl) -{ - __set_current_state(TASK_RUNNING); - - if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING) - atomic_sub(CLOSURE_SLEEPING, &cl->remaining); + closure_debug_create(cl); + closure_set_ip(cl); } -static inline void __closure_start_sleep(struct closure *cl) +static inline void closure_init_stack(struct closure *cl) { - closure_set_ip(cl); - cl->task = current; - set_current_state(TASK_UNINTERRUPTIBLE); - - if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) - atomic_add(CLOSURE_SLEEPING, &cl->remaining); + memset(cl, 0, sizeof(struct closure)); + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); } /** - * closure_wake_up() - wake up all closures on a wait list. + * closure_wake_up - wake up all closures on a wait list. */ static inline void closure_wake_up(struct closure_waitlist *list) { @@ -417,69 +303,19 @@ static inline void closure_wake_up(struct closure_waitlist *list) __closure_wake_up(list); } -/* - * Wait on an event, synchronously or asynchronously - analogous to wait_event() - * but for closures. - * - * The loop is oddly structured so as to avoid a race; we must check the - * condition again after we've added ourself to the waitlist. We know if we were - * already on the waitlist because closure_wait() returns false; thus, we only - * schedule or break if closure_wait() returns false. If it returns true, we - * just loop again - rechecking the condition. - * - * The __closure_wake_up() is necessary because we may race with the event - * becoming true; i.e. we see event false -> wait -> recheck condition, but the - * thread that made the event true may have called closure_wake_up() before we - * added ourself to the wait list. - * - * We have to call closure_sync() at the end instead of just - * __closure_end_sleep() because a different thread might've called - * closure_wake_up() before us and gotten preempted before they dropped the - * refcount on our closure. If this was a stack allocated closure, that would be - * bad. +/** + * continue_at - jump to another function with barrier + * + * After @cl is no longer waiting on anything (i.e. all outstanding refs have + * been dropped with closure_put()), it will resume execution at @fn running out + * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). + * + * NOTE: This macro expands to a return in the calling function! + * + * This is because after calling continue_at() you no longer have a ref on @cl, + * and whatever @cl owns may be freed out from under you - a running closure fn + * has a ref on its own closure which continue_at() drops. */ -#define closure_wait_event(list, cl, condition) \ -({ \ - typeof(condition) ret; \ - \ - while (1) { \ - ret = (condition); \ - if (ret) { \ - __closure_wake_up(list); \ - closure_sync(cl); \ - break; \ - } \ - \ - __closure_start_sleep(cl); \ - \ - if (!closure_wait(list, cl)) \ - schedule(); \ - } \ - \ - ret; \ -}) - -static inline void closure_queue(struct closure *cl) -{ - struct workqueue_struct *wq = cl->wq; - if (wq) { - INIT_WORK(&cl->work, cl->work.func); - BUG_ON(!queue_work(wq, &cl->work)); - } else - cl->fn(cl); -} - -static inline void set_closure_fn(struct closure *cl, closure_fn *fn, - struct workqueue_struct *wq) -{ - BUG_ON(object_is_on_stack(cl)); - closure_set_ip(cl); - cl->fn = fn; - cl->wq = wq; - /* between atomic_dec() in closure_put() */ - smp_mb__before_atomic_dec(); -} - #define continue_at(_cl, _fn, _wq) \ do { \ set_closure_fn(_cl, _fn, _wq); \ @@ -487,8 +323,28 @@ do { \ return; \ } while (0) +/** + * closure_return - finish execution of a closure + * + * This is used to indicate that @cl is finished: when all outstanding refs on + * @cl have been dropped @cl's ref on its parent closure (as passed to + * closure_init()) will be dropped, if one was specified - thus this can be + * thought of as returning to the parent closure. + */ #define closure_return(_cl) continue_at((_cl), NULL, NULL) +/** + * continue_at_nobarrier - jump to another function without barrier + * + * Causes @fn to be executed out of @cl, in @wq context (or called directly if + * @wq is NULL). + * + * NOTE: like continue_at(), this macro expands to a return in the caller! + * + * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, + * thus it's not safe to touch anything protected by @cl after a + * continue_at_nobarrier(). + */ #define continue_at_nobarrier(_cl, _fn, _wq) \ do { \ set_closure_fn(_cl, _fn, _wq); \ @@ -496,6 +352,15 @@ do { \ return; \ } while (0) +/** + * closure_return - finish execution of a closure, with destructor + * + * Works like closure_return(), except @destructor will be called when all + * outstanding refs on @cl have been dropped; @destructor may be used to safely + * free the memory occupied by @cl, and it is called with the ref on the parent + * closure still held - so @destructor could safely return an item to a + * freelist protected by @cl's parent. + */ #define closure_return_with_destructor(_cl, _destructor) \ do { \ set_closure_fn(_cl, _destructor, NULL); \ @@ -503,6 +368,13 @@ do { \ return; \ } while (0) +/** + * closure_call - execute @fn out of a new, uninitialized closure + * + * Typically used when running out of one closure, and we want to run @fn + * asynchronously out of a new closure - @parent will then wait for @cl to + * finish. + */ static inline void closure_call(struct closure *cl, closure_fn fn, struct workqueue_struct *wq, struct closure *parent) @@ -511,12 +383,4 @@ static inline void closure_call(struct closure *cl, closure_fn fn, continue_at_nobarrier(cl, fn, wq); } -static inline void closure_trylock_call(struct closure *cl, closure_fn fn, - struct workqueue_struct *wq, - struct closure *parent) -{ - if (closure_trylock(cl, parent)) - continue_at_nobarrier(cl, fn, wq); -} - #endif /* _LINUX_CLOSURE_H */ -- cgit v1.2.3-70-g09d2 From 78365411b344df35a198b119133e6515c2dcfb9f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 17 Dec 2013 01:29:34 -0800 Subject: bcache: Rework allocator reserves We need a reserve for allocating buckets for new btree nodes - and now that we've got multiple btrees, it really needs to be per btree. This reworks the reserves so we've got separate freelists for each reserve instead of watermarks, which seems to make things a bit cleaner, and it adds some code so that btree_split() can make sure the reserve is available before it starts. Signed-off-by: Kent Overstreet --- drivers/md/bcache/alloc.c | 72 ++++++++++++++++++++++++++----------------- drivers/md/bcache/bcache.h | 16 +++++----- drivers/md/bcache/btree.c | 34 ++++++++++++++++++-- drivers/md/bcache/btree.h | 4 +++ drivers/md/bcache/movinggc.c | 2 +- drivers/md/bcache/super.c | 21 ++++++++----- drivers/md/bcache/sysfs.c | 31 ------------------- include/trace/events/bcache.h | 8 ++--- 8 files changed, 105 insertions(+), 83 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 4c9852d92b0..bcfd96e2121 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -132,10 +132,16 @@ bool bch_bucket_add_unused(struct cache *ca, struct bucket *b) { BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); - if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] && - CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) - return false; + if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) { + unsigned i; + + for (i = 0; i < RESERVE_NONE; i++) + if (!fifo_full(&ca->free[i])) + goto add; + return false; + } +add: b->prio = 0; if (can_inc_bucket_gen(b) && @@ -304,6 +310,21 @@ do { \ __set_current_state(TASK_RUNNING); \ } while (0) +static int bch_allocator_push(struct cache *ca, long bucket) +{ + unsigned i; + + /* Prios/gens are actually the most important reserve */ + if (fifo_push(&ca->free[RESERVE_PRIO], bucket)) + return true; + + for (i = 0; i < RESERVE_NR; i++) + if (fifo_push(&ca->free[i], bucket)) + return true; + + return false; +} + static int bch_allocator_thread(void *arg) { struct cache *ca = arg; @@ -336,9 +357,7 @@ static int bch_allocator_thread(void *arg) mutex_lock(&ca->set->bucket_lock); } - allocator_wait(ca, !fifo_full(&ca->free)); - - fifo_push(&ca->free, bucket); + allocator_wait(ca, bch_allocator_push(ca, bucket)); wake_up(&ca->set->bucket_wait); } @@ -365,34 +384,29 @@ static int bch_allocator_thread(void *arg) } } -long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait) +long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) { DEFINE_WAIT(w); struct bucket *b; long r; /* fastpath */ - if (fifo_used(&ca->free) > ca->watermark[watermark]) { - fifo_pop(&ca->free, r); + if (fifo_pop(&ca->free[RESERVE_NONE], r) || + fifo_pop(&ca->free[reserve], r)) goto out; - } if (!wait) return -1; - while (1) { - if (fifo_used(&ca->free) > ca->watermark[watermark]) { - fifo_pop(&ca->free, r); - break; - } - + do { prepare_to_wait(&ca->set->bucket_wait, &w, TASK_UNINTERRUPTIBLE); mutex_unlock(&ca->set->bucket_lock); schedule(); mutex_lock(&ca->set->bucket_lock); - } + } while (!fifo_pop(&ca->free[RESERVE_NONE], r) && + !fifo_pop(&ca->free[reserve], r)); finish_wait(&ca->set->bucket_wait, &w); out: @@ -401,12 +415,14 @@ out: if (expensive_debug_checks(ca->set)) { size_t iter; long i; + unsigned j; for (iter = 0; iter < prio_buckets(ca) * 2; iter++) BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); - fifo_for_each(i, &ca->free, iter) - BUG_ON(i == r); + for (j = 0; j < RESERVE_NR; j++) + fifo_for_each(i, &ca->free[j], iter) + BUG_ON(i == r); fifo_for_each(i, &ca->free_inc, iter) BUG_ON(i == r); fifo_for_each(i, &ca->unused, iter) @@ -419,7 +435,7 @@ out: SET_GC_SECTORS_USED(b, ca->sb.bucket_size); - if (watermark <= WATERMARK_METADATA) { + if (reserve <= RESERVE_PRIO) { SET_GC_MARK(b, GC_MARK_METADATA); SET_GC_MOVE(b, 0); b->prio = BTREE_PRIO; @@ -445,7 +461,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k) } } -int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, +int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, struct bkey *k, int n, bool wait) { int i; @@ -459,7 +475,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, for (i = 0; i < n; i++) { struct cache *ca = c->cache_by_alloc[i]; - long b = bch_bucket_alloc(ca, watermark, wait); + long b = bch_bucket_alloc(ca, reserve, wait); if (b == -1) goto err; @@ -478,12 +494,12 @@ err: return -1; } -int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, +int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, struct bkey *k, int n, bool wait) { int ret; mutex_lock(&c->bucket_lock); - ret = __bch_bucket_alloc_set(c, watermark, k, n, wait); + ret = __bch_bucket_alloc_set(c, reserve, k, n, wait); mutex_unlock(&c->bucket_lock); return ret; } @@ -573,8 +589,8 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { unsigned watermark = write_prio - ? WATERMARK_MOVINGGC - : WATERMARK_NONE; + ? RESERVE_MOVINGGC + : RESERVE_NONE; spin_unlock(&c->data_bucket_lock); @@ -689,7 +705,7 @@ int bch_cache_allocator_init(struct cache *ca) * Then 8 for btree allocations * Then half for the moving garbage collector */ - +#if 0 ca->watermark[WATERMARK_PRIO] = 0; ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); @@ -699,6 +715,6 @@ int bch_cache_allocator_init(struct cache *ca) ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + ca->watermark[WATERMARK_MOVINGGC]; - +#endif return 0; } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 9d062bc5626..94d346e2ea1 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -383,12 +383,12 @@ struct cached_dev { unsigned writeback_rate_p_term_inverse; }; -enum alloc_watermarks { - WATERMARK_PRIO, - WATERMARK_METADATA, - WATERMARK_MOVINGGC, - WATERMARK_NONE, - WATERMARK_MAX +enum alloc_reserve { + RESERVE_BTREE, + RESERVE_PRIO, + RESERVE_MOVINGGC, + RESERVE_NONE, + RESERVE_NR, }; struct cache { @@ -400,8 +400,6 @@ struct cache { struct kobject kobj; struct block_device *bdev; - unsigned watermark[WATERMARK_MAX]; - struct task_struct *alloc_thread; struct closure prio; @@ -430,7 +428,7 @@ struct cache { * because all the data they contained was overwritten), so we only * need to discard them before they can be moved to the free list. */ - DECLARE_FIFO(long, free); + DECLARE_FIFO(long, free)[RESERVE_NR]; DECLARE_FIFO(long, free_inc); DECLARE_FIFO(long, unused); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 101231f0f39..6a0f5faf0be 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -167,6 +167,8 @@ static inline bool should_split(struct btree *b) _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ } \ rw_unlock(_w, _b); \ + if (_r == -EINTR) \ + schedule(); \ bch_cannibalize_unlock(c); \ if (_r == -ENOSPC) { \ wait_event((c)->try_wait, \ @@ -175,6 +177,7 @@ static inline bool should_split(struct btree *b) } \ } while (_r == -EINTR); \ \ + finish_wait(&(c)->bucket_wait, &(op)->wait); \ _r; \ }) @@ -1075,7 +1078,7 @@ struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait) mutex_lock(&c->bucket_lock); retry: - if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait)) + if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) goto err; bkey_put(c, &k.key); @@ -1132,6 +1135,28 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k) atomic_inc(&b->c->prio_blocked); } +static int btree_check_reserve(struct btree *b, struct btree_op *op) +{ + struct cache_set *c = b->c; + struct cache *ca; + unsigned i, reserve = c->root->level * 2 + 1; + int ret = 0; + + mutex_lock(&c->bucket_lock); + + for_each_cache(ca, c, i) + if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { + if (op) + prepare_to_wait(&c->bucket_wait, &op->wait, + TASK_UNINTERRUPTIBLE); + ret = -EINTR; + break; + } + + mutex_unlock(&c->bucket_lock); + return ret; +} + /* Garbage collection */ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) @@ -1428,7 +1453,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, if (!IS_ERR(last->b)) { should_rewrite = btree_gc_mark_node(last->b, gc); - if (should_rewrite) { + if (should_rewrite && + !btree_check_reserve(b, NULL)) { n = btree_node_alloc_replacement(last->b, false); @@ -2071,6 +2097,10 @@ static int btree_split(struct btree *b, struct btree_op *op, closure_init_stack(&cl); bch_keylist_init(&parent_keys); + if (!b->level && + btree_check_reserve(b, op)) + return -EINTR; + n1 = btree_node_alloc_replacement(b, true); if (IS_ERR(n1)) goto err; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index d68af7442f7..4f0378ac1f7 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -241,6 +241,9 @@ void bkey_put(struct cache_set *c, struct bkey *k); /* Recursing down the btree */ struct btree_op { + /* for waiting on btree reserve in btree_split() */ + wait_queue_t wait; + /* Btree level at which we start taking write locks */ short lock; @@ -250,6 +253,7 @@ struct btree_op { static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) { memset(op, 0, sizeof(struct btree_op)); + init_wait(&op->wait); op->lock = write_lock_level; } diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 052bd24d24b..9eb60d102de 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -211,7 +211,7 @@ void bch_moving_gc(struct cache_set *c) for_each_cache(ca, c, i) { unsigned sectors_to_move = 0; unsigned reserve_sectors = ca->sb.bucket_size * - min(fifo_used(&ca->free), ca->free.size / 2); + fifo_used(&ca->free[RESERVE_MOVINGGC]); ca->heap.used = 0; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index b057676fc67..63ebef78df4 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -444,7 +444,7 @@ static int __uuid_write(struct cache_set *c) lockdep_assert_held(&bch_register_lock); - if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true)) + if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true)) return 1; SET_KEY_SIZE(&k.key, c->sb.bucket_size); @@ -562,8 +562,8 @@ void bch_prio_write(struct cache *ca) atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), &ca->meta_sectors_written); - pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), - fifo_used(&ca->free_inc), fifo_used(&ca->unused)); + //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), + // fifo_used(&ca->free_inc), fifo_used(&ca->unused)); for (i = prio_buckets(ca) - 1; i >= 0; --i) { long bucket; @@ -582,7 +582,7 @@ void bch_prio_write(struct cache *ca) p->magic = pset_magic(&ca->sb); p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); - bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true); + bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true); BUG_ON(bucket == -1); mutex_unlock(&ca->set->bucket_lock); @@ -1767,6 +1767,7 @@ err: void bch_cache_release(struct kobject *kobj) { struct cache *ca = container_of(kobj, struct cache, kobj); + unsigned i; if (ca->set) ca->set->cache[ca->sb.nr_this_dev] = NULL; @@ -1780,7 +1781,9 @@ void bch_cache_release(struct kobject *kobj) free_heap(&ca->heap); free_fifo(&ca->unused); free_fifo(&ca->free_inc); - free_fifo(&ca->free); + + for (i = 0; i < RESERVE_NR; i++) + free_fifo(&ca->free[i]); if (ca->sb_bio.bi_inline_vecs[0].bv_page) put_page(ca->sb_bio.bi_io_vec[0].bv_page); @@ -1806,10 +1809,12 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) ca->journal.bio.bi_max_vecs = 8; ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; - free = roundup_pow_of_two(ca->sb.nbuckets) >> 9; - free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2); + free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; - if (!init_fifo(&ca->free, free, GFP_KERNEL) || + if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) || + !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || + !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || + !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || !init_fifo(&ca->unused, free << 2, GFP_KERNEL) || !init_heap(&ca->heap, free << 3, GFP_KERNEL) || diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index a1f85612f0b..d5dd282b176 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -102,7 +102,6 @@ rw_attribute(bypass_torture_test); rw_attribute(key_merging_disabled); rw_attribute(gc_always_rewrite); rw_attribute(expensive_debug_checks); -rw_attribute(freelist_percent); rw_attribute(cache_replacement_policy); rw_attribute(btree_shrinker_disabled); rw_attribute(copy_gc_enabled); @@ -711,9 +710,6 @@ SHOW(__bch_cache) sysfs_print(io_errors, atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT); - sysfs_print(freelist_percent, ca->free.size * 100 / - ((size_t) ca->sb.nbuckets)); - if (attr == &sysfs_cache_replacement_policy) return bch_snprint_string_list(buf, PAGE_SIZE, cache_replacement_policies, @@ -820,32 +816,6 @@ STORE(__bch_cache) } } - if (attr == &sysfs_freelist_percent) { - DECLARE_FIFO(long, free); - long i; - size_t p = strtoul_or_return(buf); - - p = clamp_t(size_t, - ((size_t) ca->sb.nbuckets * p) / 100, - roundup_pow_of_two(ca->sb.nbuckets) >> 9, - ca->sb.nbuckets / 2); - - if (!init_fifo_exact(&free, p, GFP_KERNEL)) - return -ENOMEM; - - mutex_lock(&ca->set->bucket_lock); - - fifo_move(&free, &ca->free); - fifo_swap(&free, &ca->free); - - mutex_unlock(&ca->set->bucket_lock); - - while (fifo_pop(&free, i)) - atomic_dec(&ca->buckets[i].pin); - - free_fifo(&free); - } - if (attr == &sysfs_clear_stats) { atomic_long_set(&ca->sectors_written, 0); atomic_long_set(&ca->btree_sectors_written, 0); @@ -869,7 +839,6 @@ static struct attribute *bch_cache_files[] = { &sysfs_metadata_written, &sysfs_io_errors, &sysfs_clear_stats, - &sysfs_freelist_percent, &sysfs_cache_replacement_policy, NULL }; diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 095c6e4fe1e..0c5cf2f63dc 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -411,7 +411,7 @@ TRACE_EVENT(bcache_alloc_invalidate, ), TP_fast_assign( - __entry->free = fifo_used(&ca->free); + __entry->free = fifo_used(&ca->free[RESERVE_NONE]); __entry->free_inc = fifo_used(&ca->free_inc); __entry->free_inc_size = ca->free_inc.size; __entry->unused = fifo_used(&ca->unused); @@ -422,8 +422,8 @@ TRACE_EVENT(bcache_alloc_invalidate, ); TRACE_EVENT(bcache_alloc_fail, - TP_PROTO(struct cache *ca), - TP_ARGS(ca), + TP_PROTO(struct cache *ca, unsigned reserve), + TP_ARGS(ca, reserve), TP_STRUCT__entry( __field(unsigned, free ) @@ -433,7 +433,7 @@ TRACE_EVENT(bcache_alloc_fail, ), TP_fast_assign( - __entry->free = fifo_used(&ca->free); + __entry->free = fifo_used(&ca->free[reserve]); __entry->free_inc = fifo_used(&ca->free_inc); __entry->unused = fifo_used(&ca->unused); __entry->blocked = atomic_read(&ca->set->prio_blocked); -- cgit v1.2.3-70-g09d2 From 5f5837d2d650db25b9153b91535e67a96b265f58 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 16 Dec 2013 16:38:49 -0800 Subject: bcache: Do bkey_put() in btree_split() error path This error path shouldn't have been hit in practice.. and we've got reworked reserve code coming soon so that it shouldn't _ever_ be bit... but if we've got code for this error path it should be correct. Signed-off-by: Kent Overstreet --- drivers/md/bcache/btree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 6a0f5faf0be..5a6b7522eb2 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2191,18 +2191,21 @@ static int btree_split(struct btree *b, struct btree_op *op, return 0; err_free2: + bkey_put(b->c, &n2->key); btree_node_free(n2); rw_unlock(true, n2); err_free1: + bkey_put(b->c, &n1->key); btree_node_free(n1); rw_unlock(true, n1); err: + WARN(1, "bcache: btree split failed"); + if (n3 == ERR_PTR(-EAGAIN) || n2 == ERR_PTR(-EAGAIN) || n1 == ERR_PTR(-EAGAIN)) return -EAGAIN; - pr_warn("couldn't split"); return -ENOMEM; } -- cgit v1.2.3-70-g09d2 From c78afc6261b09f74abff8c0719b80692a4959768 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Jul 2013 22:39:53 -0700 Subject: bcache/md: Use raid stripe size Now that we've got code for raid5/6 stripe awareness, bcache just needs to know about the stripes and when writing partial stripes is expensive - we probably don't want to enable this optimization for raid1 or 10, even though they have stripes. So add a flag to queue_limits. Signed-off-by: Kent Overstreet --- block/blk-settings.c | 4 ++++ drivers/md/bcache/super.c | 6 ++++++ drivers/md/raid5.c | 1 + include/linux/blkdev.h | 1 + 4 files changed, 12 insertions(+) diff --git a/block/blk-settings.c b/block/blk-settings.c index 05e826793e4..5d21239bc85 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -592,6 +592,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, ret = -1; } + t->raid_partial_stripes_expensive = + max(t->raid_partial_stripes_expensive, + b->raid_partial_stripes_expensive); + /* Find lowest common alignment_offset */ t->alignment_offset = lcm(t->alignment_offset, alignment) & (max(t->physical_block_size, t->io_min) - 1); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 63ebef78df4..e363efcf2b7 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1134,6 +1134,12 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); } + dc->disk.stripe_size = q->limits.io_opt >> 9; + + if (dc->disk.stripe_size) + dc->partial_stripes_expensive = + q->limits.raid_partial_stripes_expensive; + ret = bcache_device_init(&dc->disk, block_size, dc->bdev->bd_part->nr_sects - dc->sb.data_offset); if (ret) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index eea63372e4d..1cfb22c025b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6101,6 +6101,7 @@ static int run(struct mddev *mddev) blk_queue_io_min(mddev->queue, chunk_size); blk_queue_io_opt(mddev->queue, chunk_size * (conf->raid_disks - conf->max_degraded)); + mddev->queue->limits.raid_partial_stripes_expensive = 1; /* * We can only discard a whole stripe. It doesn't make sense to * discard data disk but write parity disk diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 02cb6f0ea71..0375654adb2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -291,6 +291,7 @@ struct queue_limits { unsigned char discard_misaligned; unsigned char cluster; unsigned char discard_zeroes_data; + unsigned char raid_partial_stripes_expensive; }; struct request_queue { -- cgit v1.2.3-70-g09d2 From 5c41c8a713da7874bd79196696a275beeb11821e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 8 Jul 2013 17:53:26 -0700 Subject: bcache: Trivial error handling fix Signed-off-by: Kent Overstreet --- drivers/md/bcache/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e363efcf2b7..677a604e7f3 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2065,7 +2065,8 @@ static void bcache_exit(void) kobject_put(bcache_kobj); if (bcache_wq) destroy_workqueue(bcache_wq); - unregister_blkdev(bcache_major, "bcache"); + if (bcache_major) + unregister_blkdev(bcache_major, "bcache"); unregister_reboot_notifier(&reboot); } -- cgit v1.2.3-70-g09d2 From 88b9f8c426f35e04738220c1bc05dd1ea1b513a3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 17 Dec 2013 21:46:35 -0800 Subject: bcache: kill index() That was a terrible name for a macro, add some better helpers to replace it. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 4 ---- drivers/md/bcache/btree.c | 6 +++--- drivers/md/bcache/btree.h | 20 ++++++++++++++++++++ drivers/md/bcache/debug.c | 2 +- 4 files changed, 24 insertions(+), 8 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 94d346e2ea1..d955a493461 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -736,10 +736,6 @@ static inline unsigned local_clock_us(void) #define node(i, j) ((struct bkey *) ((i)->d + (j))) #define end(i) node(i, (i)->keys) -#define index(i, b) \ - ((size_t) (((void *) i - (void *) (b)->sets[0].data) / \ - block_bytes(b->c))) - #define btree_data_space(b) (PAGE_SIZE << (b)->page_order) #define prios_per_bucket(c) \ diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 5a6b7522eb2..8e2573a009f 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -258,7 +258,7 @@ static void bch_btree_node_read_done(struct btree *b) err = "corrupted btree"; for (i = write_block(b); - index(i, b) < btree_blocks(b); + bset_sector_offset(b, i) < KEY_SIZE(&b->key); i = ((void *) i) + block_bytes(b->c)) if (i->seq == b->sets[0].data->seq) goto err; @@ -278,9 +278,9 @@ out: return; err: set_btree_node_io_error(b); - bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", + bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys", err, PTR_BUCKET_NR(b->c, &b->key, 0), - index(i, b), i->keys); + bset_block_offset(b, i), i->keys); goto out; } diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 4f0378ac1f7..12c99b1a764 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -185,6 +185,26 @@ static inline unsigned bset_offset(struct btree *b, struct bset *i) return (((size_t) i) - ((size_t) b->sets->data)) >> 9; } +static inline struct bset *btree_bset_first(struct btree *b) +{ + return b->sets->data; +} + +static inline unsigned bset_byte_offset(struct btree *b, struct bset *i) +{ + return ((size_t) i) - ((size_t) b->sets->data); +} + +static inline unsigned bset_sector_offset(struct btree *b, struct bset *i) +{ + return (((void *) i) - ((void *) btree_bset_first(b))) >> 9; +} + +static inline unsigned bset_block_offset(struct btree *b, struct bset *i) +{ + return bset_sector_offset(b, i) >> b->c->block_bits; +} + static inline struct bset *write_block(struct btree *b) { return ((void *) b->sets[0].data) + b->written * block_bytes(b->c); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index fab3767d6d2..473e8d5a7fe 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -88,7 +88,7 @@ static void dump_bset(struct btree *b, struct bset *i) next = bkey_next(k); bch_bkey_to_text(buf, sizeof(buf), k); - printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), + printk(KERN_ERR "block %u key %zi/%u: %s", bset_block_offset(b, i), (uint64_t *) k - i->d, i->keys, buf); for (j = 0; j < KEY_PTRS(k); j++) { -- cgit v1.2.3-70-g09d2 From 78b77bf8b20431f8ad8a4db7e3120103bd922337 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 17 Dec 2013 22:49:08 -0800 Subject: bcache: Btree verify code improvements Used this fixed code to find and fix the bug fixed by a4d885097b0ac0cd1337f171f2d4b83e946094d4. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/bset.c | 3 -- drivers/md/bcache/btree.c | 16 +++++++- drivers/md/bcache/btree.h | 2 +- drivers/md/bcache/debug.c | 97 +++++++++++++++++++++++++++++++--------------- drivers/md/bcache/debug.h | 4 +- 6 files changed, 83 insertions(+), 40 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d955a493461..eb6f2e6927a 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -629,6 +629,7 @@ struct cache_set { #ifdef CONFIG_BCACHE_DEBUG struct btree *verify_data; + struct bset *verify_ondisk; struct mutex verify_lock; #endif diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index e51a739f751..98f0ced236b 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1060,9 +1060,6 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, btree_mergesort(b, out, iter, fixup, remove_stale); b->nsets = start; - if (!fixup && !start && b->written) - bch_btree_verify(b, out); - if (!start && order == b->page_order) { /* * Our temporary buffer is the same size as the btree node's diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 8e2573a009f..f035ae3b128 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -203,7 +203,7 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i) return crc ^ 0xffffffffffffffffULL; } -static void bch_btree_node_read_done(struct btree *b) +void bch_btree_node_read_done(struct btree *b) { const char *err = "bad btree header"; struct bset *i = b->sets[0].data; @@ -290,7 +290,7 @@ static void btree_node_read_endio(struct bio *bio, int error) closure_put(cl); } -void bch_btree_node_read(struct btree *b) +static void bch_btree_node_read(struct btree *b) { uint64_t start_time = local_clock(); struct closure cl; @@ -478,6 +478,13 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) bch_btree_sort_lazy(b); + /* + * do verify if there was more than one set initially (i.e. we did a + * sort) and we sorted down to a single set: + */ + if (i != b->sets->data && !b->nsets) + bch_btree_verify(b); + if (b->written < btree_blocks(b)) bch_bset_init_next(b); } @@ -782,6 +789,8 @@ void bch_btree_cache_free(struct cache_set *c) #ifdef CONFIG_BCACHE_DEBUG if (c->verify_data) list_move(&c->verify_data->list, &c->btree_cache); + + free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c))); #endif list_splice(&c->btree_cache_freeable, @@ -822,6 +831,9 @@ int bch_btree_cache_alloc(struct cache_set *c) #ifdef CONFIG_BCACHE_DEBUG mutex_init(&c->verify_lock); + c->verify_ondisk = (void *) + __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c))); + c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); if (c->verify_data && diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 12c99b1a764..580b0113726 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -292,7 +292,7 @@ static inline void rw_unlock(bool w, struct btree *b) (w ? up_write : up_read)(&b->lock); } -void bch_btree_node_read(struct btree *); +void bch_btree_node_read_done(struct btree *); void bch_btree_node_write(struct btree *, struct closure *); void bch_btree_set_root(struct btree *); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 473e8d5a7fe..8887c550d56 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -53,18 +53,18 @@ int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) - p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k)); - - if (KEY_PTRS(k)) - while (1) { - p("%llu:%llu gen %llu", - PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i)); - - if (++i == KEY_PTRS(k)) - break; + p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k)); + for (i = 0; i < KEY_PTRS(k); i++) { + if (i) p(", "); - } + + if (PTR_DEV(k, i) == PTR_CHECK_DEV) + p("check dev"); + else + p("%llu:%llu gen %llu", PTR_DEV(k, i), + PTR_OFFSET(k, i), PTR_GEN(k, i)); + } p("]"); @@ -78,7 +78,7 @@ int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) #ifdef CONFIG_BCACHE_DEBUG -static void dump_bset(struct btree *b, struct bset *i) +static void dump_bset(struct btree *b, struct bset *i, unsigned set) { struct bkey *k, *next; unsigned j; @@ -88,7 +88,7 @@ static void dump_bset(struct btree *b, struct bset *i) next = bkey_next(k); bch_bkey_to_text(buf, sizeof(buf), k); - printk(KERN_ERR "block %u key %zi/%u: %s", bset_block_offset(b, i), + printk(KERN_ERR "b %u k %zi/%u: %s", set, (uint64_t *) k - i->d, i->keys, buf); for (j = 0; j < KEY_PTRS(k); j++) { @@ -114,50 +114,83 @@ static void bch_dump_bucket(struct btree *b) console_lock(); for (i = 0; i <= b->nsets; i++) - dump_bset(b, b->sets[i].data); + dump_bset(b, b->sets[i].data, + bset_block_offset(b, b->sets[i].data)); console_unlock(); } -void bch_btree_verify(struct btree *b, struct bset *new) +#define for_each_written_bset(b, start, i) \ + for (i = (start); \ + (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\ + i->seq == (start)->seq; \ + i = (void *) i + set_blocks(i, b->c) * block_bytes(b->c)) + +void bch_btree_verify(struct btree *b) { struct btree *v = b->c->verify_data; - struct closure cl; - closure_init_stack(&cl); + struct bset *ondisk, *sorted, *inmemory; + struct bio *bio; - if (!b->c->verify) + if (!b->c->verify || !b->c->verify_ondisk) return; down(&b->io_mutex); mutex_lock(&b->c->verify_lock); + ondisk = b->c->verify_ondisk; + sorted = b->c->verify_data->sets->data; + inmemory = b->sets->data; + bkey_copy(&v->key, &b->key); v->written = 0; v->level = b->level; - bch_btree_node_read(v); + bio = bch_bbio_alloc(b->c); + bio->bi_bdev = PTR_CACHE(b->c, &b->key, 0)->bdev; + bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); + bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9; + bch_bio_map(bio, sorted); - if (new->keys != v->sets[0].data->keys || - memcmp(new->start, - v->sets[0].data->start, - (void *) end(new) - (void *) new->start)) { - unsigned i, j; + submit_bio_wait(REQ_META|READ_SYNC, bio); + bch_bbio_free(bio, b->c); + + memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9); + + bch_btree_node_read_done(v); + sorted = v->sets->data; + + if (inmemory->keys != sorted->keys || + memcmp(inmemory->start, + sorted->start, + (void *) end(inmemory) - (void *) inmemory->start)) { + struct bset *i; + unsigned j; console_lock(); - printk(KERN_ERR "*** original memory node:\n"); - for (i = 0; i <= b->nsets; i++) - dump_bset(b, b->sets[i].data); + printk(KERN_ERR "*** in memory:\n"); + dump_bset(b, inmemory, 0); - printk(KERN_ERR "*** sorted memory node:\n"); - dump_bset(b, new); + printk(KERN_ERR "*** read back in:\n"); + dump_bset(v, sorted, 0); - printk(KERN_ERR "*** on disk node:\n"); - dump_bset(v, v->sets[0].data); + for_each_written_bset(b, ondisk, i) { + unsigned block = ((void *) i - (void *) ondisk) / + block_bytes(b->c); - for (j = 0; j < new->keys; j++) - if (new->d[j] != v->sets[0].data->d[j]) + printk(KERN_ERR "*** on disk block %u:\n", block); + dump_bset(b, i, block); + } + + printk(KERN_ERR "*** block %zu not written\n", + ((void *) i - (void *) ondisk) / block_bytes(b->c)); + + for (j = 0; j < inmemory->keys; j++) + if (inmemory->d[j] != sorted->d[j]) break; + printk(KERN_ERR "b->written %u\n", b->written); + console_unlock(); panic("verify failed at %u\n", j); } diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index 2ede60e3187..08e116e74d3 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h @@ -7,7 +7,7 @@ int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); #ifdef CONFIG_BCACHE_DEBUG -void bch_btree_verify(struct btree *, struct bset *); +void bch_btree_verify(struct btree *); void bch_data_verify(struct cached_dev *, struct bio *); int __bch_count_data(struct btree *); void __bch_check_keys(struct btree *, const char *, ...); @@ -20,7 +20,7 @@ void bch_btree_iter_next_check(struct btree_iter *); #else /* DEBUG */ -static inline void bch_btree_verify(struct btree *b, struct bset *i) {} +static inline void bch_btree_verify(struct btree *b) {} static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} static inline int __bch_count_data(struct btree *b) { return -1; } static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {} -- cgit v1.2.3-70-g09d2 From 0a45114534766058193eb2605c136562a4f7bcc8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 18 Dec 2013 00:01:06 -0800 Subject: bcache: Use a mempool for mergesort temporary space It was a single element mempool before, it's slightly cleaner to just use a real mempool. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 7 +------ drivers/md/bcache/bset.c | 12 +++++------- drivers/md/bcache/super.c | 5 ++--- 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index eb6f2e6927a..25f017248a8 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -645,12 +645,7 @@ struct cache_set { */ mempool_t *fill_iter; - /* - * btree_sort() is a merge sort and requires temporary space - single - * element mempool - */ - struct mutex sort_lock; - struct bset *sort; + mempool_t *sort_pool; unsigned sort_crit_factor; /* List of buckets we're currently writing data to */ diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 98f0ced236b..e6882659afb 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1047,11 +1047,12 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, { uint64_t start_time; bool remove_stale = !b->written; + bool used_mempool = false; struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, order); if (!out) { - mutex_lock(&b->c->sort_lock); - out = b->c->sort; + out = page_address(mempool_alloc(b->c->sort_pool, GFP_NOIO)); + used_mempool = true; order = ilog2(bucket_pages(b->c)); } @@ -1071,17 +1072,14 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, out->seq = b->sets[0].data->seq; out->version = b->sets[0].data->version; swap(out, b->sets[0].data); - - if (b->c->sort == b->sets[0].data) - b->c->sort = out; } else { b->sets[start].data->keys = out->keys; memcpy(b->sets[start].data->start, out->start, (void *) end(out) - (void *) out->start); } - if (out == b->c->sort) - mutex_unlock(&b->c->sort_lock); + if (used_mempool) + mempool_free(virt_to_page(out), b->c->sort_pool); else free_pages((unsigned long) out, order); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 677a604e7f3..d05e7562771 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1352,7 +1352,6 @@ static void cache_set_free(struct closure *cl) kobject_put(&ca->kobj); free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); - free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); if (c->bio_split) bioset_free(c->bio_split); @@ -1489,7 +1488,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) init_waitqueue_head(&c->try_wait); init_waitqueue_head(&c->bucket_wait); sema_init(&c->uuid_write_mutex, 1); - mutex_init(&c->sort_lock); spin_lock_init(&c->sort_time.lock); spin_lock_init(&c->btree_gc_time.lock); @@ -1519,7 +1517,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) bucket_pages(c))) || !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || - !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || + !(c->sort_pool = mempool_create_page_pool(1, + ilog2(bucket_pages(c)))) || !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || bch_journal_alloc(c) || bch_btree_cache_alloc(c) || -- cgit v1.2.3-70-g09d2 From 9a02b7eeeb446a0418ec83afc80eb38bc188f5c8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 20 Dec 2013 17:24:46 -0800 Subject: bcache: Remove/fix some header dependencies In the process of disentagling/libraryizing bset.c from the rest of the bcache code. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 23 ++++++++++++++++++----- drivers/md/bcache/bset.h | 21 ++------------------- drivers/md/bcache/util.h | 6 ++++++ 3 files changed, 26 insertions(+), 24 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 25f017248a8..3fd87323368 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -704,11 +704,6 @@ struct bbio { struct bio bio; }; -static inline unsigned local_clock_us(void) -{ - return local_clock() >> 10; -} - #define BTREE_PRIO USHRT_MAX #define INITIAL_PRIO 32768 @@ -776,6 +771,24 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c, return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); } +static inline uint8_t gen_after(uint8_t a, uint8_t b) +{ + uint8_t r = a - b; + return r > 128U ? 0 : r; +} + +static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, + unsigned i) +{ + return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); +} + +static inline bool ptr_available(struct cache_set *c, const struct bkey *k, + unsigned i) +{ + return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); +} + /* Btree key macros */ static inline void bkey_init(struct bkey *k) diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 1d3c24f9fa0..ba02f055bc6 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -142,6 +142,8 @@ * first key in that range of bytes again. */ +struct cache_set; + /* Btree key comparison/iteration */ #define MAX_BSETS 4U @@ -282,25 +284,6 @@ bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *); bool bch_ptr_bad(struct btree *, const struct bkey *); -static inline uint8_t gen_after(uint8_t a, uint8_t b) -{ - uint8_t r = a - b; - return r > 128U ? 0 : r; -} - -static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, - unsigned i) -{ - return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); -} - -static inline bool ptr_available(struct cache_set *c, const struct bkey *k, - unsigned i) -{ - return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); -} - - typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); struct bkey *bch_btree_iter_next(struct btree_iter *); diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 1030c6020e9..3ebaef5f645 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -2,6 +2,7 @@ #ifndef _BCACHE_UTIL_H #define _BCACHE_UTIL_H +#include #include #include #include @@ -391,6 +392,11 @@ struct time_stats { void bch_time_stats_update(struct time_stats *stats, uint64_t time); +static inline unsigned local_clock_us(void) +{ + return local_clock() >> 10; +} + #define NSEC_PER_ns 1L #define NSEC_PER_us NSEC_PER_USEC #define NSEC_PER_ms NSEC_PER_MSEC -- cgit v1.2.3-70-g09d2 From 085d2a3dd4d65b7bce1dead987c647dbbc014281 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Nov 2013 18:20:51 -0800 Subject: bcache: Make bch_keylist_realloc() take u64s, not nptrs Getting away from KEY_PTRS and moving toward KEY_U64s - and getting rid of magic 2s Also - split out the part that checks against journal entry size so as to avoid a dependancy on struct cache_set in bset.c Signed-off-by: Kent Overstreet --- drivers/md/bcache/bset.c | 12 ++---------- drivers/md/bcache/bset.h | 2 +- drivers/md/bcache/btree.c | 6 +++--- drivers/md/bcache/request.c | 22 ++++++++++++++++++++-- 4 files changed, 26 insertions(+), 16 deletions(-) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index e6882659afb..f91347a55c4 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -14,21 +14,13 @@ /* Keylists */ -int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) +int __bch_keylist_realloc(struct keylist *l, unsigned u64s) { size_t oldsize = bch_keylist_nkeys(l); - size_t newsize = oldsize + 2 + nptrs; + size_t newsize = oldsize + u64s; uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p; uint64_t *new_keys; - /* The journalling code doesn't handle the case where the keys to insert - * is bigger than an empty write: If we just return -ENOMEM here, - * bio_insert() and bio_invalidate() will insert the keys created so far - * and finish the rest when the keylist is empty. - */ - if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) - return -ENOMEM; - newsize = roundup_pow_of_two(newsize); if (newsize <= KEYLIST_INLINE || diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index ba02f055bc6..303d31a3b9e 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -259,7 +259,7 @@ static inline size_t bch_keylist_bytes(struct keylist *l) struct bkey *bch_keylist_pop(struct keylist *); void bch_keylist_pop_front(struct keylist *); -int bch_keylist_realloc(struct keylist *, int, struct cache_set *); +int __bch_keylist_realloc(struct keylist *, unsigned); void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, unsigned); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index f035ae3b128..f0a6399fdd3 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1356,8 +1356,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, n2->keys -= keys; - if (bch_keylist_realloc(keylist, - KEY_PTRS(&new_nodes[i]->key), b->c)) + if (__bch_keylist_realloc(keylist, + bkey_u64s(&new_nodes[i]->key))) goto out_nocoalesce; bch_btree_node_write(new_nodes[i], &cl); @@ -1365,7 +1365,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, } for (i = 0; i < nodes; i++) { - if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c)) + if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key))) goto out_nocoalesce; make_btree_freeing_key(r[i].b, keylist->top); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index cce02f19e6c..fcdb59f9ca9 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -255,6 +255,24 @@ static void bch_data_insert_keys(struct closure *cl) closure_return(cl); } +static int bch_keylist_realloc(struct keylist *l, unsigned u64s, + struct cache_set *c) +{ + size_t oldsize = bch_keylist_nkeys(l); + size_t newsize = oldsize + u64s; + + /* + * The journalling code doesn't handle the case where the keys to insert + * is bigger than an empty write: If we just return -ENOMEM here, + * bio_insert() and bio_invalidate() will insert the keys created so far + * and finish the rest when the keylist is empty. + */ + if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) + return -ENOMEM; + + return __bch_keylist_realloc(l, u64s); +} + static void bch_data_invalidate(struct closure *cl) { struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); @@ -267,7 +285,7 @@ static void bch_data_invalidate(struct closure *cl) unsigned sectors = min(bio_sectors(bio), 1U << (KEY_SIZE_BITS - 1)); - if (bch_keylist_realloc(&op->insert_keys, 0, op->c)) + if (bch_keylist_realloc(&op->insert_keys, 2, op->c)) goto out; bio->bi_iter.bi_sector += sectors; @@ -357,7 +375,7 @@ static void bch_data_insert_start(struct closure *cl) /* 1 for the device pointer and 1 for the chksum */ if (bch_keylist_realloc(&op->insert_keys, - 1 + (op->csum ? 1 : 0), + 3 + (op->csum ? 1 : 0), op->c)) continue_at(cl, bch_data_insert_keys, bcache_wq); -- cgit v1.2.3-70-g09d2 From fafff81cead78157099df1ee10af16cc51893ddc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 17 Dec 2013 21:56:21 -0800 Subject: bcache: Bkey indexing renaming More refactoring: node() -> bset_bkey_idx() end() -> bset_bkey_last() Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 11 ++--------- drivers/md/bcache/bset.c | 28 ++++++++++++++-------------- drivers/md/bcache/bset.h | 30 ++++++++++++++++++++++-------- drivers/md/bcache/btree.c | 33 ++++++++++++++++++--------------- drivers/md/bcache/debug.c | 6 +++--- drivers/md/bcache/journal.c | 6 +++--- include/uapi/linux/bcache.h | 2 +- 7 files changed, 63 insertions(+), 53 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 3fd87323368..2b46c86ac44 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -724,9 +724,6 @@ struct bbio { #define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c)) #define set_blocks(i, c) __set_blocks(i, (i)->keys, c) -#define node(i, j) ((struct bkey *) ((i)->d + (j))) -#define end(i) node(i, (i)->keys) - #define btree_data_space(b) (PAGE_SIZE << (b)->page_order) #define prios_per_bucket(c) \ @@ -791,18 +788,14 @@ static inline bool ptr_available(struct cache_set *c, const struct bkey *k, /* Btree key macros */ -static inline void bkey_init(struct bkey *k) -{ - *k = ZERO_KEY; -} - /* * This is used for various on disk data structures - cache_sb, prio_set, bset, * jset: The checksum is _always_ the first 8 bytes of these structs */ #define csum_set(i) \ bch_crc64(((void *) (i)) + sizeof(uint64_t), \ - ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t))) + ((void *) bset_bkey_last(i)) - \ + (((void *) (i)) + sizeof(uint64_t))) /* Error handling macros */ diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index f91347a55c4..bfee926e35f 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -500,7 +500,7 @@ static void make_bfloat(struct bset_tree *t, unsigned j) : tree_to_prev_bkey(t, j >> ffs(j)); struct bkey *r = is_power_of_2(j + 1) - ? node(t->data, t->data->keys - bkey_u64s(&t->end)) + ? bset_bkey_idx(t->data, t->data->keys - bkey_u64s(&t->end)) : tree_to_bkey(t, j >> (ffz(j) + 1)); BUG_ON(m < l || m > r); @@ -559,7 +559,7 @@ static void bset_build_written_tree(struct btree *b) bset_alloc_tree(b, t); t->size = min_t(unsigned, - bkey_to_cacheline(t, end(t->data)), + bkey_to_cacheline(t, bset_bkey_last(t->data)), b->sets->tree + bset_tree_space(b) - t->tree); if (t->size < 2) { @@ -582,7 +582,7 @@ static void bset_build_written_tree(struct btree *b) t->tree[j].m = bkey_to_cacheline_offset(k); } - while (bkey_next(k) != end(t->data)) + while (bkey_next(k) != bset_bkey_last(t->data)) k = bkey_next(k); t->end = *k; @@ -600,7 +600,7 @@ void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k) unsigned inorder, j = 1; for (t = b->sets; t <= &b->sets[b->nsets]; t++) - if (k < end(t->data)) + if (k < bset_bkey_last(t->data)) goto found_set; BUG(); @@ -613,7 +613,7 @@ found_set: if (k == t->data->start) goto fix_left; - if (bkey_next(k) == end(t->data)) { + if (bkey_next(k) == bset_bkey_last(t->data)) { t->end = *k; goto fix_right; } @@ -679,7 +679,7 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) /* Possibly add a new entry to the end of the lookup table */ for (k = table_to_bkey(t, t->size - 1); - k != end(t->data); + k != bset_bkey_last(t->data); k = bkey_next(k)) if (t->size == bkey_to_cacheline(t, k)) { t->prev[t->size] = bkey_to_cacheline_offset(k); @@ -715,7 +715,7 @@ static struct bset_search_iter bset_search_write_set(struct btree *b, unsigned li = 0, ri = t->size; BUG_ON(!b->nsets && - t->size < bkey_to_cacheline(t, end(t->data))); + t->size < bkey_to_cacheline(t, bset_bkey_last(t->data))); while (li + 1 != ri) { unsigned m = (li + ri) >> 1; @@ -728,7 +728,7 @@ static struct bset_search_iter bset_search_write_set(struct btree *b, return (struct bset_search_iter) { table_to_bkey(t, li), - ri < t->size ? table_to_bkey(t, ri) : end(t->data) + ri < t->size ? table_to_bkey(t, ri) : bset_bkey_last(t->data) }; } @@ -780,7 +780,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b, f = &t->tree[inorder_next(j, t->size)]; r = cacheline_to_bkey(t, inorder, f->m); } else - r = end(t->data); + r = bset_bkey_last(t->data); } else { r = cacheline_to_bkey(t, inorder, f->m); @@ -816,7 +816,7 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, if (unlikely(!t->size)) { i.l = t->data->start; - i.r = end(t->data); + i.r = bset_bkey_last(t->data); } else if (bset_written(b, t)) { /* * Each node in the auxiliary search tree covers a certain range @@ -826,7 +826,7 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, */ if (unlikely(bkey_cmp(search, &t->end) >= 0)) - return end(t->data); + return bset_bkey_last(t->data); if (unlikely(bkey_cmp(search, t->data->start) < 0)) return t->data->start; @@ -842,7 +842,7 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, inorder_to_tree(bkey_to_cacheline(t, i.l), t)), search) > 0); - BUG_ON(i.r != end(t->data) && + BUG_ON(i.r != bset_bkey_last(t->data) && bkey_cmp(i.r, search) <= 0); } @@ -897,7 +897,7 @@ struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, for (; start <= &b->sets[b->nsets]; start++) { ret = bch_bset_search(b, start, search); - bch_btree_iter_push(iter, ret, end(start->data)); + bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); } return ret; @@ -1067,7 +1067,7 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, } else { b->sets[start].data->keys = out->keys; memcpy(b->sets[start].data->start, out->start, - (void *) end(out) - (void *) out->start); + (void *) bset_bkey_last(out) - (void *) out->start); } if (used_mempool) diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 303d31a3b9e..88b6edbf508 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -190,14 +190,6 @@ struct bset_tree { struct bset *data; }; -static __always_inline int64_t bkey_cmp(const struct bkey *l, - const struct bkey *r) -{ - return unlikely(KEY_INODE(l) != KEY_INODE(r)) - ? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r) - : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); -} - /* Keylists */ struct keylist { @@ -261,6 +253,28 @@ struct bkey *bch_keylist_pop(struct keylist *); void bch_keylist_pop_front(struct keylist *); int __bch_keylist_realloc(struct keylist *, unsigned); +/* Bkey utility code */ + +#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys) + +static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx) +{ + return bkey_idx(i->start, idx); +} + +static inline void bkey_init(struct bkey *k) +{ + *k = ZERO_KEY; +} + +static __always_inline int64_t bkey_cmp(const struct bkey *l, + const struct bkey *r) +{ + return unlikely(KEY_INODE(l) != KEY_INODE(r)) + ? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r) + : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); +} + void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, unsigned); bool __bch_cut_front(const struct bkey *, struct bkey *); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index f0a6399fdd3..8aaaf16637a 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -197,7 +197,7 @@ void bkey_put(struct cache_set *c, struct bkey *k) static uint64_t btree_csum_set(struct btree *b, struct bset *i) { uint64_t crc = b->key.ptr[0]; - void *data = (void *) i + 8, *end = end(i); + void *data = (void *) i + 8, *end = bset_bkey_last(i); crc = bch_crc64_update(crc, data, end - data); return crc ^ 0xffffffffffffffffULL; @@ -251,7 +251,7 @@ void bch_btree_node_read_done(struct btree *b) if (i != b->sets[0].data && !i->keys) goto err; - bch_btree_iter_push(iter, i->start, end(i)); + bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); b->written += set_blocks(i, b->c); } @@ -1310,7 +1310,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, if (i > 1) { for (k = n2->start; - k < end(n2); + k < bset_bkey_last(n2); k = bkey_next(k)) { if (__set_blocks(n1, n1->keys + keys + bkey_u64s(k), b->c) > blocks) @@ -1343,16 +1343,17 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, if (last) bkey_copy_key(&new_nodes[i]->key, last); - memcpy(end(n1), + memcpy(bset_bkey_last(n1), n2->start, - (void *) node(n2, keys) - (void *) n2->start); + (void *) bset_bkey_idx(n2, keys) - (void *) n2->start); n1->keys += keys; r[i].keys = n1->keys; memmove(n2->start, - node(n2, keys), - (void *) end(n2) - (void *) node(n2, keys)); + bset_bkey_idx(n2, keys), + (void *) bset_bkey_last(n2) - + (void *) bset_bkey_idx(n2, keys)); n2->keys -= keys; @@ -1830,7 +1831,7 @@ static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) memmove((uint64_t *) where + bkey_u64s(insert), where, - (void *) end(i) - (void *) where); + (void *) bset_bkey_last(i) - (void *) where); i->keys += bkey_u64s(insert); bkey_copy(where, insert); @@ -2014,7 +2015,7 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), KEY_START(k), KEY_SIZE(k)); - while (m != end(i) && + while (m != bset_bkey_last(i) && bkey_cmp(k, &START_KEY(m)) > 0) prev = m, m = bkey_next(m); @@ -2028,12 +2029,12 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, goto merged; status = BTREE_INSERT_STATUS_OVERWROTE; - if (m != end(i) && + if (m != bset_bkey_last(i) && KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) goto copy; status = BTREE_INSERT_STATUS_FRONT_MERGE; - if (m != end(i) && + if (m != bset_bkey_last(i) && bch_bkey_try_merge(b, k, m)) goto copy; } else { @@ -2142,16 +2143,18 @@ static int btree_split(struct btree *b, struct btree_op *op, */ while (keys < (n1->sets[0].data->keys * 3) / 5) - keys += bkey_u64s(node(n1->sets[0].data, keys)); + keys += bkey_u64s(bset_bkey_idx(n1->sets[0].data, + keys)); - bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); - keys += bkey_u64s(node(n1->sets[0].data, keys)); + bkey_copy_key(&n1->key, + bset_bkey_idx(n1->sets[0].data, keys)); + keys += bkey_u64s(bset_bkey_idx(n1->sets[0].data, keys)); n2->sets[0].data->keys = n1->sets[0].data->keys - keys; n1->sets[0].data->keys = keys; memcpy(n2->sets[0].data->start, - end(n1->sets[0].data), + bset_bkey_last(n1->sets[0].data), n2->sets[0].data->keys * sizeof(uint64_t)); bkey_copy_key(&n2->key, &b->key); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 8887c550d56..955fa1d3177 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -84,7 +84,7 @@ static void dump_bset(struct btree *b, struct bset *i, unsigned set) unsigned j; char buf[80]; - for (k = i->start; k < end(i); k = next) { + for (k = i->start; k < bset_bkey_last(i); k = next) { next = bkey_next(k); bch_bkey_to_text(buf, sizeof(buf), k); @@ -102,7 +102,7 @@ static void dump_bset(struct btree *b, struct bset *i, unsigned set) printk(" %s\n", bch_ptr_status(b->c, k)); - if (next < end(i) && + if (next < bset_bkey_last(i) && bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0) printk(KERN_ERR "Key skipped backwards\n"); } @@ -162,7 +162,7 @@ void bch_btree_verify(struct btree *b) if (inmemory->keys != sorted->keys || memcmp(inmemory->start, sorted->start, - (void *) end(inmemory) - (void *) inmemory->start)) { + (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) { struct bset *i; unsigned j; diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 9d32d579082..5e14e3325ec 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -284,7 +284,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) } for (k = i->j.start; - k < end(&i->j); + k < bset_bkey_last(&i->j); k = bkey_next(k)) { unsigned j; @@ -322,7 +322,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) n, i->j.seq - 1, start, end); for (k = i->j.start; - k < end(&i->j); + k < bset_bkey_last(&i->j); k = bkey_next(k)) { trace_bcache_journal_replay_key(k); @@ -751,7 +751,7 @@ atomic_t *bch_journal(struct cache_set *c, w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); - memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys)); + memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys)); w->data->keys += bch_keylist_nkeys(keys); ret = &fifo_back(&c->journal.pin); diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 164a7e26398..ae66311be82 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -118,7 +118,7 @@ static inline struct bkey *bkey_next(const struct bkey *k) return (struct bkey *) (d + bkey_u64s(k)); } -static inline struct bkey *bkey_last(const struct bkey *k, unsigned nr_keys) +static inline struct bkey *bkey_idx(const struct bkey *k, unsigned nr_keys) { __u64 *d = (void *) k; return (struct bkey *) (d + nr_keys); -- cgit v1.2.3-70-g09d2 From 911c9610099f26e9e6ea3d1962ce24f53890b163 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 28 Jul 2013 18:35:09 -0700 Subject: bcache: Split out sort_extent_cmp() Only use extent comparison for comparing extents, so we're not using START_KEY() on other key types (i.e. btree pointers) Signed-off-by: Kent Overstreet --- drivers/md/bcache/bset.c | 84 +++++++++++++++++++++++++++++++++++------------ drivers/md/bcache/bset.h | 4 +-- drivers/md/bcache/btree.c | 10 ++++-- drivers/md/bcache/btree.h | 7 ---- 4 files changed, 73 insertions(+), 32 deletions(-) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index bfee926e35f..9e3a53d87de 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -855,19 +855,13 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, /* Btree iterator */ -/* - * Returns true if l > r - unless l == r, in which case returns true if l is - * older than r. - * - * Necessary for btree_sort_fixup() - if there are multiple keys that compare - * equal in different sets, we have to process them newest to oldest. - */ +typedef bool (btree_iter_cmp_fn)(struct btree_iter_set, + struct btree_iter_set); + static inline bool btree_iter_cmp(struct btree_iter_set l, struct btree_iter_set r) { - int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); - - return c ? c > 0 : l.k < r.k; + return bkey_cmp(l.k, r.k) > 0; } static inline bool btree_iter_end(struct btree_iter *iter) @@ -884,8 +878,10 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, btree_iter_cmp)); } -struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, - struct bkey *search, struct bset_tree *start) +static struct bkey *__bch_btree_iter_init(struct btree *b, + struct btree_iter *iter, + struct bkey *search, + struct bset_tree *start) { struct bkey *ret = NULL; iter->size = ARRAY_SIZE(iter->data); @@ -903,7 +899,15 @@ struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, return ret; } -struct bkey *bch_btree_iter_next(struct btree_iter *iter) +struct bkey *bch_btree_iter_init(struct btree *b, + struct btree_iter *iter, + struct bkey *search) +{ + return __bch_btree_iter_init(b, iter, search, b->sets); +} + +static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, + btree_iter_cmp_fn *cmp) { struct btree_iter_set unused; struct bkey *ret = NULL; @@ -920,14 +924,20 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter) } if (iter->data->k == iter->data->end) - heap_pop(iter, unused, btree_iter_cmp); + heap_pop(iter, unused, cmp); else - heap_sift(iter, 0, btree_iter_cmp); + heap_sift(iter, 0, cmp); } return ret; } +struct bkey *bch_btree_iter_next(struct btree_iter *iter) +{ + return __bch_btree_iter_next(iter, btree_iter_cmp); + +} + struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, struct btree *b, ptr_filter_fn fn) { @@ -951,13 +961,37 @@ static void sort_key_next(struct btree_iter *iter, *i = iter->data[--iter->used]; } -static struct bkey *btree_sort_fixup(struct btree_iter *iter, struct bkey *tmp) +/* + * Returns true if l > r - unless l == r, in which case returns true if l is + * older than r. + * + * Necessary for btree_sort_fixup() - if there are multiple keys that compare + * equal in different sets, we have to process them newest to oldest. + */ +static inline bool sort_extent_cmp(struct btree_iter_set l, + struct btree_iter_set r) +{ + int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); + + return c ? c > 0 : l.k < r.k; +} + +static inline bool sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) +{ + int64_t c = bkey_cmp(l.k, r.k); + + return c ? c > 0 : l.k < r.k; +} + +static struct bkey *btree_sort_fixup_extents(struct btree_iter *iter, + struct bkey *tmp) { while (iter->used > 1) { struct btree_iter_set *top = iter->data, *i = top + 1; if (iter->used > 2 && - btree_iter_cmp(i[0], i[1])) + sort_extent_cmp(i[0], i[1])) i++; if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) @@ -965,7 +999,7 @@ static struct bkey *btree_sort_fixup(struct btree_iter *iter, struct bkey *tmp) if (!KEY_SIZE(i->k)) { sort_key_next(iter, i); - heap_sift(iter, i - top, btree_iter_cmp); + heap_sift(iter, i - top, sort_extent_cmp); continue; } @@ -975,7 +1009,7 @@ static struct bkey *btree_sort_fixup(struct btree_iter *iter, struct bkey *tmp) else bch_cut_front(top->k, i->k); - heap_sift(iter, i - top, btree_iter_cmp); + heap_sift(iter, i - top, sort_extent_cmp); } else { /* can't happen because of comparison func */ BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); @@ -1001,20 +1035,28 @@ static void btree_mergesort(struct btree *b, struct bset *out, struct btree_iter *iter, bool fixup, bool remove_stale) { + int i; struct bkey *k, *last = NULL; BKEY_PADDED(k) tmp; + btree_iter_cmp_fn *cmp = b->level + ? sort_cmp + : sort_extent_cmp; bool (*bad)(struct btree *, const struct bkey *) = remove_stale ? bch_ptr_bad : bch_ptr_invalid; + /* Heapify the iterator, using our comparison function */ + for (i = iter->used / 2 - 1; i >= 0; --i) + heap_sift(iter, i, cmp); + while (!btree_iter_end(iter)) { if (fixup && !b->level) - k = btree_sort_fixup(iter, &tmp.k); + k = btree_sort_fixup_extents(iter, &tmp.k); else k = NULL; if (!k) - k = bch_btree_iter_next(iter); + k = __bch_btree_iter_next(iter, cmp); if (bad(b, k)) continue; diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 88b6edbf508..91bcbdb0408 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -305,8 +305,8 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *, struct btree *, ptr_filter_fn); void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); -struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *, - struct bkey *, struct bset_tree *); +struct bkey *bch_btree_iter_init(struct btree *, struct btree_iter *, + struct bkey *); /* 32 bits total: */ #define BKEY_MID_BITS 3 diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 8aaaf16637a..e1e36e76172 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1854,10 +1854,16 @@ static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, while (1) { struct bkey *k = bch_btree_iter_next(iter); - if (!k || - bkey_cmp(&START_KEY(k), insert) >= 0) + if (!k) break; + if (bkey_cmp(&START_KEY(k), insert) >= 0) { + if (KEY_SIZE(k)) + break; + else + continue; + } + if (bkey_cmp(k, &START_KEY(insert)) <= 0) continue; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 580b0113726..2a5a8481f25 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -225,13 +225,6 @@ static inline void set_gc_sectors(struct cache_set *c) atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); } -static inline struct bkey *bch_btree_iter_init(struct btree *b, - struct btree_iter *iter, - struct bkey *search) -{ - return __bch_btree_iter_init(b, iter, search, b->sets); -} - static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) { if (b->level) -- cgit v1.2.3-70-g09d2 From 67539e85289c14a76a1c4162613d14a5f05a0027 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Sep 2013 22:53:34 -0700 Subject: bcache: Add struct bset_sort_state More disentangling bset.c from the rest of the bcache code - soon, the sorting routines won't have any dependencies on any outside structs. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 5 ++-- drivers/md/bcache/bset.c | 68 ++++++++++++++++++++++++++++------------------ drivers/md/bcache/bset.h | 38 ++++++++++++++++++++------ drivers/md/bcache/btree.c | 14 +++++++--- drivers/md/bcache/super.c | 9 ++---- drivers/md/bcache/sysfs.c | 2 +- 6 files changed, 87 insertions(+), 49 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 2b46c86ac44..7bd4c93475e 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -187,6 +187,7 @@ #include #include +#include "bset.h" #include "util.h" #include "closure.h" @@ -645,8 +646,7 @@ struct cache_set { */ mempool_t *fill_iter; - mempool_t *sort_pool; - unsigned sort_crit_factor; + struct bset_sort_state sort; /* List of buckets we're currently writing data to */ struct list_head data_buckets; @@ -662,7 +662,6 @@ struct cache_set { unsigned congested_read_threshold_us; unsigned congested_write_threshold_us; - struct time_stats sort_time; struct time_stats btree_gc_time; struct time_stats btree_split_time; struct time_stats btree_read_time; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 9e3a53d87de..9d9c2edda76 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -952,6 +952,26 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, /* Mergesort */ +void bch_bset_sort_state_free(struct bset_sort_state *state) +{ + if (state->pool) + mempool_destroy(state->pool); +} + +int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order) +{ + spin_lock_init(&state->time.lock); + + state->page_order = page_order; + state->crit_factor = int_sqrt(1 << page_order); + + state->pool = mempool_create_page_pool(1, page_order); + if (!state->pool) + return -ENOMEM; + + return 0; +} + static void sort_key_next(struct btree_iter *iter, struct btree_iter_set *i) { @@ -1077,22 +1097,24 @@ static void btree_mergesort(struct btree *b, struct bset *out, } static void __btree_sort(struct btree *b, struct btree_iter *iter, - unsigned start, unsigned order, bool fixup) + unsigned start, unsigned order, bool fixup, + struct bset_sort_state *state) { uint64_t start_time; - bool remove_stale = !b->written; bool used_mempool = false; struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, order); if (!out) { - out = page_address(mempool_alloc(b->c->sort_pool, GFP_NOIO)); + BUG_ON(order > state->page_order); + + out = page_address(mempool_alloc(state->pool, GFP_NOIO)); used_mempool = true; order = ilog2(bucket_pages(b->c)); } start_time = local_clock(); - btree_mergesort(b, out, iter, fixup, remove_stale); + btree_mergesort(b, out, iter, fixup, false); b->nsets = start; if (!start && order == b->page_order) { @@ -1113,18 +1135,18 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, } if (used_mempool) - mempool_free(virt_to_page(out), b->c->sort_pool); + mempool_free(virt_to_page(out), state->pool); else free_pages((unsigned long) out, order); - if (b->written) - bset_build_written_tree(b); + bset_build_written_tree(b); if (!start) - bch_time_stats_update(&b->c->sort_time, start_time); + bch_time_stats_update(&state->time, start_time); } -void bch_btree_sort_partial(struct btree *b, unsigned start) +void bch_btree_sort_partial(struct btree *b, unsigned start, + struct bset_sort_state *state) { size_t order = b->page_order, keys = 0; struct btree_iter iter; @@ -1148,18 +1170,19 @@ void bch_btree_sort_partial(struct btree *b, unsigned start) order = ilog2(order); } - __btree_sort(b, &iter, start, order, false); + __btree_sort(b, &iter, start, order, false, state); EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize); } -void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter) +void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter, + struct bset_sort_state *state) { - BUG_ON(!b->written); - __btree_sort(b, iter, 0, b->page_order, true); + __btree_sort(b, iter, 0, b->page_order, true, state); } -void bch_btree_sort_into(struct btree *b, struct btree *new) +void bch_btree_sort_into(struct btree *b, struct btree *new, + struct bset_sort_state *state) { uint64_t start_time = local_clock(); @@ -1168,15 +1191,14 @@ void bch_btree_sort_into(struct btree *b, struct btree *new) btree_mergesort(b, new->sets->data, &iter, false, true); - bch_time_stats_update(&b->c->sort_time, start_time); + bch_time_stats_update(&state->time, start_time); - bkey_copy_key(&new->key, &b->key); new->sets->size = 0; } #define SORT_CRIT (4096 / sizeof(uint64_t)) -void bch_btree_sort_lazy(struct btree *b) +void bch_btree_sort_lazy(struct btree *b, struct bset_sort_state *state) { unsigned crit = SORT_CRIT; int i; @@ -1185,24 +1207,18 @@ void bch_btree_sort_lazy(struct btree *b) if (!b->nsets) goto out; - /* If not a leaf node, always sort */ - if (b->level) { - bch_btree_sort(b); - return; - } - for (i = b->nsets - 1; i >= 0; --i) { - crit *= b->c->sort_crit_factor; + crit *= state->crit_factor; if (b->sets[i].data->keys < crit) { - bch_btree_sort_partial(b, i); + bch_btree_sort_partial(b, i, state); return; } } /* Sort if we'd overflow */ if (b->nsets + 1 == MAX_BSETS) { - bch_btree_sort(b); + bch_btree_sort(b, state); return; } diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 91bcbdb0408..4f60c21c7a3 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -3,6 +3,8 @@ #include +#include "util.h" /* for time_stats */ + /* * BKEYS: * @@ -190,6 +192,33 @@ struct bset_tree { struct bset *data; }; +/* Sorting */ + +struct bset_sort_state { + mempool_t *pool; + + unsigned page_order; + unsigned crit_factor; + + struct time_stats time; +}; + +void bch_bset_sort_state_free(struct bset_sort_state *); +int bch_bset_sort_state_init(struct bset_sort_state *, unsigned); +void bch_btree_sort_lazy(struct btree *, struct bset_sort_state *); +void bch_btree_sort_into(struct btree *, struct btree *, + struct bset_sort_state *); +void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *, + struct bset_sort_state *); +void bch_btree_sort_partial(struct btree *, unsigned, + struct bset_sort_state *); + +static inline void bch_btree_sort(struct btree *b, + struct bset_sort_state *state) +{ + bch_btree_sort_partial(b, 0, state); +} + /* Keylists */ struct keylist { @@ -374,15 +403,6 @@ static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, }) bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); -void bch_btree_sort_lazy(struct btree *); -void bch_btree_sort_into(struct btree *, struct btree *); -void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *); -void bch_btree_sort_partial(struct btree *, unsigned); - -static inline void bch_btree_sort(struct btree *b) -{ - bch_btree_sort_partial(b, 0); -} int bch_bset_print_stats(struct cache_set *, char *); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index e1e36e76172..78ba0b67ac1 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -263,7 +263,7 @@ void bch_btree_node_read_done(struct btree *b) if (i->seq == b->sets[0].data->seq) goto err; - bch_btree_sort_and_fix_extents(b, iter); + bch_btree_sort_and_fix_extents(b, iter, &b->c->sort); i = b->sets[0].data; err = "short btree key"; @@ -476,7 +476,11 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); - bch_btree_sort_lazy(b); + /* If not a leaf node, always sort */ + if (b->level && b->nsets) + bch_btree_sort(b, &b->c->sort); + else + bch_btree_sort_lazy(b, &b->c->sort); /* * do verify if there was more than one set initially (i.e. we did a @@ -1125,8 +1129,10 @@ err: static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) { struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); - if (!IS_ERR_OR_NULL(n)) - bch_btree_sort_into(b, n); + if (!IS_ERR_OR_NULL(n)) { + bch_btree_sort_into(b, n, &b->c->sort); + bkey_copy_key(&n->key, &b->key); + } return n; } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index d05e7562771..1fc8165a5c0 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1351,6 +1351,7 @@ static void cache_set_free(struct closure *cl) if (ca) kobject_put(&ca->kobj); + bch_bset_sort_state_free(&c->sort); free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); if (c->bio_split) @@ -1481,15 +1482,12 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->btree_pages = max_t(int, c->btree_pages / 4, BTREE_MAX_PAGES); - c->sort_crit_factor = int_sqrt(c->btree_pages); - sema_init(&c->sb_write_mutex, 1); mutex_init(&c->bucket_lock); init_waitqueue_head(&c->try_wait); init_waitqueue_head(&c->bucket_wait); sema_init(&c->uuid_write_mutex, 1); - spin_lock_init(&c->sort_time.lock); spin_lock_init(&c->btree_gc_time.lock); spin_lock_init(&c->btree_split_time.lock); spin_lock_init(&c->btree_read_time.lock); @@ -1517,12 +1515,11 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) bucket_pages(c))) || !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || - !(c->sort_pool = mempool_create_page_pool(1, - ilog2(bucket_pages(c)))) || !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || bch_journal_alloc(c) || bch_btree_cache_alloc(c) || - bch_open_buckets_alloc(c)) + bch_open_buckets_alloc(c) || + bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages))) goto err; c->congested_read_threshold_us = 2000; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index d5dd282b176..206c80fb27c 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -490,7 +490,7 @@ lock_root: sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); - sysfs_print_time_stats(&c->sort_time, btree_sort, ms, us); + sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us); sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us); -- cgit v1.2.3-70-g09d2 From ee811287c9f241641899788cbfc9d70ed96ba3a5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 17 Dec 2013 23:49:49 -0800 Subject: bcache: Rename/shuffle various code around More work to disentangle bset.c from the rest of the code: Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 8 -- drivers/md/bcache/bset.c | 172 +++++++++++++++++++++++++----- drivers/md/bcache/bset.h | 247 ++++++++++++++++++++------------------------ drivers/md/bcache/btree.c | 167 +++++++++++------------------- drivers/md/bcache/btree.h | 9 +- drivers/md/bcache/debug.c | 3 +- drivers/md/bcache/journal.c | 9 +- drivers/md/bcache/super.c | 2 +- 8 files changed, 341 insertions(+), 276 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 7bd4c93475e..5c74d55cea7 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -717,14 +717,6 @@ struct bbio { #define bucket_bytes(c) ((c)->sb.bucket_size << 9) #define block_bytes(c) ((c)->sb.block_size << 9) -#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) -#define set_bytes(i) __set_bytes(i, i->keys) - -#define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c)) -#define set_blocks(i, c) __set_blocks(i, (i)->keys, c) - -#define btree_data_space(b) (PAGE_SIZE << (b)->page_order) - #define prios_per_bucket(c) \ ((bucket_bytes(c) - sizeof(struct prio_set)) / \ sizeof(struct bucket_disk)) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 9d9c2edda76..e04e5908e29 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -302,6 +302,115 @@ bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r) return true; } +/* Auxiliary search trees */ + +/* 32 bits total: */ +#define BKEY_MID_BITS 3 +#define BKEY_EXPONENT_BITS 7 +#define BKEY_MANTISSA_BITS (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS) +#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) + +struct bkey_float { + unsigned exponent:BKEY_EXPONENT_BITS; + unsigned m:BKEY_MID_BITS; + unsigned mantissa:BKEY_MANTISSA_BITS; +} __packed; + +/* + * BSET_CACHELINE was originally intended to match the hardware cacheline size - + * it used to be 64, but I realized the lookup code would touch slightly less + * memory if it was 128. + * + * It definites the number of bytes (in struct bset) per struct bkey_float in + * the auxiliar search tree - when we're done searching the bset_float tree we + * have this many bytes left that we do a linear search over. + * + * Since (after level 5) every level of the bset_tree is on a new cacheline, + * we're touching one fewer cacheline in the bset tree in exchange for one more + * cacheline in the linear search - but the linear search might stop before it + * gets to the second cacheline. + */ + +#define BSET_CACHELINE 128 + +/* Space required for the btree node keys */ +static inline size_t btree_keys_bytes(struct btree *b) +{ + return PAGE_SIZE << b->page_order; +} + +static inline size_t btree_keys_cachelines(struct btree *b) +{ + return btree_keys_bytes(b) / BSET_CACHELINE; +} + +/* Space required for the auxiliary search trees */ +static inline size_t bset_tree_bytes(struct btree *b) +{ + return btree_keys_cachelines(b) * sizeof(struct bkey_float); +} + +/* Space required for the prev pointers */ +static inline size_t bset_prev_bytes(struct btree *b) +{ + return btree_keys_cachelines(b) * sizeof(uint8_t); +} + +/* Memory allocation */ + +void bch_btree_keys_free(struct btree *b) +{ + struct bset_tree *t = b->sets; + + if (bset_prev_bytes(b) < PAGE_SIZE) + kfree(t->prev); + else + free_pages((unsigned long) t->prev, + get_order(bset_prev_bytes(b))); + + if (bset_tree_bytes(b) < PAGE_SIZE) + kfree(t->tree); + else + free_pages((unsigned long) t->tree, + get_order(bset_tree_bytes(b))); + + free_pages((unsigned long) t->data, b->page_order); + + t->prev = NULL; + t->tree = NULL; + t->data = NULL; +} + +int bch_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp) +{ + struct bset_tree *t = b->sets; + + BUG_ON(t->data); + + b->page_order = page_order; + + t->data = (void *) __get_free_pages(gfp, b->page_order); + if (!t->data) + goto err; + + t->tree = bset_tree_bytes(b) < PAGE_SIZE + ? kmalloc(bset_tree_bytes(b), gfp) + : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); + if (!t->tree) + goto err; + + t->prev = bset_prev_bytes(b) < PAGE_SIZE + ? kmalloc(bset_prev_bytes(b), gfp) + : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); + if (!t->prev) + goto err; + + return 0; +err: + bch_btree_keys_free(b); + return -ENOMEM; +} + /* Binary tree stuff for auxiliary search trees */ static unsigned inorder_next(unsigned j, unsigned size) @@ -538,21 +647,36 @@ static void bset_alloc_tree(struct btree *b, struct bset_tree *t) t++->size = 0; } -static void bset_build_unwritten_tree(struct btree *b) +static void bch_bset_build_unwritten_tree(struct btree *b) { - struct bset_tree *t = b->sets + b->nsets; + struct bset_tree *t = bset_tree_last(b); bset_alloc_tree(b, t); - if (t->tree != b->sets->tree + bset_tree_space(b)) { + if (t->tree != b->sets->tree + btree_keys_cachelines(b)) { t->prev[0] = bkey_to_cacheline_offset(t->data->start); t->size = 1; } } +void bch_bset_init_next(struct btree *b, struct bset *i, uint64_t magic) +{ + if (i != b->sets->data) { + b->sets[++b->nsets].data = i; + i->seq = b->sets->data->seq; + } else + get_random_bytes(&i->seq, sizeof(uint64_t)); + + i->magic = magic; + i->version = 0; + i->keys = 0; + + bch_bset_build_unwritten_tree(b); +} + static void bset_build_written_tree(struct btree *b) { - struct bset_tree *t = b->sets + b->nsets; + struct bset_tree *t = bset_tree_last(b); struct bkey *k = t->data->start; unsigned j, cacheline = 1; @@ -560,7 +684,7 @@ static void bset_build_written_tree(struct btree *b) t->size = min_t(unsigned, bkey_to_cacheline(t, bset_bkey_last(t->data)), - b->sets->tree + bset_tree_space(b) - t->tree); + b->sets->tree + btree_keys_cachelines(b) - t->tree); if (t->size < 2) { t->size = 0; @@ -599,7 +723,7 @@ void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k) struct bset_tree *t; unsigned inorder, j = 1; - for (t = b->sets; t <= &b->sets[b->nsets]; t++) + for (t = b->sets; t <= bset_tree_last(b); t++) if (k < bset_bkey_last(t->data)) goto found_set; @@ -639,9 +763,10 @@ fix_right: do { } while (j < t->size); } -void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) +static void bch_bset_fix_lookup_table(struct btree *b, + struct bset_tree *t, + struct bkey *k) { - struct bset_tree *t = &b->sets[b->nsets]; unsigned shift = bkey_u64s(k); unsigned j = bkey_to_cacheline(t, k); @@ -673,7 +798,7 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) } } - if (t->size == b->sets->tree + bset_tree_space(b) - t->tree) + if (t->size == b->sets->tree + btree_keys_cachelines(b) - t->tree) return; /* Possibly add a new entry to the end of the lookup table */ @@ -687,21 +812,23 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) } } -void bch_bset_init_next(struct btree *b) +void bch_bset_insert(struct btree *b, struct bkey *where, + struct bkey *insert) { - struct bset *i = write_block(b); + struct bset_tree *t = bset_tree_last(b); - if (i != b->sets[0].data) { - b->sets[++b->nsets].data = i; - i->seq = b->sets[0].data->seq; - } else - get_random_bytes(&i->seq, sizeof(uint64_t)); + BUG_ON(t->data != write_block(b)); + BUG_ON(bset_byte_offset(b, t->data) + + __set_bytes(t->data, t->data->keys + bkey_u64s(insert)) > + PAGE_SIZE << b->page_order); - i->magic = bset_magic(&b->c->sb); - i->version = 0; - i->keys = 0; + memmove((uint64_t *) where + bkey_u64s(insert), + where, + (void *) bset_bkey_last(t->data) - (void *) where); - bset_build_unwritten_tree(b); + t->data->keys += bkey_u64s(insert); + bkey_copy(where, insert); + bch_bset_fix_lookup_table(b, t, where); } struct bset_search_iter { @@ -1154,9 +1281,8 @@ void bch_btree_sort_partial(struct btree *b, unsigned start, __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); - BUG_ON(b->sets[b->nsets].data == write_block(b) && - (b->sets[b->nsets].size || b->nsets)); - + BUG_ON(!bset_written(b, bset_tree_last(b)) && + (bset_tree_last(b)->size || b->nsets)); if (start) { unsigned i; diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 4f60c21c7a3..ab31f3fc8d4 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -144,22 +144,11 @@ * first key in that range of bytes again. */ -struct cache_set; - -/* Btree key comparison/iteration */ +struct btree; +struct bkey_float; #define MAX_BSETS 4U -struct btree_iter { - size_t size, used; -#ifdef CONFIG_BCACHE_DEBUG - struct btree *b; -#endif - struct btree_iter_set { - struct bkey *k, *end; - } data[MAX_BSETS]; -}; - struct bset_tree { /* * We construct a binary tree in an array as if the array @@ -169,14 +158,14 @@ struct bset_tree { */ /* size of the binary tree and prev array */ - unsigned size; + unsigned size; /* function of size - precalculated for to_inorder() */ - unsigned extra; + unsigned extra; /* copy of the last key in the set */ - struct bkey end; - struct bkey_float *tree; + struct bkey end; + struct bkey_float *tree; /* * The nodes in the bset tree point to specific keys - this @@ -186,12 +175,61 @@ struct bset_tree { * to keep bkey_float to 4 bytes and prev isn't used in the fast * path. */ - uint8_t *prev; + uint8_t *prev; /* The actual btree node, with pointers to each sorted set */ - struct bset *data; + struct bset *data; }; +#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) +#define set_bytes(i) __set_bytes(i, i->keys) + +#define __set_blocks(i, k, block_bytes) \ + DIV_ROUND_UP(__set_bytes(i, k), block_bytes) +#define set_blocks(i, block_bytes) \ + __set_blocks(i, (i)->keys, block_bytes) + +void bch_btree_keys_free(struct btree *); +int bch_btree_keys_alloc(struct btree *, unsigned, gfp_t); + +void bch_bset_fix_invalidated_key(struct btree *, struct bkey *); +void bch_bset_init_next(struct btree *, struct bset *, uint64_t); +void bch_bset_insert(struct btree *, struct bkey *, struct bkey *); + +/* Btree key iteration */ + +struct btree_iter { + size_t size, used; +#ifdef CONFIG_BCACHE_DEBUG + struct btree *b; +#endif + struct btree_iter_set { + struct bkey *k, *end; + } data[MAX_BSETS]; +}; + +typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); + +struct bkey *bch_btree_iter_next(struct btree_iter *); +struct bkey *bch_btree_iter_next_filter(struct btree_iter *, + struct btree *, ptr_filter_fn); + +void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); +struct bkey *bch_btree_iter_init(struct btree *, struct btree_iter *, + struct bkey *); + +struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, + const struct bkey *); + +/* + * Returns the first key that is strictly greater than search + */ +static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, + const struct bkey *search) +{ + return search ? __bch_bset_search(b, t, search) : t->data->start; +} + /* Sorting */ struct bset_sort_state { @@ -219,6 +257,60 @@ static inline void bch_btree_sort(struct btree *b, bch_btree_sort_partial(b, 0, state); } +/* Bkey utility code */ + +#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys) + +static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx) +{ + return bkey_idx(i->start, idx); +} + +static inline void bkey_init(struct bkey *k) +{ + *k = ZERO_KEY; +} + +static __always_inline int64_t bkey_cmp(const struct bkey *l, + const struct bkey *r) +{ + return unlikely(KEY_INODE(l) != KEY_INODE(r)) + ? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r) + : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); +} + +void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, + unsigned); +bool __bch_cut_front(const struct bkey *, struct bkey *); +bool __bch_cut_back(const struct bkey *, struct bkey *); + +static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) +{ + BUG_ON(bkey_cmp(where, k) > 0); + return __bch_cut_front(where, k); +} + +static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) +{ + BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); + return __bch_cut_back(where, k); +} + +#define PRECEDING_KEY(_k) \ +({ \ + struct bkey *_ret = NULL; \ + \ + if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ + _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ + \ + if (!_ret->low) \ + _ret->high--; \ + _ret->low--; \ + } \ + \ + _ret; \ +}) + /* Keylists */ struct keylist { @@ -282,126 +374,15 @@ struct bkey *bch_keylist_pop(struct keylist *); void bch_keylist_pop_front(struct keylist *); int __bch_keylist_realloc(struct keylist *, unsigned); -/* Bkey utility code */ - -#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys) - -static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx) -{ - return bkey_idx(i->start, idx); -} - -static inline void bkey_init(struct bkey *k) -{ - *k = ZERO_KEY; -} - -static __always_inline int64_t bkey_cmp(const struct bkey *l, - const struct bkey *r) -{ - return unlikely(KEY_INODE(l) != KEY_INODE(r)) - ? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r) - : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); -} - -void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, - unsigned); -bool __bch_cut_front(const struct bkey *, struct bkey *); -bool __bch_cut_back(const struct bkey *, struct bkey *); - -static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) -{ - BUG_ON(bkey_cmp(where, k) > 0); - return __bch_cut_front(where, k); -} - -static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) -{ - BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); - return __bch_cut_back(where, k); -} - +struct cache_set; const char *bch_ptr_status(struct cache_set *, const struct bkey *); bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *); +bool bch_btree_ptr_bad(struct btree *, const struct bkey *); +bool bch_extent_ptr_bad(struct btree *, const struct bkey *); bool bch_ptr_bad(struct btree *, const struct bkey *); -typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); - -struct bkey *bch_btree_iter_next(struct btree_iter *); -struct bkey *bch_btree_iter_next_filter(struct btree_iter *, - struct btree *, ptr_filter_fn); - -void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); -struct bkey *bch_btree_iter_init(struct btree *, struct btree_iter *, - struct bkey *); - -/* 32 bits total: */ -#define BKEY_MID_BITS 3 -#define BKEY_EXPONENT_BITS 7 -#define BKEY_MANTISSA_BITS 22 -#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) - -struct bkey_float { - unsigned exponent:BKEY_EXPONENT_BITS; - unsigned m:BKEY_MID_BITS; - unsigned mantissa:BKEY_MANTISSA_BITS; -} __packed; - -/* - * BSET_CACHELINE was originally intended to match the hardware cacheline size - - * it used to be 64, but I realized the lookup code would touch slightly less - * memory if it was 128. - * - * It definites the number of bytes (in struct bset) per struct bkey_float in - * the auxiliar search tree - when we're done searching the bset_float tree we - * have this many bytes left that we do a linear search over. - * - * Since (after level 5) every level of the bset_tree is on a new cacheline, - * we're touching one fewer cacheline in the bset tree in exchange for one more - * cacheline in the linear search - but the linear search might stop before it - * gets to the second cacheline. - */ - -#define BSET_CACHELINE 128 -#define bset_tree_space(b) (btree_data_space(b) / BSET_CACHELINE) - -#define bset_tree_bytes(b) (bset_tree_space(b) * sizeof(struct bkey_float)) -#define bset_prev_bytes(b) (bset_tree_space(b) * sizeof(uint8_t)) - -void bch_bset_init_next(struct btree *); - -void bch_bset_fix_invalidated_key(struct btree *, struct bkey *); -void bch_bset_fix_lookup_table(struct btree *, struct bkey *); - -struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, - const struct bkey *); - -/* - * Returns the first key that is strictly greater than search - */ -static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, - const struct bkey *search) -{ - return search ? __bch_bset_search(b, t, search) : t->data->start; -} - -#define PRECEDING_KEY(_k) \ -({ \ - struct bkey *_ret = NULL; \ - \ - if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ - _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ - \ - if (!_ret->low) \ - _ret->high--; \ - _ret->low--; \ - } \ - \ - _ret; \ -}) - bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); int bch_bset_print_stats(struct cache_set *, char *); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 78ba0b67ac1..89252e7f287 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -110,7 +110,7 @@ static inline bool should_split(struct btree *b) { struct bset *i = write_block(b); return b->written >= btree_blocks(b) || - (b->written + __set_blocks(i, i->keys + 15, b->c) + (b->written + __set_blocks(i, i->keys + 15, block_bytes(b->c)) > btree_blocks(b)); } @@ -206,7 +206,7 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i) void bch_btree_node_read_done(struct btree *b) { const char *err = "bad btree header"; - struct bset *i = b->sets[0].data; + struct bset *i = btree_bset_first(b); struct btree_iter *iter; iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); @@ -228,7 +228,8 @@ void bch_btree_node_read_done(struct btree *b) goto err; err = "bad btree header"; - if (b->written + set_blocks(i, b->c) > btree_blocks(b)) + if (b->written + set_blocks(i, block_bytes(b->c)) > + btree_blocks(b)) goto err; err = "bad magic"; @@ -253,7 +254,7 @@ void bch_btree_node_read_done(struct btree *b) bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); - b->written += set_blocks(i, b->c); + b->written += set_blocks(i, block_bytes(b->c)); } err = "corrupted btree"; @@ -272,7 +273,7 @@ void bch_btree_node_read_done(struct btree *b) goto err; if (b->written < btree_blocks(b)) - bch_bset_init_next(b); + bch_bset_init_next(b, write_block(b), bset_magic(&b->c->sb)); out: mempool_free(iter, b->c->fill_iter); return; @@ -393,7 +394,7 @@ static void btree_node_write_endio(struct bio *bio, int error) static void do_btree_node_write(struct btree *b) { struct closure *cl = &b->io; - struct bset *i = b->sets[b->nsets].data; + struct bset *i = btree_bset_last(b); BKEY_PADDED(key) k; i->version = BCACHE_BSET_VERSION; @@ -405,7 +406,7 @@ static void do_btree_node_write(struct btree *b) b->bio->bi_end_io = btree_node_write_endio; b->bio->bi_private = cl; b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; - b->bio->bi_iter.bi_size = set_blocks(i, b->c) * block_bytes(b->c); + b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c)); bch_bio_map(b->bio, i); /* @@ -424,7 +425,8 @@ static void do_btree_node_write(struct btree *b) */ bkey_copy(&k.key, &b->key); - SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); + SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + + bset_sector_offset(b, i)); if (!bio_alloc_pages(b->bio, GFP_NOIO)) { int j; @@ -451,14 +453,14 @@ static void do_btree_node_write(struct btree *b) void bch_btree_node_write(struct btree *b, struct closure *parent) { - struct bset *i = b->sets[b->nsets].data; + struct bset *i = btree_bset_last(b); trace_bcache_btree_write(b); BUG_ON(current->bio_list); BUG_ON(b->written >= btree_blocks(b)); BUG_ON(b->written && !i->keys); - BUG_ON(b->sets->data->seq != i->seq); + BUG_ON(btree_bset_first(b)->seq != i->seq); bch_check_keys(b, "writing"); cancel_delayed_work(&b->work); @@ -472,8 +474,8 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) do_btree_node_write(b); - b->written += set_blocks(i, b->c); - atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, + b->written += set_blocks(i, block_bytes(b->c)); + atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size, &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); /* If not a leaf node, always sort */ @@ -490,7 +492,7 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) bch_btree_verify(b); if (b->written < btree_blocks(b)) - bch_bset_init_next(b); + bch_bset_init_next(b, write_block(b), bset_magic(&b->c->sb)); } static void bch_btree_node_write_sync(struct btree *b) @@ -515,7 +517,7 @@ static void btree_node_write_work(struct work_struct *w) static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) { - struct bset *i = b->sets[b->nsets].data; + struct bset *i = btree_bset_last(b); struct btree_write *w = btree_current_write(b); BUG_ON(!b->written); @@ -575,29 +577,12 @@ static void mca_reinit(struct btree *b) static void mca_data_free(struct btree *b) { - struct bset_tree *t = b->sets; - BUG_ON(b->io_mutex.count != 1); - if (bset_prev_bytes(b) < PAGE_SIZE) - kfree(t->prev); - else - free_pages((unsigned long) t->prev, - get_order(bset_prev_bytes(b))); - - if (bset_tree_bytes(b) < PAGE_SIZE) - kfree(t->tree); - else - free_pages((unsigned long) t->tree, - get_order(bset_tree_bytes(b))); - - free_pages((unsigned long) t->data, b->page_order); + bch_btree_keys_free(b); - t->prev = NULL; - t->tree = NULL; - t->data = NULL; - list_move(&b->list, &b->c->btree_cache_freed); b->c->bucket_cache_used--; + list_move(&b->list, &b->c->btree_cache_freed); } static void mca_bucket_free(struct btree *b) @@ -616,34 +601,16 @@ static unsigned btree_order(struct bkey *k) static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) { - struct bset_tree *t = b->sets; - BUG_ON(t->data); - - b->page_order = max_t(unsigned, - ilog2(b->c->btree_pages), - btree_order(k)); - - t->data = (void *) __get_free_pages(gfp, b->page_order); - if (!t->data) - goto err; - - t->tree = bset_tree_bytes(b) < PAGE_SIZE - ? kmalloc(bset_tree_bytes(b), gfp) - : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); - if (!t->tree) - goto err; - - t->prev = bset_prev_bytes(b) < PAGE_SIZE - ? kmalloc(bset_prev_bytes(b), gfp) - : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); - if (!t->prev) - goto err; - - list_move(&b->list, &b->c->btree_cache); - b->c->bucket_cache_used++; - return; -err: - mca_data_free(b); + if (!bch_btree_keys_alloc(b, + max_t(unsigned, + ilog2(b->c->btree_pages), + btree_order(k)), + gfp)) { + b->c->bucket_cache_used++; + list_move(&b->list, &b->c->btree_cache); + } else { + list_move(&b->list, &b->c->btree_cache_freed); + } } static struct btree *mca_bucket_alloc(struct cache_set *c, @@ -1111,7 +1078,7 @@ retry: } b->accessed = 1; - bch_bset_init_next(b); + bch_bset_init_next(b, b->sets->data, bset_magic(&b->c->sb)); mutex_unlock(&c->bucket_lock); @@ -1298,7 +1265,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, blocks = btree_default_blocks(b->c) * 2 / 3; if (nodes < 2 || - __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) + __set_blocks(b->sets[0].data, keys, + block_bytes(b->c)) > blocks * (nodes - 1)) return 0; for (i = 0; i < nodes; i++) { @@ -1308,8 +1276,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, } for (i = nodes - 1; i > 0; --i) { - struct bset *n1 = new_nodes[i]->sets->data; - struct bset *n2 = new_nodes[i - 1]->sets->data; + struct bset *n1 = btree_bset_first(new_nodes[i]); + struct bset *n2 = btree_bset_first(new_nodes[i - 1]); struct bkey *k, *last = NULL; keys = 0; @@ -1319,7 +1287,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, k < bset_bkey_last(n2); k = bkey_next(k)) { if (__set_blocks(n1, n1->keys + keys + - bkey_u64s(k), b->c) > blocks) + bkey_u64s(k), + block_bytes(b->c)) > blocks) break; last = k; @@ -1335,7 +1304,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, * though) */ if (__set_blocks(n1, n1->keys + n2->keys, - b->c) > btree_blocks(new_nodes[i])) + block_bytes(b->c)) > + btree_blocks(new_nodes[i])) goto out_nocoalesce; keys = n2->keys; @@ -1343,8 +1313,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, last = &r->b->key; } - BUG_ON(__set_blocks(n1, n1->keys + keys, - b->c) > btree_blocks(new_nodes[i])); + BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) > + btree_blocks(new_nodes[i])); if (last) bkey_copy_key(&new_nodes[i]->key, last); @@ -1380,7 +1350,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, } /* We emptied out this node */ - BUG_ON(new_nodes[0]->sets->data->keys); + BUG_ON(btree_bset_first(new_nodes[0])->keys); btree_node_free(new_nodes[0]); rw_unlock(true, new_nodes[0]); @@ -1831,19 +1801,6 @@ err: /* Btree insertion */ -static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) -{ - struct bset *i = b->sets[b->nsets].data; - - memmove((uint64_t *) where + bkey_u64s(insert), - where, - (void *) bset_bkey_last(i) - (void *) where); - - i->keys += bkey_u64s(insert); - bkey_copy(where, insert); - bch_bset_fix_lookup_table(b, where); -} - static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, struct btree_iter *iter, struct bkey *replace_key) @@ -1944,13 +1901,13 @@ static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, * depends on us inserting a new key for the top * here. */ - top = bch_bset_search(b, &b->sets[b->nsets], + top = bch_bset_search(b, bset_tree_last(b), insert); - shift_keys(b, top, k); + bch_bset_insert(b, top, k); } else { BKEY_PADDED(key) temp; bkey_copy(&temp.key, k); - shift_keys(b, k, &temp.key); + bch_bset_insert(b, k, &temp.key); top = bkey_next(k); } @@ -1999,7 +1956,7 @@ check_failed: static bool btree_insert_key(struct btree *b, struct btree_op *op, struct bkey *k, struct bkey *replace_key) { - struct bset *i = b->sets[b->nsets].data; + struct bset *i = btree_bset_last(b); struct bkey *m, *prev; unsigned status = BTREE_INSERT_STATUS_INSERT; @@ -2051,10 +2008,10 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, goto copy; } else { BUG_ON(replace_key); - m = bch_bset_search(b, &b->sets[b->nsets], k); + m = bch_bset_search(b, bset_tree_last(b), k); } -insert: shift_keys(b, m, k); +insert: bch_bset_insert(b, m, k); copy: bkey_copy(m, k); merged: bch_check_keys(b, "%u for %s", status, @@ -2079,8 +2036,9 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, struct bset *i = write_block(b); struct bkey *k = insert_keys->keys; - if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c) - > btree_blocks(b)) + if (b->written + + __set_blocks(i, i->keys + bkey_u64s(k), + block_bytes(b->c)) > btree_blocks(b)) break; if (bkey_cmp(k, &b->key) <= 0) { @@ -2130,12 +2088,13 @@ static int btree_split(struct btree *b, struct btree_op *op, if (IS_ERR(n1)) goto err; - split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; + split = set_blocks(btree_bset_first(n1), + block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5; if (split) { unsigned keys = 0; - trace_bcache_btree_node_split(b, n1->sets[0].data->keys); + trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); n2 = bch_btree_node_alloc(b->c, b->level, true); if (IS_ERR(n2)) @@ -2154,20 +2113,20 @@ static int btree_split(struct btree *b, struct btree_op *op, * search tree yet */ - while (keys < (n1->sets[0].data->keys * 3) / 5) - keys += bkey_u64s(bset_bkey_idx(n1->sets[0].data, + while (keys < (btree_bset_first(n1)->keys * 3) / 5) + keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys)); bkey_copy_key(&n1->key, - bset_bkey_idx(n1->sets[0].data, keys)); - keys += bkey_u64s(bset_bkey_idx(n1->sets[0].data, keys)); + bset_bkey_idx(btree_bset_first(n1), keys)); + keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys)); - n2->sets[0].data->keys = n1->sets[0].data->keys - keys; - n1->sets[0].data->keys = keys; + btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys; + btree_bset_first(n1)->keys = keys; - memcpy(n2->sets[0].data->start, - bset_bkey_last(n1->sets[0].data), - n2->sets[0].data->keys * sizeof(uint64_t)); + memcpy(btree_bset_first(n2)->start, + bset_bkey_last(btree_bset_first(n1)), + btree_bset_first(n2)->keys * sizeof(uint64_t)); bkey_copy_key(&n2->key, &b->key); @@ -2175,7 +2134,7 @@ static int btree_split(struct btree *b, struct btree_op *op, bch_btree_node_write(n2, &cl); rw_unlock(true, n2); } else { - trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); + trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); bch_btree_insert_keys(n1, op, insert_keys, replace_key); } @@ -2256,7 +2215,7 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, -EINTR; } } else { - BUG_ON(write_block(b) != b->sets[b->nsets].data); + BUG_ON(write_block(b) != btree_bset_last(b)); if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { if (!b->level) diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 2a5a8481f25..b3541173ccf 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -180,9 +180,9 @@ static inline struct btree_write *btree_prev_write(struct btree *b) return b->writes + (btree_node_write_idx(b) ^ 1); } -static inline unsigned bset_offset(struct btree *b, struct bset *i) +static inline struct bset_tree *bset_tree_last(struct btree *b) { - return (((size_t) i) - ((size_t) b->sets->data)) >> 9; + return b->sets + b->nsets; } static inline struct bset *btree_bset_first(struct btree *b) @@ -190,6 +190,11 @@ static inline struct bset *btree_bset_first(struct btree *b) return b->sets->data; } +static inline struct bset *btree_bset_last(struct btree *b) +{ + return bset_tree_last(b)->data; +} + static inline unsigned bset_byte_offset(struct btree *b, struct bset *i) { return ((size_t) i) - ((size_t) b->sets->data); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 955fa1d3177..5a78137c420 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -123,7 +123,8 @@ static void bch_dump_bucket(struct btree *b) for (i = (start); \ (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\ i->seq == (start)->seq; \ - i = (void *) i + set_blocks(i, b->c) * block_bytes(b->c)) + i = (void *) i + set_blocks(i, block_bytes(b->c)) * \ + block_bytes(b->c)) void bch_btree_verify(struct btree *b) { diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 5e14e3325ec..18039affc30 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -95,7 +95,7 @@ reread: left = ca->sb.bucket_size - offset; return ret; } - blocks = set_blocks(j, ca->set); + blocks = set_blocks(j, block_bytes(ca->set)); while (!list_empty(list)) { i = list_first_entry(list, @@ -579,7 +579,8 @@ static void journal_write_unlocked(struct closure *cl) struct cache *ca; struct journal_write *w = c->journal.cur; struct bkey *k = &c->journal.key; - unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size; + unsigned i, sectors = set_blocks(w->data, block_bytes(c)) * + c->sb.block_size; struct bio *bio; struct bio_list list; @@ -595,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl) continue_at(cl, journal_write, system_wq); } - c->journal.blocks_free -= set_blocks(w->data, c); + c->journal.blocks_free -= set_blocks(w->data, block_bytes(c)); w->data->btree_level = c->root->level; @@ -685,7 +686,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, struct journal_write *w = c->journal.cur; sectors = __set_blocks(w->data, w->data->keys + nkeys, - c) * c->sb.block_size; + block_bytes(c)) * c->sb.block_size; if (sectors <= min_t(size_t, c->journal.blocks_free * c->sb.block_size, diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1fc8165a5c0..12057a43e01 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1477,7 +1477,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->block_bits = ilog2(sb->block_size); c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); - c->btree_pages = c->sb.bucket_size / PAGE_SECTORS; + c->btree_pages = bucket_pages(c); if (c->btree_pages > BTREE_MAX_PAGES) c->btree_pages = max_t(int, c->btree_pages / 4, BTREE_MAX_PAGES); -- cgit v1.2.3-70-g09d2 From 65d45231b56efb3db51eb441e2c68f8252ecdd12 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 20 Dec 2013 17:22:05 -0800 Subject: bcache: Abstract out stuff needed for sorting Signed-off-by: Kent Overstreet --- drivers/md/bcache/Makefile | 5 +- drivers/md/bcache/bset.c | 279 +--------------------------------- drivers/md/bcache/bset.h | 8 - drivers/md/bcache/btree.c | 6 + drivers/md/bcache/btree.h | 42 +++++- drivers/md/bcache/debug.c | 1 + drivers/md/bcache/extents.c | 354 ++++++++++++++++++++++++++++++++++++++++++++ drivers/md/bcache/extents.h | 12 ++ drivers/md/bcache/super.c | 5 +- 9 files changed, 423 insertions(+), 289 deletions(-) create mode 100644 drivers/md/bcache/extents.c create mode 100644 drivers/md/bcache/extents.h diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index 0e9c82523be..c488b846f83 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -1,7 +1,8 @@ obj-$(CONFIG_BCACHE) += bcache.o -bcache-y := alloc.o btree.o bset.o io.o journal.o writeback.o\ - movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o +bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ + io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ + util.o writeback.o CFLAGS_request.o += -Iblock diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index e04e5908e29..c2c42cbbe88 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -63,140 +63,6 @@ void bch_keylist_pop_front(struct keylist *l) bch_keylist_bytes(l)); } -/* Pointer validation */ - -static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) -{ - unsigned i; - - for (i = 0; i < KEY_PTRS(k); i++) - if (ptr_available(c, k, i)) { - struct cache *ca = PTR_CACHE(c, k, i); - size_t bucket = PTR_BUCKET_NR(c, k, i); - size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); - - if (KEY_SIZE(k) + r > c->sb.bucket_size || - bucket < ca->sb.first_bucket || - bucket >= ca->sb.nbuckets) - return true; - } - - return false; -} - -bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) -{ - char buf[80]; - - if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) - goto bad; - - if (__ptr_invalid(c, k)) - goto bad; - - return false; -bad: - bch_bkey_to_text(buf, sizeof(buf), k); - cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); - return true; -} - -bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k) -{ - char buf[80]; - - if (!KEY_SIZE(k)) - return true; - - if (KEY_SIZE(k) > KEY_OFFSET(k)) - goto bad; - - if (__ptr_invalid(c, k)) - goto bad; - - return false; -bad: - bch_bkey_to_text(buf, sizeof(buf), k); - cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k)); - return true; -} - -static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k, - unsigned ptr) -{ - struct bucket *g = PTR_BUCKET(b->c, k, ptr); - char buf[80]; - - if (mutex_trylock(&b->c->bucket_lock)) { - if (b->level) { - if (KEY_DIRTY(k) || - g->prio != BTREE_PRIO || - (b->c->gc_mark_valid && - GC_MARK(g) != GC_MARK_METADATA)) - goto err; - - } else { - if (g->prio == BTREE_PRIO) - goto err; - - if (KEY_DIRTY(k) && - b->c->gc_mark_valid && - GC_MARK(g) != GC_MARK_DIRTY) - goto err; - } - mutex_unlock(&b->c->bucket_lock); - } - - return false; -err: - mutex_unlock(&b->c->bucket_lock); - bch_bkey_to_text(buf, sizeof(buf), k); - btree_bug(b, -"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", - buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), - g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); - return true; -} - -bool bch_ptr_bad(struct btree *b, const struct bkey *k) -{ - struct bucket *g; - unsigned i, stale; - - if (!bkey_cmp(k, &ZERO_KEY) || - !KEY_PTRS(k) || - bch_ptr_invalid(b, k)) - return true; - - for (i = 0; i < KEY_PTRS(k); i++) - if (!ptr_available(b->c, k, i)) - return true; - - if (!expensive_debug_checks(b->c) && KEY_DIRTY(k)) - return false; - - for (i = 0; i < KEY_PTRS(k); i++) { - g = PTR_BUCKET(b->c, k, i); - stale = ptr_stale(b->c, k, i); - - btree_bug_on(stale > 96, b, - "key too stale: %i, need_gc %u", - stale, b->c->need_gc); - - btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), - b, "stale dirty pointer"); - - if (stale) - return true; - - if (expensive_debug_checks(b->c) && - ptr_bad_expensive_checks(b, k, i)) - return true; - } - - return false; -} - /* Key/pointer manipulation */ void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, @@ -251,57 +117,6 @@ bool __bch_cut_back(const struct bkey *where, struct bkey *k) return true; } -static uint64_t merge_chksums(struct bkey *l, struct bkey *r) -{ - return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & - ~((uint64_t)1 << 63); -} - -/* Tries to merge l and r: l should be lower than r - * Returns true if we were able to merge. If we did merge, l will be the merged - * key, r will be untouched. - */ -bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r) -{ - unsigned i; - - if (key_merging_disabled(b->c)) - return false; - - if (KEY_PTRS(l) != KEY_PTRS(r) || - KEY_DIRTY(l) != KEY_DIRTY(r) || - bkey_cmp(l, &START_KEY(r))) - return false; - - for (i = 0; i < KEY_PTRS(l); i++) - if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || - PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) - return false; - - /* Keys with no pointers aren't restricted to one bucket and could - * overflow KEY_SIZE - */ - if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { - SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); - SET_KEY_SIZE(l, USHRT_MAX); - - bch_cut_front(l, r); - return false; - } - - if (KEY_CSUM(l)) { - if (KEY_CSUM(r)) - l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); - else - SET_KEY_CSUM(l, 0); - } - - SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); - SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); - - return true; -} - /* Auxiliary search trees */ /* 32 bits total: */ @@ -1099,85 +914,6 @@ int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order) return 0; } -static void sort_key_next(struct btree_iter *iter, - struct btree_iter_set *i) -{ - i->k = bkey_next(i->k); - - if (i->k == i->end) - *i = iter->data[--iter->used]; -} - -/* - * Returns true if l > r - unless l == r, in which case returns true if l is - * older than r. - * - * Necessary for btree_sort_fixup() - if there are multiple keys that compare - * equal in different sets, we have to process them newest to oldest. - */ -static inline bool sort_extent_cmp(struct btree_iter_set l, - struct btree_iter_set r) -{ - int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); - - return c ? c > 0 : l.k < r.k; -} - -static inline bool sort_cmp(struct btree_iter_set l, - struct btree_iter_set r) -{ - int64_t c = bkey_cmp(l.k, r.k); - - return c ? c > 0 : l.k < r.k; -} - -static struct bkey *btree_sort_fixup_extents(struct btree_iter *iter, - struct bkey *tmp) -{ - while (iter->used > 1) { - struct btree_iter_set *top = iter->data, *i = top + 1; - - if (iter->used > 2 && - sort_extent_cmp(i[0], i[1])) - i++; - - if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) - break; - - if (!KEY_SIZE(i->k)) { - sort_key_next(iter, i); - heap_sift(iter, i - top, sort_extent_cmp); - continue; - } - - if (top->k > i->k) { - if (bkey_cmp(top->k, i->k) >= 0) - sort_key_next(iter, i); - else - bch_cut_front(top->k, i->k); - - heap_sift(iter, i - top, sort_extent_cmp); - } else { - /* can't happen because of comparison func */ - BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); - - if (bkey_cmp(i->k, top->k) < 0) { - bkey_copy(tmp, top->k); - - bch_cut_back(&START_KEY(i->k), tmp); - bch_cut_front(i->k, top->k); - heap_sift(iter, 0, btree_iter_cmp); - - return tmp; - } else { - bch_cut_back(&START_KEY(i->k), top->k); - } - } - } - - return NULL; -} - static void btree_mergesort(struct btree *b, struct bset *out, struct btree_iter *iter, bool fixup, bool remove_stale) @@ -1185,25 +921,22 @@ static void btree_mergesort(struct btree *b, struct bset *out, int i; struct bkey *k, *last = NULL; BKEY_PADDED(k) tmp; - btree_iter_cmp_fn *cmp = b->level - ? sort_cmp - : sort_extent_cmp; bool (*bad)(struct btree *, const struct bkey *) = remove_stale ? bch_ptr_bad : bch_ptr_invalid; /* Heapify the iterator, using our comparison function */ for (i = iter->used / 2 - 1; i >= 0; --i) - heap_sift(iter, i, cmp); + heap_sift(iter, i, b->ops->sort_cmp); while (!btree_iter_end(iter)) { - if (fixup && !b->level) - k = btree_sort_fixup_extents(iter, &tmp.k); + if (b->ops->sort_fixup && fixup) + k = b->ops->sort_fixup(iter, &tmp.k); else k = NULL; if (!k) - k = __bch_btree_iter_next(iter, cmp); + k = __bch_btree_iter_next(iter, b->ops->sort_cmp); if (bad(b, k)) continue; @@ -1211,8 +944,7 @@ static void btree_mergesort(struct btree *b, struct bset *out, if (!last) { last = out->start; bkey_copy(last, k); - } else if (b->level || - !bch_bkey_try_merge(b, last, k)) { + } else if (!bch_bkey_try_merge(b, last, k)) { last = bkey_next(last); bkey_copy(last, k); } @@ -1300,6 +1032,7 @@ void bch_btree_sort_partial(struct btree *b, unsigned start, EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize); } +EXPORT_SYMBOL(bch_btree_sort_partial); void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter, struct bset_sort_state *state) diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index ab31f3fc8d4..b5797129e91 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -376,14 +376,6 @@ int __bch_keylist_realloc(struct keylist *, unsigned); struct cache_set; const char *bch_ptr_status(struct cache_set *, const struct bkey *); -bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); -bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *); -bool bch_btree_ptr_bad(struct btree *, const struct bkey *); -bool bch_extent_ptr_bad(struct btree *, const struct bkey *); - -bool bch_ptr_bad(struct btree *, const struct bkey *); - -bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); int bch_bset_print_stats(struct cache_set *, char *); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 89252e7f287..6734e2759b9 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -23,6 +23,7 @@ #include "bcache.h" #include "btree.h" #include "debug.h" +#include "extents.h" #include "writeback.h" #include @@ -931,6 +932,11 @@ out: b->level = level; b->parent = (void *) ~0UL; + if (!b->level) + b->ops = &bch_extent_keys_ops; + else + b->ops = &bch_btree_keys_ops; + mca_reinit(b); return b; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index b3541173ccf..0b436079db7 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -113,7 +113,28 @@ struct btree_write { int prio_blocked; }; +struct btree_keys_ops { + bool (*sort_cmp)(struct btree_iter_set, + struct btree_iter_set); + struct bkey *(*sort_fixup)(struct btree_iter *, + struct bkey *); + bool (*key_invalid)(struct btree *, + const struct bkey *); + bool (*key_bad)(struct btree *, + const struct bkey *); + bool (*key_merge)(struct btree *, + struct bkey *, struct bkey *); + + + /* + * Only used for deciding whether to use START_KEY(k) or just the key + * itself in a couple places + */ + bool is_extents; +}; + struct btree { + const struct btree_keys_ops *ops; /* Hottest entries first */ struct hlist_node hash; @@ -232,10 +253,23 @@ static inline void set_gc_sectors(struct cache_set *c) static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) { - if (b->level) - return bch_btree_ptr_invalid(b->c, k); - else - return bch_extent_ptr_invalid(b->c, k); + return b->ops->key_invalid(b, k); +} + +static inline bool bch_ptr_bad(struct btree *b, const struct bkey *k) +{ + return b->ops->key_bad(b, k); +} + +/* + * Tries to merge l and r: l should be lower than r + * Returns true if we were able to merge. If we did merge, l will be the merged + * key, r will be untouched. + */ +static inline bool bch_bkey_try_merge(struct btree *b, + struct bkey *l, struct bkey *r) +{ + return b->ops->key_merge ? b->ops->key_merge(b, l, r) : false; } void bkey_put(struct cache_set *c, struct bkey *k); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 5a78137c420..2c6587d016d 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -145,6 +145,7 @@ void bch_btree_verify(struct btree *b) bkey_copy(&v->key, &b->key); v->written = 0; v->level = b->level; + v->ops = b->ops; bio = bch_bbio_alloc(b->c); bio->bi_bdev = PTR_CACHE(b->c, &b->key, 0)->bdev; diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c new file mode 100644 index 00000000000..8fe6aaece41 --- /dev/null +++ b/drivers/md/bcache/extents.c @@ -0,0 +1,354 @@ +/* + * Copyright (C) 2010 Kent Overstreet + * + * Uses a block device as cache for other block devices; optimized for SSDs. + * All allocation is done in buckets, which should match the erase block size + * of the device. + * + * Buckets containing cached data are kept on a heap sorted by priority; + * bucket priority is increased on cache hit, and periodically all the buckets + * on the heap have their priority scaled down. This currently is just used as + * an LRU but in the future should allow for more intelligent heuristics. + * + * Buckets have an 8 bit counter; freeing is accomplished by incrementing the + * counter. Garbage collection is used to remove stale pointers. + * + * Indexing is done via a btree; nodes are not necessarily fully sorted, rather + * as keys are inserted we only sort the pages that have not yet been written. + * When garbage collection is run, we resort the entire node. + * + * All configuration is done via sysfs; see Documentation/bcache.txt. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "extents.h" +#include "writeback.h" + +static void sort_key_next(struct btree_iter *iter, + struct btree_iter_set *i) +{ + i->k = bkey_next(i->k); + + if (i->k == i->end) + *i = iter->data[--iter->used]; +} + +static bool bch_key_sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) +{ + int64_t c = bkey_cmp(l.k, r.k); + + return c ? c > 0 : l.k < r.k; +} + +static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) { + struct cache *ca = PTR_CACHE(c, k, i); + size_t bucket = PTR_BUCKET_NR(c, k, i); + size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); + + if (KEY_SIZE(k) + r > c->sb.bucket_size || + bucket < ca->sb.first_bucket || + bucket >= ca->sb.nbuckets) + return true; + } + + return false; +} + +/* Btree ptrs */ + +bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) +{ + char buf[80]; + + if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) + goto bad; + + if (__ptr_invalid(c, k)) + goto bad; + + return false; +bad: + bch_bkey_to_text(buf, sizeof(buf), k); + cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); + return true; +} + +static bool bch_btree_ptr_invalid(struct btree *b, const struct bkey *k) +{ + return __bch_btree_ptr_invalid(b->c, k); +} + +static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k) +{ + unsigned i; + char buf[80]; + struct bucket *g; + + if (mutex_trylock(&b->c->bucket_lock)) { + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(b->c, k, i)) { + g = PTR_BUCKET(b->c, k, i); + + if (KEY_DIRTY(k) || + g->prio != BTREE_PRIO || + (b->c->gc_mark_valid && + GC_MARK(g) != GC_MARK_METADATA)) + goto err; + } + + mutex_unlock(&b->c->bucket_lock); + } + + return false; +err: + mutex_unlock(&b->c->bucket_lock); + bch_bkey_to_text(buf, sizeof(buf), k); + btree_bug(b, +"inconsistent btree pointer %s: bucket %li pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", + buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), + g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); + return true; +} + +static bool bch_btree_ptr_bad(struct btree *b, const struct bkey *k) +{ + unsigned i; + + if (!bkey_cmp(k, &ZERO_KEY) || + !KEY_PTRS(k) || + bch_ptr_invalid(b, k)) + return true; + + for (i = 0; i < KEY_PTRS(k); i++) + if (!ptr_available(b->c, k, i) || + ptr_stale(b->c, k, i)) + return true; + + if (expensive_debug_checks(b->c) && + btree_ptr_bad_expensive(b, k)) + return true; + + return false; +} + +const struct btree_keys_ops bch_btree_keys_ops = { + .sort_cmp = bch_key_sort_cmp, + .key_invalid = bch_btree_ptr_invalid, + .key_bad = bch_btree_ptr_bad, +}; + +/* Extents */ + +/* + * Returns true if l > r - unless l == r, in which case returns true if l is + * older than r. + * + * Necessary for btree_sort_fixup() - if there are multiple keys that compare + * equal in different sets, we have to process them newest to oldest. + */ +static bool bch_extent_sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) +{ + int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); + + return c ? c > 0 : l.k < r.k; +} + +static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, + struct bkey *tmp) +{ + while (iter->used > 1) { + struct btree_iter_set *top = iter->data, *i = top + 1; + + if (iter->used > 2 && + bch_extent_sort_cmp(i[0], i[1])) + i++; + + if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) + break; + + if (!KEY_SIZE(i->k)) { + sort_key_next(iter, i); + heap_sift(iter, i - top, bch_extent_sort_cmp); + continue; + } + + if (top->k > i->k) { + if (bkey_cmp(top->k, i->k) >= 0) + sort_key_next(iter, i); + else + bch_cut_front(top->k, i->k); + + heap_sift(iter, i - top, bch_extent_sort_cmp); + } else { + /* can't happen because of comparison func */ + BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); + + if (bkey_cmp(i->k, top->k) < 0) { + bkey_copy(tmp, top->k); + + bch_cut_back(&START_KEY(i->k), tmp); + bch_cut_front(i->k, top->k); + heap_sift(iter, 0, bch_extent_sort_cmp); + + return tmp; + } else { + bch_cut_back(&START_KEY(i->k), top->k); + } + } + } + + return NULL; +} + +static bool bch_extent_invalid(struct btree *b, const struct bkey *k) +{ + char buf[80]; + + if (!KEY_SIZE(k)) + return true; + + if (KEY_SIZE(k) > KEY_OFFSET(k)) + goto bad; + + if (__ptr_invalid(b->c, k)) + goto bad; + + return false; +bad: + bch_bkey_to_text(buf, sizeof(buf), k); + cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k)); + return true; +} + +static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, + unsigned ptr) +{ + struct bucket *g = PTR_BUCKET(b->c, k, ptr); + char buf[80]; + + if (mutex_trylock(&b->c->bucket_lock)) { + if (b->c->gc_mark_valid && + ((GC_MARK(g) != GC_MARK_DIRTY && + KEY_DIRTY(k)) || + GC_MARK(g) == GC_MARK_METADATA)) + goto err; + + if (g->prio == BTREE_PRIO) + goto err; + + mutex_unlock(&b->c->bucket_lock); + } + + return false; +err: + mutex_unlock(&b->c->bucket_lock); + bch_bkey_to_text(buf, sizeof(buf), k); + btree_bug(b, +"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", + buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), + g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); + return true; +} + +static bool bch_extent_bad(struct btree *b, const struct bkey *k) +{ + struct bucket *g; + unsigned i, stale; + + if (!KEY_PTRS(k) || + bch_extent_invalid(b, k)) + return true; + + for (i = 0; i < KEY_PTRS(k); i++) + if (!ptr_available(b->c, k, i)) + return true; + + if (!expensive_debug_checks(b->c) && KEY_DIRTY(k)) + return false; + + for (i = 0; i < KEY_PTRS(k); i++) { + g = PTR_BUCKET(b->c, k, i); + stale = ptr_stale(b->c, k, i); + + btree_bug_on(stale > 96, b, + "key too stale: %i, need_gc %u", + stale, b->c->need_gc); + + btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), + b, "stale dirty pointer"); + + if (stale) + return true; + + if (expensive_debug_checks(b->c) && + bch_extent_bad_expensive(b, k, i)) + return true; + } + + return false; +} + +static uint64_t merge_chksums(struct bkey *l, struct bkey *r) +{ + return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & + ~((uint64_t)1 << 63); +} + +static bool bch_extent_merge(struct btree *b, struct bkey *l, struct bkey *r) +{ + unsigned i; + + if (key_merging_disabled(b->c)) + return false; + + if (KEY_PTRS(l) != KEY_PTRS(r) || + KEY_DIRTY(l) != KEY_DIRTY(r) || + bkey_cmp(l, &START_KEY(r))) + return false; + + for (i = 0; i < KEY_PTRS(l); i++) + if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || + PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) + return false; + + /* Keys with no pointers aren't restricted to one bucket and could + * overflow KEY_SIZE + */ + if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { + SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); + SET_KEY_SIZE(l, USHRT_MAX); + + bch_cut_front(l, r); + return false; + } + + if (KEY_CSUM(l)) { + if (KEY_CSUM(r)) + l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); + else + SET_KEY_CSUM(l, 0); + } + + SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); + SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); + + return true; +} + +const struct btree_keys_ops bch_extent_keys_ops = { + .sort_cmp = bch_extent_sort_cmp, + .sort_fixup = bch_extent_sort_fixup, + .key_invalid = bch_extent_invalid, + .key_bad = bch_extent_bad, + .key_merge = bch_extent_merge, + .is_extents = true, +}; diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h new file mode 100644 index 00000000000..e0c0b685d2f --- /dev/null +++ b/drivers/md/bcache/extents.h @@ -0,0 +1,12 @@ +#ifndef _BCACHE_EXTENTS_H +#define _BCACHE_EXTENTS_H + +extern const struct btree_keys_ops bch_btree_keys_ops; +extern const struct btree_keys_ops bch_extent_keys_ops; + +struct bkey; +struct cache_set; + +bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); + +#endif /* _BCACHE_EXTENTS_H */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 12057a43e01..6d6a7a15043 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -9,6 +9,7 @@ #include "bcache.h" #include "btree.h" #include "debug.h" +#include "extents.h" #include "request.h" #include "writeback.h" @@ -399,7 +400,7 @@ static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) { struct bkey *k = &j->uuid_bucket; - if (bch_btree_ptr_invalid(c, k)) + if (__bch_btree_ptr_invalid(c, k)) return "bad uuid pointer"; bkey_copy(&c->uuid_bucket, k); @@ -1575,7 +1576,7 @@ static void run_cache_set(struct cache_set *c) k = &j->btree_root; err = "bad btree root"; - if (bch_btree_ptr_invalid(c, k)) + if (__bch_btree_ptr_invalid(c, k)) goto err; err = "error reading btree root"; -- cgit v1.2.3-70-g09d2 From a85e968e66a175c86d0410719ea84a5bd0f1d070 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 20 Dec 2013 17:28:16 -0800 Subject: bcache: Add struct btree_keys Soon, bset.c won't need to depend on struct btree. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/bset.c | 179 +++++++++++++++++++++++++----------------- drivers/md/bcache/bset.h | 119 ++++++++++++++++++++++++++-- drivers/md/bcache/btree.c | 153 +++++++++++++++++------------------- drivers/md/bcache/btree.h | 93 ++-------------------- drivers/md/bcache/debug.c | 18 ++--- drivers/md/bcache/extents.c | 19 +++-- drivers/md/bcache/sysfs.c | 2 +- include/trace/events/bcache.h | 2 +- 9 files changed, 323 insertions(+), 264 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 5c74d55cea7..93b84841966 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -679,9 +679,9 @@ struct cache_set { unsigned error_decay; unsigned short journal_delay_ms; + bool expensive_debug_checks; unsigned verify:1; unsigned key_merging_disabled:1; - unsigned expensive_debug_checks:1; unsigned gc_always_rewrite:1; unsigned shrinker_disabled:1; unsigned copy_gc_enabled:1; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index c2c42cbbe88..f34ef56560e 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -149,33 +149,33 @@ struct bkey_float { #define BSET_CACHELINE 128 /* Space required for the btree node keys */ -static inline size_t btree_keys_bytes(struct btree *b) +static inline size_t btree_keys_bytes(struct btree_keys *b) { return PAGE_SIZE << b->page_order; } -static inline size_t btree_keys_cachelines(struct btree *b) +static inline size_t btree_keys_cachelines(struct btree_keys *b) { return btree_keys_bytes(b) / BSET_CACHELINE; } /* Space required for the auxiliary search trees */ -static inline size_t bset_tree_bytes(struct btree *b) +static inline size_t bset_tree_bytes(struct btree_keys *b) { return btree_keys_cachelines(b) * sizeof(struct bkey_float); } /* Space required for the prev pointers */ -static inline size_t bset_prev_bytes(struct btree *b) +static inline size_t bset_prev_bytes(struct btree_keys *b) { return btree_keys_cachelines(b) * sizeof(uint8_t); } /* Memory allocation */ -void bch_btree_keys_free(struct btree *b) +void bch_btree_keys_free(struct btree_keys *b) { - struct bset_tree *t = b->sets; + struct bset_tree *t = b->set; if (bset_prev_bytes(b) < PAGE_SIZE) kfree(t->prev); @@ -195,10 +195,11 @@ void bch_btree_keys_free(struct btree *b) t->tree = NULL; t->data = NULL; } +EXPORT_SYMBOL(bch_btree_keys_free); -int bch_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp) +int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp) { - struct bset_tree *t = b->sets; + struct bset_tree *t = b->set; BUG_ON(t->data); @@ -225,6 +226,29 @@ err: bch_btree_keys_free(b); return -ENOMEM; } +EXPORT_SYMBOL(bch_btree_keys_alloc); + +void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, + bool *expensive_debug_checks) +{ + unsigned i; + + b->ops = ops; + b->expensive_debug_checks = expensive_debug_checks; + b->nsets = 0; + b->last_set_unwritten = 0; + + /* XXX: shouldn't be needed */ + for (i = 0; i < MAX_BSETS; i++) + b->set[i].size = 0; + /* + * Second loop starts at 1 because b->keys[0]->data is the memory we + * allocated + */ + for (i = 1; i < MAX_BSETS; i++) + b->set[i].data = NULL; +} +EXPORT_SYMBOL(bch_btree_keys_init); /* Binary tree stuff for auxiliary search trees */ @@ -448,9 +472,9 @@ static void make_bfloat(struct bset_tree *t, unsigned j) f->exponent = 127; } -static void bset_alloc_tree(struct btree *b, struct bset_tree *t) +static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t) { - if (t != b->sets) { + if (t != b->set) { unsigned j = roundup(t[-1].size, 64 / sizeof(struct bkey_float)); @@ -458,27 +482,30 @@ static void bset_alloc_tree(struct btree *b, struct bset_tree *t) t->prev = t[-1].prev + j; } - while (t < b->sets + MAX_BSETS) + while (t < b->set + MAX_BSETS) t++->size = 0; } -static void bch_bset_build_unwritten_tree(struct btree *b) +static void bch_bset_build_unwritten_tree(struct btree_keys *b) { struct bset_tree *t = bset_tree_last(b); + BUG_ON(b->last_set_unwritten); + b->last_set_unwritten = 1; + bset_alloc_tree(b, t); - if (t->tree != b->sets->tree + btree_keys_cachelines(b)) { + if (t->tree != b->set->tree + btree_keys_cachelines(b)) { t->prev[0] = bkey_to_cacheline_offset(t->data->start); t->size = 1; } } -void bch_bset_init_next(struct btree *b, struct bset *i, uint64_t magic) +void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic) { - if (i != b->sets->data) { - b->sets[++b->nsets].data = i; - i->seq = b->sets->data->seq; + if (i != b->set->data) { + b->set[++b->nsets].data = i; + i->seq = b->set->data->seq; } else get_random_bytes(&i->seq, sizeof(uint64_t)); @@ -488,18 +515,21 @@ void bch_bset_init_next(struct btree *b, struct bset *i, uint64_t magic) bch_bset_build_unwritten_tree(b); } +EXPORT_SYMBOL(bch_bset_init_next); -static void bset_build_written_tree(struct btree *b) +void bch_bset_build_written_tree(struct btree_keys *b) { struct bset_tree *t = bset_tree_last(b); struct bkey *k = t->data->start; unsigned j, cacheline = 1; + b->last_set_unwritten = 0; + bset_alloc_tree(b, t); t->size = min_t(unsigned, bkey_to_cacheline(t, bset_bkey_last(t->data)), - b->sets->tree + btree_keys_cachelines(b) - t->tree); + b->set->tree + btree_keys_cachelines(b) - t->tree); if (t->size < 2) { t->size = 0; @@ -532,13 +562,14 @@ static void bset_build_written_tree(struct btree *b) j = inorder_next(j, t->size)) make_bfloat(t, j); } +EXPORT_SYMBOL(bch_bset_build_written_tree); -void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k) +void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k) { struct bset_tree *t; unsigned inorder, j = 1; - for (t = b->sets; t <= bset_tree_last(b); t++) + for (t = b->set; t <= bset_tree_last(b); t++) if (k < bset_bkey_last(t->data)) goto found_set; @@ -577,8 +608,9 @@ fix_right: do { j = j * 2 + 1; } while (j < t->size); } +EXPORT_SYMBOL(bch_bset_fix_invalidated_key); -static void bch_bset_fix_lookup_table(struct btree *b, +static void bch_bset_fix_lookup_table(struct btree_keys *b, struct bset_tree *t, struct bkey *k) { @@ -613,7 +645,7 @@ static void bch_bset_fix_lookup_table(struct btree *b, } } - if (t->size == b->sets->tree + btree_keys_cachelines(b) - t->tree) + if (t->size == b->set->tree + btree_keys_cachelines(b) - t->tree) return; /* Possibly add a new entry to the end of the lookup table */ @@ -627,12 +659,12 @@ static void bch_bset_fix_lookup_table(struct btree *b, } } -void bch_bset_insert(struct btree *b, struct bkey *where, +void bch_bset_insert(struct btree_keys *b, struct bkey *where, struct bkey *insert) { struct bset_tree *t = bset_tree_last(b); - BUG_ON(t->data != write_block(b)); + BUG_ON(!b->last_set_unwritten); BUG_ON(bset_byte_offset(b, t->data) + __set_bytes(t->data, t->data->keys + bkey_u64s(insert)) > PAGE_SIZE << b->page_order); @@ -645,20 +677,17 @@ void bch_bset_insert(struct btree *b, struct bkey *where, bkey_copy(where, insert); bch_bset_fix_lookup_table(b, t, where); } +EXPORT_SYMBOL(bch_bset_insert); struct bset_search_iter { struct bkey *l, *r; }; -static struct bset_search_iter bset_search_write_set(struct btree *b, - struct bset_tree *t, +static struct bset_search_iter bset_search_write_set(struct bset_tree *t, const struct bkey *search) { unsigned li = 0, ri = t->size; - BUG_ON(!b->nsets && - t->size < bkey_to_cacheline(t, bset_bkey_last(t->data))); - while (li + 1 != ri) { unsigned m = (li + ri) >> 1; @@ -674,8 +703,7 @@ static struct bset_search_iter bset_search_write_set(struct btree *b, }; } -static struct bset_search_iter bset_search_tree(struct btree *b, - struct bset_tree *t, +static struct bset_search_iter bset_search_tree(struct bset_tree *t, const struct bkey *search) { struct bkey *l, *r; @@ -759,7 +787,7 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, if (unlikely(!t->size)) { i.l = t->data->start; i.r = bset_bkey_last(t->data); - } else if (bset_written(b, t)) { + } else if (bset_written(&b->keys, t)) { /* * Each node in the auxiliary search tree covers a certain range * of bits, and keys above and below the set it covers might @@ -773,12 +801,16 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, if (unlikely(bkey_cmp(search, t->data->start) < 0)) return t->data->start; - i = bset_search_tree(b, t, search); - } else - i = bset_search_write_set(b, t, search); + i = bset_search_tree(t, search); + } else { + BUG_ON(!b->keys.nsets && + t->size < bkey_to_cacheline(t, bset_bkey_last(t->data))); + + i = bset_search_write_set(t, search); + } if (expensive_debug_checks(b->c)) { - BUG_ON(bset_written(b, t) && + BUG_ON(bset_written(&b->keys, t) && i.l != t->data->start && bkey_cmp(tree_to_prev_bkey(t, inorder_to_tree(bkey_to_cacheline(t, i.l), t)), @@ -794,6 +826,7 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, return i.l; } +EXPORT_SYMBOL(__bch_bset_search); /* Btree iterator */ @@ -833,7 +866,7 @@ static struct bkey *__bch_btree_iter_init(struct btree *b, iter->b = b; #endif - for (; start <= &b->sets[b->nsets]; start++) { + for (; start <= bset_tree_last(&b->keys); start++) { ret = bch_bset_search(b, start, search); bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); } @@ -845,8 +878,9 @@ struct bkey *bch_btree_iter_init(struct btree *b, struct btree_iter *iter, struct bkey *search) { - return __bch_btree_iter_init(b, iter, search, b->sets); + return __bch_btree_iter_init(b, iter, search, b->keys.set); } +EXPORT_SYMBOL(bch_btree_iter_init); static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, btree_iter_cmp_fn *cmp) @@ -879,9 +913,10 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter) return __bch_btree_iter_next(iter, btree_iter_cmp); } +EXPORT_SYMBOL(bch_btree_iter_next); struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, - struct btree *b, ptr_filter_fn fn) + struct btree_keys *b, ptr_filter_fn fn) { struct bkey *ret; @@ -913,15 +948,16 @@ int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order) return 0; } +EXPORT_SYMBOL(bch_bset_sort_state_init); -static void btree_mergesort(struct btree *b, struct bset *out, +static void btree_mergesort(struct btree_keys *b, struct bset *out, struct btree_iter *iter, bool fixup, bool remove_stale) { int i; struct bkey *k, *last = NULL; BKEY_PADDED(k) tmp; - bool (*bad)(struct btree *, const struct bkey *) = remove_stale + bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale ? bch_ptr_bad : bch_ptr_invalid; @@ -955,7 +991,7 @@ static void btree_mergesort(struct btree *b, struct bset *out, pr_debug("sorted %i keys", out->keys); } -static void __btree_sort(struct btree *b, struct btree_iter *iter, +static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, unsigned start, unsigned order, bool fixup, struct bset_sort_state *state) { @@ -968,7 +1004,7 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, out = page_address(mempool_alloc(state->pool, GFP_NOIO)); used_mempool = true; - order = ilog2(bucket_pages(b->c)); + order = state->page_order; } start_time = local_clock(); @@ -983,13 +1019,13 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, * memcpy() */ - out->magic = bset_magic(&b->c->sb); - out->seq = b->sets[0].data->seq; - out->version = b->sets[0].data->version; - swap(out, b->sets[0].data); + out->magic = b->set->data->magic; + out->seq = b->set->data->seq; + out->version = b->set->data->version; + swap(out, b->set->data); } else { - b->sets[start].data->keys = out->keys; - memcpy(b->sets[start].data->start, out->start, + b->set[start].data->keys = out->keys; + memcpy(b->set[start].data->start, out->start, (void *) bset_bkey_last(out) - (void *) out->start); } @@ -998,7 +1034,7 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, else free_pages((unsigned long) out, order); - bset_build_written_tree(b); + bch_bset_build_written_tree(b); if (!start) bch_time_stats_update(&state->time, start_time); @@ -1007,34 +1043,32 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, void bch_btree_sort_partial(struct btree *b, unsigned start, struct bset_sort_state *state) { - size_t order = b->page_order, keys = 0; + size_t order = b->keys.page_order, keys = 0; struct btree_iter iter; int oldsize = bch_count_data(b); - __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); - - BUG_ON(!bset_written(b, bset_tree_last(b)) && - (bset_tree_last(b)->size || b->nsets)); + __bch_btree_iter_init(b, &iter, NULL, &b->keys.set[start]); if (start) { unsigned i; - for (i = start; i <= b->nsets; i++) - keys += b->sets[i].data->keys; + for (i = start; i <= b->keys.nsets; i++) + keys += b->keys.set[i].data->keys; - order = roundup_pow_of_two(__set_bytes(b->sets->data, + order = roundup_pow_of_two(__set_bytes(b->keys.set->data, keys)) / PAGE_SIZE; if (order) order = ilog2(order); } - __btree_sort(b, &iter, start, order, false, state); + __btree_sort(&b->keys, &iter, start, order, false, state); EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize); } EXPORT_SYMBOL(bch_btree_sort_partial); -void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter, +void bch_btree_sort_and_fix_extents(struct btree_keys *b, + struct btree_iter *iter, struct bset_sort_state *state) { __btree_sort(b, iter, 0, b->page_order, true, state); @@ -1048,11 +1082,11 @@ void bch_btree_sort_into(struct btree *b, struct btree *new, struct btree_iter iter; bch_btree_iter_init(b, &iter, NULL); - btree_mergesort(b, new->sets->data, &iter, false, true); + btree_mergesort(&b->keys, new->keys.set->data, &iter, false, true); bch_time_stats_update(&state->time, start_time); - new->sets->size = 0; + new->keys.set->size = 0; // XXX: why? } #define SORT_CRIT (4096 / sizeof(uint64_t)) @@ -1062,28 +1096,31 @@ void bch_btree_sort_lazy(struct btree *b, struct bset_sort_state *state) unsigned crit = SORT_CRIT; int i; + b->keys.last_set_unwritten = 0; + /* Don't sort if nothing to do */ - if (!b->nsets) + if (!b->keys.nsets) goto out; - for (i = b->nsets - 1; i >= 0; --i) { + for (i = b->keys.nsets - 1; i >= 0; --i) { crit *= state->crit_factor; - if (b->sets[i].data->keys < crit) { + if (b->keys.set[i].data->keys < crit) { bch_btree_sort_partial(b, i, state); return; } } /* Sort if we'd overflow */ - if (b->nsets + 1 == MAX_BSETS) { + if (b->keys.nsets + 1 == MAX_BSETS) { bch_btree_sort(b, state); return; } out: - bset_build_written_tree(b); + bch_bset_build_written_tree(&b->keys); } +EXPORT_SYMBOL(bch_btree_sort_lazy); /* Sysfs stuff */ @@ -1102,12 +1139,12 @@ static int btree_bset_stats(struct btree_op *op, struct btree *b) stats->nodes++; - for (i = 0; i <= b->nsets; i++) { - struct bset_tree *t = &b->sets[i]; + for (i = 0; i <= b->keys.nsets; i++) { + struct bset_tree *t = &b->keys.set[i]; size_t bytes = t->data->keys * sizeof(uint64_t); size_t j; - if (bset_written(b, t)) { + if (bset_written(&b->keys, t)) { stats->sets_written++; stats->bytes_written += bytes; diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index b5797129e91..87da828477f 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -145,6 +145,9 @@ */ struct btree; +struct btree_keys; +struct btree_iter; +struct btree_iter_set; struct bkey_float; #define MAX_BSETS 4U @@ -181,6 +184,74 @@ struct bset_tree { struct bset *data; }; +struct btree_keys_ops { + bool (*sort_cmp)(struct btree_iter_set, + struct btree_iter_set); + struct bkey *(*sort_fixup)(struct btree_iter *, struct bkey *); + bool (*key_invalid)(struct btree_keys *, + const struct bkey *); + bool (*key_bad)(struct btree_keys *, const struct bkey *); + bool (*key_merge)(struct btree_keys *, + struct bkey *, struct bkey *); + + /* + * Only used for deciding whether to use START_KEY(k) or just the key + * itself in a couple places + */ + bool is_extents; +}; + +struct btree_keys { + const struct btree_keys_ops *ops; + uint8_t page_order; + uint8_t nsets; + unsigned last_set_unwritten:1; + bool *expensive_debug_checks; + + /* + * Sets of sorted keys - the real btree node - plus a binary search tree + * + * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point + * to the memory we have allocated for this btree node. Additionally, + * set[0]->data points to the entire btree node as it exists on disk. + */ + struct bset_tree set[MAX_BSETS]; +}; + +static inline struct bset_tree *bset_tree_last(struct btree_keys *b) +{ + return b->set + b->nsets; +} + +static inline bool bset_written(struct btree_keys *b, struct bset_tree *t) +{ + return t <= b->set + b->nsets - b->last_set_unwritten; +} + +static inline bool bkey_written(struct btree_keys *b, struct bkey *k) +{ + return !b->last_set_unwritten || k < b->set[b->nsets].data->start; +} + +static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i) +{ + return ((size_t) i) - ((size_t) b->set->data); +} + +static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i) +{ + return bset_byte_offset(b, i) >> 9; +} + +static inline bool btree_keys_expensive_checks(struct btree_keys *b) +{ +#ifdef CONFIG_BCACHE_DEBUG + return *b->expensive_debug_checks; +#else + return false; +#endif +} + #define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) #define set_bytes(i) __set_bytes(i, i->keys) @@ -189,12 +260,34 @@ struct bset_tree { #define set_blocks(i, block_bytes) \ __set_blocks(i, (i)->keys, block_bytes) -void bch_btree_keys_free(struct btree *); -int bch_btree_keys_alloc(struct btree *, unsigned, gfp_t); +static inline struct bset *bset_next_set(struct btree_keys *b, + unsigned block_bytes) +{ + struct bset *i = bset_tree_last(b)->data; + + return ((void *) i) + roundup(set_bytes(i), block_bytes); +} + +void bch_btree_keys_free(struct btree_keys *); +int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t); +void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *, + bool *); -void bch_bset_fix_invalidated_key(struct btree *, struct bkey *); -void bch_bset_init_next(struct btree *, struct bset *, uint64_t); -void bch_bset_insert(struct btree *, struct bkey *, struct bkey *); +void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t); +void bch_bset_build_written_tree(struct btree_keys *); +void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *); +void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *); + +/* + * Tries to merge l and r: l should be lower than r + * Returns true if we were able to merge. If we did merge, l will be the merged + * key, r will be untouched. + */ +static inline bool bch_bkey_try_merge(struct btree_keys *b, + struct bkey *l, struct bkey *r) +{ + return b->ops->key_merge ? b->ops->key_merge(b, l, r) : false; +} /* Btree key iteration */ @@ -208,11 +301,11 @@ struct btree_iter { } data[MAX_BSETS]; }; -typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); +typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *); struct bkey *bch_btree_iter_next(struct btree_iter *); struct bkey *bch_btree_iter_next_filter(struct btree_iter *, - struct btree *, ptr_filter_fn); + struct btree_keys *, ptr_filter_fn); void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); struct bkey *bch_btree_iter_init(struct btree *, struct btree_iter *, @@ -246,7 +339,7 @@ int bch_bset_sort_state_init(struct bset_sort_state *, unsigned); void bch_btree_sort_lazy(struct btree *, struct bset_sort_state *); void bch_btree_sort_into(struct btree *, struct btree *, struct bset_sort_state *); -void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *, +void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *, struct bset_sort_state *); void bch_btree_sort_partial(struct btree *, unsigned, struct bset_sort_state *); @@ -311,6 +404,16 @@ static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) _ret; \ }) +static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k) +{ + return b->ops->key_invalid(b, k); +} + +static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k) +{ + return b->ops->key_bad(b, k); +} + /* Keylists */ struct keylist { diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 6734e2759b9..5d7dee8bb85 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -107,14 +107,6 @@ enum { static struct workqueue_struct *btree_io_wq; -static inline bool should_split(struct btree *b) -{ - struct bset *i = write_block(b); - return b->written >= btree_blocks(b) || - (b->written + __set_blocks(i, i->keys + 15, block_bytes(b->c)) - > btree_blocks(b)); -} - #define insert_lock(s, b) ((b)->level <= (s)->lock) /* @@ -182,6 +174,19 @@ static inline bool should_split(struct btree *b) _r; \ }) +static inline struct bset *write_block(struct btree *b) +{ + return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); +} + +static inline bool should_split(struct btree *b) +{ + struct bset *i = write_block(b); + return b->written >= btree_blocks(b) || + (b->written + __set_blocks(i, i->keys + 15, block_bytes(b->c)) + > btree_blocks(b)); +} + /* Btree key manipulation */ void bkey_put(struct cache_set *c, struct bkey *k) @@ -222,7 +227,7 @@ void bch_btree_node_read_done(struct btree *b) goto err; for (; - b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; + b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq; i = write_block(b)) { err = "unsupported bset version"; if (i->version > BCACHE_BSET_VERSION) @@ -250,7 +255,7 @@ void bch_btree_node_read_done(struct btree *b) } err = "empty set"; - if (i != b->sets[0].data && !i->keys) + if (i != b->keys.set[0].data && !i->keys) goto err; bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); @@ -260,21 +265,22 @@ void bch_btree_node_read_done(struct btree *b) err = "corrupted btree"; for (i = write_block(b); - bset_sector_offset(b, i) < KEY_SIZE(&b->key); + bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); i = ((void *) i) + block_bytes(b->c)) - if (i->seq == b->sets[0].data->seq) + if (i->seq == b->keys.set[0].data->seq) goto err; - bch_btree_sort_and_fix_extents(b, iter, &b->c->sort); + bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); - i = b->sets[0].data; + i = b->keys.set[0].data; err = "short btree key"; - if (b->sets[0].size && - bkey_cmp(&b->key, &b->sets[0].end) < 0) + if (b->keys.set[0].size && + bkey_cmp(&b->key, &b->keys.set[0].end) < 0) goto err; if (b->written < btree_blocks(b)) - bch_bset_init_next(b, write_block(b), bset_magic(&b->c->sb)); + bch_bset_init_next(&b->keys, write_block(b), + bset_magic(&b->c->sb)); out: mempool_free(iter, b->c->fill_iter); return; @@ -308,7 +314,7 @@ static void bch_btree_node_read(struct btree *b) bio->bi_end_io = btree_node_read_endio; bio->bi_private = &cl; - bch_bio_map(bio, b->sets[0].data); + bch_bio_map(bio, b->keys.set[0].data); bch_submit_bbio(bio, b->c, &b->key, 0); closure_sync(&cl); @@ -427,7 +433,7 @@ static void do_btree_node_write(struct btree *b) bkey_copy(&k.key, &b->key); SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + - bset_sector_offset(b, i)); + bset_sector_offset(&b->keys, i)); if (!bio_alloc_pages(b->bio, GFP_NOIO)) { int j; @@ -475,12 +481,13 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) do_btree_node_write(b); - b->written += set_blocks(i, block_bytes(b->c)); atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size, &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); + b->written += set_blocks(i, block_bytes(b->c)); + /* If not a leaf node, always sort */ - if (b->level && b->nsets) + if (b->level && b->keys.nsets) bch_btree_sort(b, &b->c->sort); else bch_btree_sort_lazy(b, &b->c->sort); @@ -489,11 +496,12 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) * do verify if there was more than one set initially (i.e. we did a * sort) and we sorted down to a single set: */ - if (i != b->sets->data && !b->nsets) + if (i != b->keys.set->data && !b->keys.nsets) bch_btree_verify(b); if (b->written < btree_blocks(b)) - bch_bset_init_next(b, write_block(b), bset_magic(&b->c->sb)); + bch_bset_init_next(&b->keys, write_block(b), + bset_magic(&b->c->sb)); } static void bch_btree_node_write_sync(struct btree *b) @@ -553,24 +561,6 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) * mca -> memory cache */ -static void mca_reinit(struct btree *b) -{ - unsigned i; - - b->flags = 0; - b->written = 0; - b->nsets = 0; - - for (i = 0; i < MAX_BSETS; i++) - b->sets[i].size = 0; - /* - * Second loop starts at 1 because b->sets[0]->data is the memory we - * allocated - */ - for (i = 1; i < MAX_BSETS; i++) - b->sets[i].data = NULL; -} - #define mca_reserve(c) (((c->root && c->root->level) \ ? c->root->level : 1) * 8 + 16) #define mca_can_free(c) \ @@ -580,7 +570,7 @@ static void mca_data_free(struct btree *b) { BUG_ON(b->io_mutex.count != 1); - bch_btree_keys_free(b); + bch_btree_keys_free(&b->keys); b->c->bucket_cache_used--; list_move(&b->list, &b->c->btree_cache_freed); @@ -602,7 +592,7 @@ static unsigned btree_order(struct bkey *k) static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) { - if (!bch_btree_keys_alloc(b, + if (!bch_btree_keys_alloc(&b->keys, max_t(unsigned, ilog2(b->c->btree_pages), btree_order(k)), @@ -642,9 +632,9 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush) if (!down_write_trylock(&b->lock)) return -ENOMEM; - BUG_ON(btree_node_dirty(b) && !b->sets[0].data); + BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data); - if (b->page_order < min_order) + if (b->keys.page_order < min_order) goto out_unlock; if (!flush) { @@ -809,7 +799,7 @@ int bch_btree_cache_alloc(struct cache_set *c) c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); if (c->verify_data && - c->verify_data->sets[0].data) + c->verify_data->keys.set->data) list_del_init(&c->verify_data->list); else c->verify_data = NULL; @@ -907,7 +897,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) list_for_each_entry(b, &c->btree_cache_freed, list) if (!mca_reap(b, 0, false)) { mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); - if (!b->sets[0].data) + if (!b->keys.set[0].data) goto err; else goto out; @@ -918,7 +908,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) goto err; BUG_ON(!down_write_trylock(&b->lock)); - if (!b->sets->data) + if (!b->keys.set->data) goto err; out: BUG_ON(b->io_mutex.count != 1); @@ -929,15 +919,17 @@ out: hlist_add_head_rcu(&b->hash, mca_hash(c, k)); lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); - b->level = level; b->parent = (void *) ~0UL; + b->flags = 0; + b->written = 0; + b->level = level; if (!b->level) - b->ops = &bch_extent_keys_ops; + bch_btree_keys_init(&b->keys, &bch_extent_keys_ops, + &b->c->expensive_debug_checks); else - b->ops = &bch_btree_keys_ops; - - mca_reinit(b); + bch_btree_keys_init(&b->keys, &bch_btree_keys_ops, + &b->c->expensive_debug_checks); return b; err: @@ -998,13 +990,13 @@ retry: b->accessed = 1; - for (; i <= b->nsets && b->sets[i].size; i++) { - prefetch(b->sets[i].tree); - prefetch(b->sets[i].data); + for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { + prefetch(b->keys.set[i].tree); + prefetch(b->keys.set[i].data); } - for (; i <= b->nsets; i++) - prefetch(b->sets[i].data); + for (; i <= b->keys.nsets; i++) + prefetch(b->keys.set[i].data); if (btree_node_io_error(b)) { rw_unlock(write, b); @@ -1084,7 +1076,7 @@ retry: } b->accessed = 1; - bch_bset_init_next(b, b->sets->data, bset_magic(&b->c->sb)); + bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); mutex_unlock(&c->bucket_lock); @@ -1215,7 +1207,7 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) stale = max(stale, btree_mark_key(b, k)); keys++; - if (bch_ptr_bad(b, k)) + if (bch_ptr_bad(&b->keys, k)) continue; gc->key_bytes += bkey_u64s(k); @@ -1225,9 +1217,9 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) gc->data += KEY_SIZE(k); } - for (t = b->sets; t <= &b->sets[b->nsets]; t++) + for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++) btree_bug_on(t->size && - bset_written(b, t) && + bset_written(&b->keys, t) && bkey_cmp(&b->key, &t->end) < 0, b, "found short btree key in gc"); @@ -1271,7 +1263,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, blocks = btree_default_blocks(b->c) * 2 / 3; if (nodes < 2 || - __set_blocks(b->sets[0].data, keys, + __set_blocks(b->keys.set[0].data, keys, block_bytes(b->c)) > blocks * (nodes - 1)) return 0; @@ -1428,7 +1420,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, r[i].b = ERR_PTR(-EINTR); while (1) { - k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); + k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); if (k) { r->b = bch_btree_node_get(b->c, k, b->level - 1, true); if (IS_ERR(r->b)) { @@ -1764,7 +1756,8 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, bch_btree_iter_init(b, &iter, NULL); do { - k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); + k = bch_btree_iter_next_filter(&iter, &b->keys, + bch_ptr_bad); if (k) btree_node_prefetch(b->c, k, b->level - 1); @@ -1894,7 +1887,7 @@ static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); - if (bkey_written(b, k)) { + if (bkey_written(&b->keys, k)) { /* * We insert a new key to cover the top of the * old key, and the old key is modified in place @@ -1907,19 +1900,20 @@ static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, * depends on us inserting a new key for the top * here. */ - top = bch_bset_search(b, bset_tree_last(b), + top = bch_bset_search(b, + bset_tree_last(&b->keys), insert); - bch_bset_insert(b, top, k); + bch_bset_insert(&b->keys, top, k); } else { BKEY_PADDED(key) temp; bkey_copy(&temp.key, k); - bch_bset_insert(b, k, &temp.key); + bch_bset_insert(&b->keys, k, &temp.key); top = bkey_next(k); } bch_cut_front(insert, top); bch_cut_back(&START_KEY(insert), k); - bch_bset_fix_invalidated_key(b, k); + bch_bset_fix_invalidated_key(&b->keys, k); return false; } @@ -1929,7 +1923,7 @@ static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) old_offset = KEY_START(insert); - if (bkey_written(b, k) && + if (bkey_written(&b->keys, k) && bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { /* * Completely overwrote, so we don't have to @@ -1938,7 +1932,7 @@ static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, bch_cut_front(k, k); } else { __bch_cut_back(&START_KEY(insert), k); - bch_bset_fix_invalidated_key(b, k); + bch_bset_fix_invalidated_key(&b->keys, k); } } @@ -1979,7 +1973,8 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, * the previous key. */ prev = NULL; - m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k))); + m = bch_btree_iter_init(b, &iter, + PRECEDING_KEY(&START_KEY(k))); if (fix_overlapping_extents(b, k, &iter, replace_key)) { op->insert_collision = true; @@ -2000,7 +1995,7 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, /* prev is in the tree, if we merge we're done */ status = BTREE_INSERT_STATUS_BACK_MERGE; if (prev && - bch_bkey_try_merge(b, prev, k)) + bch_bkey_try_merge(&b->keys, prev, k)) goto merged; status = BTREE_INSERT_STATUS_OVERWROTE; @@ -2010,14 +2005,14 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, status = BTREE_INSERT_STATUS_FRONT_MERGE; if (m != bset_bkey_last(i) && - bch_bkey_try_merge(b, k, m)) + bch_bkey_try_merge(&b->keys, k, m)) goto copy; } else { BUG_ON(replace_key); - m = bch_bset_search(b, bset_tree_last(b), k); + m = bch_bset_search(b, bset_tree_last(&b->keys), k); } -insert: bch_bset_insert(b, m, k); +insert: bch_bset_insert(&b->keys, m, k); copy: bkey_copy(m, k); merged: bch_check_keys(b, "%u for %s", status, @@ -2362,7 +2357,7 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, bch_btree_iter_init(b, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, b, + while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { ret = btree(map_nodes_recurse, k, b, op, from, fn, flags); @@ -2395,7 +2390,7 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, bch_btree_iter_init(b, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) { + while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { ret = !b->level ? fn(op, b, k) : btree(map_keys_recurse, k, b, op, from, fn, flags); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 0b436079db7..04e81f8ab89 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -113,28 +113,7 @@ struct btree_write { int prio_blocked; }; -struct btree_keys_ops { - bool (*sort_cmp)(struct btree_iter_set, - struct btree_iter_set); - struct bkey *(*sort_fixup)(struct btree_iter *, - struct bkey *); - bool (*key_invalid)(struct btree *, - const struct bkey *); - bool (*key_bad)(struct btree *, - const struct bkey *); - bool (*key_merge)(struct btree *, - struct bkey *, struct bkey *); - - - /* - * Only used for deciding whether to use START_KEY(k) or just the key - * itself in a couple places - */ - bool is_extents; -}; - struct btree { - const struct btree_keys_ops *ops; /* Hottest entries first */ struct hlist_node hash; @@ -151,17 +130,8 @@ struct btree { unsigned long flags; uint16_t written; /* would be nice to kill */ uint8_t level; - uint8_t nsets; - uint8_t page_order; - - /* - * Set of sorted keys - the real btree node - plus a binary search tree - * - * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point - * to the memory we have allocated for this btree node. Additionally, - * set[0]->data points to the entire btree node as it exists on disk. - */ - struct bset_tree sets[MAX_BSETS]; + + struct btree_keys keys; /* For outstanding btree writes, used as a lock - protects write_idx */ struct closure io; @@ -201,49 +171,19 @@ static inline struct btree_write *btree_prev_write(struct btree *b) return b->writes + (btree_node_write_idx(b) ^ 1); } -static inline struct bset_tree *bset_tree_last(struct btree *b) -{ - return b->sets + b->nsets; -} - static inline struct bset *btree_bset_first(struct btree *b) { - return b->sets->data; + return b->keys.set->data; } static inline struct bset *btree_bset_last(struct btree *b) { - return bset_tree_last(b)->data; -} - -static inline unsigned bset_byte_offset(struct btree *b, struct bset *i) -{ - return ((size_t) i) - ((size_t) b->sets->data); -} - -static inline unsigned bset_sector_offset(struct btree *b, struct bset *i) -{ - return (((void *) i) - ((void *) btree_bset_first(b))) >> 9; + return bset_tree_last(&b->keys)->data; } static inline unsigned bset_block_offset(struct btree *b, struct bset *i) { - return bset_sector_offset(b, i) >> b->c->block_bits; -} - -static inline struct bset *write_block(struct btree *b) -{ - return ((void *) b->sets[0].data) + b->written * block_bytes(b->c); -} - -static inline bool bset_written(struct btree *b, struct bset_tree *t) -{ - return t->data < write_block(b); -} - -static inline bool bkey_written(struct btree *b, struct bkey *k) -{ - return k < write_block(b)->start; + return bset_sector_offset(&b->keys, i) >> b->c->block_bits; } static inline void set_gc_sectors(struct cache_set *c) @@ -251,27 +191,6 @@ static inline void set_gc_sectors(struct cache_set *c) atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); } -static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) -{ - return b->ops->key_invalid(b, k); -} - -static inline bool bch_ptr_bad(struct btree *b, const struct bkey *k) -{ - return b->ops->key_bad(b, k); -} - -/* - * Tries to merge l and r: l should be lower than r - * Returns true if we were able to merge. If we did merge, l will be the merged - * key, r will be untouched. - */ -static inline bool bch_bkey_try_merge(struct btree *b, - struct bkey *l, struct bkey *r) -{ - return b->ops->key_merge ? b->ops->key_merge(b, l, r) : false; -} - void bkey_put(struct cache_set *c, struct bkey *k); /* Looping macros */ @@ -284,7 +203,7 @@ void bkey_put(struct cache_set *c, struct bkey *k); #define for_each_key_filter(b, k, iter, filter) \ for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next_filter((iter), b, filter));) + ((k) = bch_btree_iter_next_filter((iter), &(b)->keys, filter));) #define for_each_key(b, k, iter) \ for (bch_btree_iter_init((b), (iter), NULL); \ diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 2c6587d016d..8acc18af07c 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -113,9 +113,9 @@ static void bch_dump_bucket(struct btree *b) unsigned i; console_lock(); - for (i = 0; i <= b->nsets; i++) - dump_bset(b, b->sets[i].data, - bset_block_offset(b, b->sets[i].data)); + for (i = 0; i <= b->keys.nsets; i++) + dump_bset(b, b->keys.set[i].data, + bset_block_offset(b, b->keys.set[i].data)); console_unlock(); } @@ -139,13 +139,13 @@ void bch_btree_verify(struct btree *b) mutex_lock(&b->c->verify_lock); ondisk = b->c->verify_ondisk; - sorted = b->c->verify_data->sets->data; - inmemory = b->sets->data; + sorted = b->c->verify_data->keys.set->data; + inmemory = b->keys.set->data; bkey_copy(&v->key, &b->key); v->written = 0; v->level = b->level; - v->ops = b->ops; + v->keys.ops = b->keys.ops; bio = bch_bbio_alloc(b->c); bio->bi_bdev = PTR_CACHE(b->c, &b->key, 0)->bdev; @@ -159,7 +159,7 @@ void bch_btree_verify(struct btree *b) memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9); bch_btree_node_read_done(v); - sorted = v->sets->data; + sorted = v->keys.set->data; if (inmemory->keys != sorted->keys || memcmp(inmemory->start, @@ -264,14 +264,14 @@ void __bch_check_keys(struct btree *b, const char *fmt, ...) if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) goto bug; - if (bch_ptr_invalid(b, k)) + if (bch_ptr_invalid(&b->keys, k)) continue; err = "Overlapping keys"; if (p && bkey_cmp(p, &START_KEY(k)) > 0) goto bug; } else { - if (bch_ptr_bad(b, k)) + if (bch_ptr_bad(&b->keys, k)) continue; err = "Duplicate keys"; diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 8fe6aaece41..ba3021128e7 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -81,8 +81,9 @@ bad: return true; } -static bool bch_btree_ptr_invalid(struct btree *b, const struct bkey *k) +static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k) { + struct btree *b = container_of(bk, struct btree, keys); return __bch_btree_ptr_invalid(b->c, k); } @@ -118,13 +119,14 @@ err: return true; } -static bool bch_btree_ptr_bad(struct btree *b, const struct bkey *k) +static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k) { + struct btree *b = container_of(bk, struct btree, keys); unsigned i; if (!bkey_cmp(k, &ZERO_KEY) || !KEY_PTRS(k) || - bch_ptr_invalid(b, k)) + bch_ptr_invalid(bk, k)) return true; for (i = 0; i < KEY_PTRS(k); i++) @@ -209,8 +211,9 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, return NULL; } -static bool bch_extent_invalid(struct btree *b, const struct bkey *k) +static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) { + struct btree *b = container_of(bk, struct btree, keys); char buf[80]; if (!KEY_SIZE(k)) @@ -259,13 +262,14 @@ err: return true; } -static bool bch_extent_bad(struct btree *b, const struct bkey *k) +static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) { + struct btree *b = container_of(bk, struct btree, keys); struct bucket *g; unsigned i, stale; if (!KEY_PTRS(k) || - bch_extent_invalid(b, k)) + bch_extent_invalid(bk, k)) return true; for (i = 0; i < KEY_PTRS(k); i++) @@ -303,8 +307,9 @@ static uint64_t merge_chksums(struct bkey *l, struct bkey *r) ~((uint64_t)1 << 63); } -static bool bch_extent_merge(struct btree *b, struct bkey *l, struct bkey *r) +static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r) { + struct btree *b = container_of(bk, struct btree, keys); unsigned i; if (key_merging_disabled(b->c)) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 206c80fb27c..7e175dbc76b 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -433,7 +433,7 @@ lock_root: mutex_lock(&c->bucket_lock); list_for_each_entry(b, &c->btree_cache, list) - ret += 1 << (b->page_order + PAGE_SHIFT); + ret += 1 << (b->keys.page_order + PAGE_SHIFT); mutex_unlock(&c->bucket_lock); return ret; diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 0c5cf2f63dc..7110897c3df 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -247,7 +247,7 @@ TRACE_EVENT(bcache_btree_write, TP_fast_assign( __entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0); __entry->block = b->written; - __entry->keys = b->sets[b->nsets].data->keys; + __entry->keys = b->keys.set[b->keys.nsets].data->keys; ), TP_printk("bucket %zu", __entry->bucket) -- cgit v1.2.3-70-g09d2 From 59158fde429fb5d18064e2734b3dd5e6048affbd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Nov 2013 19:03:54 -0800 Subject: bcache: Add bch_btree_keys_u64s_remaining() Helper function to explicitly check how much space is free in a btree node Signed-off-by: Kent Overstreet --- drivers/md/bcache/bset.h | 15 +++++++++++++++ drivers/md/bcache/btree.c | 28 +++++++++++++++------------- include/uapi/linux/bcache.h | 1 + 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 87da828477f..4fc40fd719d 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -260,6 +260,21 @@ static inline bool btree_keys_expensive_checks(struct btree_keys *b) #define set_blocks(i, block_bytes) \ __set_blocks(i, (i)->keys, block_bytes) +static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b) +{ + struct bset_tree *t = bset_tree_last(b); + + BUG_ON((PAGE_SIZE << b->page_order) < + (bset_byte_offset(b, t->data) + set_bytes(t->data))); + + if (!b->last_set_unwritten) + return 0; + + return ((PAGE_SIZE << b->page_order) - + (bset_byte_offset(b, t->data) + set_bytes(t->data))) / + sizeof(u64); +} + static inline struct bset *bset_next_set(struct btree_keys *b, unsigned block_bytes) { diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 5d7dee8bb85..2c90003ff4c 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -179,14 +179,6 @@ static inline struct bset *write_block(struct btree *b) return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); } -static inline bool should_split(struct btree *b) -{ - struct bset *i = write_block(b); - return b->written >= btree_blocks(b) || - (b->written + __set_blocks(i, i->keys + 15, block_bytes(b->c)) - > btree_blocks(b)); -} - /* Btree key manipulation */ void bkey_put(struct cache_set *c, struct bkey *k) @@ -2026,6 +2018,19 @@ merged: return true; } +static size_t insert_u64s_remaining(struct btree *b) +{ + ssize_t ret = bch_btree_keys_u64s_remaining(&b->keys); + + /* + * Might land in the middle of an existing extent and have to split it + */ + if (b->keys.ops->is_extents) + ret -= KEY_MAX_U64S; + + return max(ret, 0L); +} + static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, struct keylist *insert_keys, struct bkey *replace_key) @@ -2034,12 +2039,9 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, int oldsize = bch_count_data(b); while (!bch_keylist_empty(insert_keys)) { - struct bset *i = write_block(b); struct bkey *k = insert_keys->keys; - if (b->written + - __set_blocks(i, i->keys + bkey_u64s(k), - block_bytes(b->c)) > btree_blocks(b)) + if (bkey_u64s(k) > insert_u64s_remaining(b)) break; if (bkey_cmp(k, &b->key) <= 0) { @@ -2203,7 +2205,7 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, { BUG_ON(b->level && replace_key); - if (should_split(b)) { + if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { if (current->bio_list) { op->lock = b->c->root->level + 1; return -EAGAIN; diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index ae66311be82..22b6ad31c70 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -39,6 +39,7 @@ static inline void SET_##name(struct bkey *k, unsigned i, __u64 v) \ } #define KEY_SIZE_BITS 16 +#define KEY_MAX_U64S 8 KEY_FIELD(KEY_PTRS, high, 60, 3) KEY_FIELD(HEADER_SIZE, high, 58, 2) -- cgit v1.2.3-70-g09d2 From f67342dd342d5917d94a7c0ffbde5f78e0d7a57a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Nov 2013 19:25:55 -0800 Subject: bcache: Refactor bset_tree sysfs stats We're in the process of turning bset.c into library code, so none of the code in that file should know about struct cache_set or struct btree - so, move the btree traversal part of the stats code to sysfs.c. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bset.c | 49 ++++------------------------------------------- drivers/md/bcache/bset.h | 10 ++++++++-- drivers/md/bcache/sysfs.c | 42 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 47 deletions(-) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index f34ef56560e..a3ffc3711b7 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1122,29 +1122,16 @@ out: } EXPORT_SYMBOL(bch_btree_sort_lazy); -/* Sysfs stuff */ - -struct bset_stats { - struct btree_op op; - size_t nodes; - size_t sets_written, sets_unwritten; - size_t bytes_written, bytes_unwritten; - size_t floats, failed; -}; - -static int btree_bset_stats(struct btree_op *op, struct btree *b) +void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats) { - struct bset_stats *stats = container_of(op, struct bset_stats, op); unsigned i; - stats->nodes++; - - for (i = 0; i <= b->keys.nsets; i++) { - struct bset_tree *t = &b->keys.set[i]; + for (i = 0; i <= b->nsets; i++) { + struct bset_tree *t = &b->set[i]; size_t bytes = t->data->keys * sizeof(uint64_t); size_t j; - if (bset_written(&b->keys, t)) { + if (bset_written(b, t)) { stats->sets_written++; stats->bytes_written += bytes; @@ -1158,32 +1145,4 @@ static int btree_bset_stats(struct btree_op *op, struct btree *b) stats->bytes_unwritten += bytes; } } - - return MAP_CONTINUE; -} - -int bch_bset_print_stats(struct cache_set *c, char *buf) -{ - struct bset_stats t; - int ret; - - memset(&t, 0, sizeof(struct bset_stats)); - bch_btree_op_init(&t.op, -1); - - ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats); - if (ret < 0) - return ret; - - return snprintf(buf, PAGE_SIZE, - "btree nodes: %zu\n" - "written sets: %zu\n" - "unwritten sets: %zu\n" - "written key bytes: %zu\n" - "unwritten key bytes: %zu\n" - "floats: %zu\n" - "failed: %zu\n", - t.nodes, - t.sets_written, t.sets_unwritten, - t.bytes_written, t.bytes_unwritten, - t.floats, t.failed); } diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 4fc40fd719d..49135695342 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -365,6 +365,14 @@ static inline void bch_btree_sort(struct btree *b, bch_btree_sort_partial(b, 0, state); } +struct bset_stats { + size_t sets_written, sets_unwritten; + size_t bytes_written, bytes_unwritten; + size_t floats, failed; +}; + +void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *); + /* Bkey utility code */ #define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys) @@ -495,6 +503,4 @@ int __bch_keylist_realloc(struct keylist *, unsigned); struct cache_set; const char *bch_ptr_status(struct cache_set *, const struct bkey *); -int bch_bset_print_stats(struct cache_set *, char *); - #endif diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 7e175dbc76b..db2111b88e2 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -400,6 +400,48 @@ static struct attribute *bch_flash_dev_files[] = { }; KTYPE(bch_flash_dev); +struct bset_stats_op { + struct btree_op op; + size_t nodes; + struct bset_stats stats; +}; + +static int btree_bset_stats(struct btree_op *b_op, struct btree *b) +{ + struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op); + + op->nodes++; + bch_btree_keys_stats(&b->keys, &op->stats); + + return MAP_CONTINUE; +} + +int bch_bset_print_stats(struct cache_set *c, char *buf) +{ + struct bset_stats_op op; + int ret; + + memset(&op, 0, sizeof(op)); + bch_btree_op_init(&op.op, -1); + + ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, btree_bset_stats); + if (ret < 0) + return ret; + + return snprintf(buf, PAGE_SIZE, + "btree nodes: %zu\n" + "written sets: %zu\n" + "unwritten sets: %zu\n" + "written key bytes: %zu\n" + "unwritten key bytes: %zu\n" + "floats: %zu\n" + "failed: %zu\n", + op.nodes, + op.stats.sets_written, op.stats.sets_unwritten, + op.stats.bytes_written, op.stats.bytes_unwritten, + op.stats.floats, op.stats.failed); +} + SHOW(__bch_cache_set) { unsigned root_usage(struct cache_set *c) -- cgit v1.2.3-70-g09d2 From c052dd9a26f60bcf70c0c3fcc08e07abb60295cd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Nov 2013 17:35:24 -0800 Subject: bcache: Convert btree_iter to struct btree_keys More work to disentangle bset.c from struct btree Signed-off-by: Kent Overstreet --- drivers/md/bcache/bset.c | 22 +++++++++++----------- drivers/md/bcache/bset.h | 19 ++++++++++++++----- drivers/md/bcache/btree.c | 22 +++++++++++----------- drivers/md/bcache/btree.h | 8 -------- drivers/md/bcache/debug.c | 6 ++++-- drivers/md/bcache/sysfs.c | 2 +- 6 files changed, 41 insertions(+), 38 deletions(-) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index a3ffc3711b7..097bd8d2acb 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -764,7 +764,7 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, return (struct bset_search_iter) {l, r}; } -struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, +struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, const struct bkey *search) { struct bset_search_iter i; @@ -787,7 +787,7 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, if (unlikely(!t->size)) { i.l = t->data->start; i.r = bset_bkey_last(t->data); - } else if (bset_written(&b->keys, t)) { + } else if (bset_written(b, t)) { /* * Each node in the auxiliary search tree covers a certain range * of bits, and keys above and below the set it covers might @@ -803,14 +803,14 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, i = bset_search_tree(t, search); } else { - BUG_ON(!b->keys.nsets && + BUG_ON(!b->nsets && t->size < bkey_to_cacheline(t, bset_bkey_last(t->data))); i = bset_search_write_set(t, search); } - if (expensive_debug_checks(b->c)) { - BUG_ON(bset_written(&b->keys, t) && + if (btree_keys_expensive_checks(b)) { + BUG_ON(bset_written(b, t) && i.l != t->data->start && bkey_cmp(tree_to_prev_bkey(t, inorder_to_tree(bkey_to_cacheline(t, i.l), t)), @@ -853,7 +853,7 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, btree_iter_cmp)); } -static struct bkey *__bch_btree_iter_init(struct btree *b, +static struct bkey *__bch_btree_iter_init(struct btree_keys *b, struct btree_iter *iter, struct bkey *search, struct bset_tree *start) @@ -866,7 +866,7 @@ static struct bkey *__bch_btree_iter_init(struct btree *b, iter->b = b; #endif - for (; start <= bset_tree_last(&b->keys); start++) { + for (; start <= bset_tree_last(b); start++) { ret = bch_bset_search(b, start, search); bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); } @@ -874,11 +874,11 @@ static struct bkey *__bch_btree_iter_init(struct btree *b, return ret; } -struct bkey *bch_btree_iter_init(struct btree *b, +struct bkey *bch_btree_iter_init(struct btree_keys *b, struct btree_iter *iter, struct bkey *search) { - return __bch_btree_iter_init(b, iter, search, b->keys.set); + return __bch_btree_iter_init(b, iter, search, b->set); } EXPORT_SYMBOL(bch_btree_iter_init); @@ -1047,7 +1047,7 @@ void bch_btree_sort_partial(struct btree *b, unsigned start, struct btree_iter iter; int oldsize = bch_count_data(b); - __bch_btree_iter_init(b, &iter, NULL, &b->keys.set[start]); + __bch_btree_iter_init(&b->keys, &iter, NULL, &b->keys.set[start]); if (start) { unsigned i; @@ -1080,7 +1080,7 @@ void bch_btree_sort_into(struct btree *b, struct btree *new, uint64_t start_time = local_clock(); struct btree_iter iter; - bch_btree_iter_init(b, &iter, NULL); + bch_btree_iter_init(&b->keys, &iter, NULL); btree_mergesort(&b->keys, new->keys.set->data, &iter, false, true); diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 49135695342..563130c2814 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -309,7 +309,7 @@ static inline bool bch_bkey_try_merge(struct btree_keys *b, struct btree_iter { size_t size, used; #ifdef CONFIG_BCACHE_DEBUG - struct btree *b; + struct btree_keys *b; #endif struct btree_iter_set { struct bkey *k, *end; @@ -323,21 +323,30 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *, struct btree_keys *, ptr_filter_fn); void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); -struct bkey *bch_btree_iter_init(struct btree *, struct btree_iter *, +struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *, struct bkey *); -struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, - const struct bkey *); +struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *, + const struct bkey *); /* * Returns the first key that is strictly greater than search */ -static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, +static inline struct bkey *bch_bset_search(struct btree_keys *b, + struct bset_tree *t, const struct bkey *search) { return search ? __bch_bset_search(b, t, search) : t->data->start; } +#define for_each_key_filter(b, k, iter, filter) \ + for (bch_btree_iter_init((b), (iter), NULL); \ + ((k) = bch_btree_iter_next_filter((iter), (b), filter));) + +#define for_each_key(b, k, iter) \ + for (bch_btree_iter_init((b), (iter), NULL); \ + ((k) = bch_btree_iter_next(iter));) + /* Sorting */ struct bset_sort_state { diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 2c90003ff4c..9424c8a15e3 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -212,7 +212,7 @@ void bch_btree_node_read_done(struct btree *b) iter->used = 0; #ifdef CONFIG_BCACHE_DEBUG - iter->b = b; + iter->b = &b->keys; #endif if (!i->seq) @@ -1195,7 +1195,7 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) gc->nodes++; - for_each_key_filter(b, k, &iter, bch_ptr_invalid) { + for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { stale = max(stale, btree_mark_key(b, k)); keys++; @@ -1386,7 +1386,7 @@ static unsigned btree_gc_count_keys(struct btree *b) struct btree_iter iter; unsigned ret = 0; - for_each_key_filter(b, k, &iter, bch_ptr_bad) + for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) ret += bkey_u64s(k); return ret; @@ -1406,7 +1406,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, struct gc_merge_info *last = r + GC_MERGE_NODES - 1; bch_keylist_init(&keys); - bch_btree_iter_init(b, &iter, &b->c->gc_done); + bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); for (i = 0; i < GC_MERGE_NODES; i++) r[i].b = ERR_PTR(-EINTR); @@ -1722,7 +1722,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, struct bucket *g; struct btree_iter iter; - for_each_key_filter(b, k, &iter, bch_ptr_invalid) { + for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { for (i = 0; i < KEY_PTRS(k); i++) { if (!ptr_available(b->c, k, i)) continue; @@ -1745,7 +1745,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, } if (b->level) { - bch_btree_iter_init(b, &iter, NULL); + bch_btree_iter_init(&b->keys, &iter, NULL); do { k = bch_btree_iter_next_filter(&iter, &b->keys, @@ -1892,7 +1892,7 @@ static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, * depends on us inserting a new key for the top * here. */ - top = bch_bset_search(b, + top = bch_bset_search(&b->keys, bset_tree_last(&b->keys), insert); bch_bset_insert(&b->keys, top, k); @@ -1965,7 +1965,7 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, * the previous key. */ prev = NULL; - m = bch_btree_iter_init(b, &iter, + m = bch_btree_iter_init(&b->keys, &iter, PRECEDING_KEY(&START_KEY(k))); if (fix_overlapping_extents(b, k, &iter, replace_key)) { @@ -2001,7 +2001,7 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, goto copy; } else { BUG_ON(replace_key); - m = bch_bset_search(b, bset_tree_last(&b->keys), k); + m = bch_bset_search(&b->keys, bset_tree_last(&b->keys), k); } insert: bch_bset_insert(&b->keys, m, k); @@ -2357,7 +2357,7 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, struct bkey *k; struct btree_iter iter; - bch_btree_iter_init(b, &iter, from); + bch_btree_iter_init(&b->keys, &iter, from); while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { @@ -2390,7 +2390,7 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, struct bkey *k; struct btree_iter iter; - bch_btree_iter_init(b, &iter, from); + bch_btree_iter_init(&b->keys, &iter, from); while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { ret = !b->level diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 04e81f8ab89..af065e97e55 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -201,14 +201,6 @@ void bkey_put(struct cache_set *c, struct bkey *k); iter++) \ hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) -#define for_each_key_filter(b, k, iter, filter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next_filter((iter), &(b)->keys, filter));) - -#define for_each_key(b, k, iter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next(iter));) - /* Recursing down the btree */ struct btree_op { diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 8acc18af07c..3de27e2a4e0 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -246,7 +246,7 @@ int __bch_count_data(struct btree *b) struct bkey *k; if (!b->level) - for_each_key(b, k, &iter) + for_each_key(&b->keys, k, &iter) ret += KEY_SIZE(k); return ret; } @@ -258,7 +258,7 @@ void __bch_check_keys(struct btree *b, const char *fmt, ...) struct btree_iter iter; const char *err; - for_each_key(b, k, &iter) { + for_each_key(&b->keys, k, &iter) { if (!b->level) { err = "Keys out of order"; if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) @@ -298,6 +298,7 @@ bug: void bch_btree_iter_next_check(struct btree_iter *iter) { +#if 0 struct bkey *k = iter->data->k, *next = bkey_next(k); if (next < iter->data->end && @@ -305,6 +306,7 @@ void bch_btree_iter_next_check(struct btree_iter *iter) bch_dump_bucket(iter->b); panic("Key skipped backwards\n"); } +#endif } #endif diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index db2111b88e2..c6ab69333a6 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -460,7 +460,7 @@ lock_root: rw_lock(false, b, b->level); } while (b != c->root); - for_each_key_filter(b, k, &iter, bch_ptr_bad) + for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) bytes += bkey_bytes(k); rw_unlock(false, b); -- cgit v1.2.3-70-g09d2 From dc9d98d621bdce0552997200ce855659875a5c9f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 17 Dec 2013 23:47:33 -0800 Subject: bcache: Convert debug code to btree_keys More work to disentangle various code from struct btree Signed-off-by: Kent Overstreet --- drivers/md/bcache/bset.c | 117 ++++++++++++++++++++++++++++- drivers/md/bcache/bset.h | 56 +++++++++++--- drivers/md/bcache/btree.c | 8 +- drivers/md/bcache/debug.c | 179 ++------------------------------------------ drivers/md/bcache/debug.h | 23 +----- drivers/md/bcache/extents.c | 93 ++++++++++++++++++++++- drivers/md/bcache/extents.h | 1 + drivers/md/bcache/super.c | 2 +- drivers/md/bcache/util.h | 2 + 9 files changed, 264 insertions(+), 217 deletions(-) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 097bd8d2acb..448cff8cc05 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -7,11 +7,121 @@ #include "bcache.h" #include "btree.h" -#include "debug.h" +#include #include #include +#ifdef CONFIG_BCACHE_DEBUG + +void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set) +{ + struct bkey *k, *next; + + for (k = i->start; k < bset_bkey_last(i); k = next) { + next = bkey_next(k); + + printk(KERN_ERR "block %u key %zi/%u: ", set, + (uint64_t *) k - i->d, i->keys); + + if (b->ops->key_dump) + b->ops->key_dump(b, k); + else + printk("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k)); + + if (next < bset_bkey_last(i) && + bkey_cmp(k, b->ops->is_extents ? + &START_KEY(next) : next) > 0) + printk(KERN_ERR "Key skipped backwards\n"); + } +} + +void bch_dump_bucket(struct btree_keys *b) +{ + unsigned i; + + console_lock(); + for (i = 0; i <= b->nsets; i++) + bch_dump_bset(b, b->set[i].data, + bset_sector_offset(b, b->set[i].data)); + console_unlock(); +} + +int __bch_count_data(struct btree_keys *b) +{ + unsigned ret = 0; + struct btree_iter iter; + struct bkey *k; + + if (b->ops->is_extents) + for_each_key(b, k, &iter) + ret += KEY_SIZE(k); + return ret; +} + +void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) +{ + va_list args; + struct bkey *k, *p = NULL; + struct btree_iter iter; + const char *err; + + for_each_key(b, k, &iter) { + if (b->ops->is_extents) { + err = "Keys out of order"; + if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) + goto bug; + + if (bch_ptr_invalid(b, k)) + continue; + + err = "Overlapping keys"; + if (p && bkey_cmp(p, &START_KEY(k)) > 0) + goto bug; + } else { + if (bch_ptr_bad(b, k)) + continue; + + err = "Duplicate keys"; + if (p && !bkey_cmp(p, k)) + goto bug; + } + p = k; + } +#if 0 + err = "Key larger than btree node key"; + if (p && bkey_cmp(p, &b->key) > 0) + goto bug; +#endif + return; +bug: + bch_dump_bucket(b); + + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + + panic("bch_check_keys error: %s:\n", err); +} + +static void bch_btree_iter_next_check(struct btree_iter *iter) +{ + struct bkey *k = iter->data->k, *next = bkey_next(k); + + if (next < iter->data->end && + bkey_cmp(k, iter->b->ops->is_extents ? + &START_KEY(next) : next) > 0) { + bch_dump_bucket(iter->b); + panic("Key skipped backwards\n"); + } +} + +#else + +static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} + +#endif + /* Keylists */ int __bch_keylist_realloc(struct keylist *l, unsigned u64s) @@ -1045,7 +1155,7 @@ void bch_btree_sort_partial(struct btree *b, unsigned start, { size_t order = b->keys.page_order, keys = 0; struct btree_iter iter; - int oldsize = bch_count_data(b); + int oldsize = bch_count_data(&b->keys); __bch_btree_iter_init(&b->keys, &iter, NULL, &b->keys.set[start]); @@ -1063,7 +1173,8 @@ void bch_btree_sort_partial(struct btree *b, unsigned start, __btree_sort(&b->keys, &iter, start, order, false, state); - EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize); + EBUG_ON(b->written && oldsize >= 0 && + bch_count_data(&b->keys) != oldsize); } EXPORT_SYMBOL(bch_btree_sort_partial); diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 563130c2814..e01e69e0065 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -193,6 +193,8 @@ struct btree_keys_ops { bool (*key_bad)(struct btree_keys *, const struct bkey *); bool (*key_merge)(struct btree_keys *, struct bkey *, struct bkey *); + void (*key_to_text)(char *, size_t, const struct bkey *); + void (*key_dump)(struct btree_keys *, const struct bkey *); /* * Only used for deciding whether to use START_KEY(k) or just the key @@ -243,15 +245,6 @@ static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i) return bset_byte_offset(b, i) >> 9; } -static inline bool btree_keys_expensive_checks(struct btree_keys *b) -{ -#ifdef CONFIG_BCACHE_DEBUG - return *b->expensive_debug_checks; -#else - return false; -#endif -} - #define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) #define set_bytes(i) __set_bytes(i, i->keys) @@ -446,6 +439,12 @@ static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k) return b->ops->key_bad(b, k); } +static inline void bch_bkey_to_text(struct btree_keys *b, char *buf, + size_t size, const struct bkey *k) +{ + return b->ops->key_to_text(buf, size, k); +} + /* Keylists */ struct keylist { @@ -509,7 +508,42 @@ struct bkey *bch_keylist_pop(struct keylist *); void bch_keylist_pop_front(struct keylist *); int __bch_keylist_realloc(struct keylist *, unsigned); -struct cache_set; -const char *bch_ptr_status(struct cache_set *, const struct bkey *); +/* Debug stuff */ + +#ifdef CONFIG_BCACHE_DEBUG + +int __bch_count_data(struct btree_keys *); +void __bch_check_keys(struct btree_keys *, const char *, ...); +void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); +void bch_dump_bucket(struct btree_keys *); + +#else + +static inline int __bch_count_data(struct btree_keys *b) { return -1; } +static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {} +static inline void bch_dump_bucket(struct btree_keys *b) {} +void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); + +#endif + +static inline bool btree_keys_expensive_checks(struct btree_keys *b) +{ +#ifdef CONFIG_BCACHE_DEBUG + return *b->expensive_debug_checks; +#else + return false; +#endif +} + +static inline int bch_count_data(struct btree_keys *b) +{ + return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1; +} + +#define bch_check_keys(b, ...) \ +do { \ + if (btree_keys_expensive_checks(b)) \ + __bch_check_keys(b, __VA_ARGS__); \ +} while (0) #endif diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 9424c8a15e3..2128ee1e691 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -460,7 +460,7 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) BUG_ON(b->written >= btree_blocks(b)); BUG_ON(b->written && !i->keys); BUG_ON(btree_bset_first(b)->seq != i->seq); - bch_check_keys(b, "writing"); + bch_check_keys(&b->keys, "writing"); cancel_delayed_work(&b->work); @@ -2007,7 +2007,7 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, insert: bch_bset_insert(&b->keys, m, k); copy: bkey_copy(m, k); merged: - bch_check_keys(b, "%u for %s", status, + bch_check_keys(&b->keys, "%u for %s", status, replace_key ? "replace" : "insert"); if (b->level && !KEY_OFFSET(k)) @@ -2036,7 +2036,7 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, struct bkey *replace_key) { bool ret = false; - int oldsize = bch_count_data(b); + int oldsize = bch_count_data(&b->keys); while (!bch_keylist_empty(insert_keys)) { struct bkey *k = insert_keys->keys; @@ -2066,7 +2066,7 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, BUG_ON(!bch_keylist_empty(insert_keys) && b->level); - BUG_ON(bch_count_data(b) < oldsize); + BUG_ON(bch_count_data(&b->keys) < oldsize); return ret; } diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 3de27e2a4e0..8b1f1d5c181 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -8,6 +8,7 @@ #include "bcache.h" #include "btree.h" #include "debug.h" +#include "extents.h" #include #include @@ -17,108 +18,8 @@ static struct dentry *debug; -const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) -{ - unsigned i; - - for (i = 0; i < KEY_PTRS(k); i++) - if (ptr_available(c, k, i)) { - struct cache *ca = PTR_CACHE(c, k, i); - size_t bucket = PTR_BUCKET_NR(c, k, i); - size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); - - if (KEY_SIZE(k) + r > c->sb.bucket_size) - return "bad, length too big"; - if (bucket < ca->sb.first_bucket) - return "bad, short offset"; - if (bucket >= ca->sb.nbuckets) - return "bad, offset past end of device"; - if (ptr_stale(c, k, i)) - return "stale"; - } - - if (!bkey_cmp(k, &ZERO_KEY)) - return "bad, null key"; - if (!KEY_PTRS(k)) - return "bad, no pointers"; - if (!KEY_SIZE(k)) - return "zeroed key"; - return ""; -} - -int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) -{ - unsigned i = 0; - char *out = buf, *end = buf + size; - -#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) - - p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k)); - - for (i = 0; i < KEY_PTRS(k); i++) { - if (i) - p(", "); - - if (PTR_DEV(k, i) == PTR_CHECK_DEV) - p("check dev"); - else - p("%llu:%llu gen %llu", PTR_DEV(k, i), - PTR_OFFSET(k, i), PTR_GEN(k, i)); - } - - p("]"); - - if (KEY_DIRTY(k)) - p(" dirty"); - if (KEY_CSUM(k)) - p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); -#undef p - return out - buf; -} - #ifdef CONFIG_BCACHE_DEBUG -static void dump_bset(struct btree *b, struct bset *i, unsigned set) -{ - struct bkey *k, *next; - unsigned j; - char buf[80]; - - for (k = i->start; k < bset_bkey_last(i); k = next) { - next = bkey_next(k); - - bch_bkey_to_text(buf, sizeof(buf), k); - printk(KERN_ERR "b %u k %zi/%u: %s", set, - (uint64_t *) k - i->d, i->keys, buf); - - for (j = 0; j < KEY_PTRS(k); j++) { - size_t n = PTR_BUCKET_NR(b->c, k, j); - printk(" bucket %zu", n); - - if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) - printk(" prio %i", - PTR_BUCKET(b->c, k, j)->prio); - } - - printk(" %s\n", bch_ptr_status(b->c, k)); - - if (next < bset_bkey_last(i) && - bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0) - printk(KERN_ERR "Key skipped backwards\n"); - } -} - -static void bch_dump_bucket(struct btree *b) -{ - unsigned i; - - console_lock(); - for (i = 0; i <= b->keys.nsets; i++) - dump_bset(b, b->keys.set[i].data, - bset_block_offset(b, b->keys.set[i].data)); - console_unlock(); -} - #define for_each_written_bset(b, start, i) \ for (i = (start); \ (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\ @@ -171,17 +72,17 @@ void bch_btree_verify(struct btree *b) console_lock(); printk(KERN_ERR "*** in memory:\n"); - dump_bset(b, inmemory, 0); + bch_dump_bset(&b->keys, inmemory, 0); printk(KERN_ERR "*** read back in:\n"); - dump_bset(v, sorted, 0); + bch_dump_bset(&v->keys, sorted, 0); for_each_written_bset(b, ondisk, i) { unsigned block = ((void *) i - (void *) ondisk) / block_bytes(b->c); printk(KERN_ERR "*** on disk block %u:\n", block); - dump_bset(b, i, block); + bch_dump_bset(&b->keys, i, block); } printk(KERN_ERR "*** block %zu not written\n", @@ -239,76 +140,6 @@ out_put: bio_put(check); } -int __bch_count_data(struct btree *b) -{ - unsigned ret = 0; - struct btree_iter iter; - struct bkey *k; - - if (!b->level) - for_each_key(&b->keys, k, &iter) - ret += KEY_SIZE(k); - return ret; -} - -void __bch_check_keys(struct btree *b, const char *fmt, ...) -{ - va_list args; - struct bkey *k, *p = NULL; - struct btree_iter iter; - const char *err; - - for_each_key(&b->keys, k, &iter) { - if (!b->level) { - err = "Keys out of order"; - if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) - goto bug; - - if (bch_ptr_invalid(&b->keys, k)) - continue; - - err = "Overlapping keys"; - if (p && bkey_cmp(p, &START_KEY(k)) > 0) - goto bug; - } else { - if (bch_ptr_bad(&b->keys, k)) - continue; - - err = "Duplicate keys"; - if (p && !bkey_cmp(p, k)) - goto bug; - } - p = k; - } - - err = "Key larger than btree node key"; - if (p && bkey_cmp(p, &b->key) > 0) - goto bug; - - return; -bug: - bch_dump_bucket(b); - - va_start(args, fmt); - vprintk(fmt, args); - va_end(args); - - panic("bcache error: %s:\n", err); -} - -void bch_btree_iter_next_check(struct btree_iter *iter) -{ -#if 0 - struct bkey *k = iter->data->k, *next = bkey_next(k); - - if (next < iter->data->end && - bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) { - bch_dump_bucket(iter->b); - panic("Key skipped backwards\n"); - } -#endif -} - #endif #ifdef CONFIG_DEBUG_FS @@ -355,7 +186,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, if (!w) break; - bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key); + bch_extent_to_text(kbuf, sizeof(kbuf), &w->key); i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); bch_keybuf_del(&i->keys, w); } diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index 08e116e74d3..1f63c195d24 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h @@ -1,19 +1,15 @@ #ifndef _BCACHE_DEBUG_H #define _BCACHE_DEBUG_H -/* Btree/bkey debug printing */ - -int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); +struct bio; +struct cached_dev; +struct cache_set; #ifdef CONFIG_BCACHE_DEBUG void bch_btree_verify(struct btree *); void bch_data_verify(struct cached_dev *, struct bio *); -int __bch_count_data(struct btree *); -void __bch_check_keys(struct btree *, const char *, ...); -void bch_btree_iter_next_check(struct btree_iter *); -#define EBUG_ON(cond) BUG_ON(cond) #define expensive_debug_checks(c) ((c)->expensive_debug_checks) #define key_merging_disabled(c) ((c)->key_merging_disabled) #define bypass_torture_test(d) ((d)->bypass_torture_test) @@ -22,26 +18,13 @@ void bch_btree_iter_next_check(struct btree_iter *); static inline void bch_btree_verify(struct btree *b) {} static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} -static inline int __bch_count_data(struct btree *b) { return -1; } -static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {} -static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} -#define EBUG_ON(cond) do { if (cond); } while (0) #define expensive_debug_checks(c) 0 #define key_merging_disabled(c) 0 #define bypass_torture_test(d) 0 #endif -#define bch_count_data(b) \ - (expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1) - -#define bch_check_keys(b, ...) \ -do { \ - if (expensive_debug_checks((b)->c)) \ - __bch_check_keys(b, __VA_ARGS__); \ -} while (0) - #ifdef CONFIG_DEBUG_FS void bch_debug_init_cache_set(struct cache_set *); #else diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index ba3021128e7..bc1c3eed07e 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -62,6 +62,87 @@ static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) return false; } +/* Common among btree and extent ptrs */ + +static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) { + struct cache *ca = PTR_CACHE(c, k, i); + size_t bucket = PTR_BUCKET_NR(c, k, i); + size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); + + if (KEY_SIZE(k) + r > c->sb.bucket_size) + return "bad, length too big"; + if (bucket < ca->sb.first_bucket) + return "bad, short offset"; + if (bucket >= ca->sb.nbuckets) + return "bad, offset past end of device"; + if (ptr_stale(c, k, i)) + return "stale"; + } + + if (!bkey_cmp(k, &ZERO_KEY)) + return "bad, null key"; + if (!KEY_PTRS(k)) + return "bad, no pointers"; + if (!KEY_SIZE(k)) + return "zeroed key"; + return ""; +} + +void bch_extent_to_text(char *buf, size_t size, const struct bkey *k) +{ + unsigned i = 0; + char *out = buf, *end = buf + size; + +#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) + + p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k)); + + for (i = 0; i < KEY_PTRS(k); i++) { + if (i) + p(", "); + + if (PTR_DEV(k, i) == PTR_CHECK_DEV) + p("check dev"); + else + p("%llu:%llu gen %llu", PTR_DEV(k, i), + PTR_OFFSET(k, i), PTR_GEN(k, i)); + } + + p("]"); + + if (KEY_DIRTY(k)) + p(" dirty"); + if (KEY_CSUM(k)) + p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); +#undef p +} + +static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k) +{ + struct btree *b = container_of(keys, struct btree, keys); + unsigned j; + char buf[80]; + + bch_extent_to_text(buf, sizeof(buf), k); + printk(" %s", buf); + + for (j = 0; j < KEY_PTRS(k); j++) { + size_t n = PTR_BUCKET_NR(b->c, k, j); + printk(" bucket %zu", n); + + if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) + printk(" prio %i", + PTR_BUCKET(b->c, k, j)->prio); + } + + printk(" %s\n", bch_ptr_status(b->c, k)); +} + /* Btree ptrs */ bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) @@ -76,7 +157,7 @@ bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) return false; bad: - bch_bkey_to_text(buf, sizeof(buf), k); + bch_extent_to_text(buf, sizeof(buf), k); cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); return true; } @@ -111,7 +192,7 @@ static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k) return false; err: mutex_unlock(&b->c->bucket_lock); - bch_bkey_to_text(buf, sizeof(buf), k); + bch_extent_to_text(buf, sizeof(buf), k); btree_bug(b, "inconsistent btree pointer %s: bucket %li pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), @@ -145,6 +226,8 @@ const struct btree_keys_ops bch_btree_keys_ops = { .sort_cmp = bch_key_sort_cmp, .key_invalid = bch_btree_ptr_invalid, .key_bad = bch_btree_ptr_bad, + .key_to_text = bch_extent_to_text, + .key_dump = bch_bkey_dump, }; /* Extents */ @@ -227,7 +310,7 @@ static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) return false; bad: - bch_bkey_to_text(buf, sizeof(buf), k); + bch_extent_to_text(buf, sizeof(buf), k); cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k)); return true; } @@ -254,7 +337,7 @@ static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, return false; err: mutex_unlock(&b->c->bucket_lock); - bch_bkey_to_text(buf, sizeof(buf), k); + bch_extent_to_text(buf, sizeof(buf), k); btree_bug(b, "inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), @@ -355,5 +438,7 @@ const struct btree_keys_ops bch_extent_keys_ops = { .key_invalid = bch_extent_invalid, .key_bad = bch_extent_bad, .key_merge = bch_extent_merge, + .key_to_text = bch_extent_to_text, + .key_dump = bch_bkey_dump, .is_extents = true, }; diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h index e0c0b685d2f..e4e23409782 100644 --- a/drivers/md/bcache/extents.h +++ b/drivers/md/bcache/extents.h @@ -7,6 +7,7 @@ extern const struct btree_keys_ops bch_extent_keys_ops; struct bkey; struct cache_set; +void bch_extent_to_text(char *, size_t, const struct bkey *); bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); #endif /* _BCACHE_EXTENTS_H */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 6d6a7a15043..24a3a1546ca 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -384,7 +384,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw, break; } - bch_bkey_to_text(buf, sizeof(buf), k); + bch_extent_to_text(buf, sizeof(buf), k); pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf); for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 3ebaef5f645..ac7d0d1f70d 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -18,11 +18,13 @@ struct closure; #ifdef CONFIG_BCACHE_DEBUG +#define EBUG_ON(cond) BUG_ON(cond) #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) #else /* DEBUG */ +#define EBUG_ON(cond) do { if (cond); } while (0) #define atomic_dec_bug(v) atomic_dec(v) #define atomic_inc_bug(v, i) atomic_inc(v) -- cgit v1.2.3-70-g09d2 From 89ebb4a28ba9efb5c9b18ba552e784021957b14a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Nov 2013 18:38:51 -0800 Subject: bcache: Convert sorting to btree_keys More work to disentangle various code from struct btree Signed-off-by: Kent Overstreet --- drivers/md/bcache/bset.c | 50 ++++++++++++++++++++++------------------------- drivers/md/bcache/bset.h | 13 ++++++------ drivers/md/bcache/btree.c | 6 +++--- 3 files changed, 33 insertions(+), 36 deletions(-) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 448cff8cc05..2ff75f3199f 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -5,8 +5,10 @@ * Copyright 2012 Google, Inc. */ -#include "bcache.h" -#include "btree.h" +#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ + +#include "util.h" +#include "bset.h" #include #include @@ -1150,31 +1152,27 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, bch_time_stats_update(&state->time, start_time); } -void bch_btree_sort_partial(struct btree *b, unsigned start, +void bch_btree_sort_partial(struct btree_keys *b, unsigned start, struct bset_sort_state *state) { - size_t order = b->keys.page_order, keys = 0; + size_t order = b->page_order, keys = 0; struct btree_iter iter; - int oldsize = bch_count_data(&b->keys); + int oldsize = bch_count_data(b); - __bch_btree_iter_init(&b->keys, &iter, NULL, &b->keys.set[start]); + __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); if (start) { unsigned i; - for (i = start; i <= b->keys.nsets; i++) - keys += b->keys.set[i].data->keys; + for (i = start; i <= b->nsets; i++) + keys += b->set[i].data->keys; - order = roundup_pow_of_two(__set_bytes(b->keys.set->data, - keys)) / PAGE_SIZE; - if (order) - order = ilog2(order); + order = get_order(__set_bytes(b->set->data, keys)); } - __btree_sort(&b->keys, &iter, start, order, false, state); + __btree_sort(b, &iter, start, order, false, state); - EBUG_ON(b->written && oldsize >= 0 && - bch_count_data(&b->keys) != oldsize); + EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); } EXPORT_SYMBOL(bch_btree_sort_partial); @@ -1185,51 +1183,49 @@ void bch_btree_sort_and_fix_extents(struct btree_keys *b, __btree_sort(b, iter, 0, b->page_order, true, state); } -void bch_btree_sort_into(struct btree *b, struct btree *new, +void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, struct bset_sort_state *state) { uint64_t start_time = local_clock(); struct btree_iter iter; - bch_btree_iter_init(&b->keys, &iter, NULL); + bch_btree_iter_init(b, &iter, NULL); - btree_mergesort(&b->keys, new->keys.set->data, &iter, false, true); + btree_mergesort(b, new->set->data, &iter, false, true); bch_time_stats_update(&state->time, start_time); - new->keys.set->size = 0; // XXX: why? + new->set->size = 0; // XXX: why? } #define SORT_CRIT (4096 / sizeof(uint64_t)) -void bch_btree_sort_lazy(struct btree *b, struct bset_sort_state *state) +void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state) { unsigned crit = SORT_CRIT; int i; - b->keys.last_set_unwritten = 0; - /* Don't sort if nothing to do */ - if (!b->keys.nsets) + if (!b->nsets) goto out; - for (i = b->keys.nsets - 1; i >= 0; --i) { + for (i = b->nsets - 1; i >= 0; --i) { crit *= state->crit_factor; - if (b->keys.set[i].data->keys < crit) { + if (b->set[i].data->keys < crit) { bch_btree_sort_partial(b, i, state); return; } } /* Sort if we'd overflow */ - if (b->keys.nsets + 1 == MAX_BSETS) { + if (b->nsets + 1 == MAX_BSETS) { bch_btree_sort(b, state); return; } out: - bch_bset_build_written_tree(&b->keys); + bch_bset_build_written_tree(b); } EXPORT_SYMBOL(bch_btree_sort_lazy); diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index e01e69e0065..4aa199d0334 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -1,7 +1,9 @@ #ifndef _BCACHE_BSET_H #define _BCACHE_BSET_H -#include +#include +#include +#include #include "util.h" /* for time_stats */ @@ -144,7 +146,6 @@ * first key in that range of bytes again. */ -struct btree; struct btree_keys; struct btree_iter; struct btree_iter_set; @@ -353,15 +354,15 @@ struct bset_sort_state { void bch_bset_sort_state_free(struct bset_sort_state *); int bch_bset_sort_state_init(struct bset_sort_state *, unsigned); -void bch_btree_sort_lazy(struct btree *, struct bset_sort_state *); -void bch_btree_sort_into(struct btree *, struct btree *, +void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *); +void bch_btree_sort_into(struct btree_keys *, struct btree_keys *, struct bset_sort_state *); void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *, struct bset_sort_state *); -void bch_btree_sort_partial(struct btree *, unsigned, +void bch_btree_sort_partial(struct btree_keys *, unsigned, struct bset_sort_state *); -static inline void bch_btree_sort(struct btree *b, +static inline void bch_btree_sort(struct btree_keys *b, struct bset_sort_state *state) { bch_btree_sort_partial(b, 0, state); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 2128ee1e691..b14f34aa927 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -480,9 +480,9 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) /* If not a leaf node, always sort */ if (b->level && b->keys.nsets) - bch_btree_sort(b, &b->c->sort); + bch_btree_sort(&b->keys, &b->c->sort); else - bch_btree_sort_lazy(b, &b->c->sort); + bch_btree_sort_lazy(&b->keys, &b->c->sort); /* * do verify if there was more than one set initially (i.e. we did a @@ -1087,7 +1087,7 @@ static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) { struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); if (!IS_ERR_OR_NULL(n)) { - bch_btree_sort_into(b, n, &b->c->sort); + bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); bkey_copy_key(&n->key, &b->key); } -- cgit v1.2.3-70-g09d2 From 829a60b9055c319f3656a01eb8cb78b1b86232ef Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Nov 2013 17:02:31 -0800 Subject: bcache: Move insert_fixup() to btree_keys_ops Now handling overlapping extents/keys is a method that's specific to what the btree node contains. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bset.c | 50 +++++++++ drivers/md/bcache/bset.h | 12 +++ drivers/md/bcache/btree.c | 246 +++----------------------------------------- drivers/md/bcache/extents.c | 178 ++++++++++++++++++++++++++++++++ 4 files changed, 257 insertions(+), 229 deletions(-) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 2ff75f3199f..4a711323603 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -676,6 +676,8 @@ void bch_bset_build_written_tree(struct btree_keys *b) } EXPORT_SYMBOL(bch_bset_build_written_tree); +/* Insert */ + void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k) { struct bset_tree *t; @@ -791,6 +793,54 @@ void bch_bset_insert(struct btree_keys *b, struct bkey *where, } EXPORT_SYMBOL(bch_bset_insert); +unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k, + struct bkey *replace_key) +{ + unsigned status = BTREE_INSERT_STATUS_NO_INSERT; + struct bset *i = bset_tree_last(b)->data; + struct bkey *m, *prev = NULL; + struct btree_iter iter; + + BUG_ON(b->ops->is_extents && !KEY_SIZE(k)); + + m = bch_btree_iter_init(b, &iter, b->ops->is_extents + ? PRECEDING_KEY(&START_KEY(k)) + : PRECEDING_KEY(k)); + + if (b->ops->insert_fixup(b, k, &iter, replace_key)) + return status; + + status = BTREE_INSERT_STATUS_INSERT; + + while (m != bset_bkey_last(i) && + bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0) + prev = m, m = bkey_next(m); + + /* prev is in the tree, if we merge we're done */ + status = BTREE_INSERT_STATUS_BACK_MERGE; + if (prev && + bch_bkey_try_merge(b, prev, k)) + goto merged; +#if 0 + status = BTREE_INSERT_STATUS_OVERWROTE; + if (m != bset_bkey_last(i) && + KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) + goto copy; +#endif + status = BTREE_INSERT_STATUS_FRONT_MERGE; + if (m != bset_bkey_last(i) && + bch_bkey_try_merge(b, k, m)) + goto copy; + + bch_bset_insert(b, m, k); +copy: bkey_copy(m, k); +merged: + return status; +} +EXPORT_SYMBOL(bch_btree_insert_key); + +/* Lookup */ + struct bset_search_iter { struct bkey *l, *r; }; diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 4aa199d0334..759df830bb1 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -189,6 +189,8 @@ struct btree_keys_ops { bool (*sort_cmp)(struct btree_iter_set, struct btree_iter_set); struct bkey *(*sort_fixup)(struct btree_iter *, struct bkey *); + bool (*insert_fixup)(struct btree_keys *, struct bkey *, + struct btree_iter *, struct bkey *); bool (*key_invalid)(struct btree_keys *, const struct bkey *); bool (*key_bad)(struct btree_keys *, const struct bkey *); @@ -286,6 +288,16 @@ void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t); void bch_bset_build_written_tree(struct btree_keys *); void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *); void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *); +unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *, + struct bkey *); + +enum { + BTREE_INSERT_STATUS_NO_INSERT = 0, + BTREE_INSERT_STATUS_INSERT, + BTREE_INSERT_STATUS_BACK_MERGE, + BTREE_INSERT_STATUS_OVERWROTE, + BTREE_INSERT_STATUS_FRONT_MERGE, +}; /* * Tries to merge l and r: l should be lower than r diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index b14f34aa927..463d2800a95 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -24,7 +24,6 @@ #include "btree.h" #include "debug.h" #include "extents.h" -#include "writeback.h" #include #include @@ -90,13 +89,6 @@ * Test module load/unload */ -enum { - BTREE_INSERT_STATUS_INSERT, - BTREE_INSERT_STATUS_BACK_MERGE, - BTREE_INSERT_STATUS_OVERWROTE, - BTREE_INSERT_STATUS_FRONT_MERGE, -}; - #define MAX_NEED_GC 64 #define MAX_SAVE_PRIO 72 @@ -1792,230 +1784,23 @@ err: /* Btree insertion */ -static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, - struct btree_iter *iter, - struct bkey *replace_key) +static bool btree_insert_key(struct btree *b, struct bkey *k, + struct bkey *replace_key) { - void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) - { - if (KEY_DIRTY(k)) - bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), - offset, -sectors); - } - - uint64_t old_offset; - unsigned old_size, sectors_found = 0; - - while (1) { - struct bkey *k = bch_btree_iter_next(iter); - if (!k) - break; - - if (bkey_cmp(&START_KEY(k), insert) >= 0) { - if (KEY_SIZE(k)) - break; - else - continue; - } - - if (bkey_cmp(k, &START_KEY(insert)) <= 0) - continue; - - old_offset = KEY_START(k); - old_size = KEY_SIZE(k); - - /* - * We might overlap with 0 size extents; we can't skip these - * because if they're in the set we're inserting to we have to - * adjust them so they don't overlap with the key we're - * inserting. But we don't want to check them for replace - * operations. - */ - - if (replace_key && KEY_SIZE(k)) { - /* - * k might have been split since we inserted/found the - * key we're replacing - */ - unsigned i; - uint64_t offset = KEY_START(k) - - KEY_START(replace_key); - - /* But it must be a subset of the replace key */ - if (KEY_START(k) < KEY_START(replace_key) || - KEY_OFFSET(k) > KEY_OFFSET(replace_key)) - goto check_failed; - - /* We didn't find a key that we were supposed to */ - if (KEY_START(k) > KEY_START(insert) + sectors_found) - goto check_failed; - - if (KEY_PTRS(k) != KEY_PTRS(replace_key) || - KEY_DIRTY(k) != KEY_DIRTY(replace_key)) - goto check_failed; - - /* skip past gen */ - offset <<= 8; - - BUG_ON(!KEY_PTRS(replace_key)); - - for (i = 0; i < KEY_PTRS(replace_key); i++) - if (k->ptr[i] != replace_key->ptr[i] + offset) - goto check_failed; - - sectors_found = KEY_OFFSET(k) - KEY_START(insert); - } - - if (bkey_cmp(insert, k) < 0 && - bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { - /* - * We overlapped in the middle of an existing key: that - * means we have to split the old key. But we have to do - * slightly different things depending on whether the - * old key has been written out yet. - */ - - struct bkey *top; - - subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); - - if (bkey_written(&b->keys, k)) { - /* - * We insert a new key to cover the top of the - * old key, and the old key is modified in place - * to represent the bottom split. - * - * It's completely arbitrary whether the new key - * is the top or the bottom, but it has to match - * up with what btree_sort_fixup() does - it - * doesn't check for this kind of overlap, it - * depends on us inserting a new key for the top - * here. - */ - top = bch_bset_search(&b->keys, - bset_tree_last(&b->keys), - insert); - bch_bset_insert(&b->keys, top, k); - } else { - BKEY_PADDED(key) temp; - bkey_copy(&temp.key, k); - bch_bset_insert(&b->keys, k, &temp.key); - top = bkey_next(k); - } - - bch_cut_front(insert, top); - bch_cut_back(&START_KEY(insert), k); - bch_bset_fix_invalidated_key(&b->keys, k); - return false; - } - - if (bkey_cmp(insert, k) < 0) { - bch_cut_front(insert, k); - } else { - if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) - old_offset = KEY_START(insert); - - if (bkey_written(&b->keys, k) && - bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { - /* - * Completely overwrote, so we don't have to - * invalidate the binary search tree - */ - bch_cut_front(k, k); - } else { - __bch_cut_back(&START_KEY(insert), k); - bch_bset_fix_invalidated_key(&b->keys, k); - } - } - - subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); - } - -check_failed: - if (replace_key) { - if (!sectors_found) { - return true; - } else if (sectors_found < KEY_SIZE(insert)) { - SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - - (KEY_SIZE(insert) - sectors_found)); - SET_KEY_SIZE(insert, sectors_found); - } - } - - return false; -} - -static bool btree_insert_key(struct btree *b, struct btree_op *op, - struct bkey *k, struct bkey *replace_key) -{ - struct bset *i = btree_bset_last(b); - struct bkey *m, *prev; - unsigned status = BTREE_INSERT_STATUS_INSERT; + unsigned status; BUG_ON(bkey_cmp(k, &b->key) > 0); - BUG_ON(b->level && !KEY_PTRS(k)); - BUG_ON(!b->level && !KEY_OFFSET(k)); - if (!b->level) { - struct btree_iter iter; + status = bch_btree_insert_key(&b->keys, k, replace_key); + if (status != BTREE_INSERT_STATUS_NO_INSERT) { + bch_check_keys(&b->keys, "%u for %s", status, + replace_key ? "replace" : "insert"); - /* - * bset_search() returns the first key that is strictly greater - * than the search key - but for back merging, we want to find - * the previous key. - */ - prev = NULL; - m = bch_btree_iter_init(&b->keys, &iter, - PRECEDING_KEY(&START_KEY(k))); - - if (fix_overlapping_extents(b, k, &iter, replace_key)) { - op->insert_collision = true; - return false; - } - - if (KEY_DIRTY(k)) - bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), - KEY_START(k), KEY_SIZE(k)); - - while (m != bset_bkey_last(i) && - bkey_cmp(k, &START_KEY(m)) > 0) - prev = m, m = bkey_next(m); - - if (key_merging_disabled(b->c)) - goto insert; - - /* prev is in the tree, if we merge we're done */ - status = BTREE_INSERT_STATUS_BACK_MERGE; - if (prev && - bch_bkey_try_merge(&b->keys, prev, k)) - goto merged; - - status = BTREE_INSERT_STATUS_OVERWROTE; - if (m != bset_bkey_last(i) && - KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) - goto copy; - - status = BTREE_INSERT_STATUS_FRONT_MERGE; - if (m != bset_bkey_last(i) && - bch_bkey_try_merge(&b->keys, k, m)) - goto copy; - } else { - BUG_ON(replace_key); - m = bch_bset_search(&b->keys, bset_tree_last(&b->keys), k); - } - -insert: bch_bset_insert(&b->keys, m, k); -copy: bkey_copy(m, k); -merged: - bch_check_keys(&b->keys, "%u for %s", status, - replace_key ? "replace" : "insert"); - - if (b->level && !KEY_OFFSET(k)) - btree_current_write(b)->prio_blocked++; - - trace_bcache_btree_insert_key(b, k, replace_key != NULL, status); - - return true; + trace_bcache_btree_insert_key(b, k, replace_key != NULL, + status); + return true; + } else + return false; } static size_t insert_u64s_remaining(struct btree *b) @@ -2048,7 +1833,7 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, if (!b->level) bkey_put(b->c, k); - ret |= btree_insert_key(b, op, k, replace_key); + ret |= btree_insert_key(b, k, replace_key); bch_keylist_pop_front(insert_keys); } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { BKEY_PADDED(key) temp; @@ -2057,13 +1842,16 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, bch_cut_back(&b->key, &temp.key); bch_cut_front(&b->key, insert_keys->keys); - ret |= btree_insert_key(b, op, &temp.key, replace_key); + ret |= btree_insert_key(b, &temp.key, replace_key); break; } else { break; } } + if (!ret) + op->insert_collision = true; + BUG_ON(!bch_keylist_empty(insert_keys) && b->level); BUG_ON(bch_count_data(&b->keys) < oldsize); diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index bc1c3eed07e..d6de3c76a87 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -222,8 +222,22 @@ static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k) return false; } +static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk, + struct bkey *insert, + struct btree_iter *iter, + struct bkey *replace_key) +{ + struct btree *b = container_of(bk, struct btree, keys); + + if (!KEY_OFFSET(insert)) + btree_current_write(b)->prio_blocked++; + + return false; +} + const struct btree_keys_ops bch_btree_keys_ops = { .sort_cmp = bch_key_sort_cmp, + .insert_fixup = bch_btree_ptr_insert_fixup, .key_invalid = bch_btree_ptr_invalid, .key_bad = bch_btree_ptr_bad, .key_to_text = bch_extent_to_text, @@ -294,6 +308,169 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, return NULL; } +static bool bch_extent_insert_fixup(struct btree_keys *b, + struct bkey *insert, + struct btree_iter *iter, + struct bkey *replace_key) +{ + struct cache_set *c = container_of(b, struct btree, keys)->c; + + void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) + { + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(c, KEY_INODE(k), + offset, -sectors); + } + + uint64_t old_offset; + unsigned old_size, sectors_found = 0; + + BUG_ON(!KEY_OFFSET(insert)); + BUG_ON(!KEY_SIZE(insert)); + + while (1) { + struct bkey *k = bch_btree_iter_next(iter); + if (!k) + break; + + if (bkey_cmp(&START_KEY(k), insert) >= 0) { + if (KEY_SIZE(k)) + break; + else + continue; + } + + if (bkey_cmp(k, &START_KEY(insert)) <= 0) + continue; + + old_offset = KEY_START(k); + old_size = KEY_SIZE(k); + + /* + * We might overlap with 0 size extents; we can't skip these + * because if they're in the set we're inserting to we have to + * adjust them so they don't overlap with the key we're + * inserting. But we don't want to check them for replace + * operations. + */ + + if (replace_key && KEY_SIZE(k)) { + /* + * k might have been split since we inserted/found the + * key we're replacing + */ + unsigned i; + uint64_t offset = KEY_START(k) - + KEY_START(replace_key); + + /* But it must be a subset of the replace key */ + if (KEY_START(k) < KEY_START(replace_key) || + KEY_OFFSET(k) > KEY_OFFSET(replace_key)) + goto check_failed; + + /* We didn't find a key that we were supposed to */ + if (KEY_START(k) > KEY_START(insert) + sectors_found) + goto check_failed; + + if (KEY_PTRS(k) != KEY_PTRS(replace_key) || + KEY_DIRTY(k) != KEY_DIRTY(replace_key)) + goto check_failed; + + /* skip past gen */ + offset <<= 8; + + BUG_ON(!KEY_PTRS(replace_key)); + + for (i = 0; i < KEY_PTRS(replace_key); i++) + if (k->ptr[i] != replace_key->ptr[i] + offset) + goto check_failed; + + sectors_found = KEY_OFFSET(k) - KEY_START(insert); + } + + if (bkey_cmp(insert, k) < 0 && + bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { + /* + * We overlapped in the middle of an existing key: that + * means we have to split the old key. But we have to do + * slightly different things depending on whether the + * old key has been written out yet. + */ + + struct bkey *top; + + subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); + + if (bkey_written(b, k)) { + /* + * We insert a new key to cover the top of the + * old key, and the old key is modified in place + * to represent the bottom split. + * + * It's completely arbitrary whether the new key + * is the top or the bottom, but it has to match + * up with what btree_sort_fixup() does - it + * doesn't check for this kind of overlap, it + * depends on us inserting a new key for the top + * here. + */ + top = bch_bset_search(b, bset_tree_last(b), + insert); + bch_bset_insert(b, top, k); + } else { + BKEY_PADDED(key) temp; + bkey_copy(&temp.key, k); + bch_bset_insert(b, k, &temp.key); + top = bkey_next(k); + } + + bch_cut_front(insert, top); + bch_cut_back(&START_KEY(insert), k); + bch_bset_fix_invalidated_key(b, k); + goto out; + } + + if (bkey_cmp(insert, k) < 0) { + bch_cut_front(insert, k); + } else { + if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) + old_offset = KEY_START(insert); + + if (bkey_written(b, k) && + bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { + /* + * Completely overwrote, so we don't have to + * invalidate the binary search tree + */ + bch_cut_front(k, k); + } else { + __bch_cut_back(&START_KEY(insert), k); + bch_bset_fix_invalidated_key(b, k); + } + } + + subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); + } + +check_failed: + if (replace_key) { + if (!sectors_found) { + return true; + } else if (sectors_found < KEY_SIZE(insert)) { + SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - + (KEY_SIZE(insert) - sectors_found)); + SET_KEY_SIZE(insert, sectors_found); + } + } +out: + if (KEY_DIRTY(insert)) + bcache_dev_sectors_dirty_add(c, KEY_INODE(insert), + KEY_START(insert), + KEY_SIZE(insert)); + + return false; +} + static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) { struct btree *b = container_of(bk, struct btree, keys); @@ -435,6 +612,7 @@ static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey const struct btree_keys_ops bch_extent_keys_ops = { .sort_cmp = bch_extent_sort_cmp, .sort_fixup = bch_extent_sort_fixup, + .insert_fixup = bch_extent_insert_fixup, .key_invalid = bch_extent_invalid, .key_bad = bch_extent_bad, .key_merge = bch_extent_merge, -- cgit v1.2.3-70-g09d2 From 0f49cf3d83fbf038534c9302095b66b07b9838c3 Mon Sep 17 00:00:00 2001 From: Nicholas Swenson Date: Mon, 14 Oct 2013 18:53:16 -0700 Subject: bcache: update bch_bkey_try_merge Added generic header checks to bch_bkey_try_merge, which then calls the bkey specific function Removed extraneous checks from bch_extent_merge Signed-off-by: Nicholas Swenson --- drivers/md/bcache/bset.c | 27 +++++++++++++++++++++++++++ drivers/md/bcache/bset.h | 12 +----------- drivers/md/bcache/extents.c | 5 ----- 3 files changed, 28 insertions(+), 16 deletions(-) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 4a711323603..7f8a7bd2150 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -773,6 +773,33 @@ static void bch_bset_fix_lookup_table(struct btree_keys *b, } } +/* + * Tries to merge l and r: l should be lower than r + * Returns true if we were able to merge. If we did merge, l will be the merged + * key, r will be untouched. + */ +bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r) +{ + if (!b->ops->key_merge) + return false; + + /* + * Generic header checks + * Assumes left and right are in order + * Left and right must be exactly aligned + */ + if (KEY_U64s(l) != KEY_U64s(r) || + KEY_DELETED(l) != KEY_DELETED(r) || + KEY_CACHED(l) != KEY_CACHED(r) || + KEY_VERSION(l) != KEY_VERSION(r) || + KEY_CSUM(l) != KEY_CSUM(r) || + bkey_cmp(l, &START_KEY(r))) + return false; + + return b->ops->key_merge(b, l, r); +} +EXPORT_SYMBOL(bch_bkey_try_merge); + void bch_bset_insert(struct btree_keys *b, struct bkey *where, struct bkey *insert) { diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 759df830bb1..487373057c0 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -287,6 +287,7 @@ void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *, void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t); void bch_bset_build_written_tree(struct btree_keys *); void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *); +bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *); void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *); unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *, struct bkey *); @@ -299,17 +300,6 @@ enum { BTREE_INSERT_STATUS_FRONT_MERGE, }; -/* - * Tries to merge l and r: l should be lower than r - * Returns true if we were able to merge. If we did merge, l will be the merged - * key, r will be untouched. - */ -static inline bool bch_bkey_try_merge(struct btree_keys *b, - struct bkey *l, struct bkey *r) -{ - return b->ops->key_merge ? b->ops->key_merge(b, l, r) : false; -} - /* Btree key iteration */ struct btree_iter { diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index d6de3c76a87..7d73d862552 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -575,11 +575,6 @@ static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey if (key_merging_disabled(b->c)) return false; - if (KEY_PTRS(l) != KEY_PTRS(r) || - KEY_DIRTY(l) != KEY_DIRTY(r) || - bkey_cmp(l, &START_KEY(r))) - return false; - for (i = 0; i < KEY_PTRS(l); i++) if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) -- cgit v1.2.3-70-g09d2 From 3bdad1e40d11aad31f2322f21e943c31ef20d9da Mon Sep 17 00:00:00 2001 From: Nicholas Swenson Date: Mon, 11 Nov 2013 19:36:25 -0800 Subject: bcache: Add bch_bkey_equal_header() Checks if two keys have equivalent header fields. (good enough for replacement or merging) Used in bch_bkey_try_merge, and replacing a key in the btree. Signed-off-by: Nicholas Swenson Signed-off-by: Kent Overstreet --- drivers/md/bcache/bset.c | 8 ++------ drivers/md/bcache/bset.h | 8 ++++++++ drivers/md/bcache/extents.c | 3 +-- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 7f8a7bd2150..f990403c4f1 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -788,12 +788,8 @@ bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r) * Assumes left and right are in order * Left and right must be exactly aligned */ - if (KEY_U64s(l) != KEY_U64s(r) || - KEY_DELETED(l) != KEY_DELETED(r) || - KEY_CACHED(l) != KEY_CACHED(r) || - KEY_VERSION(l) != KEY_VERSION(r) || - KEY_CSUM(l) != KEY_CSUM(r) || - bkey_cmp(l, &START_KEY(r))) + if (!bch_bkey_equal_header(l, r) || + bkey_cmp(l, &START_KEY(r))) return false; return b->ops->key_merge(b, l, r); diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 487373057c0..003260f4ddf 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -448,6 +448,14 @@ static inline void bch_bkey_to_text(struct btree_keys *b, char *buf, return b->ops->key_to_text(buf, size, k); } +static inline bool bch_bkey_equal_header(const struct bkey *l, + const struct bkey *r) +{ + return (KEY_DIRTY(l) == KEY_DIRTY(r) && + KEY_PTRS(l) == KEY_PTRS(r) && + KEY_CSUM(l) == KEY_CSUM(l)); +} + /* Keylists */ struct keylist { diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 7d73d862552..c3ead586dc2 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -372,8 +372,7 @@ static bool bch_extent_insert_fixup(struct btree_keys *b, if (KEY_START(k) > KEY_START(insert) + sectors_found) goto check_failed; - if (KEY_PTRS(k) != KEY_PTRS(replace_key) || - KEY_DIRTY(k) != KEY_DIRTY(replace_key)) + if (!bch_bkey_equal_header(k, replace_key)) goto check_failed; /* skip past gen */ -- cgit v1.2.3-70-g09d2 From e0a985a4b1b533311ec88c85177c45d036313f75 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 12 Nov 2013 13:49:10 -0800 Subject: bcache: Improve bucket_prio() calculation When deciding what order to reuse buckets we take into account both the bucket's priority (which indicates lru order) and also the amount of live data in that bucket. The way they were scaled together wasn't as correct as it could be... this patch improves and documents it. Signed-off-by: Kent Overstreet --- drivers/md/bcache/alloc.c | 17 +++++++++++++++-- drivers/md/bcache/bcache.h | 2 +- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index bcfd96e2121..c0d37d08244 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -168,8 +168,21 @@ static void invalidate_one_bucket(struct cache *ca, struct bucket *b) fifo_push(&ca->free_inc, b - ca->buckets); } -#define bucket_prio(b) \ - (((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b)) +/* + * Determines what order we're going to reuse buckets, smallest bucket_prio() + * first: we also take into account the number of sectors of live data in that + * bucket, and in order for that multiply to make sense we have to scale bucket + * + * Thus, we scale the bucket priorities so that the bucket with the smallest + * prio is worth 1/8th of what INITIAL_PRIO is worth. + */ + +#define bucket_prio(b) \ +({ \ + unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ + \ + (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \ +}) #define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) #define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 93b84841966..0c707e4f4ea 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -704,7 +704,7 @@ struct bbio { }; #define BTREE_PRIO USHRT_MAX -#define INITIAL_PRIO 32768 +#define INITIAL_PRIO 32768U #define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) #define btree_blocks(b) \ -- cgit v1.2.3-70-g09d2 From 3b3e9e50dd951725130645660b526c4f367dcdee Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 7 Dec 2013 03:57:58 -0800 Subject: bcache: Don't return -EINTR when insert finished We need to return -EINTR after a split because we invalidated iterators (and freed the btree node) - but if we were finished inserting, we don't want to redo the traversal. Signed-off-by: Kent Overstreet --- drivers/md/bcache/btree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 463d2800a95..98cc0a810a3 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2002,8 +2002,10 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, return -EINTR; } else { /* Invalidated all iterators */ - return btree_split(b, op, insert_keys, replace_key) ?: - -EINTR; + int ret = btree_split(b, op, insert_keys, replace_key); + + return bch_keylist_empty(insert_keys) ? + 0 : ret ?: -EINTR; } } else { BUG_ON(write_block(b) != btree_bset_last(b)); -- cgit v1.2.3-70-g09d2 From 9dd6358a21daf4fc6a5b2b779267a62f0d1d3181 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 17 Dec 2013 03:11:06 -0800 Subject: bcache: Fix auxiliary search trees for key size > cacheline size Signed-off-by: Kent Overstreet --- drivers/md/bcache/bset.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index f990403c4f1..4f6b5940e60 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -511,9 +511,11 @@ static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k) return ((void *) k - (void *) t->data) / BSET_CACHELINE; } -static unsigned bkey_to_cacheline_offset(struct bkey *k) +static unsigned bkey_to_cacheline_offset(struct bset_tree *t, + unsigned cacheline, + struct bkey *k) { - return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t); + return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0); } static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j) @@ -608,7 +610,7 @@ static void bch_bset_build_unwritten_tree(struct btree_keys *b) bset_alloc_tree(b, t); if (t->tree != b->set->tree + btree_keys_cachelines(b)) { - t->prev[0] = bkey_to_cacheline_offset(t->data->start); + t->prev[0] = bkey_to_cacheline_offset(t, 0, t->data->start); t->size = 1; } } @@ -632,7 +634,7 @@ EXPORT_SYMBOL(bch_bset_init_next); void bch_bset_build_written_tree(struct btree_keys *b) { struct bset_tree *t = bset_tree_last(b); - struct bkey *k = t->data->start; + struct bkey *prev = NULL, *k = t->data->start; unsigned j, cacheline = 1; b->last_set_unwritten = 0; @@ -654,13 +656,11 @@ void bch_bset_build_written_tree(struct btree_keys *b) for (j = inorder_next(0, t->size); j; j = inorder_next(j, t->size)) { - while (bkey_to_cacheline(t, k) != cacheline) - k = bkey_next(k); + while (bkey_to_cacheline(t, k) < cacheline) + prev = k, k = bkey_next(k); - t->prev[j] = bkey_u64s(k); - k = bkey_next(k); - cacheline++; - t->tree[j].m = bkey_to_cacheline_offset(k); + t->prev[j] = bkey_u64s(prev); + t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k); } while (bkey_next(k) != bset_bkey_last(t->data)) @@ -739,8 +739,8 @@ static void bch_bset_fix_lookup_table(struct btree_keys *b, * lookup table for the first key that is strictly greater than k: * it's either k's cacheline or the next one */ - if (j < t->size && - table_to_bkey(t, j) <= k) + while (j < t->size && + table_to_bkey(t, j) <= k) j++; /* Adjust all the lookup table entries, and find a new key for any that @@ -755,7 +755,7 @@ static void bch_bset_fix_lookup_table(struct btree_keys *b, while (k < cacheline_to_bkey(t, j, 0)) k = bkey_next(k); - t->prev[j] = bkey_to_cacheline_offset(k); + t->prev[j] = bkey_to_cacheline_offset(t, j, k); } } @@ -768,7 +768,7 @@ static void bch_bset_fix_lookup_table(struct btree_keys *b, k != bset_bkey_last(t->data); k = bkey_next(k)) if (t->size == bkey_to_cacheline(t, k)) { - t->prev[t->size] = bkey_to_cacheline_offset(k); + t->prev[t->size] = bkey_to_cacheline_offset(t, t->size, k); t->size++; } } -- cgit v1.2.3-70-g09d2 From 7b7b68bba5ef23734c35ffb0d8d82079ed604d33 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 10 Jan 2014 02:08:13 +0100 Subject: floppy: bail out in open() if drive is not responding to block0 read In case reading of block 0 during open() fails, it is not the right thing to let open() succeed. Fix this by introducing FD_OPEN_SHOULD_FAIL_BIT flag, and setting it in case the bio callback encounters an error while trying to read block 0. As a bonus, this works around certain broken userspace (blkid), which is not able to properly handle read()s returning IO errors. Hence be nice to those, and bail out during open() already; if block 0 is not readable, read()s are not going to provide any meaningful data anyway. Signed-off-by: Jiri Kosina --- drivers/block/floppy.c | 36 +++++++++++++++++++++++++++--------- include/uapi/linux/fd.h | 3 ++- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 6b29c442282..2023043ce7c 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3691,9 +3691,12 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) if (!(mode & FMODE_NDELAY)) { if (mode & (FMODE_READ|FMODE_WRITE)) { UDRS->last_checked = 0; + clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags); check_disk_change(bdev); if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags)) goto out; + if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags)) + goto out; } res = -EROFS; if ((mode & FMODE_WRITE) && @@ -3746,17 +3749,29 @@ static unsigned int floppy_check_events(struct gendisk *disk, * a disk in the drive, and whether that disk is writable. */ -static void floppy_rb0_complete(struct bio *bio, int err) +struct rb0_cbdata { + int drive; + struct completion complete; +}; + +static void floppy_rb0_cb(struct bio *bio, int err) { - complete((struct completion *)bio->bi_private); + struct rb0_cbdata *cbdata = (struct rb0_cbdata *)bio->bi_private; + int drive = cbdata->drive; + + if (err) { + pr_info("floppy: error %d while reading block 0", err); + set_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags); + } + complete(&cbdata->complete); } -static int __floppy_read_block_0(struct block_device *bdev) +static int __floppy_read_block_0(struct block_device *bdev, int drive) { struct bio bio; struct bio_vec bio_vec; - struct completion complete; struct page *page; + struct rb0_cbdata cbdata; size_t size; page = alloc_page(GFP_NOIO); @@ -3769,6 +3784,8 @@ static int __floppy_read_block_0(struct block_device *bdev) if (!size) size = 1024; + cbdata.drive = drive; + bio_init(&bio); bio.bi_io_vec = &bio_vec; bio_vec.bv_page = page; @@ -3779,13 +3796,14 @@ static int __floppy_read_block_0(struct block_device *bdev) bio.bi_bdev = bdev; bio.bi_iter.bi_sector = 0; bio.bi_flags = (1 << BIO_QUIET); - init_completion(&complete); - bio.bi_private = &complete; - bio.bi_end_io = floppy_rb0_complete; + bio.bi_private = &cbdata; + bio.bi_end_io = floppy_rb0_cb; submit_bio(READ, &bio); process_fd_request(); - wait_for_completion(&complete); + + init_completion(&cbdata.complete); + wait_for_completion(&cbdata.complete); __free_page(page); @@ -3827,7 +3845,7 @@ static int floppy_revalidate(struct gendisk *disk) UDRS->generation++; if (drive_no_geom(drive)) { /* auto-sensing */ - res = __floppy_read_block_0(opened_bdev[drive]); + res = __floppy_read_block_0(opened_bdev[drive], drive); } else { if (cf) poll_drive(false, FD_RAW_NEED_DISK); diff --git a/include/uapi/linux/fd.h b/include/uapi/linux/fd.h index f1f3dd5981b..84c517cbce9 100644 --- a/include/uapi/linux/fd.h +++ b/include/uapi/linux/fd.h @@ -185,7 +185,8 @@ enum { * to clear media change status */ FD_UNUSED_BIT, FD_DISK_CHANGED_BIT, /* disk has been changed since last i/o */ - FD_DISK_WRITABLE_BIT /* disk is writable */ + FD_DISK_WRITABLE_BIT, /* disk is writable */ + FD_OPEN_SHOULD_FAIL_BIT }; #define FDSETDRVPRM _IOW(2, 0x90, struct floppy_drive_params) -- cgit v1.2.3-70-g09d2 From 28e6c50069ae55ff15da1e64245286cc27901b96 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Tue, 21 Jan 2014 14:39:20 -0800 Subject: drivers/block/sx8.c: use module_pci_driver() Use module_pci_driver() macro which makes the code smaller and simpler. Signed-off-by: Jingoo Han Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- drivers/block/sx8.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 3fb6ab4c8b4..7b2ac957398 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -1747,17 +1747,4 @@ static void carm_remove_one (struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); } -static int __init carm_init(void) -{ - return pci_register_driver(&carm_driver); -} - -static void __exit carm_exit(void) -{ - pci_unregister_driver(&carm_driver); -} - -module_init(carm_init); -module_exit(carm_exit); - - +module_pci_driver(carm_driver); -- cgit v1.2.3-70-g09d2 From f2fc8af41c351d11dcdd863356ff31fccf86cfed Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Tue, 21 Jan 2014 14:39:21 -0800 Subject: drivers/block/sx8.c: remove unnecessary pci_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure. Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- drivers/block/sx8.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 7b2ac957398..d5e2d12b9d9 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -1744,7 +1744,6 @@ static void carm_remove_one (struct pci_dev *pdev) kfree(host); pci_release_regions(pdev); pci_disable_device(pdev); - pci_set_drvdata(pdev, NULL); } module_pci_driver(carm_driver); -- cgit v1.2.3-70-g09d2 From 3f7d758b1e3c18360f955d1d5e224865c5e0b881 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 21 Jan 2014 14:39:22 -0800 Subject: drivers/block/paride/pg.c: underflow bug in pg_write() The test here can underflow so we pass bogus lengths to the hardware. It's a static checker fix and I don't know the impact. Signed-off-by: Dan Carpenter Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- drivers/block/paride/pg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c index 4a27b1de5fc..2ce3dfd7e6b 100644 --- a/drivers/block/paride/pg.c +++ b/drivers/block/paride/pg.c @@ -581,7 +581,7 @@ static ssize_t pg_write(struct file *filp, const char __user *buf, size_t count, if (hdr.magic != PG_MAGIC) return -EINVAL; - if (hdr.dlen > PG_MAX_DATA) + if (hdr.dlen < 0 || hdr.dlen > PG_MAX_DATA) return -EINVAL; if ((count - hs) > PG_MAX_DATA) return -EINVAL; -- cgit v1.2.3-70-g09d2 From 4336548af4a2e73b4d786266e520e4e595718665 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 21 Jan 2014 14:39:23 -0800 Subject: drivers/block/cciss.c:cciss_init_one(): use proper errnos pci_driver.probe should return a meaningful errno, not -1. Cc: Jens Axboe Cc: Stephen M. Cameron Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index b35fc4f5237..036e8ab86c7 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -5004,7 +5004,7 @@ reinit_after_soft_reset: i = alloc_cciss_hba(pdev); if (i < 0) - return -1; + return -ENOMEM; h = hba[i]; h->pdev = pdev; @@ -5205,7 +5205,7 @@ clean_no_release_regions: */ pci_set_drvdata(pdev, NULL); free_hba(h); - return -1; + return -ENODEV; } static void cciss_shutdown(struct pci_dev *pdev) -- cgit v1.2.3-70-g09d2 From 12a64d2f5ee1108c77f4468fcccf730d08b290e0 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Tue, 21 Jan 2014 14:39:24 -0800 Subject: drivers/block/loop.c: fix comment typo in loop_config_discard Discard requests are ignored if the encryption is enabled for the given loop device. Update comment to match the code, and similar comments elsewhere in the file. Signed-off-by: Olaf Hering Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- drivers/block/loop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 33fde3a3975..66e8c3b94ef 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -799,7 +799,7 @@ static void loop_config_discard(struct loop_device *lo) /* * We use punch hole to reclaim the free space used by the - * image a.k.a. discard. However we do support discard if + * image a.k.a. discard. However we do not support discard if * encryption is enabled, because it may give an attacker * useful information. */ -- cgit v1.2.3-70-g09d2 From 188b9f49d4b156b0c07ee05bf90ab7b290077d0f Mon Sep 17 00:00:00 2001 From: Sam Bradshaw Date: Wed, 15 Jan 2014 10:14:57 -0800 Subject: mtip32xx: Make SGL container per-command to eliminate high order dma allocation The mtip32xx driver makes a high order dma memory allocation to store a command index table, some dedicated buffers, and a command header & SGL blob. This allocation can fail with a surprise insert under low & fragmented memory conditions. This patch breaks these regions up into separate low order allocations and increases the maximum number of segments a single command SGL can have. We wanted to allow at least 256 segments for 1 MB direct IO. Since the command header occupies the first 0x80 bytes of the SGL blob, that meant we needed two 4k pages to contain the header and SGL. The two pages allow up to 504 SGL segments. Signed-off-by: Sam Bradshaw Signed-off-by: Asai Thambi S P Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 234 +++++++++++++++++++++++--------------- drivers/block/mtip32xx/mtip32xx.h | 12 +- 2 files changed, 149 insertions(+), 97 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 52b2f2a7147..966b4371a30 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -41,10 +41,31 @@ #include "mtip32xx.h" #define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32) -#define HW_CMD_TBL_SZ (AHCI_CMD_TBL_HDR_SZ + (MTIP_MAX_SG * 16)) -#define HW_CMD_TBL_AR_SZ (HW_CMD_TBL_SZ * MTIP_MAX_COMMAND_SLOTS) -#define HW_PORT_PRIV_DMA_SZ \ - (HW_CMD_SLOT_SZ + HW_CMD_TBL_AR_SZ + AHCI_RX_FIS_SZ) + +/* DMA region containing RX Fis, Identify, RLE10, and SMART buffers */ +#define AHCI_RX_FIS_SZ 0x100 +#define AHCI_RX_FIS_OFFSET 0x0 +#define AHCI_IDFY_SZ ATA_SECT_SIZE +#define AHCI_IDFY_OFFSET 0x400 +#define AHCI_SECTBUF_SZ ATA_SECT_SIZE +#define AHCI_SECTBUF_OFFSET 0x800 +#define AHCI_SMARTBUF_SZ ATA_SECT_SIZE +#define AHCI_SMARTBUF_OFFSET 0xC00 +/* 0x100 + 0x200 + 0x200 + 0x200 is smaller than 4k but we pad it out */ +#define BLOCK_DMA_ALLOC_SZ 4096 + +/* DMA region containing command table (should be 8192 bytes) */ +#define AHCI_CMD_SLOT_SZ sizeof(struct mtip_cmd_hdr) +#define AHCI_CMD_TBL_SZ (MTIP_MAX_COMMAND_SLOTS * AHCI_CMD_SLOT_SZ) +#define AHCI_CMD_TBL_OFFSET 0x0 + +/* DMA region per command (contains header and SGL) */ +#define AHCI_CMD_TBL_HDR_SZ 0x80 +#define AHCI_CMD_TBL_HDR_OFFSET 0x0 +#define AHCI_CMD_TBL_SGL_SZ (MTIP_MAX_SG * sizeof(struct mtip_cmd_sg)) +#define AHCI_CMD_TBL_SGL_OFFSET AHCI_CMD_TBL_HDR_SZ +#define CMD_DMA_ALLOC_SZ (AHCI_CMD_TBL_SGL_SZ + AHCI_CMD_TBL_HDR_SZ) + #define HOST_CAP_NZDMA (1 << 19) #define HOST_HSORG 0xFC @@ -3312,6 +3333,118 @@ st_out: return 0; } +/* + * DMA region teardown + * + * @dd Pointer to driver_data structure + * + * return value + * None + */ +static void mtip_dma_free(struct driver_data *dd) +{ + int i; + struct mtip_port *port = dd->port; + + if (port->block1) + dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, + port->block1, port->block1_dma); + + if (port->command_list) { + dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ, + port->command_list, port->command_list_dma); + } + + for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) { + if (port->commands[i].command) + dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, + port->commands[i].command, + port->commands[i].command_dma); + } +} + +/* + * DMA region setup + * + * @dd Pointer to driver_data structure + * + * return value + * -ENOMEM Not enough free DMA region space to initialize driver + */ +static int mtip_dma_alloc(struct driver_data *dd) +{ + struct mtip_port *port = dd->port; + int i, rv = 0; + u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64; + + /* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */ + port->block1 = + dmam_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, + &port->block1_dma, GFP_KERNEL); + if (!port->block1) + return -ENOMEM; + memset(port->block1, 0, BLOCK_DMA_ALLOC_SZ); + + /* Allocate dma memory for command list */ + port->command_list = + dmam_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ, + &port->command_list_dma, GFP_KERNEL); + if (!port->command_list) { + dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, + port->block1, port->block1_dma); + port->block1 = NULL; + port->block1_dma = 0; + return -ENOMEM; + } + memset(port->command_list, 0, AHCI_CMD_TBL_SZ); + + /* Setup all pointers into first DMA region */ + port->rxfis = port->block1 + AHCI_RX_FIS_OFFSET; + port->rxfis_dma = port->block1_dma + AHCI_RX_FIS_OFFSET; + port->identify = port->block1 + AHCI_IDFY_OFFSET; + port->identify_dma = port->block1_dma + AHCI_IDFY_OFFSET; + port->log_buf = port->block1 + AHCI_SECTBUF_OFFSET; + port->log_buf_dma = port->block1_dma + AHCI_SECTBUF_OFFSET; + port->smart_buf = port->block1 + AHCI_SMARTBUF_OFFSET; + port->smart_buf_dma = port->block1_dma + AHCI_SMARTBUF_OFFSET; + + /* Setup per command SGL DMA region */ + + /* Point the command headers at the command tables */ + for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) { + port->commands[i].command = + dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, + &port->commands[i].command_dma, GFP_KERNEL); + if (!port->commands[i].command) { + rv = -ENOMEM; + mtip_dma_free(dd); + return rv; + } + memset(port->commands[i].command, 0, CMD_DMA_ALLOC_SZ); + + port->commands[i].command_header = port->command_list + + (sizeof(struct mtip_cmd_hdr) * i); + port->commands[i].command_header_dma = + dd->port->command_list_dma + + (sizeof(struct mtip_cmd_hdr) * i); + + if (host_cap_64) + port->commands[i].command_header->ctbau = + __force_bit2int cpu_to_le32( + (port->commands[i].command_dma >> 16) >> 16); + + port->commands[i].command_header->ctba = + __force_bit2int cpu_to_le32( + port->commands[i].command_dma & 0xFFFFFFFF); + + sg_init_table(port->commands[i].sg, MTIP_MAX_SG); + + /* Mark command as currently inactive */ + atomic_set(&dd->port->commands[i].active, 0); + } + return 0; +} + /* * Called once for each card. * @@ -3370,83 +3503,10 @@ static int mtip_hw_init(struct driver_data *dd) dd->port->mmio = dd->mmio + PORT_OFFSET; dd->port->dd = dd; - /* Allocate memory for the command list. */ - dd->port->command_list = - dmam_alloc_coherent(&dd->pdev->dev, - HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4), - &dd->port->command_list_dma, - GFP_KERNEL); - if (!dd->port->command_list) { - dev_err(&dd->pdev->dev, - "Memory allocation: command list\n"); - rv = -ENOMEM; + /* DMA allocations */ + rv = mtip_dma_alloc(dd); + if (rv < 0) goto out1; - } - - /* Clear the memory we have allocated. */ - memset(dd->port->command_list, - 0, - HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4)); - - /* Setup the addresse of the RX FIS. */ - dd->port->rxfis = dd->port->command_list + HW_CMD_SLOT_SZ; - dd->port->rxfis_dma = dd->port->command_list_dma + HW_CMD_SLOT_SZ; - - /* Setup the address of the command tables. */ - dd->port->command_table = dd->port->rxfis + AHCI_RX_FIS_SZ; - dd->port->command_tbl_dma = dd->port->rxfis_dma + AHCI_RX_FIS_SZ; - - /* Setup the address of the identify data. */ - dd->port->identify = dd->port->command_table + - HW_CMD_TBL_AR_SZ; - dd->port->identify_dma = dd->port->command_tbl_dma + - HW_CMD_TBL_AR_SZ; - - /* Setup the address of the sector buffer - for some non-ncq cmds */ - dd->port->sector_buffer = (void *) dd->port->identify + ATA_SECT_SIZE; - dd->port->sector_buffer_dma = dd->port->identify_dma + ATA_SECT_SIZE; - - /* Setup the address of the log buf - for read log command */ - dd->port->log_buf = (void *)dd->port->sector_buffer + ATA_SECT_SIZE; - dd->port->log_buf_dma = dd->port->sector_buffer_dma + ATA_SECT_SIZE; - - /* Setup the address of the smart buf - for smart read data command */ - dd->port->smart_buf = (void *)dd->port->log_buf + ATA_SECT_SIZE; - dd->port->smart_buf_dma = dd->port->log_buf_dma + ATA_SECT_SIZE; - - - /* Point the command headers at the command tables. */ - for (i = 0; i < num_command_slots; i++) { - dd->port->commands[i].command_header = - dd->port->command_list + - (sizeof(struct mtip_cmd_hdr) * i); - dd->port->commands[i].command_header_dma = - dd->port->command_list_dma + - (sizeof(struct mtip_cmd_hdr) * i); - - dd->port->commands[i].command = - dd->port->command_table + (HW_CMD_TBL_SZ * i); - dd->port->commands[i].command_dma = - dd->port->command_tbl_dma + (HW_CMD_TBL_SZ * i); - - if (readl(dd->mmio + HOST_CAP) & HOST_CAP_64) - dd->port->commands[i].command_header->ctbau = - __force_bit2int cpu_to_le32( - (dd->port->commands[i].command_dma >> 16) >> 16); - dd->port->commands[i].command_header->ctba = - __force_bit2int cpu_to_le32( - dd->port->commands[i].command_dma & 0xFFFFFFFF); - - /* - * If this is not done, a bug is reported by the stock - * FC11 i386. Due to the fact that it has lots of kernel - * debugging enabled. - */ - sg_init_table(dd->port->commands[i].sg, MTIP_MAX_SG); - - /* Mark all commands as currently inactive.*/ - atomic_set(&dd->port->commands[i].active, 0); - } /* Setup the pointers to the extended s_active and CI registers. */ for (i = 0; i < dd->slot_groups; i++) { @@ -3594,12 +3654,8 @@ out3: out2: mtip_deinit_port(dd->port); + mtip_dma_free(dd); - /* Free the command/command header memory. */ - dmam_free_coherent(&dd->pdev->dev, - HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4), - dd->port->command_list, - dd->port->command_list_dma); out1: /* Free the memory allocated for the for structure. */ kfree(dd->port); @@ -3641,11 +3697,9 @@ static int mtip_hw_exit(struct driver_data *dd) irq_set_affinity_hint(dd->pdev->irq, NULL); devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); - /* Free the command/command header memory. */ - dmam_free_coherent(&dd->pdev->dev, - HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4), - dd->port->command_list, - dd->port->command_list_dma); + /* Free dma regions */ + mtip_dma_free(dd); + /* Free the memory allocated for the for structure. */ kfree(dd->port); dd->port = NULL; diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 9be7a1582ad..bda35d97246 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -69,7 +69,7 @@ * Maximum number of scatter gather entries * a single command may have. */ -#define MTIP_MAX_SG 128 +#define MTIP_MAX_SG 504 /* * Maximum number of slot groups (Command Issue & s_active registers) @@ -391,15 +391,13 @@ struct mtip_port { */ dma_addr_t rxfis_dma; /* - * Pointer to the beginning of the command table memory as used - * by the driver. + * Pointer to the DMA region for RX Fis, Identify, RLE10, and SMART */ - void *command_table; + void *block1; /* - * Pointer to the beginning of the command table memory as used - * by the DMA. + * DMA address of region for RX Fis, Identify, RLE10, and SMART */ - dma_addr_t command_tbl_dma; + dma_addr_t block1_dma; /* * Pointer to the beginning of the identify data memory as used * by the driver. -- cgit v1.2.3-70-g09d2 From 26d580575b4b8c3c83137315c9a2a77e60a2f716 Mon Sep 17 00:00:00 2001 From: Sam Bradshaw Date: Thu, 3 Oct 2013 10:18:05 -0700 Subject: mtip32xx: Correctly handle security locked condition If power is removed during a secure erase, the drive will end up in a security locked condition. This patch causes the driver to identify, log, and flag the security lock state. IOs are prevented from submission to the drive until the locked state is addressed with a secure erase. Bumped version number to reflect this capability. Signed-off-by: Sam Bradshaw Signed-off-by: Asai Thambi S P Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 16 ++++++++++++++-- drivers/block/mtip32xx/mtip32xx.h | 2 +- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 966b4371a30..516026954be 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -920,8 +920,9 @@ static void mtip_handle_tfe(struct driver_data *dd) fail_reason = "thermal shutdown"; } if (buf[288] == 0xBF) { + set_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag); dev_info(&dd->pdev->dev, - "Drive indicates rebuild has failed.\n"); + "Drive indicates rebuild has failed. Secure erase required.\n"); fail_all_ncq_cmds = 1; fail_reason = "rebuild failed"; } @@ -1587,6 +1588,12 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer) } #endif + /* Check security locked state */ + if (port->identify[128] & 0x4) + set_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag); + else + clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag); + #ifdef MTIP_TRIM /* Disabling TRIM support temporarily */ /* Demux ID.DRAT & ID.RZAT to determine trim support */ if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5)) @@ -1908,6 +1915,10 @@ static void mtip_dump_identify(struct mtip_port *port) strlcpy(cbuf, (char *)(port->identify+27), 41); dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf); + dev_info(&port->dd->pdev->dev, "Security: %04x %s\n", + port->identify[128], + port->identify[128] & 0x4 ? "(LOCKED)" : ""); + if (mtip_hw_get_capacity(port->dd, §ors)) dev_info(&port->dd->pdev->dev, "Capacity: %llu sectors (%llu MB)\n", @@ -3678,7 +3689,8 @@ static int mtip_hw_exit(struct driver_data *dd) * saves its state. */ if (!dd->sr) { - if (!test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) + if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) && + !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag)) if (mtip_standby_immediate(dd->port)) dev_warn(&dd->pdev->dev, "STANDBY IMMEDIATE failed\n"); diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index bda35d97246..b52e9a6d6aa 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -92,7 +92,7 @@ /* Driver name and version strings */ #define MTIP_DRV_NAME "mtip32xx" -#define MTIP_DRV_VERSION "1.2.6os3" +#define MTIP_DRV_VERSION "1.3.0" /* Maximum number of minor device numbers per device. */ #define MTIP_MAX_MINORS 16 -- cgit v1.2.3-70-g09d2 From 9967d8ac5c0f3e730a21abefe4c889c724f3b212 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Tue, 21 Jan 2014 16:59:59 +0530 Subject: null_blk: Null pointer deference problem in alloc_page_buffers If we load the null_blk module with bs=8k we get following oops: [ 3819.812190] BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 [ 3819.812387] IP: [] create_empty_buffers+0x28/0xaf [ 3819.812527] PGD 219244067 PUD 215a06067 PMD 0 [ 3819.812640] Oops: 0000 [#1] SMP [ 3819.812772] Modules linked in: null_blk(+) Fix that by resetting block size to PAGE_SIZE if it is greater than PAGE_SIZE Reported-by: Sumanth Signed-off-by: Raghavendra K T Reviewed-by: Matias Bjorling Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index a2e69d26266..a01d7f68fc7 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -622,6 +622,11 @@ static int __init null_init(void) irqmode = NULL_IRQ_NONE; } #endif + if (bs > PAGE_SIZE) { + pr_warn("null_blk: invalid block size\n"); + pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE); + bs = PAGE_SIZE; + } if (queue_mode == NULL_Q_MQ && use_per_node_hctx) { if (submit_queues < nr_online_nodes) { -- cgit v1.2.3-70-g09d2 From 14424be4dbfa127001ad623869f7ee4c7635e991 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 21 Jan 2014 16:15:53 +0100 Subject: mg_disk: Spelling s/finised/finished/ Signed-off-by: Geert Uytterhoeven Signed-off-by: Jens Axboe --- drivers/block/mg_disk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 7bc363f1ee8..eb59b124136 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -915,7 +915,7 @@ static int mg_probe(struct platform_device *plat_dev) /* disk reset */ if (prv_data->dev_attr == MG_STORAGE_DEV) { - /* If POR seq. not yet finised, wait */ + /* If POR seq. not yet finished, wait */ err = mg_wait_rstout(host->rstout, MG_TMAX_RSTOUT); if (err) goto probe_err_3b; -- cgit v1.2.3-70-g09d2