diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-12-24 10:06:03 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-12-24 10:06:03 -0800 |
commit | c5fdd531b593205a0e8c1ab253dd85b764fd0ecb (patch) | |
tree | 9117fff70e04077117d447f3eafd9b9d0d1a1041 /drivers/md | |
parent | 70e672fa7376d45e262119bda3e1a301e519d4c3 (diff) | |
parent | fc1bc35443741e132dd0118e8dbac53f69a6f76e (diff) |
Merge branch 'for-linus' of git://git.kernel.dk/linux-block
Pull block fixes from Jens Axboe:
- fix for a memory leak on certain unplug events
- a collection of bcache fixes from Kent and Nicolas
- a few null_blk fixes and updates form Matias
- a marking of static of functions in the stec pci-e driver
* 'for-linus' of git://git.kernel.dk/linux-block:
null_blk: support submit_queues on use_per_node_hctx
null_blk: set use_per_node_hctx param to false
null_blk: corrections to documentation
null_blk: warning on ignored submit_queues param
null_blk: refactor init and init errors code paths
null_blk: documentation
null_blk: mem garbage on NUMA systems during init
drivers: block: Mark the functions as static in skd_main.c
bcache: New writeback PD controller
bcache: bugfix for race between moving_gc and bucket_invalidate
bcache: fix for gc and writeback race
bcache: bugfix - moving_gc now moves only correct buckets
bcache: fix for gc crashing when no sectors are used
bcache: Fix heap_peek() macro
bcache: Fix for can_attach_cache()
bcache: Fix dirty_data accounting
bcache: Use uninterruptible sleep in writeback
bcache: kthread don't set writeback task to INTERUPTIBLE
block: fix memory leaks on unplugging block device
bcache: fix sparse non static symbol warning
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bcache/alloc.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 12 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 27 | ||||
-rw-r--r-- | drivers/md/bcache/movinggc.c | 21 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 50 | ||||
-rw-r--r-- | drivers/md/bcache/util.c | 8 | ||||
-rw-r--r-- | drivers/md/bcache/util.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 53 |
9 files changed, 111 insertions, 66 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 2b46bf1d7e4..4c9852d92b0 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -421,9 +421,11 @@ out: if (watermark <= WATERMARK_METADATA) { SET_GC_MARK(b, GC_MARK_METADATA); + SET_GC_MOVE(b, 0); b->prio = BTREE_PRIO; } else { SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + SET_GC_MOVE(b, 0); b->prio = INITIAL_PRIO; } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 4beb55a0ff3..754f4317748 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -197,7 +197,7 @@ struct bucket { uint8_t disk_gen; uint8_t last_gc; /* Most out of date gen in the btree */ uint8_t gc_gen; - uint16_t gc_mark; + uint16_t gc_mark; /* Bitfield used by GC. See below for field */ }; /* @@ -209,7 +209,8 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); #define GC_MARK_RECLAIMABLE 0 #define GC_MARK_DIRTY 1 #define GC_MARK_METADATA 2 -BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); +BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 13); +BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1); #include "journal.h" #include "stats.h" @@ -372,14 +373,14 @@ struct cached_dev { unsigned char writeback_percent; unsigned writeback_delay; - int writeback_rate_change; - int64_t writeback_rate_derivative; uint64_t writeback_rate_target; + int64_t writeback_rate_proportional; + int64_t writeback_rate_derivative; + int64_t writeback_rate_change; unsigned writeback_rate_update_seconds; unsigned writeback_rate_d_term; unsigned writeback_rate_p_term_inverse; - unsigned writeback_rate_d_smooth; }; enum alloc_watermarks { @@ -445,7 +446,6 @@ struct cache { * call prio_write() to keep gens from wrapping. */ uint8_t need_save_prio; - unsigned gc_move_threshold; /* * If nonzero, we know we aren't going to find any buckets to invalidate diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 5e2765aadce..31bb53fcc67 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1561,6 +1561,28 @@ size_t bch_btree_gc_finish(struct cache_set *c) SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), GC_MARK_METADATA); + /* don't reclaim buckets to which writeback keys point */ + rcu_read_lock(); + for (i = 0; i < c->nr_uuids; i++) { + struct bcache_device *d = c->devices[i]; + struct cached_dev *dc; + struct keybuf_key *w, *n; + unsigned j; + + if (!d || UUID_FLASH_ONLY(&c->uuids[i])) + continue; + dc = container_of(d, struct cached_dev, disk); + + spin_lock(&dc->writeback_keys.lock); + rbtree_postorder_for_each_entry_safe(w, n, + &dc->writeback_keys.keys, node) + for (j = 0; j < KEY_PTRS(&w->key); j++) + SET_GC_MARK(PTR_BUCKET(c, &w->key, j), + GC_MARK_DIRTY); + spin_unlock(&dc->writeback_keys.lock); + } + rcu_read_unlock(); + for_each_cache(ca, c, i) { uint64_t *i; @@ -1817,7 +1839,8 @@ static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, if (KEY_START(k) > KEY_START(insert) + sectors_found) goto check_failed; - if (KEY_PTRS(replace_key) != KEY_PTRS(k)) + if (KEY_PTRS(k) != KEY_PTRS(replace_key) || + KEY_DIRTY(k) != KEY_DIRTY(replace_key)) goto check_failed; /* skip past gen */ @@ -2217,7 +2240,7 @@ struct btree_insert_op { struct bkey *replace_key; }; -int btree_insert_fn(struct btree_op *b_op, struct btree *b) +static int btree_insert_fn(struct btree_op *b_op, struct btree *b) { struct btree_insert_op *op = container_of(b_op, struct btree_insert_op, op); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 7c1275e6602..f2f0998c4a9 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -25,10 +25,9 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k) unsigned i; for (i = 0; i < KEY_PTRS(k); i++) { - struct cache *ca = PTR_CACHE(c, k, i); struct bucket *g = PTR_BUCKET(c, k, i); - if (GC_SECTORS_USED(g) < ca->gc_move_threshold) + if (GC_MOVE(g)) return true; } @@ -65,11 +64,16 @@ static void write_moving_finish(struct closure *cl) static void read_moving_endio(struct bio *bio, int error) { + struct bbio *b = container_of(bio, struct bbio, bio); struct moving_io *io = container_of(bio->bi_private, struct moving_io, cl); if (error) io->op.error = error; + else if (!KEY_DIRTY(&b->key) && + ptr_stale(io->op.c, &b->key, 0)) { + io->op.error = -EINTR; + } bch_bbio_endio(io->op.c, bio, error, "reading data to move"); } @@ -141,6 +145,11 @@ static void read_moving(struct cache_set *c) if (!w) break; + if (ptr_stale(c, &w->key, 0)) { + bch_keybuf_del(&c->moving_gc_keys, w); + continue; + } + io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), GFP_KERNEL); @@ -184,7 +193,8 @@ static bool bucket_cmp(struct bucket *l, struct bucket *r) static unsigned bucket_heap_top(struct cache *ca) { - return GC_SECTORS_USED(heap_peek(&ca->heap)); + struct bucket *b; + return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0; } void bch_moving_gc(struct cache_set *c) @@ -226,9 +236,8 @@ void bch_moving_gc(struct cache_set *c) sectors_to_move -= GC_SECTORS_USED(b); } - ca->gc_move_threshold = bucket_heap_top(ca); - - pr_debug("threshold %u", ca->gc_move_threshold); + while (heap_pop(&ca->heap, b, bucket_cmp)) + SET_GC_MOVE(b, 1); } mutex_unlock(&c->bucket_lock); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index dec15cd2d79..c57bfa071a5 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1676,7 +1676,7 @@ err: static bool can_attach_cache(struct cache *ca, struct cache_set *c) { return ca->sb.block_size == c->sb.block_size && - ca->sb.bucket_size == c->sb.block_size && + ca->sb.bucket_size == c->sb.bucket_size && ca->sb.nr_in_set == c->sb.nr_in_set; } diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 80d4c2bee18..a1f85612f0b 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -83,7 +83,6 @@ rw_attribute(writeback_rate); rw_attribute(writeback_rate_update_seconds); rw_attribute(writeback_rate_d_term); rw_attribute(writeback_rate_p_term_inverse); -rw_attribute(writeback_rate_d_smooth); read_attribute(writeback_rate_debug); read_attribute(stripe_size); @@ -129,31 +128,41 @@ SHOW(__bch_cached_dev) var_printf(writeback_running, "%i"); var_print(writeback_delay); var_print(writeback_percent); - sysfs_print(writeback_rate, dc->writeback_rate.rate); + sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); var_print(writeback_rate_update_seconds); var_print(writeback_rate_d_term); var_print(writeback_rate_p_term_inverse); - var_print(writeback_rate_d_smooth); if (attr == &sysfs_writeback_rate_debug) { + char rate[20]; char dirty[20]; - char derivative[20]; char target[20]; - bch_hprint(dirty, - bcache_dev_sectors_dirty(&dc->disk) << 9); - bch_hprint(derivative, dc->writeback_rate_derivative << 9); + char proportional[20]; + char derivative[20]; + char change[20]; + s64 next_io; + + bch_hprint(rate, dc->writeback_rate.rate << 9); + bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); bch_hprint(target, dc->writeback_rate_target << 9); + bch_hprint(proportional,dc->writeback_rate_proportional << 9); + bch_hprint(derivative, dc->writeback_rate_derivative << 9); + bch_hprint(change, dc->writeback_rate_change << 9); + + next_io = div64_s64(dc->writeback_rate.next - local_clock(), + NSEC_PER_MSEC); return sprintf(buf, - "rate:\t\t%u\n" - "change:\t\t%i\n" + "rate:\t\t%s/sec\n" "dirty:\t\t%s\n" + "target:\t\t%s\n" + "proportional:\t%s\n" "derivative:\t%s\n" - "target:\t\t%s\n", - dc->writeback_rate.rate, - dc->writeback_rate_change, - dirty, derivative, target); + "change:\t\t%s/sec\n" + "next io:\t%llims\n", + rate, dirty, target, proportional, + derivative, change, next_io); } sysfs_hprint(dirty_data, @@ -189,6 +198,7 @@ STORE(__cached_dev) struct kobj_uevent_env *env; #define d_strtoul(var) sysfs_strtoul(var, dc->var) +#define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX) #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) sysfs_strtoul(data_csum, dc->disk.data_csum); @@ -197,16 +207,15 @@ STORE(__cached_dev) d_strtoul(writeback_metadata); d_strtoul(writeback_running); d_strtoul(writeback_delay); - sysfs_strtoul_clamp(writeback_rate, - dc->writeback_rate.rate, 1, 1000000); + sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); - d_strtoul(writeback_rate_update_seconds); + sysfs_strtoul_clamp(writeback_rate, + dc->writeback_rate.rate, 1, INT_MAX); + + d_strtoul_nonzero(writeback_rate_update_seconds); d_strtoul(writeback_rate_d_term); - d_strtoul(writeback_rate_p_term_inverse); - sysfs_strtoul_clamp(writeback_rate_p_term_inverse, - dc->writeback_rate_p_term_inverse, 1, INT_MAX); - d_strtoul(writeback_rate_d_smooth); + d_strtoul_nonzero(writeback_rate_p_term_inverse); d_strtoi_h(sequential_cutoff); d_strtoi_h(readahead); @@ -313,7 +322,6 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_rate_update_seconds, &sysfs_writeback_rate_d_term, &sysfs_writeback_rate_p_term_inverse, - &sysfs_writeback_rate_d_smooth, &sysfs_writeback_rate_debug, &sysfs_dirty_data, &sysfs_stripe_size, diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 462214eeacb..bb37618e766 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -209,7 +209,13 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) { uint64_t now = local_clock(); - d->next += div_u64(done, d->rate); + d->next += div_u64(done * NSEC_PER_SEC, d->rate); + + if (time_before64(now + NSEC_PER_SEC, d->next)) + d->next = now + NSEC_PER_SEC; + + if (time_after64(now - NSEC_PER_SEC * 2, d->next)) + d->next = now - NSEC_PER_SEC * 2; return time_after64(d->next, now) ? div_u64(d->next - now, NSEC_PER_SEC / HZ) diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 362c4b3f8b4..1030c6020e9 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -110,7 +110,7 @@ do { \ _r; \ }) -#define heap_peek(h) ((h)->size ? (h)->data[0] : NULL) +#define heap_peek(h) ((h)->used ? (h)->data[0] : NULL) #define heap_full(h) ((h)->used == (h)->size) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 99053b1251b..6c44fe059c2 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -30,38 +30,40 @@ static void __update_writeback_rate(struct cached_dev *dc) /* PD controller */ - int change = 0; - int64_t error; int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); int64_t derivative = dirty - dc->disk.sectors_dirty_last; + int64_t proportional = dirty - target; + int64_t change; dc->disk.sectors_dirty_last = dirty; - derivative *= dc->writeback_rate_d_term; - derivative = clamp(derivative, -dirty, dirty); + /* Scale to sectors per second */ - derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, - dc->writeback_rate_d_smooth, 0); + proportional *= dc->writeback_rate_update_seconds; + proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse); - /* Avoid divide by zero */ - if (!target) - goto out; + derivative = div_s64(derivative, dc->writeback_rate_update_seconds); - error = div64_s64((dirty + derivative - target) << 8, target); + derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, + (dc->writeback_rate_d_term / + dc->writeback_rate_update_seconds) ?: 1, 0); + + derivative *= dc->writeback_rate_d_term; + derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse); - change = div_s64((dc->writeback_rate.rate * error) >> 8, - dc->writeback_rate_p_term_inverse); + change = proportional + derivative; /* Don't increase writeback rate if the device isn't keeping up */ if (change > 0 && time_after64(local_clock(), - dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) + dc->writeback_rate.next + NSEC_PER_MSEC)) change = 0; dc->writeback_rate.rate = - clamp_t(int64_t, dc->writeback_rate.rate + change, + clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change, 1, NSEC_PER_MSEC); -out: + + dc->writeback_rate_proportional = proportional; dc->writeback_rate_derivative = derivative; dc->writeback_rate_change = change; dc->writeback_rate_target = target; @@ -87,15 +89,11 @@ static void update_writeback_rate(struct work_struct *work) static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) { - uint64_t ret; - if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || !dc->writeback_percent) return 0; - ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); - - return min_t(uint64_t, ret, HZ); + return bch_next_delay(&dc->writeback_rate, sectors); } struct dirty_io { @@ -241,7 +239,7 @@ static void read_dirty(struct cached_dev *dc) if (KEY_START(&w->key) != dc->last_read || jiffies_to_msecs(delay) > 50) while (!kthread_should_stop() && delay) - delay = schedule_timeout_interruptible(delay); + delay = schedule_timeout_uninterruptible(delay); dc->last_read = KEY_OFFSET(&w->key); @@ -438,7 +436,7 @@ static int bch_writeback_thread(void *arg) while (delay && !kthread_should_stop() && !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) - delay = schedule_timeout_interruptible(delay); + delay = schedule_timeout_uninterruptible(delay); } } @@ -476,6 +474,8 @@ void bch_sectors_dirty_init(struct cached_dev *dc) bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), sectors_dirty_init_fn, 0); + + dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk); } int bch_cached_dev_writeback_init(struct cached_dev *dc) @@ -490,18 +490,15 @@ int bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_delay = 30; dc->writeback_rate.rate = 1024; - dc->writeback_rate_update_seconds = 30; - dc->writeback_rate_d_term = 16; - dc->writeback_rate_p_term_inverse = 64; - dc->writeback_rate_d_smooth = 8; + dc->writeback_rate_update_seconds = 5; + dc->writeback_rate_d_term = 30; + dc->writeback_rate_p_term_inverse = 6000; dc->writeback_thread = kthread_create(bch_writeback_thread, dc, "bcache_writeback"); if (IS_ERR(dc->writeback_thread)) return PTR_ERR(dc->writeback_thread); - set_task_state(dc->writeback_thread, TASK_INTERRUPTIBLE); - INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); schedule_delayed_work(&dc->writeback_rate_update, dc->writeback_rate_update_seconds * HZ); |