diff options
62 files changed, 17681 insertions, 372 deletions
diff --git a/Documentation/ABI/testing/sysfs-block-bcache b/Documentation/ABI/testing/sysfs-block-bcache new file mode 100644 index 00000000000..9e4bbc5d51f --- /dev/null +++ b/Documentation/ABI/testing/sysfs-block-bcache @@ -0,0 +1,156 @@ +What: /sys/block/<disk>/bcache/unregister +Date: November 2010 +Contact: Kent Overstreet <kent.overstreet@gmail.com> +Description: + A write to this file causes the backing device or cache to be + unregistered. If a backing device had dirty data in the cache, + writeback mode is automatically disabled and all dirty data is + flushed before the device is unregistered. Caches unregister + all associated backing devices before unregistering themselves. + +What: /sys/block/<disk>/bcache/clear_stats +Date: November 2010 +Contact: Kent Overstreet <kent.overstreet@gmail.com> +Description: + Writing to this file resets all the statistics for the device. + +What: /sys/block/<disk>/bcache/cache +Date: November 2010 +Contact: Kent Overstreet <kent.overstreet@gmail.com> +Description: + For a backing device that has cache, a symlink to + the bcache/ dir of that cache. + +What: /sys/block/<disk>/bcache/cache_hits +Date: November 2010 +Contact: Kent Overstreet <kent.overstreet@gmail.com> +Description: + For backing devices: integer number of full cache hits, + counted per bio. A partial cache hit counts as a miss. + +What: /sys/block/<disk>/bcache/cache_misses +Date: November 2010 +Contact: Kent Overstreet <kent.overstreet@gmail.com> +Description: + For backing devices: integer number of cache misses. + +What: /sys/block/<disk>/bcache/cache_hit_ratio +Date: November 2010 +Contact: Kent Overstreet <kent.overstreet@gmail.com> +Description: + For backing devices: cache hits as a percentage. + +What: /sys/block/<disk>/bcache/sequential_cutoff +Date: November 2010 +Contact: Kent Overstreet <kent.overstreet@gmail.com> +Description: + For backing devices: Threshold past which sequential IO will + skip the cache. Read and written as bytes in human readable + units (i.e. echo 10M > sequntial_cutoff). + +What: /sys/block/<disk>/bcache/bypassed +Date: November 2010 +Contact: Kent Overstreet <kent.overstreet@gmail.com> +Description: + Sum of all reads and writes that have bypassed the cache (due + to the sequential cutoff). Expressed as bytes in human + readable units. + +What: /sys/block/<disk>/bcache/writeback +Date: November 2010 +Contact: Kent Overstreet <kent.overstreet@gmail.com> +Description: + For backing devices: When on, writeback caching is enabled and + writes will be buffered in the cache. When off, caching is in + writethrough mode; reads and writes will be added to the + cache but no write buffering will take place. + +What: /sys/block/<disk>/bcache/writeback_running +Date: November 2010 +Contact: Kent Overstreet <kent.overstreet@gmail.com> +Description: + For backing devices: when off, dirty data will not be written + from the cache to the backing device. The cache will still be + used to buffer writes until it is mostly full, at which point + writes transparently revert to writethrough mode. Intended only + for benchmarking/testing. + +What: /sys/block/<disk>/bcache/writeback_delay +Date: November 2010 +Contact: Kent Overstreet <kent.overstreet@gmail.com> +Description: + For backing devices: In writeback mode, when dirty data is + written to the cache and the cache held no dirty data for that + backing device, writeback from cache to backing device starts + after this delay, expressed as an integer number of seconds. + +What: /sys/block/<disk>/bcache/writeback_percent +Date: November 2010 +Contact: Kent Overstreet <kent.overstreet@gmail.com> +Description: + For backing devices: If nonzero, writeback from cache to + backing device only takes place when more than this percentage + of the cache is used, allowing more write coalescing to take + place and reducing total number of writes sent to the backing + device. Integer between 0 and 40. + +What: /sys/block/<disk>/bcache/synchronous +Date: November 2010 +Contact: Kent Overstreet <kent.overstreet@gmail.com> +Description: + For a cache, a boolean that allows synchronous mode to be + switched on and off. In synchronous mode all writes are ordered + such that the cache can reliably recover from unclean shutdown; + if disabled bcache will not generally wait for writes to + complete but if the cache is not shut down cleanly all data + will be discarded from the cache. Should not be turned off with + writeback caching enabled. + +What: /sys/block/<disk>/bcache/discard +Date: November 2010 +Contact: Kent Overstreet <kent.overstreet@gmail.com> +Description: + For a cache, a boolean allowing discard/TRIM to be turned off + or back on if the device supports it. + +What: /sys/block/<disk>/bcache/bucket_size +Date: November 2010 +Contact: Kent Overstreet <kent.overstreet@gmail.com> +Description: + For a cache, bucket size in human readable units, as set at + cache creation time; should match the erase block size of the + SSD for optimal performance. + +What: /sys/block/<disk>/bcache/nbuckets +Date: November 2010 +Contact: Kent Overstreet <kent.overstreet@gmail.com> +Description: + For a cache, the number of usable buckets. + +What: /sys/block/<disk>/bcache/tree_depth +Date: November 2010 +Contact: Kent Overstreet <kent.overstreet@gmail.com> +Description: + For a cache, height of the btree excluding leaf nodes (i.e. a + one node tree will have a depth of 0). + +What: /sys/block/<disk>/bcache/btree_cache_size +Date: November 2010 +Contact: Kent Overstreet <kent.overstreet@gmail.com> +Description: + Number of btree buckets/nodes that are currently cached in + memory; cache dynamically grows and shrinks in response to + memory pressure from the rest of the system. + +What: /sys/block/<disk>/bcache/written +Date: November 2010 +Contact: Kent Overstreet <kent.overstreet@gmail.com> +Description: + For a cache, total amount of data in human readable units + written to the cache, excluding all metadata. + +What: /sys/block/<disk>/bcache/btree_written +Date: November 2010 +Contact: Kent Overstreet <kent.overstreet@gmail.com> +Description: + For a cache, sum of all btree writes in human readable units. diff --git a/Documentation/bcache.txt b/Documentation/bcache.txt new file mode 100644 index 00000000000..77db8809bd9 --- /dev/null +++ b/Documentation/bcache.txt @@ -0,0 +1,431 @@ +Say you've got a big slow raid 6, and an X-25E or three. Wouldn't it be +nice if you could use them as cache... Hence bcache. + +Wiki and git repositories are at: + http://bcache.evilpiepirate.org + http://evilpiepirate.org/git/linux-bcache.git + http://evilpiepirate.org/git/bcache-tools.git + +It's designed around the performance characteristics of SSDs - it only allocates +in erase block sized buckets, and it uses a hybrid btree/log to track cached +extants (which can be anywhere from a single sector to the bucket size). It's +designed to avoid random writes at all costs; it fills up an erase block +sequentially, then issues a discard before reusing it. + +Both writethrough and writeback caching are supported. Writeback defaults to +off, but can be switched on and off arbitrarily at runtime. Bcache goes to +great lengths to protect your data - it reliably handles unclean shutdown. (It +doesn't even have a notion of a clean shutdown; bcache simply doesn't return +writes as completed until they're on stable storage). + +Writeback caching can use most of the cache for buffering writes - writing +dirty data to the backing device is always done sequentially, scanning from the +start to the end of the index. + +Since random IO is what SSDs excel at, there generally won't be much benefit +to caching large sequential IO. Bcache detects sequential IO and skips it; +it also keeps a rolling average of the IO sizes per task, and as long as the +average is above the cutoff it will skip all IO from that task - instead of +caching the first 512k after every seek. Backups and large file copies should +thus entirely bypass the cache. + +In the event of a data IO error on the flash it will try to recover by reading +from disk or invalidating cache entries. For unrecoverable errors (meta data +or dirty data), caching is automatically disabled; if dirty data was present +in the cache it first disables writeback caching and waits for all dirty data +to be flushed. + +Getting started: +You'll need make-bcache from the bcache-tools repository. Both the cache device +and backing device must be formatted before use. + make-bcache -B /dev/sdb + make-bcache -C /dev/sdc + +make-bcache has the ability to format multiple devices at the same time - if +you format your backing devices and cache device at the same time, you won't +have to manually attach: + make-bcache -B /dev/sda /dev/sdb -C /dev/sdc + +To make bcache devices known to the kernel, echo them to /sys/fs/bcache/register: + + echo /dev/sdb > /sys/fs/bcache/register + echo /dev/sdc > /sys/fs/bcache/register + +To register your bcache devices automatically, you could add something like +this to an init script: + + echo /dev/sd* > /sys/fs/bcache/register_quiet + +It'll look for bcache superblocks and ignore everything that doesn't have one. + +Registering the backing device makes the bcache show up in /dev; you can now +format it and use it as normal. But the first time using a new bcache device, +it'll be running in passthrough mode until you attach it to a cache. See the +section on attaching. + +The devices show up at /dev/bcacheN, and can be controlled via sysfs from +/sys/block/bcacheN/bcache: + + mkfs.ext4 /dev/bcache0 + mount /dev/bcache0 /mnt + +Cache devices are managed as sets; multiple caches per set isn't supported yet +but will allow for mirroring of metadata and dirty data in the future. Your new +cache set shows up as /sys/fs/bcache/<UUID> + +ATTACHING: + +After your cache device and backing device are registered, the backing device +must be attached to your cache set to enable caching. Attaching a backing +device to a cache set is done thusly, with the UUID of the cache set in +/sys/fs/bcache: + + echo <UUID> > /sys/block/bcache0/bcache/attach + +This only has to be done once. The next time you reboot, just reregister all +your bcache devices. If a backing device has data in a cache somewhere, the +/dev/bcache# device won't be created until the cache shows up - particularly +important if you have writeback caching turned on. + +If you're booting up and your cache device is gone and never coming back, you +can force run the backing device: + + echo 1 > /sys/block/sdb/bcache/running + +(You need to use /sys/block/sdb (or whatever your backing device is called), not +/sys/block/bcache0, because bcache0 doesn't exist yet. If you're using a +partition, the bcache directory would be at /sys/block/sdb/sdb2/bcache) + +The backing device will still use that cache set if it shows up in the future, +but all the cached data will be invalidated. If there was dirty data in the +cache, don't expect the filesystem to be recoverable - you will have massive +filesystem corruption, though ext4's fsck does work miracles. + +ERROR HANDLING: + +Bcache tries to transparently handle IO errors to/from the cache device without +affecting normal operation; if it sees too many errors (the threshold is +configurable, and defaults to 0) it shuts down the cache device and switches all +the backing devices to passthrough mode. + + - For reads from the cache, if they error we just retry the read from the + backing device. + + - For writethrough writes, if the write to the cache errors we just switch to + invalidating the data at that lba in the cache (i.e. the same thing we do for + a write that bypasses the cache) + + - For writeback writes, we currently pass that error back up to the + filesystem/userspace. This could be improved - we could retry it as a write + that skips the cache so we don't have to error the write. + + - When we detach, we first try to flush any dirty data (if we were running in + writeback mode). It currently doesn't do anything intelligent if it fails to + read some of the dirty data, though. + +TROUBLESHOOTING PERFORMANCE: + +Bcache has a bunch of config options and tunables. The defaults are intended to +be reasonable for typical desktop and server workloads, but they're not what you +want for getting the best possible numbers when benchmarking. + + - Bad write performance + + If write performance is not what you expected, you probably wanted to be + running in writeback mode, which isn't the default (not due to a lack of + maturity, but simply because in writeback mode you'll lose data if something + happens to your SSD) + + # echo writeback > /sys/block/bcache0/cache_mode + + - Bad performance, or traffic not going to the SSD that you'd expect + + By default, bcache doesn't cache everything. It tries to skip sequential IO - + because you really want to be caching the random IO, and if you copy a 10 + gigabyte file you probably don't want that pushing 10 gigabytes of randomly + accessed data out of your cache. + + But if you want to benchmark reads from cache, and you start out with fio + writing an 8 gigabyte test file - so you want to disable that. + + # echo 0 > /sys/block/bcache0/bcache/sequential_cutoff + + To set it back to the default (4 mb), do + + # echo 4M > /sys/block/bcache0/bcache/sequential_cutoff + + - Traffic's still going to the spindle/still getting cache misses + + In the real world, SSDs don't always keep up with disks - particularly with + slower SSDs, many disks being cached by one SSD, or mostly sequential IO. So + you want to avoid being bottlenecked by the SSD and having it slow everything + down. + + To avoid that bcache tracks latency to the cache device, and gradually + throttles traffic if the latency exceeds a threshold (it does this by + cranking down the sequential bypass). + + You can disable this if you need to by setting the thresholds to 0: + + # echo 0 > /sys/fs/bcache/<cache set>/congested_read_threshold_us + # echo 0 > /sys/fs/bcache/<cache set>/congested_write_threshold_us + + The default is 2000 us (2 milliseconds) for reads, and 20000 for writes. + + - Still getting cache misses, of the same data + + One last issue that sometimes trips people up is actually an old bug, due to + the way cache coherency is handled for cache misses. If a btree node is full, + a cache miss won't be able to insert a key for the new data and the data + won't be written to the cache. + + In practice this isn't an issue because as soon as a write comes along it'll + cause the btree node to be split, and you need almost no write traffic for + this to not show up enough to be noticable (especially since bcache's btree + nodes are huge and index large regions of the device). But when you're + benchmarking, if you're trying to warm the cache by reading a bunch of data + and there's no other traffic - that can be a problem. + + Solution: warm the cache by doing writes, or use the testing branch (there's + a fix for the issue there). + +SYSFS - BACKING DEVICE: + +attach + Echo the UUID of a cache set to this file to enable caching. + +cache_mode + Can be one of either writethrough, writeback, writearound or none. + +clear_stats + Writing to this file resets the running total stats (not the day/hour/5 minute + decaying versions). + +detach + Write to this file to detach from a cache set. If there is dirty data in the + cache, it will be flushed first. + +dirty_data + Amount of dirty data for this backing device in the cache. Continuously + updated unlike the cache set's version, but may be slightly off. + +label + Name of underlying device. + +readahead + Size of readahead that should be performed. Defaults to 0. If set to e.g. + 1M, it will round cache miss reads up to that size, but without overlapping + existing cache entries. + +running + 1 if bcache is running (i.e. whether the /dev/bcache device exists, whether + it's in passthrough mode or caching). + +sequential_cutoff + A sequential IO will bypass the cache once it passes this threshhold; the + most recent 128 IOs are tracked so sequential IO can be detected even when + it isn't all done at once. + +sequential_merge + If non zero, bcache keeps a list of the last 128 requests submitted to compare + against all new requests to determine which new requests are sequential + continuations of previous requests for the purpose of determining sequential + cutoff. This is necessary if the sequential cutoff value is greater than the + maximum acceptable sequential size for any single request. + +state + The backing device can be in one of four different states: + + no cache: Has never been attached to a cache set. + + clean: Part of a cache set, and there is no cached dirty data. + + dirty: Part of a cache set, and there is cached dirty data. + + inconsistent: The backing device was forcibly run by the user when there was + dirty data cached but the cache set was unavailable; whatever data was on the + backing device has likely been corrupted. + +stop + Write to this file to shut down the bcache device and close the backing + device. + +writeback_delay + When dirty data is written to the cache and it previously did not contain + any, waits some number of seconds before initiating writeback. Defaults to + 30. + +writeback_percent + If nonzero, bcache tries to keep around this percentage of the cache dirty by + throttling background writeback and using a PD controller to smoothly adjust + the rate. + +writeback_rate + Rate in sectors per second - if writeback_percent is nonzero, background + writeback is throttled to this rate. Continuously adjusted by bcache but may + also be set by the user. + +writeback_running + If off, writeback of dirty data will not take place at all. Dirty data will + still be added to the cache until it is mostly full; only meant for + benchmarking. Defaults to on. + +SYSFS - BACKING DEVICE STATS: + +There are directories with these numbers for a running total, as well as +versions that decay over the past day, hour and 5 minutes; they're also +aggregated in the cache set directory as well. + +bypassed + Amount of IO (both reads and writes) that has bypassed the cache + +cache_hits +cache_misses +cache_hit_ratio + Hits and misses are counted per individual IO as bcache sees them; a + partial hit is counted as a miss. + +cache_bypass_hits +cache_bypass_misses + Hits and misses for IO that is intended to skip the cache are still counted, + but broken out here. + +cache_miss_collisions + Counts instances where data was going to be inserted into the cache from a + cache miss, but raced with a write and data was already present (usually 0 + since the synchronization for cache misses was rewritten) + +cache_readaheads + Count of times readahead occured. + +SYSFS - CACHE SET: + +average_key_size + Average data per key in the btree. + +bdev<0..n> + Symlink to each of the attached backing devices. + +block_size + Block size of the cache devices. + +btree_cache_size + Amount of memory currently used by the btree cache + +bucket_size + Size of buckets + +cache<0..n> + Symlink to each of the cache devices comprising this cache set. + +cache_available_percent + Percentage of cache device free. + +clear_stats + Clears the statistics associated with this cache + +dirty_data + Amount of dirty data is in the cache (updated when garbage collection runs). + +flash_vol_create + Echoing a size to this file (in human readable units, k/M/G) creates a thinly + provisioned volume backed by the cache set. + +io_error_halflife +io_error_limit + These determines how many errors we accept before disabling the cache. + Each error is decayed by the half life (in # ios). If the decaying count + reaches io_error_limit dirty data is written out and the cache is disabled. + +journal_delay_ms + Journal writes will delay for up to this many milliseconds, unless a cache + flush happens sooner. Defaults to 100. + +root_usage_percent + Percentage of the root btree node in use. If this gets too high the node + will split, increasing the tree depth. + +stop + Write to this file to shut down the cache set - waits until all attached + backing devices have been shut down. + +tree_depth + Depth of the btree (A single node btree has depth 0). + +unregister + Detaches all backing devices and closes the cache devices; if dirty data is + present it will disable writeback caching and wait for it to be flushed. + +SYSFS - CACHE SET INTERNAL: + +This directory also exposes timings for a number of internal operations, with +separate files for average duration, average frequency, last occurence and max +duration: garbage collection, btree read, btree node sorts and btree splits. + +active_journal_entries + Number of journal entries that are newer than the index. + +btree_nodes + Total nodes in the btree. + +btree_used_percent + Average fraction of btree in use. + +bset_tree_stats + Statistics about the auxiliary search trees + +btree_cache_max_chain + Longest chain in the btree node cache's hash table + +cache_read_races + Counts instances where while data was being read from the cache, the bucket + was reused and invalidated - i.e. where the pointer was stale after the read + completed. When this occurs the data is reread from the backing device. + +trigger_gc + Writing to this file forces garbage collection to run. + +SYSFS - CACHE DEVICE: + +block_size + Minimum granularity of writes - should match hardware sector size. + +btree_written + Sum of all btree writes, in (kilo/mega/giga) bytes + +bucket_size + Size of buckets + +cache_replacement_policy + One of either lru, fifo or random. + +discard + Boolean; if on a discard/TRIM will be issued to each bucket before it is + reused. Defaults to off, since SATA TRIM is an unqueued command (and thus + slow). + +freelist_percent + Size of the freelist as a percentage of nbuckets. Can be written to to + increase the number of buckets kept on the freelist, which lets you + artificially reduce the size of the cache at runtime. Mostly for testing + purposes (i.e. testing how different size caches affect your hit rate), but + since buckets are discarded when they move on to the freelist will also make + the SSD's garbage collection easier by effectively giving it more reserved + space. + +io_errors + Number of errors that have occured, decayed by io_error_halflife. + +metadata_written + Sum of all non data writes (btree writes and all other metadata). + +nbuckets + Total buckets in this cache + +priority_stats + Statistics about how recently data in the cache has been accessed. This can + reveal your working set size. + +written + Sum of all data that has been written to the cache; comparison with + btree_written gives the amount of write inflation in bcache. diff --git a/MAINTAINERS b/MAINTAINERS index e73c374483c..5f5c895e662 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1620,6 +1620,13 @@ W: http://www.baycom.org/~tom/ham/ham.html S: Maintained F: drivers/net/hamradio/baycom* +BCACHE (BLOCK LAYER CACHE) +M: Kent Overstreet <koverstreet@google.com> +L: linux-bcache@vger.kernel.org +W: http://bcache.evilpiepirate.org +S: Maintained: +F: drivers/md/bcache/ + BEFS FILE SYSTEM S: Orphan F: Documentation/filesystems/befs.txt diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 5efed089a70..fc803ecbbce 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -920,16 +920,14 @@ bio_pagedec(struct bio *bio) static void bufinit(struct buf *buf, struct request *rq, struct bio *bio) { - struct bio_vec *bv; - memset(buf, 0, sizeof(*buf)); buf->rq = rq; buf->bio = bio; buf->resid = bio->bi_size; buf->sector = bio->bi_sector; bio_pageinc(bio); - buf->bv = bv = bio_iovec(bio); - buf->bv_resid = bv->bv_len; + buf->bv = bio_iovec(bio); + buf->bv_resid = buf->bv->bv_len; WARN_ON(buf->bv_resid == 0); } diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 94b51c5e067..6374dc10352 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -75,6 +75,12 @@ module_param(cciss_simple_mode, int, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(cciss_simple_mode, "Use 'simple mode' rather than 'performant mode'"); +static int cciss_allow_hpsa; +module_param(cciss_allow_hpsa, int, S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(cciss_allow_hpsa, + "Prevent cciss driver from accessing hardware known to be " + " supported by the hpsa driver"); + static DEFINE_MUTEX(cciss_mutex); static struct proc_dir_entry *proc_cciss; @@ -4115,9 +4121,13 @@ static int cciss_lookup_board_id(struct pci_dev *pdev, u32 *board_id) *board_id = ((subsystem_device_id << 16) & 0xffff0000) | subsystem_vendor_id; - for (i = 0; i < ARRAY_SIZE(products); i++) + for (i = 0; i < ARRAY_SIZE(products); i++) { + /* Stand aside for hpsa driver on request */ + if (cciss_allow_hpsa) + return -ENODEV; if (*board_id == products[i].board_id) return i; + } dev_warn(&pdev->dev, "unrecognized board ID: 0x%08x, ignoring.\n", *board_id); return -ENODEV; @@ -4959,6 +4969,16 @@ static int cciss_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) ctlr_info_t *h; unsigned long flags; + /* + * By default the cciss driver is used for all older HP Smart Array + * controllers. There are module paramaters that allow a user to + * override this behavior and instead use the hpsa SCSI driver. If + * this is the case cciss may be loaded first from the kdump initrd + * image and cause a kernel panic. So if reset_devices is true and + * cciss_allow_hpsa is set just bail. + */ + if ((reset_devices) && (cciss_allow_hpsa == 1)) + return -ENODEV; rc = cciss_init_reset_devices(pdev); if (rc) { if (rc != -ENOTSUPP) diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 92510f8ad01..6608076dc39 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -104,7 +104,6 @@ struct update_al_work { int err; }; -static int al_write_transaction(struct drbd_conf *mdev); void *drbd_md_get_buffer(struct drbd_conf *mdev) { @@ -168,7 +167,11 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, bio->bi_end_io = drbd_md_io_complete; bio->bi_rw = rw; - if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ + if (!(rw & WRITE) && mdev->state.disk == D_DISKLESS && mdev->ldev == NULL) + /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */ + ; + else if (!get_ldev_if_state(mdev, D_ATTACHING)) { + /* Corresponding put_ldev in drbd_md_io_complete() */ dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); err = -ENODEV; goto out; @@ -199,9 +202,10 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, BUG_ON(!bdev->md_bdev); - dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n", + dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", current->comm, current->pid, __func__, - (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); + (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", + (void*)_RET_IP_ ); if (sector < drbd_md_first_sector(bdev) || sector + 7 > drbd_md_last_sector(bdev)) @@ -209,7 +213,8 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, current->comm, current->pid, __func__, (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); - err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE); + /* we do all our meta data IO in aligned 4k blocks. */ + err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, 4096); if (err) { dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); @@ -217,44 +222,99 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, return err; } -static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) +static struct bm_extent *find_active_resync_extent(struct drbd_conf *mdev, unsigned int enr) { - struct lc_element *al_ext; struct lc_element *tmp; - int wake; - - spin_lock_irq(&mdev->al_lock); tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); if (unlikely(tmp != NULL)) { struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); - if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { - wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); - spin_unlock_irq(&mdev->al_lock); - if (wake) - wake_up(&mdev->al_wait); - return NULL; - } + if (test_bit(BME_NO_WRITES, &bm_ext->flags)) + return bm_ext; + } + return NULL; +} + +static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr, bool nonblock) +{ + struct lc_element *al_ext; + struct bm_extent *bm_ext; + int wake; + + spin_lock_irq(&mdev->al_lock); + bm_ext = find_active_resync_extent(mdev, enr); + if (bm_ext) { + wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); + spin_unlock_irq(&mdev->al_lock); + if (wake) + wake_up(&mdev->al_wait); + return NULL; } - al_ext = lc_get(mdev->act_log, enr); + if (nonblock) + al_ext = lc_try_get(mdev->act_log, enr); + else + al_ext = lc_get(mdev->act_log, enr); spin_unlock_irq(&mdev->al_lock); return al_ext; } -void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) +bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i) { /* for bios crossing activity log extent boundaries, * we may need to activate two extents in one go */ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); - unsigned enr; - bool locked = false; + D_ASSERT((unsigned)(last - first) <= 1); + D_ASSERT(atomic_read(&mdev->local_cnt) > 0); + + /* FIXME figure out a fast path for bios crossing AL extent boundaries */ + if (first != last) + return false; + + return _al_get(mdev, first, true); +} + +bool drbd_al_begin_io_prepare(struct drbd_conf *mdev, struct drbd_interval *i) +{ + /* for bios crossing activity log extent boundaries, + * we may need to activate two extents in one go */ + unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); + unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); + unsigned enr; + bool need_transaction = false; D_ASSERT(first <= last); D_ASSERT(atomic_read(&mdev->local_cnt) > 0); - for (enr = first; enr <= last; enr++) - wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL); + for (enr = first; enr <= last; enr++) { + struct lc_element *al_ext; + wait_event(mdev->al_wait, + (al_ext = _al_get(mdev, enr, false)) != NULL); + if (al_ext->lc_number != enr) + need_transaction = true; + } + return need_transaction; +} + +static int al_write_transaction(struct drbd_conf *mdev, bool delegate); + +/* When called through generic_make_request(), we must delegate + * activity log I/O to the worker thread: a further request + * submitted via generic_make_request() within the same task + * would be queued on current->bio_list, and would only start + * after this function returns (see generic_make_request()). + * + * However, if we *are* the worker, we must not delegate to ourselves. + */ + +/* + * @delegate: delegate activity log I/O to the worker thread + */ +void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate) +{ + bool locked = false; + + BUG_ON(delegate && current == mdev->tconn->worker.task); /* Serialize multiple transactions. * This uses test_and_set_bit, memory barrier is implicit. @@ -264,13 +324,6 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) (locked = lc_try_lock_for_transaction(mdev->act_log))); if (locked) { - /* drbd_al_write_transaction(mdev,al_ext,enr); - * recurses into generic_make_request(), which - * disallows recursion, bios being serialized on the - * current->bio_tail list now. - * we have to delegate updates to the activity log - * to the worker thread. */ - /* Double check: it may have been committed by someone else, * while we have been waiting for the lock. */ if (mdev->act_log->pending_changes) { @@ -280,11 +333,8 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; rcu_read_unlock(); - if (write_al_updates) { - al_write_transaction(mdev); - mdev->al_writ_cnt++; - } - + if (write_al_updates) + al_write_transaction(mdev, delegate); spin_lock_irq(&mdev->al_lock); /* FIXME if (err) @@ -298,6 +348,66 @@ void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) } } +/* + * @delegate: delegate activity log I/O to the worker thread + */ +void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate) +{ + BUG_ON(delegate && current == mdev->tconn->worker.task); + + if (drbd_al_begin_io_prepare(mdev, i)) + drbd_al_begin_io_commit(mdev, delegate); +} + +int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i) +{ + struct lru_cache *al = mdev->act_log; + /* for bios crossing activity log extent boundaries, + * we may need to activate two extents in one go */ + unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); + unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); + unsigned nr_al_extents; + unsigned available_update_slots; + unsigned enr; + + D_ASSERT(first <= last); + + nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */ + available_update_slots = min(al->nr_elements - al->used, + al->max_pending_changes - al->pending_changes); + + /* We want all necessary updates for a given request within the same transaction + * We could first check how many updates are *actually* needed, + * and use that instead of the worst-case nr_al_extents */ + if (available_update_slots < nr_al_extents) + return -EWOULDBLOCK; + + /* Is resync active in this area? */ + for (enr = first; enr <= last; enr++) { + struct lc_element *tmp; + tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); + if (unlikely(tmp != NULL)) { + struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); + if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { + if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags)) + return -EBUSY; + return -EWOULDBLOCK; + } + } + } + + /* Checkout the refcounts. + * Given that we checked for available elements and update slots above, + * this has to be successful. */ + for (enr = first; enr <= last; enr++) { + struct lc_element *al_ext; + al_ext = lc_get_cumulative(mdev->act_log, enr); + if (!al_ext) + dev_info(DEV, "LOGIC BUG for enr=%u\n", enr); + } + return 0; +} + void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i) { /* for bios crossing activity log extent boundaries, @@ -350,6 +460,24 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr) (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); } +static sector_t al_tr_number_to_on_disk_sector(struct drbd_conf *mdev) +{ + const unsigned int stripes = mdev->ldev->md.al_stripes; + const unsigned int stripe_size_4kB = mdev->ldev->md.al_stripe_size_4k; + + /* transaction number, modulo on-disk ring buffer wrap around */ + unsigned int t = mdev->al_tr_number % (mdev->ldev->md.al_size_4k); + + /* ... to aligned 4k on disk block */ + t = ((t % stripes) * stripe_size_4kB) + t/stripes; + + /* ... to 512 byte sector in activity log */ + t *= 8; + + /* ... plus offset to the on disk position */ + return mdev->ldev->md.md_offset + mdev->ldev->md.al_offset + t; +} + static int _al_write_transaction(struct drbd_conf *mdev) { @@ -432,23 +560,27 @@ _al_write_transaction(struct drbd_conf *mdev) if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) mdev->al_tr_cycle = 0; - sector = mdev->ldev->md.md_offset - + mdev->ldev->md.al_offset - + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9); + sector = al_tr_number_to_on_disk_sector(mdev); crc = crc32c(0, buffer, 4096); buffer->crc32c = cpu_to_be32(crc); if (drbd_bm_write_hinted(mdev)) err = -EIO; - /* drbd_chk_io_error done already */ - else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { - err = -EIO; - drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); - } else { - /* advance ringbuffer position and transaction counter */ - mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE); - mdev->al_tr_number++; + else { + bool write_al_updates; + rcu_read_lock(); + write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; + rcu_read_unlock(); + if (write_al_updates) { + if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { + err = -EIO; + drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); + } else { + mdev->al_tr_number++; + mdev->al_writ_cnt++; + } + } } drbd_md_put_buffer(mdev); @@ -474,20 +606,18 @@ static int w_al_write_transaction(struct drbd_work *w, int unused) /* Calls from worker context (see w_restart_disk_io()) need to write the transaction directly. Others came through generic_make_request(), those need to delegate it to the worker. */ -static int al_write_transaction(struct drbd_conf *mdev) +static int al_write_transaction(struct drbd_conf *mdev, bool delegate) { - struct update_al_work al_work; - - if (current == mdev->tconn->worker.task) + if (delegate) { + struct update_al_work al_work; + init_completion(&al_work.event); + al_work.w.cb = w_al_write_transaction; + al_work.w.mdev = mdev; + drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w); + wait_for_completion(&al_work.event); + return al_work.err; + } else return _al_write_transaction(mdev); - - init_completion(&al_work.event); - al_work.w.cb = w_al_write_transaction; - al_work.w.mdev = mdev; - drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w); - wait_for_completion(&al_work.event); - - return al_work.err; } static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 8dc29502dc0..64fbb8385cd 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -612,6 +612,17 @@ static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) } } +/* For the layout, see comment above drbd_md_set_sector_offsets(). */ +static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev) +{ + u64 bitmap_sectors; + if (ldev->md.al_offset == 8) + bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset; + else + bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset; + return bitmap_sectors << (9 + 3); +} + /* * make sure the bitmap has enough room for the attached storage, * if necessary, resize. @@ -668,7 +679,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits) words = ALIGN(bits, 64) >> LN2_BPL; if (get_ldev(mdev)) { - u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12; + u64 bits_on_disk = drbd_md_on_disk_bits(mdev->ldev); put_ldev(mdev); if (bits > bits_on_disk) { dev_info(DEV, "bits = %lu\n", bits); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 6b51afa1aae..f943aacfdad 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -753,13 +753,16 @@ struct drbd_md { u32 flags; u32 md_size_sect; - s32 al_offset; /* signed relative sector offset to al area */ + s32 al_offset; /* signed relative sector offset to activity log */ s32 bm_offset; /* signed relative sector offset to bitmap */ - /* u32 al_nr_extents; important for restoring the AL - * is stored into ldev->dc.al_extents, which in turn - * gets applied to act_log->nr_elements - */ + /* cached value of bdev->disk_conf->meta_dev_idx (see below) */ + s32 meta_dev_idx; + + /* see al_tr_number_to_on_disk_sector() */ + u32 al_stripes; + u32 al_stripe_size_4k; + u32 al_size_4k; /* cached product of the above */ }; struct drbd_backing_dev { @@ -891,6 +894,14 @@ struct drbd_tconn { /* is a resource from the config file */ } send; }; +struct submit_worker { + struct workqueue_struct *wq; + struct work_struct worker; + + spinlock_t lock; + struct list_head writes; +}; + struct drbd_conf { struct drbd_tconn *tconn; int vnr; /* volume number within the connection */ @@ -1009,7 +1020,6 @@ struct drbd_conf { struct lru_cache *act_log; /* activity log */ unsigned int al_tr_number; int al_tr_cycle; - int al_tr_pos; /* position of the next transaction in the journal */ wait_queue_head_t seq_wait; atomic_t packet_seq; unsigned int peer_seq; @@ -1032,6 +1042,10 @@ struct drbd_conf { atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ unsigned int peer_max_bio_size; unsigned int local_max_bio_size; + + /* any requests that would block in drbd_make_request() + * are deferred to this single-threaded work queue */ + struct submit_worker submit; }; static inline struct drbd_conf *minor_to_mdev(unsigned int minor) @@ -1148,25 +1162,44 @@ extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev, char *why, enum bm_flag flags); extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); -extern void drbd_go_diskless(struct drbd_conf *mdev); extern void drbd_ldev_destroy(struct drbd_conf *mdev); /* Meta data layout - We reserve a 128MB Block (4k aligned) - * either at the end of the backing device - * or on a separate meta data device. */ + * + * We currently have two possible layouts. + * Offsets in (512 byte) sectors. + * external: + * |----------- md_size_sect ------------------| + * [ 4k superblock ][ activity log ][ Bitmap ] + * | al_offset == 8 | + * | bm_offset = al_offset + X | + * ==> bitmap sectors = md_size_sect - bm_offset + * + * Variants: + * old, indexed fixed size meta data: + * + * internal: + * |----------- md_size_sect ------------------| + * [data.....][ Bitmap ][ activity log ][ 4k superblock ][padding*] + * | al_offset < 0 | + * | bm_offset = al_offset - Y | + * ==> bitmap sectors = Y = al_offset - bm_offset + * + * [padding*] are zero or up to 7 unused 512 Byte sectors to the + * end of the device, so that the [4k superblock] will be 4k aligned. + * + * The activity log consists of 4k transaction blocks, + * which are written in a ring-buffer, or striped ring-buffer like fashion, + * which are writtensize used to be fixed 32kB, + * but is about to become configurable. + */ -/* The following numbers are sectors */ -/* Allows up to about 3.8TB, so if you want more, +/* Our old fixed size meta data layout + * allows up to about 3.8TB, so if you want more, * you need to use the "flexible" meta data format. */ -#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ -#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ -#define MD_AL_SECTORS 64 /* = 32 kB on disk activity log ring buffer */ -#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS) - -/* we do all meta data IO in 4k blocks */ -#define MD_BLOCK_SHIFT 12 -#define MD_BLOCK_SIZE (1<<MD_BLOCK_SHIFT) +#define MD_128MB_SECT (128LLU << 11) /* 128 MB, unit sectors */ +#define MD_4kB_SECT 8 +#define MD_32kB_SECT 64 /* One activity log extent represents 4M of storage */ #define AL_EXTENT_SHIFT 22 @@ -1256,7 +1289,6 @@ struct bm_extent { /* in one sector of the bitmap, we have this many activity_log extents. */ #define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) -#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) #define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT) #define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1) @@ -1276,16 +1308,18 @@ struct bm_extent { */ #define DRBD_MAX_SECTORS_32 (0xffffffffLU) -#define DRBD_MAX_SECTORS_BM \ - ((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9))) -#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32 -#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM -#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM -#elif !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32 +/* we have a certain meta data variant that has a fixed on-disk size of 128 + * MiB, of which 4k are our "superblock", and 32k are the fixed size activity + * log, leaving this many sectors for the bitmap. + */ + +#define DRBD_MAX_SECTORS_FIXED_BM \ + ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9))) +#if !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32 #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32 #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32 #else -#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM +#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_FIXED_BM /* 16 TB in units of sectors */ #if BITS_PER_LONG == 32 /* adjust by one page worth of bitmap, @@ -1418,6 +1452,7 @@ extern void conn_free_crypto(struct drbd_tconn *tconn); extern int proc_details; /* drbd_req */ +extern void do_submit(struct work_struct *ws); extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long); extern void drbd_make_request(struct request_queue *q, struct bio *bio); extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req); @@ -1576,7 +1611,10 @@ extern const char *drbd_conn_str(enum drbd_conns s); extern const char *drbd_role_str(enum drbd_role s); /* drbd_actlog.c */ -extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i); +extern int drbd_al_begin_io_nonblock(struct drbd_conf *mdev, struct drbd_interval *i); +extern void drbd_al_begin_io_commit(struct drbd_conf *mdev, bool delegate); +extern bool drbd_al_begin_io_fastpath(struct drbd_conf *mdev, struct drbd_interval *i); +extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i, bool delegate); extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i); extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); @@ -1755,9 +1793,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev, * BTW, for internal meta data, this happens to be the maximum capacity * we could agree upon with our peer node. */ -static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev) +static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) { - switch (meta_dev_idx) { + switch (bdev->md.meta_dev_idx) { case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: return bdev->md.md_offset + bdev->md.bm_offset; @@ -1767,36 +1805,19 @@ static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backi } } -static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) -{ - int meta_dev_idx; - - rcu_read_lock(); - meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; - rcu_read_unlock(); - - return _drbd_md_first_sector(meta_dev_idx, bdev); -} - /** * drbd_md_last_sector() - Return the last sector number of the meta data area * @bdev: Meta data block device. */ static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) { - int meta_dev_idx; - - rcu_read_lock(); - meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; - rcu_read_unlock(); - - switch (meta_dev_idx) { + switch (bdev->md.meta_dev_idx) { case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: - return bdev->md.md_offset + MD_AL_OFFSET - 1; + return bdev->md.md_offset + MD_4kB_SECT -1; case DRBD_MD_INDEX_FLEX_EXT: default: - return bdev->md.md_offset + bdev->md.md_size_sect; + return bdev->md.md_offset + bdev->md.md_size_sect -1; } } @@ -1818,18 +1839,13 @@ static inline sector_t drbd_get_capacity(struct block_device *bdev) static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) { sector_t s; - int meta_dev_idx; - rcu_read_lock(); - meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; - rcu_read_unlock(); - - switch (meta_dev_idx) { + switch (bdev->md.meta_dev_idx) { case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: s = drbd_get_capacity(bdev->backing_bdev) ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, - _drbd_md_first_sector(meta_dev_idx, bdev)) + drbd_md_first_sector(bdev)) : 0; break; case DRBD_MD_INDEX_FLEX_EXT: @@ -1848,39 +1864,24 @@ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) } /** - * drbd_md_ss__() - Return the sector number of our meta data super block - * @mdev: DRBD device. + * drbd_md_ss() - Return the sector number of our meta data super block * @bdev: Meta data block device. */ -static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, - struct drbd_backing_dev *bdev) +static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev) { - int meta_dev_idx; + const int meta_dev_idx = bdev->md.meta_dev_idx; - rcu_read_lock(); - meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; - rcu_read_unlock(); - - switch (meta_dev_idx) { - default: /* external, some index */ - return MD_RESERVED_SECT * meta_dev_idx; - case DRBD_MD_INDEX_INTERNAL: - /* with drbd08, internal meta data is always "flexible" */ - case DRBD_MD_INDEX_FLEX_INT: - /* sizeof(struct md_on_disk_07) == 4k - * position: last 4k aligned block of 4k size */ - if (!bdev->backing_bdev) { - if (__ratelimit(&drbd_ratelimit_state)) { - dev_err(DEV, "bdev->backing_bdev==NULL\n"); - dump_stack(); - } - return 0; - } - return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - - MD_AL_OFFSET; - case DRBD_MD_INDEX_FLEX_EXT: + if (meta_dev_idx == DRBD_MD_INDEX_FLEX_EXT) return 0; - } + + /* Since drbd08, internal meta data is always "flexible". + * position: last 4k aligned block of 4k size */ + if (meta_dev_idx == DRBD_MD_INDEX_INTERNAL || + meta_dev_idx == DRBD_MD_INDEX_FLEX_INT) + return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8; + + /* external, some index; this is the old fixed size layout */ + return MD_128MB_SECT * bdev->md.meta_dev_idx; } static inline void @@ -2053,9 +2054,11 @@ static inline void put_ldev(struct drbd_conf *mdev) if (mdev->state.disk == D_DISKLESS) /* even internal references gone, safe to destroy */ drbd_ldev_destroy(mdev); - if (mdev->state.disk == D_FAILED) + if (mdev->state.disk == D_FAILED) { /* all application IO references gone. */ - drbd_go_diskless(mdev); + if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) + drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless); + } wake_up(&mdev->misc_wait); } } diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 298b868910d..a5dca6affcb 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -45,7 +45,7 @@ #include <linux/reboot.h> #include <linux/notifier.h> #include <linux/kthread.h> - +#include <linux/workqueue.h> #define __KERNEL_SYSCALLS__ #include <linux/unistd.h> #include <linux/vmalloc.h> @@ -2299,6 +2299,7 @@ static void drbd_cleanup(void) idr_for_each_entry(&minors, mdev, i) { idr_remove(&minors, mdev_to_minor(mdev)); idr_remove(&mdev->tconn->volumes, mdev->vnr); + destroy_workqueue(mdev->submit.wq); del_gendisk(mdev->vdisk); /* synchronize_rcu(); No other threads running at this point */ kref_put(&mdev->kref, &drbd_minor_destroy); @@ -2588,6 +2589,21 @@ void conn_destroy(struct kref *kref) kfree(tconn); } +int init_submitter(struct drbd_conf *mdev) +{ + /* opencoded create_singlethread_workqueue(), + * to be able to say "drbd%d", ..., minor */ + mdev->submit.wq = alloc_workqueue("drbd%u_submit", + WQ_UNBOUND | WQ_MEM_RECLAIM, 1, mdev->minor); + if (!mdev->submit.wq) + return -ENOMEM; + + INIT_WORK(&mdev->submit.worker, do_submit); + spin_lock_init(&mdev->submit.lock); + INIT_LIST_HEAD(&mdev->submit.writes); + return 0; +} + enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr) { struct drbd_conf *mdev; @@ -2677,6 +2693,12 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, goto out_idr_remove_minor; } + if (init_submitter(mdev)) { + err = ERR_NOMEM; + drbd_msg_put_info("unable to create submit workqueue"); + goto out_idr_remove_vol; + } + add_disk(disk); kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */ @@ -2687,6 +2709,8 @@ enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, return NO_ERROR; +out_idr_remove_vol: + idr_remove(&tconn->volumes, vnr_got); out_idr_remove_minor: idr_remove(&minors, minor_got); synchronize_rcu(); @@ -2794,6 +2818,7 @@ void drbd_free_bc(struct drbd_backing_dev *ldev) blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + kfree(ldev->disk_conf); kfree(ldev); } @@ -2833,8 +2858,9 @@ void conn_md_sync(struct drbd_tconn *tconn) rcu_read_unlock(); } +/* aligned 4kByte */ struct meta_data_on_disk { - u64 la_size; /* last agreed size. */ + u64 la_size_sect; /* last agreed size. */ u64 uuid[UI_SIZE]; /* UUIDs. */ u64 device_uuid; u64 reserved_u64_1; @@ -2842,13 +2868,17 @@ struct meta_data_on_disk { u32 magic; u32 md_size_sect; u32 al_offset; /* offset to this block */ - u32 al_nr_extents; /* important for restoring the AL */ + u32 al_nr_extents; /* important for restoring the AL (userspace) */ /* `-- act_log->nr_elements <-- ldev->dc.al_extents */ u32 bm_offset; /* offset to the bitmap, from here */ u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ u32 la_peer_max_bio_size; /* last peer max_bio_size */ - u32 reserved_u32[3]; + /* see al_tr_number_to_on_disk_sector() */ + u32 al_stripes; + u32 al_stripe_size_4k; + + u8 reserved_u8[4096 - (7*8 + 10*4)]; } __packed; /** @@ -2861,6 +2891,10 @@ void drbd_md_sync(struct drbd_conf *mdev) sector_t sector; int i; + /* Don't accidentally change the DRBD meta data layout. */ + BUILD_BUG_ON(UI_SIZE != 4); + BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096); + del_timer(&mdev->md_sync_timer); /* timer may be rearmed by drbd_md_mark_dirty() now. */ if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) @@ -2875,9 +2909,9 @@ void drbd_md_sync(struct drbd_conf *mdev) if (!buffer) goto out; - memset(buffer, 0, 512); + memset(buffer, 0, sizeof(*buffer)); - buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); + buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); for (i = UI_CURRENT; i < UI_SIZE; i++) buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); buffer->flags = cpu_to_be32(mdev->ldev->md.flags); @@ -2892,7 +2926,10 @@ void drbd_md_sync(struct drbd_conf *mdev) buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size); - D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); + buffer->al_stripes = cpu_to_be32(mdev->ldev->md.al_stripes); + buffer->al_stripe_size_4k = cpu_to_be32(mdev->ldev->md.al_stripe_size_4k); + + D_ASSERT(drbd_md_ss(mdev->ldev) == mdev->ldev->md.md_offset); sector = mdev->ldev->md.md_offset; if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { @@ -2910,13 +2947,141 @@ out: put_ldev(mdev); } +static int check_activity_log_stripe_size(struct drbd_conf *mdev, + struct meta_data_on_disk *on_disk, + struct drbd_md *in_core) +{ + u32 al_stripes = be32_to_cpu(on_disk->al_stripes); + u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k); + u64 al_size_4k; + + /* both not set: default to old fixed size activity log */ + if (al_stripes == 0 && al_stripe_size_4k == 0) { + al_stripes = 1; + al_stripe_size_4k = MD_32kB_SECT/8; + } + + /* some paranoia plausibility checks */ + + /* we need both values to be set */ + if (al_stripes == 0 || al_stripe_size_4k == 0) + goto err; + + al_size_4k = (u64)al_stripes * al_stripe_size_4k; + + /* Upper limit of activity log area, to avoid potential overflow + * problems in al_tr_number_to_on_disk_sector(). As right now, more + * than 72 * 4k blocks total only increases the amount of history, + * limiting this arbitrarily to 16 GB is not a real limitation ;-) */ + if (al_size_4k > (16 * 1024 * 1024/4)) + goto err; + + /* Lower limit: we need at least 8 transaction slots (32kB) + * to not break existing setups */ + if (al_size_4k < MD_32kB_SECT/8) + goto err; + + in_core->al_stripe_size_4k = al_stripe_size_4k; + in_core->al_stripes = al_stripes; + in_core->al_size_4k = al_size_4k; + + return 0; +err: + dev_err(DEV, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n", + al_stripes, al_stripe_size_4k); + return -EINVAL; +} + +static int check_offsets_and_sizes(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) +{ + sector_t capacity = drbd_get_capacity(bdev->md_bdev); + struct drbd_md *in_core = &bdev->md; + s32 on_disk_al_sect; + s32 on_disk_bm_sect; + + /* The on-disk size of the activity log, calculated from offsets, and + * the size of the activity log calculated from the stripe settings, + * should match. + * Though we could relax this a bit: it is ok, if the striped activity log + * fits in the available on-disk activity log size. + * Right now, that would break how resize is implemented. + * TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware + * of possible unused padding space in the on disk layout. */ + if (in_core->al_offset < 0) { + if (in_core->bm_offset > in_core->al_offset) + goto err; + on_disk_al_sect = -in_core->al_offset; + on_disk_bm_sect = in_core->al_offset - in_core->bm_offset; + } else { + if (in_core->al_offset != MD_4kB_SECT) + goto err; + if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT) + goto err; + + on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT; + on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset; + } + + /* old fixed size meta data is exactly that: fixed. */ + if (in_core->meta_dev_idx >= 0) { + if (in_core->md_size_sect != MD_128MB_SECT + || in_core->al_offset != MD_4kB_SECT + || in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT + || in_core->al_stripes != 1 + || in_core->al_stripe_size_4k != MD_32kB_SECT/8) + goto err; + } + + if (capacity < in_core->md_size_sect) + goto err; + if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev)) + goto err; + + /* should be aligned, and at least 32k */ + if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT)) + goto err; + + /* should fit (for now: exactly) into the available on-disk space; + * overflow prevention is in check_activity_log_stripe_size() above. */ + if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT) + goto err; + + /* again, should be aligned */ + if (in_core->bm_offset & 7) + goto err; + + /* FIXME check for device grow with flex external meta data? */ + + /* can the available bitmap space cover the last agreed device size? */ + if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512) + goto err; + + return 0; + +err: + dev_err(DEV, "meta data offsets don't make sense: idx=%d " + "al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, " + "md_size_sect=%u, la_size=%llu, md_capacity=%llu\n", + in_core->meta_dev_idx, + in_core->al_stripes, in_core->al_stripe_size_4k, + in_core->al_offset, in_core->bm_offset, in_core->md_size_sect, + (unsigned long long)in_core->la_size_sect, + (unsigned long long)capacity); + + return -EINVAL; +} + + /** * drbd_md_read() - Reads in the meta data super block * @mdev: DRBD device. * @bdev: Device from which the meta data should be read in. * - * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case + * Return NO_ERROR on success, and an enum drbd_ret_code in case * something goes wrong. + * + * Called exactly once during drbd_adm_attach(), while still being D_DISKLESS, + * even before @bdev is assigned to @mdev->ldev. */ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) { @@ -2924,12 +3089,17 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) u32 magic, flags; int i, rv = NO_ERROR; - if (!get_ldev_if_state(mdev, D_ATTACHING)) - return ERR_IO_MD_DISK; + if (mdev->state.disk != D_DISKLESS) + return ERR_DISK_CONFIGURED; buffer = drbd_md_get_buffer(mdev); if (!buffer) - goto out; + return ERR_NOMEM; + + /* First, figure out where our meta data superblock is located, + * and read it. */ + bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx; + bdev->md.md_offset = drbd_md_ss(bdev); if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { /* NOTE: can't do normal error processing here as this is @@ -2948,45 +3118,51 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) rv = ERR_MD_UNCLEAN; goto err; } + + rv = ERR_MD_INVALID; if (magic != DRBD_MD_MAGIC_08) { if (magic == DRBD_MD_MAGIC_07) dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n"); else dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n"); - rv = ERR_MD_INVALID; goto err; } - if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) { - dev_err(DEV, "unexpected al_offset: %d (expected %d)\n", - be32_to_cpu(buffer->al_offset), bdev->md.al_offset); - rv = ERR_MD_INVALID; + + if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { + dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n", + be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); goto err; } + + + /* convert to in_core endian */ + bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect); + for (i = UI_CURRENT; i < UI_SIZE; i++) + bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); + bdev->md.flags = be32_to_cpu(buffer->flags); + bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); + + bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect); + bdev->md.al_offset = be32_to_cpu(buffer->al_offset); + bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset); + + if (check_activity_log_stripe_size(mdev, buffer, &bdev->md)) + goto err; + if (check_offsets_and_sizes(mdev, bdev)) + goto err; + if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n", be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); - rv = ERR_MD_INVALID; goto err; } if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) { dev_err(DEV, "unexpected md_size: %u (expected %u)\n", be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect); - rv = ERR_MD_INVALID; goto err; } - if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { - dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n", - be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); - rv = ERR_MD_INVALID; - goto err; - } - - bdev->md.la_size_sect = be64_to_cpu(buffer->la_size); - for (i = UI_CURRENT; i < UI_SIZE; i++) - bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); - bdev->md.flags = be32_to_cpu(buffer->flags); - bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); + rv = NO_ERROR; spin_lock_irq(&mdev->tconn->req_lock); if (mdev->state.conn < C_CONNECTED) { @@ -2999,8 +3175,6 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) err: drbd_md_put_buffer(mdev); - out: - put_ldev(mdev); return rv; } @@ -3238,8 +3412,12 @@ static int w_go_diskless(struct drbd_work *w, int unused) * end up here after a failed attach, before ldev was even assigned. */ if (mdev->bitmap && mdev->ldev) { + /* An interrupted resync or similar is allowed to recounts bits + * while we detach. + * Any modifications would not be expected anymore, though. + */ if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write, - "detach", BM_LOCKED_MASK)) { + "detach", BM_LOCKED_TEST_ALLOWED)) { if (test_bit(WAS_READ_ERROR, &mdev->flags)) { drbd_md_set_flag(mdev, MDF_FULL_SYNC); drbd_md_sync(mdev); @@ -3251,13 +3429,6 @@ static int w_go_diskless(struct drbd_work *w, int unused) return 0; } -void drbd_go_diskless(struct drbd_conf *mdev) -{ - D_ASSERT(mdev->state.disk == D_FAILED); - if (!test_and_set_bit(GO_DISKLESS, &mdev->flags)) - drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless); -} - /** * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap * @mdev: DRBD device. diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 2af26fc9528..9e3f441e7e8 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -696,37 +696,52 @@ out: return 0; } -/* initializes the md.*_offset members, so we are able to find - * the on disk meta data */ +/* Initializes the md.*_offset members, so we are able to find + * the on disk meta data. + * + * We currently have two possible layouts: + * external: + * |----------- md_size_sect ------------------| + * [ 4k superblock ][ activity log ][ Bitmap ] + * | al_offset == 8 | + * | bm_offset = al_offset + X | + * ==> bitmap sectors = md_size_sect - bm_offset + * + * internal: + * |----------- md_size_sect ------------------| + * [data.....][ Bitmap ][ activity log ][ 4k superblock ] + * | al_offset < 0 | + * | bm_offset = al_offset - Y | + * ==> bitmap sectors = Y = al_offset - bm_offset + * + * Activity log size used to be fixed 32kB, + * but is about to become configurable. + */ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) { sector_t md_size_sect = 0; - int meta_dev_idx; + unsigned int al_size_sect = bdev->md.al_size_4k * 8; - rcu_read_lock(); - meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx; + bdev->md.md_offset = drbd_md_ss(bdev); - switch (meta_dev_idx) { + switch (bdev->md.meta_dev_idx) { default: /* v07 style fixed size indexed meta data */ - bdev->md.md_size_sect = MD_RESERVED_SECT; - bdev->md.md_offset = drbd_md_ss__(mdev, bdev); - bdev->md.al_offset = MD_AL_OFFSET; - bdev->md.bm_offset = MD_BM_OFFSET; + bdev->md.md_size_sect = MD_128MB_SECT; + bdev->md.al_offset = MD_4kB_SECT; + bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; break; case DRBD_MD_INDEX_FLEX_EXT: /* just occupy the full device; unit: sectors */ bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); - bdev->md.md_offset = 0; - bdev->md.al_offset = MD_AL_OFFSET; - bdev->md.bm_offset = MD_BM_OFFSET; + bdev->md.al_offset = MD_4kB_SECT; + bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; break; case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: - bdev->md.md_offset = drbd_md_ss__(mdev, bdev); /* al size is still fixed */ - bdev->md.al_offset = -MD_AL_SECTORS; + bdev->md.al_offset = -al_size_sect; /* we need (slightly less than) ~ this much bitmap sectors: */ md_size_sect = drbd_get_capacity(bdev->backing_bdev); md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); @@ -735,14 +750,13 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, /* plus the "drbd meta data super block", * and the activity log; */ - md_size_sect += MD_BM_OFFSET; + md_size_sect += MD_4kB_SECT + al_size_sect; bdev->md.md_size_sect = md_size_sect; /* bitmap offset is adjusted by 'super' block size */ - bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; + bdev->md.bm_offset = -md_size_sect + MD_4kB_SECT; break; } - rcu_read_unlock(); } /* input size is expected to be in KB */ @@ -805,7 +819,7 @@ void drbd_resume_io(struct drbd_conf *mdev) enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) { sector_t prev_first_sect, prev_size; /* previous meta location */ - sector_t la_size, u_size; + sector_t la_size_sect, u_size; sector_t size; char ppb[10]; @@ -828,7 +842,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds prev_first_sect = drbd_md_first_sector(mdev->ldev); prev_size = mdev->ldev->md.md_size_sect; - la_size = mdev->ldev->md.la_size_sect; + la_size_sect = mdev->ldev->md.la_size_sect; /* TODO: should only be some assert here, not (re)init... */ drbd_md_set_sector_offsets(mdev, mdev->ldev); @@ -864,7 +878,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds if (rv == dev_size_error) goto out; - la_size_changed = (la_size != mdev->ldev->md.la_size_sect); + la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect); md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) || prev_size != mdev->ldev->md.md_size_sect; @@ -886,9 +900,9 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds drbd_md_mark_dirty(mdev); } - if (size > la_size) + if (size > la_size_sect) rv = grew; - if (size < la_size) + if (size < la_size_sect) rv = shrunk; out: lc_unlock(mdev->act_log); @@ -903,7 +917,7 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, sector_t u_size, int assume_peer_has_space) { sector_t p_size = mdev->p_size; /* partner's disk size. */ - sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ + sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */ sector_t m_size; /* my size */ sector_t size = 0; @@ -917,8 +931,8 @@ drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, if (p_size && m_size) { size = min_t(sector_t, p_size, m_size); } else { - if (la_size) { - size = la_size; + if (la_size_sect) { + size = la_size_sect; if (m_size && m_size < size) size = m_size; if (p_size && p_size < size) @@ -1127,15 +1141,32 @@ static bool should_set_defaults(struct genl_info *info) return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS); } -static void enforce_disk_conf_limits(struct disk_conf *dc) +static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev) { - if (dc->al_extents < DRBD_AL_EXTENTS_MIN) - dc->al_extents = DRBD_AL_EXTENTS_MIN; - if (dc->al_extents > DRBD_AL_EXTENTS_MAX) - dc->al_extents = DRBD_AL_EXTENTS_MAX; + /* This is limited by 16 bit "slot" numbers, + * and by available on-disk context storage. + * + * Also (u16)~0 is special (denotes a "free" extent). + * + * One transaction occupies one 4kB on-disk block, + * we have n such blocks in the on disk ring buffer, + * the "current" transaction may fail (n-1), + * and there is 919 slot numbers context information per transaction. + * + * 72 transaction blocks amounts to more than 2**16 context slots, + * so cap there first. + */ + const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX; + const unsigned int sufficient_on_disk = + (max_al_nr + AL_CONTEXT_PER_TRANSACTION -1) + /AL_CONTEXT_PER_TRANSACTION; + + unsigned int al_size_4k = bdev->md.al_size_4k; + + if (al_size_4k > sufficient_on_disk) + return max_al_nr; - if (dc->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) - dc->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; + return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION; } int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) @@ -1182,7 +1213,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) if (!expect(new_disk_conf->resync_rate >= 1)) new_disk_conf->resync_rate = 1; - enforce_disk_conf_limits(new_disk_conf); + if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) + new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; + if (new_disk_conf->al_extents > drbd_al_extents_max(mdev->ldev)) + new_disk_conf->al_extents = drbd_al_extents_max(mdev->ldev); + + if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) + new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; if (fifo_size != mdev->rs_plan_s->size) { @@ -1330,7 +1367,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) goto fail; } - enforce_disk_conf_limits(new_disk_conf); + if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) + new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ); if (!new_plan) { @@ -1343,6 +1381,12 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) goto fail; } + write_lock_irq(&global_state_lock); + retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after); + write_unlock_irq(&global_state_lock); + if (retcode != NO_ERROR) + goto fail; + rcu_read_lock(); nc = rcu_dereference(mdev->tconn->net_conf); if (nc) { @@ -1399,8 +1443,16 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) goto fail; } - /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ - drbd_md_set_sector_offsets(mdev, nbc); + /* Read our meta data super block early. + * This also sets other on-disk offsets. */ + retcode = drbd_md_read(mdev, nbc); + if (retcode != NO_ERROR) + goto fail; + + if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) + new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; + if (new_disk_conf->al_extents > drbd_al_extents_max(nbc)) + new_disk_conf->al_extents = drbd_al_extents_max(nbc); if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", @@ -1416,7 +1468,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) min_md_device_sectors = (2<<10); } else { max_possible_sectors = DRBD_MAX_SECTORS; - min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1); + min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1); } if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { @@ -1467,8 +1519,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) if (!get_ldev_if_state(mdev, D_ATTACHING)) goto force_diskless; - drbd_md_set_sector_offsets(mdev, nbc); - if (!mdev->bitmap) { if (drbd_bm_init(mdev)) { retcode = ERR_NOMEM; @@ -1476,10 +1526,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) } } - retcode = drbd_md_read(mdev, nbc); - if (retcode != NO_ERROR) - goto force_diskless_dec; - if (mdev->state.conn < C_CONNECTED && mdev->state.role == R_PRIMARY && (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { @@ -2158,8 +2204,11 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool for return SS_SUCCESS; case SS_PRIMARY_NOP: /* Our state checking code wants to see the peer outdated. */ - rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, - pdsk, D_OUTDATED), CS_VERBOSE); + rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0); + + if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */ + rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_VERBOSE); + break; case SS_CW_FAILED_BY_PEER: /* The peer probably wants to see us outdated. */ @@ -2406,22 +2455,19 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); drbd_flush_workqueue(mdev); - retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); - - if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION) - retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); - - while (retcode == SS_NEED_CONNECTION) { - spin_lock_irq(&mdev->tconn->req_lock); - if (mdev->state.conn < C_CONNECTED) - retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL); - spin_unlock_irq(&mdev->tconn->req_lock); - - if (retcode != SS_NEED_CONNECTION) - break; - + /* If we happen to be C_STANDALONE R_SECONDARY, just change to + * D_INCONSISTENT, and set all bits in the bitmap. Otherwise, + * try to start a resync handshake as sync target for full sync. + */ + if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_SECONDARY) { + retcode = drbd_request_state(mdev, NS(disk, D_INCONSISTENT)); + if (retcode >= SS_SUCCESS) { + if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, + "set_n_write from invalidate", BM_LOCKED_MASK)) + retcode = ERR_IO_MD_DISK; + } + } else retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); - } drbd_resume_io(mdev); out: @@ -2475,21 +2521,22 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); drbd_flush_workqueue(mdev); - retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); - if (retcode < SS_SUCCESS) { - if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) { - /* The peer will get a resync upon connect anyways. - * Just make that into a full resync. */ - retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); - if (retcode >= SS_SUCCESS) { - if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, - "set_n_write from invalidate_peer", - BM_LOCKED_SET_ALLOWED)) - retcode = ERR_IO_MD_DISK; - } - } else - retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); - } + /* If we happen to be C_STANDALONE R_PRIMARY, just set all bits + * in the bitmap. Otherwise, try to start a resync handshake + * as sync source for full sync. + */ + if (mdev->state.conn == C_STANDALONE && mdev->state.role == R_PRIMARY) { + /* The peer will get a resync upon connect anyways. Just make that + into a full resync. */ + retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT)); + if (retcode >= SS_SUCCESS) { + if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al, + "set_n_write from invalidate_peer", + BM_LOCKED_SET_ALLOWED)) + retcode = ERR_IO_MD_DISK; + } + } else + retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); drbd_resume_io(mdev); out: @@ -3162,6 +3209,7 @@ static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev) CS_VERBOSE + CS_WAIT_COMPLETE); idr_remove(&mdev->tconn->volumes, mdev->vnr); idr_remove(&minors, mdev_to_minor(mdev)); + destroy_workqueue(mdev->submit.wq); del_gendisk(mdev->vdisk); synchronize_rcu(); kref_put(&mdev->kref, &drbd_minor_destroy); diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 928adb815b0..bf31d41dbaa 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -313,8 +313,14 @@ static int drbd_seq_show(struct seq_file *seq, void *v) static int drbd_proc_open(struct inode *inode, struct file *file) { - if (try_module_get(THIS_MODULE)) - return single_open(file, drbd_seq_show, PDE_DATA(inode)); + int err; + + if (try_module_get(THIS_MODULE)) { + err = single_open(file, drbd_seq_show, PDE_DATA(inode)); + if (err) + module_put(THIS_MODULE); + return err; + } return -ENODEV; } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 83c5ae0ed56..4222affff48 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -850,6 +850,7 @@ int drbd_connected(struct drbd_conf *mdev) err = drbd_send_current_state(mdev); clear_bit(USE_DEGR_WFC_T, &mdev->flags); clear_bit(RESIZE_PENDING, &mdev->flags); + atomic_set(&mdev->ap_in_flight, 0); mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ return err; } @@ -2266,7 +2267,7 @@ static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi) drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size); peer_req->flags |= EE_CALL_AL_COMPLETE_IO; peer_req->flags &= ~EE_MAY_SET_IN_SYNC; - drbd_al_begin_io(mdev, &peer_req->i); + drbd_al_begin_io(mdev, &peer_req->i, true); } err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR); @@ -2662,7 +2663,6 @@ static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) if (hg == -1 && mdev->state.role == R_PRIMARY) { enum drbd_state_rv rv2; - drbd_set_role(mdev, R_SECONDARY, 0); /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, * we might be here in C_WF_REPORT_PARAMS which is transient. * we do not need to wait for the after state change work either. */ @@ -3993,7 +3993,7 @@ static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi) clear_bit(DISCARD_MY_DATA, &mdev->flags); - drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ + drbd_md_sync(mdev); /* update connected indicator, la_size_sect, ... */ return 0; } @@ -4660,8 +4660,8 @@ static int drbd_do_features(struct drbd_tconn *tconn) #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) static int drbd_do_auth(struct drbd_tconn *tconn) { - dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); - dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); + conn_err(tconn, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); + conn_err(tconn, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); return -1; } #else @@ -5258,9 +5258,11 @@ int drbd_asender(struct drbd_thread *thi) bool ping_timeout_active = false; struct net_conf *nc; int ping_timeo, tcp_cork, ping_int; + struct sched_param param = { .sched_priority = 2 }; - current->policy = SCHED_RR; /* Make this a realtime task! */ - current->rt_priority = 2; /* more important than all other tasks */ + rv = sched_setscheduler(current, SCHED_RR, ¶m); + if (rv < 0) + conn_err(tconn, "drbd_asender: ERROR set priority, ret=%d\n", rv); while (get_t_state(thi) == RUNNING) { drbd_thread_current_set_cpu(thi); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 2b8303ad63c..c24379ffd4e 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -34,14 +34,14 @@ static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size); /* Update disk stats at start of I/O request */ -static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) +static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req) { - const int rw = bio_data_dir(bio); + const int rw = bio_data_dir(req->master_bio); int cpu; cpu = part_stat_lock(); part_round_stats(cpu, &mdev->vdisk->part0); part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); - part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); + part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], req->i.size >> 9); (void) cpu; /* The macro invocations above want the cpu argument, I do not like the compiler warning about cpu only assigned but never used... */ part_inc_in_flight(&mdev->vdisk->part0, rw); @@ -263,8 +263,7 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) else root = &mdev->read_requests; drbd_remove_request_interval(root, req); - } else if (!(s & RQ_POSTPONED)) - D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0); + } /* Before we can signal completion to the upper layers, * we may need to close the current transfer log epoch. @@ -755,6 +754,11 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, D_ASSERT(req->rq_state & RQ_NET_PENDING); mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE); break; + + case QUEUE_AS_DRBD_BARRIER: + start_new_tl_epoch(mdev->tconn); + mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE); + break; }; return rv; @@ -861,8 +865,10 @@ static void maybe_pull_ahead(struct drbd_conf *mdev) bool congested = false; enum drbd_on_congestion on_congestion; + rcu_read_lock(); nc = rcu_dereference(tconn->net_conf); on_congestion = nc ? nc->on_congestion : OC_BLOCK; + rcu_read_unlock(); if (on_congestion == OC_BLOCK || tconn->agreed_pro_version < 96) return; @@ -956,14 +962,8 @@ static int drbd_process_write_request(struct drbd_request *req) struct drbd_conf *mdev = req->w.mdev; int remote, send_oos; - rcu_read_lock(); remote = drbd_should_do_remote(mdev->state); - if (remote) { - maybe_pull_ahead(mdev); - remote = drbd_should_do_remote(mdev->state); - } send_oos = drbd_should_send_out_of_sync(mdev->state); - rcu_read_unlock(); /* Need to replicate writes. Unless it is an empty flush, * which is better mapped to a DRBD P_BARRIER packet, @@ -975,8 +975,8 @@ static int drbd_process_write_request(struct drbd_request *req) /* The only size==0 bios we expect are empty flushes. */ D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH); if (remote) - start_new_tl_epoch(mdev->tconn); - return 0; + _req_mod(req, QUEUE_AS_DRBD_BARRIER); + return remote; } if (!remote && !send_oos) @@ -1020,12 +1020,24 @@ drbd_submit_req_private_bio(struct drbd_request *req) bio_endio(bio, -EIO); } -void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) +static void drbd_queue_write(struct drbd_conf *mdev, struct drbd_request *req) { - const int rw = bio_rw(bio); - struct bio_and_error m = { NULL, }; + spin_lock(&mdev->submit.lock); + list_add_tail(&req->tl_requests, &mdev->submit.writes); + spin_unlock(&mdev->submit.lock); + queue_work(mdev->submit.wq, &mdev->submit.worker); +} + +/* returns the new drbd_request pointer, if the caller is expected to + * drbd_send_and_submit() it (to save latency), or NULL if we queued the + * request on the submitter thread. + * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request. + */ +struct drbd_request * +drbd_request_prepare(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) +{ + const int rw = bio_data_dir(bio); struct drbd_request *req; - bool no_remote = false; /* allocate outside of all locks; */ req = drbd_req_new(mdev, bio); @@ -1035,7 +1047,7 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long * if user cannot handle io errors, that's not our business. */ dev_err(DEV, "could not kmalloc() req\n"); bio_endio(bio, -ENOMEM); - return; + return ERR_PTR(-ENOMEM); } req->start_time = start_time; @@ -1044,28 +1056,40 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long req->private_bio = NULL; } - /* For WRITES going to the local disk, grab a reference on the target - * extent. This waits for any resync activity in the corresponding - * resync extent to finish, and, if necessary, pulls in the target - * extent into the activity log, which involves further disk io because - * of transactional on-disk meta data updates. - * Empty flushes don't need to go into the activity log, they can only - * flush data for pending writes which are already in there. */ + /* Update disk stats */ + _drbd_start_io_acct(mdev, req); + if (rw == WRITE && req->private_bio && req->i.size && !test_bit(AL_SUSPENDED, &mdev->flags)) { + if (!drbd_al_begin_io_fastpath(mdev, &req->i)) { + drbd_queue_write(mdev, req); + return NULL; + } req->rq_state |= RQ_IN_ACT_LOG; - drbd_al_begin_io(mdev, &req->i); } + return req; +} + +static void drbd_send_and_submit(struct drbd_conf *mdev, struct drbd_request *req) +{ + const int rw = bio_rw(req->master_bio); + struct bio_and_error m = { NULL, }; + bool no_remote = false; + spin_lock_irq(&mdev->tconn->req_lock); if (rw == WRITE) { /* This may temporarily give up the req_lock, * but will re-aquire it before it returns here. * Needs to be before the check on drbd_suspended() */ complete_conflicting_writes(req); + /* no more giving up req_lock from now on! */ + + /* check for congestion, and potentially stop sending + * full data updates, but start sending "dirty bits" only. */ + maybe_pull_ahead(mdev); } - /* no more giving up req_lock from now on! */ if (drbd_suspended(mdev)) { /* push back and retry: */ @@ -1078,9 +1102,6 @@ void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long goto out; } - /* Update disk stats */ - _drbd_start_io_acct(mdev, req, bio); - /* We fail READ/READA early, if we can not serve it. * We must do this before req is registered on any lists. * Otherwise, drbd_req_complete() will queue failed READ for retry. */ @@ -1137,7 +1158,116 @@ out: if (m.bio) complete_master_bio(mdev, &m); - return; +} + +void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) +{ + struct drbd_request *req = drbd_request_prepare(mdev, bio, start_time); + if (IS_ERR_OR_NULL(req)) + return; + drbd_send_and_submit(mdev, req); +} + +static void submit_fast_path(struct drbd_conf *mdev, struct list_head *incoming) +{ + struct drbd_request *req, *tmp; + list_for_each_entry_safe(req, tmp, incoming, tl_requests) { + const int rw = bio_data_dir(req->master_bio); + + if (rw == WRITE /* rw != WRITE should not even end up here! */ + && req->private_bio && req->i.size + && !test_bit(AL_SUSPENDED, &mdev->flags)) { + if (!drbd_al_begin_io_fastpath(mdev, &req->i)) + continue; + + req->rq_state |= RQ_IN_ACT_LOG; + } + + list_del_init(&req->tl_requests); + drbd_send_and_submit(mdev, req); + } +} + +static bool prepare_al_transaction_nonblock(struct drbd_conf *mdev, + struct list_head *incoming, + struct list_head *pending) +{ + struct drbd_request *req, *tmp; + int wake = 0; + int err; + + spin_lock_irq(&mdev->al_lock); + list_for_each_entry_safe(req, tmp, incoming, tl_requests) { + err = drbd_al_begin_io_nonblock(mdev, &req->i); + if (err == -EBUSY) + wake = 1; + if (err) + continue; + req->rq_state |= RQ_IN_ACT_LOG; + list_move_tail(&req->tl_requests, pending); + } + spin_unlock_irq(&mdev->al_lock); + if (wake) + wake_up(&mdev->al_wait); + + return !list_empty(pending); +} + +void do_submit(struct work_struct *ws) +{ + struct drbd_conf *mdev = container_of(ws, struct drbd_conf, submit.worker); + LIST_HEAD(incoming); + LIST_HEAD(pending); + struct drbd_request *req, *tmp; + + for (;;) { + spin_lock(&mdev->submit.lock); + list_splice_tail_init(&mdev->submit.writes, &incoming); + spin_unlock(&mdev->submit.lock); + + submit_fast_path(mdev, &incoming); + if (list_empty(&incoming)) + break; + + wait_event(mdev->al_wait, prepare_al_transaction_nonblock(mdev, &incoming, &pending)); + /* Maybe more was queued, while we prepared the transaction? + * Try to stuff them into this transaction as well. + * Be strictly non-blocking here, no wait_event, we already + * have something to commit. + * Stop if we don't make any more progres. + */ + for (;;) { + LIST_HEAD(more_pending); + LIST_HEAD(more_incoming); + bool made_progress; + + /* It is ok to look outside the lock, + * it's only an optimization anyways */ + if (list_empty(&mdev->submit.writes)) + break; + + spin_lock(&mdev->submit.lock); + list_splice_tail_init(&mdev->submit.writes, &more_incoming); + spin_unlock(&mdev->submit.lock); + + if (list_empty(&more_incoming)) + break; + + made_progress = prepare_al_transaction_nonblock(mdev, &more_incoming, &more_pending); + + list_splice_tail_init(&more_pending, &pending); + list_splice_tail_init(&more_incoming, &incoming); + + if (!made_progress) + break; + } + drbd_al_begin_io_commit(mdev, false); + + list_for_each_entry_safe(req, tmp, &pending, tl_requests) { + list_del_init(&req->tl_requests); + drbd_send_and_submit(mdev, req); + } + } } void drbd_make_request(struct request_queue *q, struct bio *bio) diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index c08d22964d0..978cb1addc9 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -88,6 +88,14 @@ enum drbd_req_event { QUEUE_FOR_NET_READ, QUEUE_FOR_SEND_OOS, + /* An empty flush is queued as P_BARRIER, + * which will cause it to complete "successfully", + * even if the local disk flush failed. + * + * Just like "real" requests, empty flushes (blkdev_issue_flush()) will + * only see an error if neither local nor remote data is reachable. */ + QUEUE_AS_DRBD_BARRIER, + SEND_CANCELED, SEND_FAILED, HANDED_OVER_TO_NETWORK, diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 0fe220cfb9e..90c5be2b1d3 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -570,6 +570,13 @@ is_valid_state(struct drbd_conf *mdev, union drbd_state ns) mdev->tconn->agreed_pro_version < 88) rv = SS_NOT_SUPPORTED; + else if (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) + rv = SS_NO_UP_TO_DATE_DISK; + + else if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && + ns.pdsk == D_UNKNOWN) + rv = SS_NEED_CONNECTION; + else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) rv = SS_CONNECTED_OUTDATES; @@ -635,6 +642,10 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_t && os.conn < C_WF_REPORT_PARAMS) rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ + if (ns.conn == C_DISCONNECTING && ns.pdsk == D_OUTDATED && + os.conn < C_CONNECTED && os.pdsk > D_OUTDATED) + rv = SS_OUTDATE_WO_CONN; + return rv; } @@ -1377,13 +1388,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); - /* We are invalidating our self... */ - if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && - os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) - /* other bitmap operation expected during this phase */ - drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, - "set_n_write from invalidate", BM_LOCKED_MASK); - /* first half of local IO error, failure to attach, * or administrative detach */ if (os.disk != D_FAILED && ns.disk == D_FAILED) { @@ -1748,13 +1752,9 @@ _conn_rq_cond(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags)) return SS_CW_FAILED_BY_PEER; - rv = tconn->cstate != C_WF_REPORT_PARAMS ? SS_CW_NO_NEED : SS_UNKNOWN_ERROR; - - if (rv == SS_UNKNOWN_ERROR) - rv = conn_is_valid_transition(tconn, mask, val, 0); - - if (rv == SS_SUCCESS) - rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ + rv = conn_is_valid_transition(tconn, mask, val, 0); + if (rv == SS_SUCCESS && tconn->cstate == C_WF_REPORT_PARAMS) + rv = SS_UNKNOWN_ERROR; /* continue waiting */ return rv; } diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c index 9a664bd2740..58e08ff2b2c 100644 --- a/drivers/block/drbd/drbd_strings.c +++ b/drivers/block/drbd/drbd_strings.c @@ -89,6 +89,7 @@ static const char *drbd_state_sw_errors[] = { [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", + [-SS_OUTDATE_WO_CONN] = "Need a connection for a graceful disconnect/outdate peer", [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config", }; diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 424dc7bdf9b..891c0ecaa29 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -89,7 +89,8 @@ void drbd_md_io_complete(struct bio *bio, int error) md_io->done = 1; wake_up(&mdev->misc_wait); bio_put(bio); - put_ldev(mdev); + if (mdev->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */ + put_ldev(mdev); } /* reads on behalf of the partner, @@ -1410,7 +1411,7 @@ int w_restart_disk_io(struct drbd_work *w, int cancel) struct drbd_conf *mdev = w->mdev; if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) - drbd_al_begin_io(mdev, &req->i); + drbd_al_begin_io(mdev, &req->i, false); drbd_req_make_private_bio(req, req->master_bio); req->private_bio->bi_bdev = mdev->ldev->backing_bdev; @@ -1425,7 +1426,7 @@ static int _drbd_may_sync_now(struct drbd_conf *mdev) int resync_after; while (1) { - if (!odev->ldev) + if (!odev->ldev || odev->state.disk == D_DISKLESS) return 1; rcu_read_lock(); resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; @@ -1433,7 +1434,7 @@ static int _drbd_may_sync_now(struct drbd_conf *mdev) if (resync_after == -1) return 1; odev = minor_to_mdev(resync_after); - if (!expect(odev)) + if (!odev) return 1; if ((odev->state.conn >= C_SYNC_SOURCE && odev->state.conn <= C_PAUSED_SYNC_T) || @@ -1515,7 +1516,7 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor) if (o_minor == -1) return NO_ERROR; - if (o_minor < -1 || minor_to_mdev(o_minor) == NULL) + if (o_minor < -1 || o_minor > MINORMASK) return ERR_RESYNC_AFTER; /* check for loops */ @@ -1524,6 +1525,15 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor) if (odev == mdev) return ERR_RESYNC_AFTER_CYCLE; + /* You are free to depend on diskless, non-existing, + * or not yet/no longer existing minors. + * We only reject dependency loops. + * We cannot follow the dependency chain beyond a detached or + * missing minor. + */ + if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS) + return NO_ERROR; + rcu_read_lock(); resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; rcu_read_unlock(); @@ -1652,7 +1662,9 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) clear_bit(B_RS_H_DONE, &mdev->flags); write_lock_irq(&global_state_lock); - if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { + /* Did some connection breakage or IO error race with us? */ + if (mdev->state.conn < C_CONNECTED + || !get_ldev_if_state(mdev, D_NEGOTIATING)) { write_unlock_irq(&global_state_lock); mutex_unlock(mdev->state_mutex); return; diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 076ae7f1b78..a56cfcd5d64 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -780,6 +780,7 @@ static const struct block_device_operations mg_disk_ops = { .getgeo = mg_getgeo }; +#ifdef CONFIG_PM_SLEEP static int mg_suspend(struct device *dev) { struct mg_drv_data *prv_data = dev->platform_data; @@ -824,6 +825,7 @@ static int mg_resume(struct device *dev) return 0; } +#endif static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume); diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 32c678028e5..847107ef0cc 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -728,7 +728,10 @@ static void mtip_async_complete(struct mtip_port *port, atomic_set(&port->commands[tag].active, 0); release_slot(port, tag); - up(&port->cmd_slot); + if (unlikely(command->unaligned)) + up(&port->cmd_slot_unal); + else + up(&port->cmd_slot); } /* @@ -1560,10 +1563,12 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer) } #endif +#ifdef MTIP_TRIM /* Disabling TRIM support temporarily */ /* Demux ID.DRAT & ID.RZAT to determine trim support */ if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5)) port->dd->trim_supp = true; else +#endif port->dd->trim_supp = false; /* Set the identify buffer as valid. */ @@ -2557,7 +2562,7 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd, */ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, int nsect, int nents, int tag, void *callback, - void *data, int dir) + void *data, int dir, int unaligned) { struct host_to_dev_fis *fis; struct mtip_port *port = dd->port; @@ -2570,6 +2575,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, command->scatter_ents = nents; + command->unaligned = unaligned; /* * The number of retries for this command before it is * reported as a failure to the upper layers. @@ -2598,6 +2604,9 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, fis->res3 = 0; fill_command_sg(dd, command, nents); + if (unaligned) + fis->device |= 1 << 7; + /* Populate the command header */ command->command_header->opts = __force_bit2int cpu_to_le32( @@ -2644,9 +2653,13 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, * return value * None */ -static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag) +static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag, + int unaligned) { + struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal : + &dd->port->cmd_slot; release_slot(dd->port, tag); + up(sem); } /* @@ -2661,22 +2674,25 @@ static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag) * or NULL if no command slots are available. */ static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd, - int *tag) + int *tag, int unaligned) { + struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal : + &dd->port->cmd_slot; + /* * It is possible that, even with this semaphore, a thread * may think that no command slots are available. Therefore, we * need to make an attempt to get_slot(). */ - down(&dd->port->cmd_slot); + down(sem); *tag = get_slot(dd->port); if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) { - up(&dd->port->cmd_slot); + up(sem); return NULL; } if (unlikely(*tag < 0)) { - up(&dd->port->cmd_slot); + up(sem); return NULL; } @@ -3010,6 +3026,11 @@ static inline void hba_setup(struct driver_data *dd) dd->mmio + HOST_HSORG); } +static int mtip_device_unaligned_constrained(struct driver_data *dd) +{ + return (dd->pdev->device == P420M_DEVICE_ID ? 1 : 0); +} + /* * Detect the details of the product, and store anything needed * into the driver data structure. This includes product type and @@ -3232,8 +3253,15 @@ static int mtip_hw_init(struct driver_data *dd) for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++) dd->work[i].port = dd->port; + /* Enable unaligned IO constraints for some devices */ + if (mtip_device_unaligned_constrained(dd)) + dd->unal_qdepth = MTIP_MAX_UNALIGNED_SLOTS; + else + dd->unal_qdepth = 0; + /* Counting semaphore to track command slot usage */ - sema_init(&dd->port->cmd_slot, num_command_slots - 1); + sema_init(&dd->port->cmd_slot, num_command_slots - 1 - dd->unal_qdepth); + sema_init(&dd->port->cmd_slot_unal, dd->unal_qdepth); /* Spinlock to prevent concurrent issue */ for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++) @@ -3836,7 +3864,7 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) struct scatterlist *sg; struct bio_vec *bvec; int nents = 0; - int tag = 0; + int tag = 0, unaligned = 0; if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) { if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, @@ -3872,7 +3900,15 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) return; } - sg = mtip_hw_get_scatterlist(dd, &tag); + if (bio_data_dir(bio) == WRITE && bio_sectors(bio) <= 64 && + dd->unal_qdepth) { + if (bio->bi_sector % 8 != 0) /* Unaligned on 4k boundaries */ + unaligned = 1; + else if (bio_sectors(bio) % 8 != 0) /* Aligned but not 4k/8k */ + unaligned = 1; + } + + sg = mtip_hw_get_scatterlist(dd, &tag, unaligned); if (likely(sg != NULL)) { blk_queue_bounce(queue, &bio); @@ -3880,7 +3916,7 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) dev_warn(&dd->pdev->dev, "Maximum number of SGL entries exceeded\n"); bio_io_error(bio); - mtip_hw_release_scatterlist(dd, tag); + mtip_hw_release_scatterlist(dd, tag, unaligned); return; } @@ -3900,7 +3936,8 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) tag, bio_endio, bio, - bio_data_dir(bio)); + bio_data_dir(bio), + unaligned); } else bio_io_error(bio); } @@ -4156,26 +4193,24 @@ static int mtip_block_remove(struct driver_data *dd) */ static int mtip_block_shutdown(struct driver_data *dd) { - dev_info(&dd->pdev->dev, - "Shutting down %s ...\n", dd->disk->disk_name); - /* Delete our gendisk structure, and cleanup the blk queue. */ if (dd->disk) { - if (dd->disk->queue) + dev_info(&dd->pdev->dev, + "Shutting down %s ...\n", dd->disk->disk_name); + + if (dd->disk->queue) { del_gendisk(dd->disk); - else + blk_cleanup_queue(dd->queue); + } else put_disk(dd->disk); + dd->disk = NULL; + dd->queue = NULL; } - spin_lock(&rssd_index_lock); ida_remove(&rssd_index_ida, dd->index); spin_unlock(&rssd_index_lock); - blk_cleanup_queue(dd->queue); - dd->disk = NULL; - dd->queue = NULL; - mtip_hw_shutdown(dd); return 0; } diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 8e8334c9dd0..3bb8a295fbe 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -52,6 +52,9 @@ #define MTIP_FTL_REBUILD_MAGIC 0xED51 #define MTIP_FTL_REBUILD_TIMEOUT_MS 2400000 +/* unaligned IO handling */ +#define MTIP_MAX_UNALIGNED_SLOTS 8 + /* Macro to extract the tag bit number from a tag value. */ #define MTIP_TAG_BIT(tag) (tag & 0x1F) @@ -333,6 +336,8 @@ struct mtip_cmd { int scatter_ents; /* Number of scatter list entries used */ + int unaligned; /* command is unaligned on 4k boundary */ + struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */ int retries; /* The number of retries left for this command. */ @@ -452,6 +457,10 @@ struct mtip_port { * command slots available. */ struct semaphore cmd_slot; + + /* Semaphore to control queue depth of unaligned IOs */ + struct semaphore cmd_slot_unal; + /* Spinlock for working around command-issue bug. */ spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS]; }; @@ -502,6 +511,8 @@ struct driver_data { int isr_binding; + int unal_qdepth; /* qdepth of unaligned IO queue */ + struct list_head online_list; /* linkage for online list */ struct list_head remove_list; /* linkage for removing list */ diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 4d8d90b4fe7..3bfc8f1da9f 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -174,6 +174,8 @@ config MD_FAULTY In unsure, say N. +source "drivers/md/bcache/Kconfig" + config BLK_DEV_DM tristate "Device mapper support" ---help--- diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 7ceeaefc0e9..1439fd4ad9b 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_MD_RAID10) += raid10.o obj-$(CONFIG_MD_RAID456) += raid456.o obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_MD_FAULTY) += faulty.o +obj-$(CONFIG_BCACHE) += bcache/ obj-$(CONFIG_BLK_DEV_MD) += md-mod.o obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o obj-$(CONFIG_DM_BUFIO) += dm-bufio.o diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig new file mode 100644 index 00000000000..05c220d05e2 --- /dev/null +++ b/drivers/md/bcache/Kconfig @@ -0,0 +1,42 @@ + +config BCACHE + tristate "Block device as cache" + select CLOSURES + ---help--- + Allows a block device to be used as cache for other devices; uses + a btree for indexing and the layout is optimized for SSDs. + + See Documentation/bcache.txt for details. + +config BCACHE_DEBUG + bool "Bcache debugging" + depends on BCACHE + ---help--- + Don't select this option unless you're a developer + + Enables extra debugging tools (primarily a fuzz tester) + +config BCACHE_EDEBUG + bool "Extended runtime checks" + depends on BCACHE + ---help--- + Don't select this option unless you're a developer + + Enables extra runtime checks which significantly affect performance + +config BCACHE_CLOSURES_DEBUG + bool "Debug closures" + depends on BCACHE + select DEBUG_FS + ---help--- + Keeps all active closures in a linked list and provides a debugfs + interface to list them, which makes it possible to see asynchronous + operations that get stuck. + +# cgroup code needs to be updated: +# +#config CGROUP_BCACHE +# bool "Cgroup controls for bcache" +# depends on BCACHE && BLK_CGROUP +# ---help--- +# TODO diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile new file mode 100644 index 00000000000..0e9c82523be --- /dev/null +++ b/drivers/md/bcache/Makefile @@ -0,0 +1,7 @@ + +obj-$(CONFIG_BCACHE) += bcache.o + +bcache-y := alloc.o btree.o bset.o io.o journal.o writeback.o\ + movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o + +CFLAGS_request.o += -Iblock diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c new file mode 100644 index 00000000000..048f2947e08 --- /dev/null +++ b/drivers/md/bcache/alloc.c @@ -0,0 +1,599 @@ +/* + * Primary bucket allocation code + * + * Copyright 2012 Google, Inc. + * + * Allocation in bcache is done in terms of buckets: + * + * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in + * btree pointers - they must match for the pointer to be considered valid. + * + * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a + * bucket simply by incrementing its gen. + * + * The gens (along with the priorities; it's really the gens are important but + * the code is named as if it's the priorities) are written in an arbitrary list + * of buckets on disk, with a pointer to them in the journal header. + * + * When we invalidate a bucket, we have to write its new gen to disk and wait + * for that write to complete before we use it - otherwise after a crash we + * could have pointers that appeared to be good but pointed to data that had + * been overwritten. + * + * Since the gens and priorities are all stored contiguously on disk, we can + * batch this up: We fill up the free_inc list with freshly invalidated buckets, + * call prio_write(), and when prio_write() finishes we pull buckets off the + * free_inc list and optionally discard them. + * + * free_inc isn't the only freelist - if it was, we'd often to sleep while + * priorities and gens were being written before we could allocate. c->free is a + * smaller freelist, and buckets on that list are always ready to be used. + * + * If we've got discards enabled, that happens when a bucket moves from the + * free_inc list to the free list. + * + * There is another freelist, because sometimes we have buckets that we know + * have nothing pointing into them - these we can reuse without waiting for + * priorities to be rewritten. These come from freed btree nodes and buckets + * that garbage collection discovered no longer had valid keys pointing into + * them (because they were overwritten). That's the unused list - buckets on the + * unused list move to the free list, optionally being discarded in the process. + * + * It's also important to ensure that gens don't wrap around - with respect to + * either the oldest gen in the btree or the gen on disk. This is quite + * difficult to do in practice, but we explicitly guard against it anyways - if + * a bucket is in danger of wrapping around we simply skip invalidating it that + * time around, and we garbage collect or rewrite the priorities sooner than we + * would have otherwise. + * + * bch_bucket_alloc() allocates a single bucket from a specific cache. + * + * bch_bucket_alloc_set() allocates one or more buckets from different caches + * out of a cache set. + * + * free_some_buckets() drives all the processes described above. It's called + * from bch_bucket_alloc() and a few other places that need to make sure free + * buckets are ready. + * + * invalidate_buckets_(lru|fifo)() find buckets that are available to be + * invalidated, and then invalidate them and stick them on the free_inc list - + * in either lru or fifo order. + */ + +#include "bcache.h" +#include "btree.h" + +#include <linux/random.h> + +#define MAX_IN_FLIGHT_DISCARDS 8U + +/* Bucket heap / gen */ + +uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) +{ + uint8_t ret = ++b->gen; + + ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b)); + WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX); + + if (CACHE_SYNC(&ca->set->sb)) { + ca->need_save_prio = max(ca->need_save_prio, + bucket_disk_gen(b)); + WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX); + } + + return ret; +} + +void bch_rescale_priorities(struct cache_set *c, int sectors) +{ + struct cache *ca; + struct bucket *b; + unsigned next = c->nbuckets * c->sb.bucket_size / 1024; + unsigned i; + int r; + + atomic_sub(sectors, &c->rescale); + + do { + r = atomic_read(&c->rescale); + + if (r >= 0) + return; + } while (atomic_cmpxchg(&c->rescale, r, r + next) != r); + + mutex_lock(&c->bucket_lock); + + c->min_prio = USHRT_MAX; + + for_each_cache(ca, c, i) + for_each_bucket(b, ca) + if (b->prio && + b->prio != BTREE_PRIO && + !atomic_read(&b->pin)) { + b->prio--; + c->min_prio = min(c->min_prio, b->prio); + } + + mutex_unlock(&c->bucket_lock); +} + +/* Discard/TRIM */ + +struct discard { + struct list_head list; + struct work_struct work; + struct cache *ca; + long bucket; + + struct bio bio; + struct bio_vec bv; +}; + +static void discard_finish(struct work_struct *w) +{ + struct discard *d = container_of(w, struct discard, work); + struct cache *ca = d->ca; + char buf[BDEVNAME_SIZE]; + + if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) { + pr_notice("discard error on %s, disabling", + bdevname(ca->bdev, buf)); + d->ca->discard = 0; + } + + mutex_lock(&ca->set->bucket_lock); + + fifo_push(&ca->free, d->bucket); + list_add(&d->list, &ca->discards); + atomic_dec(&ca->discards_in_flight); + + mutex_unlock(&ca->set->bucket_lock); + + closure_wake_up(&ca->set->bucket_wait); + wake_up(&ca->set->alloc_wait); + + closure_put(&ca->set->cl); +} + +static void discard_endio(struct bio *bio, int error) +{ + struct discard *d = container_of(bio, struct discard, bio); + schedule_work(&d->work); +} + +static void do_discard(struct cache *ca, long bucket) +{ + struct discard *d = list_first_entry(&ca->discards, + struct discard, list); + + list_del(&d->list); + d->bucket = bucket; + + atomic_inc(&ca->discards_in_flight); + closure_get(&ca->set->cl); + + bio_init(&d->bio); + + d->bio.bi_sector = bucket_to_sector(ca->set, d->bucket); + d->bio.bi_bdev = ca->bdev; + d->bio.bi_rw = REQ_WRITE|REQ_DISCARD; + d->bio.bi_max_vecs = 1; + d->bio.bi_io_vec = d->bio.bi_inline_vecs; + d->bio.bi_size = bucket_bytes(ca); + d->bio.bi_end_io = discard_endio; + bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + + submit_bio(0, &d->bio); +} + +/* Allocation */ + +static inline bool can_inc_bucket_gen(struct bucket *b) +{ + return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX && + bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX; +} + +bool bch_bucket_add_unused(struct cache *ca, struct bucket *b) +{ + BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); + + if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] && + CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) + return false; + + b->prio = 0; + + if (can_inc_bucket_gen(b) && + fifo_push(&ca->unused, b - ca->buckets)) { + atomic_inc(&b->pin); + return true; + } + + return false; +} + +static bool can_invalidate_bucket(struct cache *ca, struct bucket *b) +{ + return GC_MARK(b) == GC_MARK_RECLAIMABLE && + !atomic_read(&b->pin) && + can_inc_bucket_gen(b); +} + +static void invalidate_one_bucket(struct cache *ca, struct bucket *b) +{ + bch_inc_gen(ca, b); + b->prio = INITIAL_PRIO; + atomic_inc(&b->pin); + fifo_push(&ca->free_inc, b - ca->buckets); +} + +#define bucket_prio(b) \ + (((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b)) + +#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) +#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) + +static void invalidate_buckets_lru(struct cache *ca) +{ + struct bucket *b; + ssize_t i; + + ca->heap.used = 0; + + for_each_bucket(b, ca) { + /* + * If we fill up the unused list, if we then return before + * adding anything to the free_inc list we'll skip writing + * prios/gens and just go back to allocating from the unused + * list: + */ + if (fifo_full(&ca->unused)) + return; + + if (!can_invalidate_bucket(ca, b)) + continue; + + if (!GC_SECTORS_USED(b) && + bch_bucket_add_unused(ca, b)) + continue; + + if (!heap_full(&ca->heap)) + heap_add(&ca->heap, b, bucket_max_cmp); + else if (bucket_max_cmp(b, heap_peek(&ca->heap))) { + ca->heap.data[0] = b; + heap_sift(&ca->heap, 0, bucket_max_cmp); + } + } + + for (i = ca->heap.used / 2 - 1; i >= 0; --i) + heap_sift(&ca->heap, i, bucket_min_cmp); + + while (!fifo_full(&ca->free_inc)) { + if (!heap_pop(&ca->heap, b, bucket_min_cmp)) { + /* + * We don't want to be calling invalidate_buckets() + * multiple times when it can't do anything + */ + ca->invalidate_needs_gc = 1; + bch_queue_gc(ca->set); + return; + } + + invalidate_one_bucket(ca, b); + } +} + +static void invalidate_buckets_fifo(struct cache *ca) +{ + struct bucket *b; + size_t checked = 0; + + while (!fifo_full(&ca->free_inc)) { + if (ca->fifo_last_bucket < ca->sb.first_bucket || + ca->fifo_last_bucket >= ca->sb.nbuckets) + ca->fifo_last_bucket = ca->sb.first_bucket; + + b = ca->buckets + ca->fifo_last_bucket++; + + if (can_invalidate_bucket(ca, b)) + invalidate_one_bucket(ca, b); + + if (++checked >= ca->sb.nbuckets) { + ca->invalidate_needs_gc = 1; + bch_queue_gc(ca->set); + return; + } + } +} + +static void invalidate_buckets_random(struct cache *ca) +{ + struct bucket *b; + size_t checked = 0; + + while (!fifo_full(&ca->free_inc)) { + size_t n; + get_random_bytes(&n, sizeof(n)); + + n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket); + n += ca->sb.first_bucket; + + b = ca->buckets + n; + + if (can_invalidate_bucket(ca, b)) + invalidate_one_bucket(ca, b); + + if (++checked >= ca->sb.nbuckets / 2) { + ca->invalidate_needs_gc = 1; + bch_queue_gc(ca->set); + return; + } + } +} + +static void invalidate_buckets(struct cache *ca) +{ + if (ca->invalidate_needs_gc) + return; + + switch (CACHE_REPLACEMENT(&ca->sb)) { + case CACHE_REPLACEMENT_LRU: + invalidate_buckets_lru(ca); + break; + case CACHE_REPLACEMENT_FIFO: + invalidate_buckets_fifo(ca); + break; + case CACHE_REPLACEMENT_RANDOM: + invalidate_buckets_random(ca); + break; + } + + pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu", + fifo_used(&ca->free), ca->free.size, + fifo_used(&ca->free_inc), ca->free_inc.size, + fifo_used(&ca->unused), ca->unused.size); +} + +#define allocator_wait(ca, cond) \ +do { \ + DEFINE_WAIT(__wait); \ + \ + while (1) { \ + prepare_to_wait(&ca->set->alloc_wait, \ + &__wait, TASK_INTERRUPTIBLE); \ + if (cond) \ + break; \ + \ + mutex_unlock(&(ca)->set->bucket_lock); \ + if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \ + finish_wait(&ca->set->alloc_wait, &__wait); \ + closure_return(cl); \ + } \ + \ + schedule(); \ + mutex_lock(&(ca)->set->bucket_lock); \ + } \ + \ + finish_wait(&ca->set->alloc_wait, &__wait); \ +} while (0) + +void bch_allocator_thread(struct closure *cl) +{ + struct cache *ca = container_of(cl, struct cache, alloc); + + mutex_lock(&ca->set->bucket_lock); + + while (1) { + /* + * First, we pull buckets off of the unused and free_inc lists, + * possibly issue discards to them, then we add the bucket to + * the free list: + */ + while (1) { + long bucket; + + if ((!atomic_read(&ca->set->prio_blocked) || + !CACHE_SYNC(&ca->set->sb)) && + !fifo_empty(&ca->unused)) + fifo_pop(&ca->unused, bucket); + else if (!fifo_empty(&ca->free_inc)) + fifo_pop(&ca->free_inc, bucket); + else + break; + + allocator_wait(ca, (int) fifo_free(&ca->free) > + atomic_read(&ca->discards_in_flight)); + + if (ca->discard) { + allocator_wait(ca, !list_empty(&ca->discards)); + do_discard(ca, bucket); + } else { + fifo_push(&ca->free, bucket); + closure_wake_up(&ca->set->bucket_wait); + } + } + + /* + * We've run out of free buckets, we need to find some buckets + * we can invalidate. First, invalidate them in memory and add + * them to the free_inc list: + */ + + allocator_wait(ca, ca->set->gc_mark_valid && + (ca->need_save_prio > 64 || + !ca->invalidate_needs_gc)); + invalidate_buckets(ca); + + /* + * Now, we write their new gens to disk so we can start writing + * new stuff to them: + */ + allocator_wait(ca, !atomic_read(&ca->set->prio_blocked)); + if (CACHE_SYNC(&ca->set->sb) && + (!fifo_empty(&ca->free_inc) || + ca->need_save_prio > 64)) + bch_prio_write(ca); + } +} + +long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl) +{ + long r = -1; +again: + wake_up(&ca->set->alloc_wait); + + if (fifo_used(&ca->free) > ca->watermark[watermark] && + fifo_pop(&ca->free, r)) { + struct bucket *b = ca->buckets + r; +#ifdef CONFIG_BCACHE_EDEBUG + size_t iter; + long i; + + for (iter = 0; iter < prio_buckets(ca) * 2; iter++) + BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); + + fifo_for_each(i, &ca->free, iter) + BUG_ON(i == r); + fifo_for_each(i, &ca->free_inc, iter) + BUG_ON(i == r); + fifo_for_each(i, &ca->unused, iter) + BUG_ON(i == r); +#endif + BUG_ON(atomic_read(&b->pin) != 1); + + SET_GC_SECTORS_USED(b, ca->sb.bucket_size); + + if (watermark <= WATERMARK_METADATA) { + SET_GC_MARK(b, GC_MARK_METADATA); + b->prio = BTREE_PRIO; + } else { + SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + b->prio = INITIAL_PRIO; + } + + return r; + } + + pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu", + atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free), + fifo_used(&ca->free_inc), fifo_used(&ca->unused)); + + if (cl) { + closure_wait(&ca->set->bucket_wait, cl); + + if (closure_blocking(cl)) { + mutex_unlock(&ca->set->bucket_lock); + closure_sync(cl); + mutex_lock(&ca->set->bucket_lock); + goto again; + } + } + + return -1; +} + +void bch_bucket_free(struct cache_set *c, struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) { + struct bucket *b = PTR_BUCKET(c, k, i); + + SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + SET_GC_SECTORS_USED(b, 0); + bch_bucket_add_unused(PTR_CACHE(c, k, i), b); + } +} + +int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, + struct bkey *k, int n, struct closure *cl) +{ + int i; + + lockdep_assert_held(&c->bucket_lock); + BUG_ON(!n || n > c->caches_loaded || n > 8); + + bkey_init(k); + + /* sort by free space/prio of oldest data in caches */ + + for (i = 0; i < n; i++) { + struct cache *ca = c->cache_by_alloc[i]; + long b = bch_bucket_alloc(ca, watermark, cl); + + if (b == -1) + goto err; + + k->ptr[i] = PTR(ca->buckets[b].gen, + bucket_to_sector(c, b), + ca->sb.nr_this_dev); + + SET_KEY_PTRS(k, i + 1); + } + + return 0; +err: + bch_bucket_free(c, k); + __bkey_put(c, k); + return -1; +} + +int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, + struct bkey *k, int n, struct closure *cl) +{ + int ret; + mutex_lock(&c->bucket_lock); + ret = __bch_bucket_alloc_set(c, watermark, k, n, cl); + mutex_unlock(&c->bucket_lock); + return ret; +} + +/* Init */ + +void bch_cache_allocator_exit(struct cache *ca) +{ + struct discard *d; + + while (!list_empty(&ca->discards)) { + d = list_first_entry(&ca->discards, struct discard, list); + cancel_work_sync(&d->work); + list_del(&d->list); + kfree(d); + } +} + +int bch_cache_allocator_init(struct cache *ca) +{ + unsigned i; + + /* + * Reserve: + * Prio/gen writes first + * Then 8 for btree allocations + * Then half for the moving garbage collector + */ + + ca->watermark[WATERMARK_PRIO] = 0; + + ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); + + ca->watermark[WATERMARK_MOVINGGC] = 8 + + ca->watermark[WATERMARK_METADATA]; + + ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + + ca->watermark[WATERMARK_MOVINGGC]; + + for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) { + struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) + return -ENOMEM; + + d->ca = ca; + INIT_WORK(&d->work, discard_finish); + list_add(&d->list, &ca->discards); + } + + return 0; +} diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h new file mode 100644 index 00000000000..340146d7c17 --- /dev/null +++ b/drivers/md/bcache/bcache.h @@ -0,0 +1,1259 @@ +#ifndef _BCACHE_H +#define _BCACHE_H + +/* + * SOME HIGH LEVEL CODE DOCUMENTATION: + * + * Bcache mostly works with cache sets, cache devices, and backing devices. + * + * Support for multiple cache devices hasn't quite been finished off yet, but + * it's about 95% plumbed through. A cache set and its cache devices is sort of + * like a md raid array and its component devices. Most of the code doesn't care + * about individual cache devices, the main abstraction is the cache set. + * + * Multiple cache devices is intended to give us the ability to mirror dirty + * cached data and metadata, without mirroring clean cached data. + * + * Backing devices are different, in that they have a lifetime independent of a + * cache set. When you register a newly formatted backing device it'll come up + * in passthrough mode, and then you can attach and detach a backing device from + * a cache set at runtime - while it's mounted and in use. Detaching implicitly + * invalidates any cached data for that backing device. + * + * A cache set can have multiple (many) backing devices attached to it. + * + * There's also flash only volumes - this is the reason for the distinction + * between struct cached_dev and struct bcache_device. A flash only volume + * works much like a bcache device that has a backing device, except the + * "cached" data is always dirty. The end result is that we get thin + * provisioning with very little additional code. + * + * Flash only volumes work but they're not production ready because the moving + * garbage collector needs more work. More on that later. + * + * BUCKETS/ALLOCATION: + * + * Bcache is primarily designed for caching, which means that in normal + * operation all of our available space will be allocated. Thus, we need an + * efficient way of deleting things from the cache so we can write new things to + * it. + * + * To do this, we first divide the cache device up into buckets. A bucket is the + * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ + * works efficiently. + * + * Each bucket has a 16 bit priority, and an 8 bit generation associated with + * it. The gens and priorities for all the buckets are stored contiguously and + * packed on disk (in a linked list of buckets - aside from the superblock, all + * of bcache's metadata is stored in buckets). + * + * The priority is used to implement an LRU. We reset a bucket's priority when + * we allocate it or on cache it, and every so often we decrement the priority + * of each bucket. It could be used to implement something more sophisticated, + * if anyone ever gets around to it. + * + * The generation is used for invalidating buckets. Each pointer also has an 8 + * bit generation embedded in it; for a pointer to be considered valid, its gen + * must match the gen of the bucket it points into. Thus, to reuse a bucket all + * we have to do is increment its gen (and write its new gen to disk; we batch + * this up). + * + * Bcache is entirely COW - we never write twice to a bucket, even buckets that + * contain metadata (including btree nodes). + * + * THE BTREE: + * + * Bcache is in large part design around the btree. + * + * At a high level, the btree is just an index of key -> ptr tuples. + * + * Keys represent extents, and thus have a size field. Keys also have a variable + * number of pointers attached to them (potentially zero, which is handy for + * invalidating the cache). + * + * The key itself is an inode:offset pair. The inode number corresponds to a + * backing device or a flash only volume. The offset is the ending offset of the + * extent within the inode - not the starting offset; this makes lookups + * slightly more convenient. + * + * Pointers contain the cache device id, the offset on that device, and an 8 bit + * generation number. More on the gen later. + * + * Index lookups are not fully abstracted - cache lookups in particular are + * still somewhat mixed in with the btree code, but things are headed in that + * direction. + * + * Updates are fairly well abstracted, though. There are two different ways of + * updating the btree; insert and replace. + * + * BTREE_INSERT will just take a list of keys and insert them into the btree - + * overwriting (possibly only partially) any extents they overlap with. This is + * used to update the index after a write. + * + * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is + * overwriting a key that matches another given key. This is used for inserting + * data into the cache after a cache miss, and for background writeback, and for + * the moving garbage collector. + * + * There is no "delete" operation; deleting things from the index is + * accomplished by either by invalidating pointers (by incrementing a bucket's + * gen) or by inserting a key with 0 pointers - which will overwrite anything + * previously present at that location in the index. + * + * This means that there are always stale/invalid keys in the btree. They're + * filtered out by the code that iterates through a btree node, and removed when + * a btree node is rewritten. + * + * BTREE NODES: + * + * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and + * free smaller than a bucket - so, that's how big our btree nodes are. + * + * (If buckets are really big we'll only use part of the bucket for a btree node + * - no less than 1/4th - but a bucket still contains no more than a single + * btree node. I'd actually like to change this, but for now we rely on the + * bucket's gen for deleting btree nodes when we rewrite/split a node.) + * + * Anyways, btree nodes are big - big enough to be inefficient with a textbook + * btree implementation. + * + * The way this is solved is that btree nodes are internally log structured; we + * can append new keys to an existing btree node without rewriting it. This + * means each set of keys we write is sorted, but the node is not. + * + * We maintain this log structure in memory - keeping 1Mb of keys sorted would + * be expensive, and we have to distinguish between the keys we have written and + * the keys we haven't. So to do a lookup in a btree node, we have to search + * each sorted set. But we do merge written sets together lazily, so the cost of + * these extra searches is quite low (normally most of the keys in a btree node + * will be in one big set, and then there'll be one or two sets that are much + * smaller). + * + * This log structure makes bcache's btree more of a hybrid between a + * conventional btree and a compacting data structure, with some of the + * advantages of both. + * + * GARBAGE COLLECTION: + * + * We can't just invalidate any bucket - it might contain dirty data or + * metadata. If it once contained dirty data, other writes might overwrite it + * later, leaving no valid pointers into that bucket in the index. + * + * Thus, the primary purpose of garbage collection is to find buckets to reuse. + * It also counts how much valid data it each bucket currently contains, so that + * allocation can reuse buckets sooner when they've been mostly overwritten. + * + * It also does some things that are really internal to the btree + * implementation. If a btree node contains pointers that are stale by more than + * some threshold, it rewrites the btree node to avoid the bucket's generation + * wrapping around. It also merges adjacent btree nodes if they're empty enough. + * + * THE JOURNAL: + * + * Bcache's journal is not necessary for consistency; we always strictly + * order metadata writes so that the btree and everything else is consistent on + * disk in the event of an unclean shutdown, and in fact bcache had writeback + * caching (with recovery from unclean shutdown) before journalling was + * implemented. + * + * Rather, the journal is purely a performance optimization; we can't complete a + * write until we've updated the index on disk, otherwise the cache would be + * inconsistent in the event of an unclean shutdown. This means that without the + * journal, on random write workloads we constantly have to update all the leaf + * nodes in the btree, and those writes will be mostly empty (appending at most + * a few keys each) - highly inefficient in terms of amount of metadata writes, + * and it puts more strain on the various btree resorting/compacting code. + * + * The journal is just a log of keys we've inserted; on startup we just reinsert + * all the keys in the open journal entries. That means that when we're updating + * a node in the btree, we can wait until a 4k block of keys fills up before + * writing them out. + * + * For simplicity, we only journal updates to leaf nodes; updates to parent + * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth + * the complexity to deal with journalling them (in particular, journal replay) + * - updates to non leaf nodes just happen synchronously (see btree_split()). + */ + +#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ + +#include <linux/bio.h> +#include <linux/blktrace_api.h> +#include <linux/kobject.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/rbtree.h> +#include <linux/rwsem.h> +#include <linux/types.h> +#include <linux/workqueue.h> + +#include "util.h" +#include "closure.h" + +struct bucket { + atomic_t pin; + uint16_t prio; + uint8_t gen; + uint8_t disk_gen; + uint8_t last_gc; /* Most out of date gen in the btree */ + uint8_t gc_gen; + uint16_t gc_mark; +}; + +/* + * I'd use bitfields for these, but I don't trust the compiler not to screw me + * as multiple threads touch struct bucket without locking + */ + +BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); +#define GC_MARK_RECLAIMABLE 0 +#define GC_MARK_DIRTY 1 +#define GC_MARK_METADATA 2 +BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); + +struct bkey { + uint64_t high; + uint64_t low; + uint64_t ptr[]; +}; + +/* Enough for a key with 6 pointers */ +#define BKEY_PAD 8 + +#define BKEY_PADDED(key) \ + union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; } + +/* Version 0: Cache device + * Version 1: Backing device + * Version 2: Seed pointer into btree node checksum + * Version 3: Cache device with new UUID format + * Version 4: Backing device with data offset + */ +#define BCACHE_SB_VERSION_CDEV 0 +#define BCACHE_SB_VERSION_BDEV 1 +#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 +#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 +#define BCACHE_SB_MAX_VERSION 4 + +#define SB_SECTOR 8 +#define SB_SIZE 4096 +#define SB_LABEL_SIZE 32 +#define SB_JOURNAL_BUCKETS 256U +/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ +#define MAX_CACHES_PER_SET 8 + +#define BDEV_DATA_START_DEFAULT 16 /* sectors */ + +struct cache_sb { + uint64_t csum; + uint64_t offset; /* sector where this sb was written */ + uint64_t version; + + uint8_t magic[16]; + + uint8_t uuid[16]; + union { + uint8_t set_uuid[16]; + uint64_t set_magic; + }; + uint8_t label[SB_LABEL_SIZE]; + + uint64_t flags; + uint64_t seq; + uint64_t pad[8]; + + union { + struct { + /* Cache devices */ + uint64_t nbuckets; /* device size */ + + uint16_t block_size; /* sectors */ + uint16_t bucket_size; /* sectors */ + + uint16_t nr_in_set; + uint16_t nr_this_dev; + }; + struct { + /* Backing devices */ + uint64_t data_offset; + + /* + * block_size from the cache device section is still used by + * backing devices, so don't add anything here until we fix + * things to not need it for backing devices anymore + */ + }; + }; + + uint32_t last_mount; /* time_t */ + + uint16_t first_bucket; + union { + uint16_t njournal_buckets; + uint16_t keys; + }; + uint64_t d[SB_JOURNAL_BUCKETS]; /* journal buckets */ +}; + +BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); +BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); +BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); +#define CACHE_REPLACEMENT_LRU 0U +#define CACHE_REPLACEMENT_FIFO 1U +#define CACHE_REPLACEMENT_RANDOM 2U + +BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); +#define CACHE_MODE_WRITETHROUGH 0U +#define CACHE_MODE_WRITEBACK 1U +#define CACHE_MODE_WRITEAROUND 2U +#define CACHE_MODE_NONE 3U +BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); +#define BDEV_STATE_NONE 0U +#define BDEV_STATE_CLEAN 1U +#define BDEV_STATE_DIRTY 2U +#define BDEV_STATE_STALE 3U + +/* Version 1: Seed pointer into btree node checksum + */ +#define BCACHE_BSET_VERSION 1 + +/* + * This is the on disk format for btree nodes - a btree node on disk is a list + * of these; within each set the keys are sorted + */ +struct bset { + uint64_t csum; + uint64_t magic; + uint64_t seq; + uint32_t version; + uint32_t keys; + + union { + struct bkey start[0]; + uint64_t d[0]; + }; +}; + +/* + * On disk format for priorities and gens - see super.c near prio_write() for + * more. + */ +struct prio_set { + uint64_t csum; + uint64_t magic; + uint64_t seq; + uint32_t version; + uint32_t pad; + + uint64_t next_bucket; + + struct bucket_disk { + uint16_t prio; + uint8_t gen; + } __attribute((packed)) data[]; +}; + +struct uuid_entry { + union { + struct { + uint8_t uuid[16]; + uint8_t label[32]; + uint32_t first_reg; + uint32_t last_reg; + uint32_t invalidated; + + uint32_t flags; + /* Size of flash only volumes */ + uint64_t sectors; + }; + + uint8_t pad[128]; + }; +}; + +BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); + +#include "journal.h" +#include "stats.h" +struct search; +struct btree; +struct keybuf; + +struct keybuf_key { + struct rb_node node; + BKEY_PADDED(key); + void *private; +}; + +typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); + +struct keybuf { + keybuf_pred_fn *key_predicate; + + struct bkey last_scanned; + spinlock_t lock; + + /* + * Beginning and end of range in rb tree - so that we can skip taking + * lock and checking the rb tree when we need to check for overlapping + * keys. + */ + struct bkey start; + struct bkey end; + + struct rb_root keys; + +#define KEYBUF_NR 100 + DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); +}; + +struct bio_split_pool { + struct bio_set *bio_split; + mempool_t *bio_split_hook; +}; + +struct bio_split_hook { + struct closure cl; + struct bio_split_pool *p; + struct bio *bio; + bio_end_io_t *bi_end_io; + void *bi_private; +}; + +struct bcache_device { + struct closure cl; + + struct kobject kobj; + + struct cache_set *c; + unsigned id; +#define BCACHEDEVNAME_SIZE 12 + char name[BCACHEDEVNAME_SIZE]; + + struct gendisk *disk; + + /* If nonzero, we're closing */ + atomic_t closing; + + /* If nonzero, we're detaching/unregistering from cache set */ + atomic_t detaching; + + atomic_long_t sectors_dirty; + unsigned long sectors_dirty_gc; + unsigned long sectors_dirty_last; + long sectors_dirty_derivative; + + mempool_t *unaligned_bvec; + struct bio_set *bio_split; + + unsigned data_csum:1; + + int (*cache_miss)(struct btree *, struct search *, + struct bio *, unsigned); + int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long); + + struct bio_split_pool bio_split_hook; +}; + +struct io { + /* Used to track sequential IO so it can be skipped */ + struct hlist_node hash; + struct list_head lru; + + unsigned long jiffies; + unsigned sequential; + sector_t last; +}; + +struct cached_dev { + struct list_head list; + struct bcache_device disk; + struct block_device *bdev; + + struct cache_sb sb; + struct bio sb_bio; + struct bio_vec sb_bv[1]; + struct closure_with_waitlist sb_write; + + /* Refcount on the cache set. Always nonzero when we're caching. */ + atomic_t count; + struct work_struct detach; + + /* + * Device might not be running if it's dirty and the cache set hasn't + * showed up yet. + */ + atomic_t running; + + /* + * Writes take a shared lock from start to finish; scanning for dirty + * data to refill the rb tree requires an exclusive lock. + */ + struct rw_semaphore writeback_lock; + + /* + * Nonzero, and writeback has a refcount (d->count), iff there is dirty + * data in the cache. Protected by writeback_lock; must have an + * shared lock to set and exclusive lock to clear. + */ + atomic_t has_dirty; + + struct ratelimit writeback_rate; + struct delayed_work writeback_rate_update; + + /* + * Internal to the writeback code, so read_dirty() can keep track of + * where it's at. + */ + sector_t last_read; + + /* Number of writeback bios in flight */ + atomic_t in_flight; + struct closure_with_timer writeback; + struct closure_waitlist writeback_wait; + + struct keybuf writeback_keys; + + /* For tracking sequential IO */ +#define RECENT_IO_BITS 7 +#define RECENT_IO (1 << RECENT_IO_BITS) + struct io io[RECENT_IO]; + struct hlist_head io_hash[RECENT_IO + 1]; + struct list_head io_lru; + spinlock_t io_lock; + + struct cache_accounting accounting; + + /* The rest of this all shows up in sysfs */ + unsigned sequential_cutoff; + unsigned readahead; + + unsigned sequential_merge:1; + unsigned verify:1; + + unsigned writeback_metadata:1; + unsigned writeback_running:1; + unsigned char writeback_percent; + unsigned writeback_delay; + + int writeback_rate_change; + int64_t writeback_rate_derivative; + uint64_t writeback_rate_target; + + unsigned writeback_rate_update_seconds; + unsigned writeback_rate_d_term; + unsigned writeback_rate_p_term_inverse; + unsigned writeback_rate_d_smooth; +}; + +enum alloc_watermarks { + WATERMARK_PRIO, + WATERMARK_METADATA, + WATERMARK_MOVINGGC, + WATERMARK_NONE, + WATERMARK_MAX +}; + +struct cache { + struct cache_set *set; + struct cache_sb sb; + struct bio sb_bio; + struct bio_vec sb_bv[1]; + + struct kobject kobj; + struct block_device *bdev; + + unsigned watermark[WATERMARK_MAX]; + + struct closure alloc; + struct workqueue_struct *alloc_workqueue; + + struct closure prio; + struct prio_set *disk_buckets; + + /* + * When allocating new buckets, prio_write() gets first dibs - since we + * may not be allocate at all without writing priorities and gens. + * prio_buckets[] contains the last buckets we wrote priorities to (so + * gc can mark them as metadata), prio_next[] contains the buckets + * allocated for the next prio write. + */ + uint64_t *prio_buckets; + uint64_t *prio_last_buckets; + + /* + * free: Buckets that are ready to be used + * + * free_inc: Incoming buckets - these are buckets that currently have + * cached data in them, and we can't reuse them until after we write + * their new gen to disk. After prio_write() finishes writing the new + * gens/prios, they'll be moved to the free list (and possibly discarded + * in the process) + * + * unused: GC found nothing pointing into these buckets (possibly + * because all the data they contained was overwritten), so we only + * need to discard them before they can be moved to the free list. + */ + DECLARE_FIFO(long, free); + DECLARE_FIFO(long, free_inc); + DECLARE_FIFO(long, unused); + + size_t fifo_last_bucket; + + /* Allocation stuff: */ + struct bucket *buckets; + + DECLARE_HEAP(struct bucket *, heap); + + /* + * max(gen - disk_gen) for all buckets. When it gets too big we have to + * call prio_write() to keep gens from wrapping. + */ + uint8_t need_save_prio; + unsigned gc_move_threshold; + + /* + * If nonzero, we know we aren't going to find any buckets to invalidate + * until a gc finishes - otherwise we could pointlessly burn a ton of + * cpu + */ + unsigned invalidate_needs_gc:1; + + bool discard; /* Get rid of? */ + + /* + * We preallocate structs for issuing discards to buckets, and keep them + * on this list when they're not in use; do_discard() issues discards + * whenever there's work to do and is called by free_some_buckets() and + * when a discard finishes. + */ + atomic_t discards_in_flight; + struct list_head discards; + + struct journal_device journal; + + /* The rest of this all shows up in sysfs */ +#define IO_ERROR_SHIFT 20 + atomic_t io_errors; + atomic_t io_count; + + atomic_long_t meta_sectors_written; + atomic_long_t btree_sectors_written; + atomic_long_t sectors_written; + + struct bio_split_pool bio_split_hook; +}; + +struct gc_stat { + size_t nodes; + size_t key_bytes; + + size_t nkeys; + uint64_t data; /* sectors */ + uint64_t dirty; /* sectors */ + unsigned in_use; /* percent */ +}; + +/* + * Flag bits, for how the cache set is shutting down, and what phase it's at: + * + * CACHE_SET_UNREGISTERING means we're not just shutting down, we're detaching + * all the backing devices first (their cached data gets invalidated, and they + * won't automatically reattach). + * + * CACHE_SET_STOPPING always gets set first when we're closing down a cache set; + * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e. + * flushing dirty data). + * + * CACHE_SET_STOPPING_2 gets set at the last phase, when it's time to shut down + * the allocation thread. + */ +#define CACHE_SET_UNREGISTERING 0 +#define CACHE_SET_STOPPING 1 +#define CACHE_SET_STOPPING_2 2 + +struct cache_set { + struct closure cl; + + struct list_head list; + struct kobject kobj; + struct kobject internal; + struct dentry *debug; + struct cache_accounting accounting; + + unsigned long flags; + + struct cache_sb sb; + + struct cache *cache[MAX_CACHES_PER_SET]; + struct cache *cache_by_alloc[MAX_CACHES_PER_SET]; + int caches_loaded; + + struct bcache_device **devices; + struct list_head cached_devs; + uint64_t cached_dev_sectors; + struct closure caching; + + struct closure_with_waitlist sb_write; + + mempool_t *search; + mempool_t *bio_meta; + struct bio_set *bio_split; + + /* For the btree cache */ + struct shrinker shrink; + + /* For the allocator itself */ + wait_queue_head_t alloc_wait; + + /* For the btree cache and anything allocation related */ + struct mutex bucket_lock; + + /* log2(bucket_size), in sectors */ + unsigned short bucket_bits; + + /* log2(block_size), in sectors */ + unsigned short block_bits; + + /* + * Default number of pages for a new btree node - may be less than a + * full bucket + */ + unsigned btree_pages; + + /* + * Lists of struct btrees; lru is the list for structs that have memory + * allocated for actual btree node, freed is for structs that do not. + * + * We never free a struct btree, except on shutdown - we just put it on + * the btree_cache_freed list and reuse it later. This simplifies the + * code, and it doesn't cost us much memory as the memory usage is + * dominated by buffers that hold the actual btree node data and those + * can be freed - and the number of struct btrees allocated is + * effectively bounded. + * + * btree_cache_freeable effectively is a small cache - we use it because + * high order page allocations can be rather expensive, and it's quite + * common to delete and allocate btree nodes in quick succession. It + * should never grow past ~2-3 nodes in practice. + */ + struct list_head btree_cache; + struct list_head btree_cache_freeable; + struct list_head btree_cache_freed; + + /* Number of elements in btree_cache + btree_cache_freeable lists */ + unsigned bucket_cache_used; + + /* + * If we need to allocate memory for a new btree node and that + * allocation fails, we can cannibalize another node in the btree cache + * to satisfy the allocation. However, only one thread can be doing this + * at a time, for obvious reasons - try_harder and try_wait are + * basically a lock for this that we can wait on asynchronously. The + * btree_root() macro releases the lock when it returns. + */ + struct closure *try_harder; + struct closure_waitlist try_wait; + uint64_t try_harder_start; + + /* + * When we free a btree node, we increment the gen of the bucket the + * node is in - but we can't rewrite the prios and gens until we + * finished whatever it is we were doing, otherwise after a crash the + * btree node would be freed but for say a split, we might not have the + * pointers to the new nodes inserted into the btree yet. + * + * This is a refcount that blocks prio_write() until the new keys are + * written. + */ + atomic_t prio_blocked; + struct closure_waitlist bucket_wait; + + /* + * For any bio we don't skip we subtract the number of sectors from + * rescale; when it hits 0 we rescale all the bucket priorities. + */ + atomic_t rescale; + /* + * When we invalidate buckets, we use both the priority and the amount + * of good data to determine which buckets to reuse first - to weight + * those together consistently we keep track of the smallest nonzero + * priority of any bucket. + */ + uint16_t min_prio; + + /* + * max(gen - gc_gen) for all buckets. When it gets too big we have to gc + * to keep gens from wrapping around. + */ + uint8_t need_gc; + struct gc_stat gc_stats; + size_t nbuckets; + + struct closure_with_waitlist gc; + /* Where in the btree gc currently is */ + struct bkey gc_done; + + /* + * The allocation code needs gc_mark in struct bucket to be correct, but + * it's not while a gc is in progress. Protected by bucket_lock. + */ + int gc_mark_valid; + + /* Counts how many sectors bio_insert has added to the cache */ + atomic_t sectors_to_gc; + + struct closure moving_gc; + struct closure_waitlist moving_gc_wait; + struct keybuf moving_gc_keys; + /* Number of moving GC bios in flight */ + atomic_t in_flight; + + struct btree *root; + +#ifdef CONFIG_BCACHE_DEBUG + struct btree *verify_data; + struct mutex verify_lock; +#endif + + unsigned nr_uuids; + struct uuid_entry *uuids; + BKEY_PADDED(uuid_bucket); + struct closure_with_waitlist uuid_write; + + /* + * A btree node on disk could have too many bsets for an iterator to fit + * on the stack - this is a single element mempool for btree_read_work() + */ + struct mutex fill_lock; + struct btree_iter *fill_iter; + + /* + * btree_sort() is a merge sort and requires temporary space - single + * element mempool + */ + struct mutex sort_lock; + struct bset *sort; + + /* List of buckets we're currently writing data to */ + struct list_head data_buckets; + spinlock_t data_bucket_lock; + + struct journal journal; + +#define CONGESTED_MAX 1024 + unsigned congested_last_us; + atomic_t congested; + + /* The rest of this all shows up in sysfs */ + unsigned congested_read_threshold_us; + unsigned congested_write_threshold_us; + + spinlock_t sort_time_lock; + struct time_stats sort_time; + struct time_stats btree_gc_time; + struct time_stats btree_split_time; + spinlock_t btree_read_time_lock; + struct time_stats btree_read_time; + struct time_stats try_harder_time; + + atomic_long_t cache_read_races; + atomic_long_t writeback_keys_done; + atomic_long_t writeback_keys_failed; + unsigned error_limit; + unsigned error_decay; + unsigned short journal_delay_ms; + unsigned verify:1; + unsigned key_merging_disabled:1; + unsigned gc_always_rewrite:1; + unsigned shrinker_disabled:1; + unsigned copy_gc_enabled:1; + +#define BUCKET_HASH_BITS 12 + struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; +}; + +static inline bool key_merging_disabled(struct cache_set *c) +{ +#ifdef CONFIG_BCACHE_DEBUG + return c->key_merging_disabled; +#else + return 0; +#endif +} + +static inline bool SB_IS_BDEV(const struct cache_sb *sb) +{ + return sb->version == BCACHE_SB_VERSION_BDEV + || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; +} + +struct bbio { + unsigned submit_time_us; + union { + struct bkey key; + uint64_t _pad[3]; + /* + * We only need pad = 3 here because we only ever carry around a + * single pointer - i.e. the pointer we're doing io to/from. + */ + }; + struct bio bio; +}; + +static inline unsigned local_clock_us(void) +{ + return local_clock() >> 10; +} + +#define MAX_BSETS 4U + +#define BTREE_PRIO USHRT_MAX +#define INITIAL_PRIO 32768 + +#define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) +#define btree_blocks(b) \ + ((unsigned) (KEY_SIZE(&b->key) >> (b)->c->block_bits)) + +#define btree_default_blocks(c) \ + ((unsigned) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits)) + +#define bucket_pages(c) ((c)->sb.bucket_size / PAGE_SECTORS) +#define bucket_bytes(c) ((c)->sb.bucket_size << 9) +#define block_bytes(c) ((c)->sb.block_size << 9) + +#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) +#define set_bytes(i) __set_bytes(i, i->keys) + +#define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c)) +#define set_blocks(i, c) __set_blocks(i, (i)->keys, c) + +#define node(i, j) ((struct bkey *) ((i)->d + (j))) +#define end(i) node(i, (i)->keys) + +#define index(i, b) \ + ((size_t) (((void *) i - (void *) (b)->sets[0].data) / \ + block_bytes(b->c))) + +#define btree_data_space(b) (PAGE_SIZE << (b)->page_order) + +#define prios_per_bucket(c) \ + ((bucket_bytes(c) - sizeof(struct prio_set)) / \ + sizeof(struct bucket_disk)) +#define prio_buckets(c) \ + DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c)) + +#define JSET_MAGIC 0x245235c1a3625032ULL +#define PSET_MAGIC 0x6750e15f87337f91ULL +#define BSET_MAGIC 0x90135c78b99e07f5ULL + +#define jset_magic(c) ((c)->sb.set_magic ^ JSET_MAGIC) +#define pset_magic(c) ((c)->sb.set_magic ^ PSET_MAGIC) +#define bset_magic(c) ((c)->sb.set_magic ^ BSET_MAGIC) + +/* Bkey fields: all units are in sectors */ + +#define KEY_FIELD(name, field, offset, size) \ + BITMASK(name, struct bkey, field, offset, size) + +#define PTR_FIELD(name, offset, size) \ + static inline uint64_t name(const struct bkey *k, unsigned i) \ + { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); } \ + \ + static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\ + { \ + k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset); \ + k->ptr[i] |= v << offset; \ + } + +KEY_FIELD(KEY_PTRS, high, 60, 3) +KEY_FIELD(HEADER_SIZE, high, 58, 2) +KEY_FIELD(KEY_CSUM, high, 56, 2) +KEY_FIELD(KEY_PINNED, high, 55, 1) +KEY_FIELD(KEY_DIRTY, high, 36, 1) + +KEY_FIELD(KEY_SIZE, high, 20, 16) +KEY_FIELD(KEY_INODE, high, 0, 20) + +/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */ + +static inline uint64_t KEY_OFFSET(const struct bkey *k) +{ + return k->low; +} + +static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v) +{ + k->low = v; +} + +PTR_FIELD(PTR_DEV, 51, 12) +PTR_FIELD(PTR_OFFSET, 8, 43) +PTR_FIELD(PTR_GEN, 0, 8) + +#define PTR_CHECK_DEV ((1 << 12) - 1) + +#define PTR(gen, offset, dev) \ + ((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen) + +static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) +{ + return s >> c->bucket_bits; +} + +static inline sector_t bucket_to_sector(struct cache_set *c, size_t b) +{ + return ((sector_t) b) << c->bucket_bits; +} + +static inline sector_t bucket_remainder(struct cache_set *c, sector_t s) +{ + return s & (c->sb.bucket_size - 1); +} + +static inline struct cache *PTR_CACHE(struct cache_set *c, + const struct bkey *k, + unsigned ptr) +{ + return c->cache[PTR_DEV(k, ptr)]; +} + +static inline size_t PTR_BUCKET_NR(struct cache_set *c, + const struct bkey *k, + unsigned ptr) +{ + return sector_to_bucket(c, PTR_OFFSET(k, ptr)); +} + +static inline struct bucket *PTR_BUCKET(struct cache_set *c, + const struct bkey *k, + unsigned ptr) +{ + return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); +} + +/* Btree key macros */ + +/* + * The high bit being set is a relic from when we used it to do binary + * searches - it told you where a key started. It's not used anymore, + * and can probably be safely dropped. + */ +#define KEY(dev, sector, len) \ +((struct bkey) { \ + .high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev), \ + .low = (sector) \ +}) + +static inline void bkey_init(struct bkey *k) +{ + *k = KEY(0, 0, 0); +} + +#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) +#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) +#define MAX_KEY KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0) +#define ZERO_KEY KEY(0, 0, 0) + +/* + * This is used for various on disk data structures - cache_sb, prio_set, bset, + * jset: The checksum is _always_ the first 8 bytes of these structs + */ +#define csum_set(i) \ + bch_crc64(((void *) (i)) + sizeof(uint64_t), \ + ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t))) + +/* Error handling macros */ + +#define btree_bug(b, ...) \ +do { \ + if (bch_cache_set_error((b)->c, __VA_ARGS__)) \ + dump_stack(); \ +} while (0) + +#define cache_bug(c, ...) \ +do { \ + if (bch_cache_set_error(c, __VA_ARGS__)) \ + dump_stack(); \ +} while (0) + +#define btree_bug_on(cond, b, ...) \ +do { \ + if (cond) \ + btree_bug(b, __VA_ARGS__); \ +} while (0) + +#define cache_bug_on(cond, c, ...) \ +do { \ + if (cond) \ + cache_bug(c, __VA_ARGS__); \ +} while (0) + +#define cache_set_err_on(cond, c, ...) \ +do { \ + if (cond) \ + bch_cache_set_error(c, __VA_ARGS__); \ +} while (0) + +/* Looping macros */ + +#define for_each_cache(ca, cs, iter) \ + for (iter = 0; ca = cs->cache[iter], iter < (cs)->sb.nr_in_set; iter++) + +#define for_each_bucket(b, ca) \ + for (b = (ca)->buckets + (ca)->sb.first_bucket; \ + b < (ca)->buckets + (ca)->sb.nbuckets; b++) + +static inline void __bkey_put(struct cache_set *c, struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); +} + +/* Blktrace macros */ + +#define blktrace_msg(c, fmt, ...) \ +do { \ + struct request_queue *q = bdev_get_queue(c->bdev); \ + if (q) \ + blk_add_trace_msg(q, fmt, ##__VA_ARGS__); \ +} while (0) + +#define blktrace_msg_all(s, fmt, ...) \ +do { \ + struct cache *_c; \ + unsigned i; \ + for_each_cache(_c, (s), i) \ + blktrace_msg(_c, fmt, ##__VA_ARGS__); \ +} while (0) + +static inline void cached_dev_put(struct cached_dev *dc) +{ + if (atomic_dec_and_test(&dc->count)) + schedule_work(&dc->detach); +} + +static inline bool cached_dev_get(struct cached_dev *dc) +{ + if (!atomic_inc_not_zero(&dc->count)) + return false; + + /* Paired with the mb in cached_dev_attach */ + smp_mb__after_atomic_inc(); + return true; +} + +/* + * bucket_gc_gen() returns the difference between the bucket's current gen and + * the oldest gen of any pointer into that bucket in the btree (last_gc). + * + * bucket_disk_gen() returns the difference between the current gen and the gen + * on disk; they're both used to make sure gens don't wrap around. + */ + +static inline uint8_t bucket_gc_gen(struct bucket *b) +{ + return b->gen - b->last_gc; +} + +static inline uint8_t bucket_disk_gen(struct bucket *b) +{ + return b->gen - b->disk_gen; +} + +#define BUCKET_GC_GEN_MAX 96U +#define BUCKET_DISK_GEN_MAX 64U + +#define kobj_attribute_write(n, fn) \ + static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn) + +#define kobj_attribute_rw(n, show, store) \ + static struct kobj_attribute ksysfs_##n = \ + __ATTR(n, S_IWUSR|S_IRUSR, show, store) + +/* Forward declarations */ + +void bch_writeback_queue(struct cached_dev *); +void bch_writeback_add(struct cached_dev *, unsigned); + +void bch_count_io_errors(struct cache *, int, const char *); +void bch_bbio_count_io_errors(struct cache_set *, struct bio *, + int, const char *); +void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *); +void bch_bbio_free(struct bio *, struct cache_set *); +struct bio *bch_bbio_alloc(struct cache_set *); + +struct bio *bch_bio_split(struct bio *, int, gfp_t, struct bio_set *); +void bch_generic_make_request(struct bio *, struct bio_split_pool *); +void __bch_submit_bbio(struct bio *, struct cache_set *); +void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); + +uint8_t bch_inc_gen(struct cache *, struct bucket *); +void bch_rescale_priorities(struct cache_set *, int); +bool bch_bucket_add_unused(struct cache *, struct bucket *); +void bch_allocator_thread(struct closure *); + +long bch_bucket_alloc(struct cache *, unsigned, struct closure *); +void bch_bucket_free(struct cache_set *, struct bkey *); + +int __bch_bucket_alloc_set(struct cache_set *, unsigned, + struct bkey *, int, struct closure *); +int bch_bucket_alloc_set(struct cache_set *, unsigned, + struct bkey *, int, struct closure *); + +__printf(2, 3) +bool bch_cache_set_error(struct cache_set *, const char *, ...); + +void bch_prio_write(struct cache *); +void bch_write_bdev_super(struct cached_dev *, struct closure *); + +extern struct workqueue_struct *bcache_wq, *bch_gc_wq; +extern const char * const bch_cache_modes[]; +extern struct mutex bch_register_lock; +extern struct list_head bch_cache_sets; + +extern struct kobj_type bch_cached_dev_ktype; +extern struct kobj_type bch_flash_dev_ktype; +extern struct kobj_type bch_cache_set_ktype; +extern struct kobj_type bch_cache_set_internal_ktype; +extern struct kobj_type bch_cache_ktype; + +void bch_cached_dev_release(struct kobject *); +void bch_flash_dev_release(struct kobject *); +void bch_cache_set_release(struct kobject *); +void bch_cache_release(struct kobject *); + +int bch_uuid_write(struct cache_set *); +void bcache_write_super(struct cache_set *); + +int bch_flash_dev_create(struct cache_set *c, uint64_t size); + +int bch_cached_dev_attach(struct cached_dev *, struct cache_set *); +void bch_cached_dev_detach(struct cached_dev *); +void bch_cached_dev_run(struct cached_dev *); +void bcache_device_stop(struct bcache_device *); + +void bch_cache_set_unregister(struct cache_set *); +void bch_cache_set_stop(struct cache_set *); + +struct cache_set *bch_cache_set_alloc(struct cache_sb *); +void bch_btree_cache_free(struct cache_set *); +int bch_btree_cache_alloc(struct cache_set *); +void bch_writeback_init_cached_dev(struct cached_dev *); +void bch_moving_init_cache_set(struct cache_set *); + +void bch_cache_allocator_exit(struct cache *ca); +int bch_cache_allocator_init(struct cache *ca); + +void bch_debug_exit(void); +int bch_debug_init(struct kobject *); +void bch_writeback_exit(void); +int bch_writeback_init(void); +void bch_request_exit(void); +int bch_request_init(void); +void bch_btree_exit(void); +int bch_btree_init(void); + +#endif /* _BCACHE_H */ diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c new file mode 100644 index 00000000000..cb4578a327b --- /dev/null +++ b/drivers/md/bcache/bset.c @@ -0,0 +1,1192 @@ +/* + * Code for working with individual keys, and sorted sets of keys with in a + * btree node + * + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" + +#include <linux/random.h> +#include <linux/prefetch.h> + +/* Keylists */ + +void bch_keylist_copy(struct keylist *dest, struct keylist *src) +{ + *dest = *src; + + if (src->list == src->d) { + size_t n = (uint64_t *) src->top - src->d; + dest->top = (struct bkey *) &dest->d[n]; + dest->list = dest->d; + } +} + +int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) +{ + unsigned oldsize = (uint64_t *) l->top - l->list; + unsigned newsize = oldsize + 2 + nptrs; + uint64_t *new; + + /* The journalling code doesn't handle the case where the keys to insert + * is bigger than an empty write: If we just return -ENOMEM here, + * bio_insert() and bio_invalidate() will insert the keys created so far + * and finish the rest when the keylist is empty. + */ + if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) + return -ENOMEM; + + newsize = roundup_pow_of_two(newsize); + + if (newsize <= KEYLIST_INLINE || + roundup_pow_of_two(oldsize) == newsize) + return 0; + + new = krealloc(l->list == l->d ? NULL : l->list, + sizeof(uint64_t) * newsize, GFP_NOIO); + + if (!new) + return -ENOMEM; + + if (l->list == l->d) + memcpy(new, l->list, sizeof(uint64_t) * KEYLIST_INLINE); + + l->list = new; + l->top = (struct bkey *) (&l->list[oldsize]); + + return 0; +} + +struct bkey *bch_keylist_pop(struct keylist *l) +{ + struct bkey *k = l->bottom; + + if (k == l->top) + return NULL; + + while (bkey_next(k) != l->top) + k = bkey_next(k); + + return l->top = k; +} + +/* Pointer validation */ + +bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) +{ + unsigned i; + + if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) + goto bad; + + if (!level && KEY_SIZE(k) > KEY_OFFSET(k)) + goto bad; + + if (!KEY_SIZE(k)) + return true; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) { + struct cache *ca = PTR_CACHE(c, k, i); + size_t bucket = PTR_BUCKET_NR(c, k, i); + size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); + + if (KEY_SIZE(k) + r > c->sb.bucket_size || + bucket < ca->sb.first_bucket || + bucket >= ca->sb.nbuckets) + goto bad; + } + + return false; +bad: + cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k)); + return true; +} + +bool bch_ptr_bad(struct btree *b, const struct bkey *k) +{ + struct bucket *g; + unsigned i, stale; + + if (!bkey_cmp(k, &ZERO_KEY) || + !KEY_PTRS(k) || + bch_ptr_invalid(b, k)) + return true; + + if (KEY_PTRS(k) && PTR_DEV(k, 0) == PTR_CHECK_DEV) + return true; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(b->c, k, i)) { + g = PTR_BUCKET(b->c, k, i); + stale = ptr_stale(b->c, k, i); + + btree_bug_on(stale > 96, b, + "key too stale: %i, need_gc %u", + stale, b->c->need_gc); + + btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), + b, "stale dirty pointer"); + + if (stale) + return true; + +#ifdef CONFIG_BCACHE_EDEBUG + if (!mutex_trylock(&b->c->bucket_lock)) + continue; + + if (b->level) { + if (KEY_DIRTY(k) || + g->prio != BTREE_PRIO || + (b->c->gc_mark_valid && + GC_MARK(g) != GC_MARK_METADATA)) + goto bug; + + } else { + if (g->prio == BTREE_PRIO) + goto bug; + + if (KEY_DIRTY(k) && + b->c->gc_mark_valid && + GC_MARK(g) != GC_MARK_DIRTY) + goto bug; + } + mutex_unlock(&b->c->bucket_lock); +#endif + } + + return false; +#ifdef CONFIG_BCACHE_EDEBUG +bug: + mutex_unlock(&b->c->bucket_lock); + btree_bug(b, +"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", + pkey(k), PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), + g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); + return true; +#endif +} + +/* Key/pointer manipulation */ + +void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, + unsigned i) +{ + BUG_ON(i > KEY_PTRS(src)); + + /* Only copy the header, key, and one pointer. */ + memcpy(dest, src, 2 * sizeof(uint64_t)); + dest->ptr[0] = src->ptr[i]; + SET_KEY_PTRS(dest, 1); + /* We didn't copy the checksum so clear that bit. */ + SET_KEY_CSUM(dest, 0); +} + +bool __bch_cut_front(const struct bkey *where, struct bkey *k) +{ + unsigned i, len = 0; + + if (bkey_cmp(where, &START_KEY(k)) <= 0) + return false; + + if (bkey_cmp(where, k) < 0) + len = KEY_OFFSET(k) - KEY_OFFSET(where); + else + bkey_copy_key(k, where); + + for (i = 0; i < KEY_PTRS(k); i++) + SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + KEY_SIZE(k) - len); + + BUG_ON(len > KEY_SIZE(k)); + SET_KEY_SIZE(k, len); + return true; +} + +bool __bch_cut_back(const struct bkey *where, struct bkey *k) +{ + unsigned len = 0; + + if (bkey_cmp(where, k) >= 0) + return false; + + BUG_ON(KEY_INODE(where) != KEY_INODE(k)); + + if (bkey_cmp(where, &START_KEY(k)) > 0) + len = KEY_OFFSET(where) - KEY_START(k); + + bkey_copy_key(k, where); + + BUG_ON(len > KEY_SIZE(k)); + SET_KEY_SIZE(k, len); + return true; +} + +static uint64_t merge_chksums(struct bkey *l, struct bkey *r) +{ + return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & + ~((uint64_t)1 << 63); +} + +/* Tries to merge l and r: l should be lower than r + * Returns true if we were able to merge. If we did merge, l will be the merged + * key, r will be untouched. + */ +bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r) +{ + unsigned i; + + if (key_merging_disabled(b->c)) + return false; + + if (KEY_PTRS(l) != KEY_PTRS(r) || + KEY_DIRTY(l) != KEY_DIRTY(r) || + bkey_cmp(l, &START_KEY(r))) + return false; + + for (i = 0; i < KEY_PTRS(l); i++) + if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || + PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) + return false; + + /* Keys with no pointers aren't restricted to one bucket and could + * overflow KEY_SIZE + */ + if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { + SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); + SET_KEY_SIZE(l, USHRT_MAX); + + bch_cut_front(l, r); + return false; + } + + if (KEY_CSUM(l)) { + if (KEY_CSUM(r)) + l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); + else + SET_KEY_CSUM(l, 0); + } + + SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); + SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); + + return true; +} + +/* Binary tree stuff for auxiliary search trees */ + +static unsigned inorder_next(unsigned j, unsigned size) +{ + if (j * 2 + 1 < size) { + j = j * 2 + 1; + + while (j * 2 < size) + j *= 2; + } else + j >>= ffz(j) + 1; + + return j; +} + +static unsigned inorder_prev(unsigned j, unsigned size) +{ + if (j * 2 < size) { + j = j * 2; + + while (j * 2 + 1 < size) + j = j * 2 + 1; + } else + j >>= ffs(j); + + return j; +} + +/* I have no idea why this code works... and I'm the one who wrote it + * + * However, I do know what it does: + * Given a binary tree constructed in an array (i.e. how you normally implement + * a heap), it converts a node in the tree - referenced by array index - to the + * index it would have if you did an inorder traversal. + * + * Also tested for every j, size up to size somewhere around 6 million. + * + * The binary tree starts at array index 1, not 0 + * extra is a function of size: + * extra = (size - rounddown_pow_of_two(size - 1)) << 1; + */ +static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra) +{ + unsigned b = fls(j); + unsigned shift = fls(size - 1) - b; + + j ^= 1U << (b - 1); + j <<= 1; + j |= 1; + j <<= shift; + + if (j > extra) + j -= (j - extra) >> 1; + + return j; +} + +static unsigned to_inorder(unsigned j, struct bset_tree *t) +{ + return __to_inorder(j, t->size, t->extra); +} + +static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra) +{ + unsigned shift; + + if (j > extra) + j += j - extra; + + shift = ffs(j); + + j >>= shift; + j |= roundup_pow_of_two(size) >> shift; + + return j; +} + +static unsigned inorder_to_tree(unsigned j, struct bset_tree *t) +{ + return __inorder_to_tree(j, t->size, t->extra); +} + +#if 0 +void inorder_test(void) +{ + unsigned long done = 0; + ktime_t start = ktime_get(); + + for (unsigned size = 2; + size < 65536000; + size++) { + unsigned extra = (size - rounddown_pow_of_two(size - 1)) << 1; + unsigned i = 1, j = rounddown_pow_of_two(size - 1); + + if (!(size % 4096)) + printk(KERN_NOTICE "loop %u, %llu per us\n", size, + done / ktime_us_delta(ktime_get(), start)); + + while (1) { + if (__inorder_to_tree(i, size, extra) != j) + panic("size %10u j %10u i %10u", size, j, i); + + if (__to_inorder(j, size, extra) != i) + panic("size %10u j %10u i %10u", size, j, i); + + if (j == rounddown_pow_of_two(size) - 1) + break; + + BUG_ON(inorder_prev(inorder_next(j, size), size) != j); + + j = inorder_next(j, size); + i++; + } + + done += size - 1; + } +} +#endif + +/* + * Cacheline/offset <-> bkey pointer arithmatic: + * + * t->tree is a binary search tree in an array; each node corresponds to a key + * in one cacheline in t->set (BSET_CACHELINE bytes). + * + * This means we don't have to store the full index of the key that a node in + * the binary tree points to; to_inorder() gives us the cacheline, and then + * bkey_float->m gives us the offset within that cacheline, in units of 8 bytes. + * + * cacheline_to_bkey() and friends abstract out all the pointer arithmatic to + * make this work. + * + * To construct the bfloat for an arbitrary key we need to know what the key + * immediately preceding it is: we have to check if the two keys differ in the + * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size + * of the previous key so we can walk backwards to it from t->tree[j]'s key. + */ + +static struct bkey *cacheline_to_bkey(struct bset_tree *t, unsigned cacheline, + unsigned offset) +{ + return ((void *) t->data) + cacheline * BSET_CACHELINE + offset * 8; +} + +static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k) +{ + return ((void *) k - (void *) t->data) / BSET_CACHELINE; +} + +static unsigned bkey_to_cacheline_offset(struct bkey *k) +{ + return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t); +} + +static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j) +{ + return cacheline_to_bkey(t, to_inorder(j, t), t->tree[j].m); +} + +static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned j) +{ + return (void *) (((uint64_t *) tree_to_bkey(t, j)) - t->prev[j]); +} + +/* + * For the write set - the one we're currently inserting keys into - we don't + * maintain a full search tree, we just keep a simple lookup table in t->prev. + */ +static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline) +{ + return cacheline_to_bkey(t, cacheline, t->prev[cacheline]); +} + +static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift) +{ +#ifdef CONFIG_X86_64 + asm("shrd %[shift],%[high],%[low]" + : [low] "+Rm" (low) + : [high] "R" (high), + [shift] "ci" (shift) + : "cc"); +#else + low >>= shift; + low |= (high << 1) << (63U - shift); +#endif + return low; +} + +static inline unsigned bfloat_mantissa(const struct bkey *k, + struct bkey_float *f) +{ + const uint64_t *p = &k->low - (f->exponent >> 6); + return shrd128(p[-1], p[0], f->exponent & 63) & BKEY_MANTISSA_MASK; +} + +static void make_bfloat(struct bset_tree *t, unsigned j) +{ + struct bkey_float *f = &t->tree[j]; + struct bkey *m = tree_to_bkey(t, j); + struct bkey *p = tree_to_prev_bkey(t, j); + + struct bkey *l = is_power_of_2(j) + ? t->data->start + : tree_to_prev_bkey(t, j >> ffs(j)); + + struct bkey *r = is_power_of_2(j + 1) + ? node(t->data, t->data->keys - bkey_u64s(&t->end)) + : tree_to_bkey(t, j >> (ffz(j) + 1)); + + BUG_ON(m < l || m > r); + BUG_ON(bkey_next(p) != m); + + if (KEY_INODE(l) != KEY_INODE(r)) + f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64; + else + f->exponent = fls64(r->low ^ l->low); + + f->exponent = max_t(int, f->exponent - BKEY_MANTISSA_BITS, 0); + + /* + * Setting f->exponent = 127 flags this node as failed, and causes the + * lookup code to fall back to comparing against the original key. + */ + + if (bfloat_mantissa(m, f) != bfloat_mantissa(p, f)) + f->mantissa = bfloat_mantissa(m, f) - 1; + else + f->exponent = 127; +} + +static void bset_alloc_tree(struct btree *b, struct bset_tree *t) +{ + if (t != b->sets) { + unsigned j = roundup(t[-1].size, + 64 / sizeof(struct bkey_float)); + + t->tree = t[-1].tree + j; + t->prev = t[-1].prev + j; + } + + while (t < b->sets + MAX_BSETS) + t++->size = 0; +} + +static void bset_build_unwritten_tree(struct btree *b) +{ + struct bset_tree *t = b->sets + b->nsets; + + bset_alloc_tree(b, t); + + if (t->tree != b->sets->tree + bset_tree_space(b)) { + t->prev[0] = bkey_to_cacheline_offset(t->data->start); + t->size = 1; + } +} + +static void bset_build_written_tree(struct btree *b) +{ + struct bset_tree *t = b->sets + b->nsets; + struct bkey *k = t->data->start; + unsigned j, cacheline = 1; + + bset_alloc_tree(b, t); + + t->size = min_t(unsigned, + bkey_to_cacheline(t, end(t->data)), + b->sets->tree + bset_tree_space(b) - t->tree); + + if (t->size < 2) { + t->size = 0; + return; + } + + t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; + + /* First we figure out where the first key in each cacheline is */ + for (j = inorder_next(0, t->size); + j; + j = inorder_next(j, t->size)) { + while (bkey_to_cacheline(t, k) != cacheline) + k = bkey_next(k); + + t->prev[j] = bkey_u64s(k); + k = bkey_next(k); + cacheline++; + t->tree[j].m = bkey_to_cacheline_offset(k); + } + + while (bkey_next(k) != end(t->data)) + k = bkey_next(k); + + t->end = *k; + + /* Then we build the tree */ + for (j = inorder_next(0, t->size); + j; + j = inorder_next(j, t->size)) + make_bfloat(t, j); +} + +void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k) +{ + struct bset_tree *t; + unsigned inorder, j = 1; + + for (t = b->sets; t <= &b->sets[b->nsets]; t++) + if (k < end(t->data)) + goto found_set; + + BUG(); +found_set: + if (!t->size || !bset_written(b, t)) + return; + + inorder = bkey_to_cacheline(t, k); + + if (k == t->data->start) + goto fix_left; + + if (bkey_next(k) == end(t->data)) { + t->end = *k; + goto fix_right; + } + + j = inorder_to_tree(inorder, t); + + if (j && + j < t->size && + k == tree_to_bkey(t, j)) +fix_left: do { + make_bfloat(t, j); + j = j * 2; + } while (j < t->size); + + j = inorder_to_tree(inorder + 1, t); + + if (j && + j < t->size && + k == tree_to_prev_bkey(t, j)) +fix_right: do { + make_bfloat(t, j); + j = j * 2 + 1; + } while (j < t->size); +} + +void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k) +{ + struct bset_tree *t = &b->sets[b->nsets]; + unsigned shift = bkey_u64s(k); + unsigned j = bkey_to_cacheline(t, k); + + /* We're getting called from btree_split() or btree_gc, just bail out */ + if (!t->size) + return; + + /* k is the key we just inserted; we need to find the entry in the + * lookup table for the first key that is strictly greater than k: + * it's either k's cacheline or the next one + */ + if (j < t->size && + table_to_bkey(t, j) <= k) + j++; + + /* Adjust all the lookup table entries, and find a new key for any that + * have gotten too big + */ + for (; j < t->size; j++) { + t->prev[j] += shift; + + if (t->prev[j] > 7) { + k = table_to_bkey(t, j - 1); + + while (k < cacheline_to_bkey(t, j, 0)) + k = bkey_next(k); + + t->prev[j] = bkey_to_cacheline_offset(k); + } + } + + if (t->size == b->sets->tree + bset_tree_space(b) - t->tree) + return; + + /* Possibly add a new entry to the end of the lookup table */ + + for (k = table_to_bkey(t, t->size - 1); + k != end(t->data); + k = bkey_next(k)) + if (t->size == bkey_to_cacheline(t, k)) { + t->prev[t->size] = bkey_to_cacheline_offset(k); + t->size++; + } +} + +void bch_bset_init_next(struct btree *b) +{ + struct bset *i = write_block(b); + + if (i != b->sets[0].data) { + b->sets[++b->nsets].data = i; + i->seq = b->sets[0].data->seq; + } else + get_random_bytes(&i->seq, sizeof(uint64_t)); + + i->magic = bset_magic(b->c); + i->version = 0; + i->keys = 0; + + bset_build_unwritten_tree(b); +} + +struct bset_search_iter { + struct bkey *l, *r; +}; + +static struct bset_search_iter bset_search_write_set(struct btree *b, + struct bset_tree *t, + const struct bkey *search) +{ + unsigned li = 0, ri = t->size; + + BUG_ON(!b->nsets && + t->size < bkey_to_cacheline(t, end(t->data))); + + while (li + 1 != ri) { + unsigned m = (li + ri) >> 1; + + if (bkey_cmp(table_to_bkey(t, m), search) > 0) + ri = m; + else + li = m; + } + + return (struct bset_search_iter) { + table_to_bkey(t, li), + ri < t->size ? table_to_bkey(t, ri) : end(t->data) + }; +} + +static struct bset_search_iter bset_search_tree(struct btree *b, + struct bset_tree *t, + const struct bkey *search) +{ + struct bkey *l, *r; + struct bkey_float *f; + unsigned inorder, j, n = 1; + + do { + unsigned p = n << 4; + p &= ((int) (p - t->size)) >> 31; + + prefetch(&t->tree[p]); + + j = n; + f = &t->tree[j]; + + /* + * n = (f->mantissa > bfloat_mantissa()) + * ? j * 2 + * : j * 2 + 1; + * + * We need to subtract 1 from f->mantissa for the sign bit trick + * to work - that's done in make_bfloat() + */ + if (likely(f->exponent != 127)) + n = j * 2 + (((unsigned) + (f->mantissa - + bfloat_mantissa(search, f))) >> 31); + else + n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) + ? j * 2 + : j * 2 + 1; + } while (n < t->size); + + inorder = to_inorder(j, t); + + /* + * n would have been the node we recursed to - the low bit tells us if + * we recursed left or recursed right. + */ + if (n & 1) { + l = cacheline_to_bkey(t, inorder, f->m); + + if (++inorder != t->size) { + f = &t->tree[inorder_next(j, t->size)]; + r = cacheline_to_bkey(t, inorder, f->m); + } else + r = end(t->data); + } else { + r = cacheline_to_bkey(t, inorder, f->m); + + if (--inorder) { + f = &t->tree[inorder_prev(j, t->size)]; + l = cacheline_to_bkey(t, inorder, f->m); + } else + l = t->data->start; + } + + return (struct bset_search_iter) {l, r}; +} + +struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, + const struct bkey *search) +{ + struct bset_search_iter i; + + /* + * First, we search for a cacheline, then lastly we do a linear search + * within that cacheline. + * + * To search for the cacheline, there's three different possibilities: + * * The set is too small to have a search tree, so we just do a linear + * search over the whole set. + * * The set is the one we're currently inserting into; keeping a full + * auxiliary search tree up to date would be too expensive, so we + * use a much simpler lookup table to do a binary search - + * bset_search_write_set(). + * * Or we use the auxiliary search tree we constructed earlier - + * bset_search_tree() + */ + + if (unlikely(!t->size)) { + i.l = t->data->start; + i.r = end(t->data); + } else if (bset_written(b, t)) { + /* + * Each node in the auxiliary search tree covers a certain range + * of bits, and keys above and below the set it covers might + * differ outside those bits - so we have to special case the + * start and end - handle that here: + */ + + if (unlikely(bkey_cmp(search, &t->end) >= 0)) + return end(t->data); + + if (unlikely(bkey_cmp(search, t->data->start) < 0)) + return t->data->start; + + i = bset_search_tree(b, t, search); + } else + i = bset_search_write_set(b, t, search); + +#ifdef CONFIG_BCACHE_EDEBUG + BUG_ON(bset_written(b, t) && + i.l != t->data->start && + bkey_cmp(tree_to_prev_bkey(t, + inorder_to_tree(bkey_to_cacheline(t, i.l), t)), + search) > 0); + + BUG_ON(i.r != end(t->data) && + bkey_cmp(i.r, search) <= 0); +#endif + + while (likely(i.l != i.r) && + bkey_cmp(i.l, search) <= 0) + i.l = bkey_next(i.l); + + return i.l; +} + +/* Btree iterator */ + +static inline bool btree_iter_cmp(struct btree_iter_set l, + struct btree_iter_set r) +{ + int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); + + return c ? c > 0 : l.k < r.k; +} + +static inline bool btree_iter_end(struct btree_iter *iter) +{ + return !iter->used; +} + +void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, + struct bkey *end) +{ + if (k != end) + BUG_ON(!heap_add(iter, + ((struct btree_iter_set) { k, end }), + btree_iter_cmp)); +} + +struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, + struct bkey *search, struct bset_tree *start) +{ + struct bkey *ret = NULL; + iter->size = ARRAY_SIZE(iter->data); + iter->used = 0; + + for (; start <= &b->sets[b->nsets]; start++) { + ret = bch_bset_search(b, start, search); + bch_btree_iter_push(iter, ret, end(start->data)); + } + + return ret; +} + +struct bkey *bch_btree_iter_next(struct btree_iter *iter) +{ + struct btree_iter_set unused; + struct bkey *ret = NULL; + + if (!btree_iter_end(iter)) { + ret = iter->data->k; + iter->data->k = bkey_next(iter->data->k); + + if (iter->data->k > iter->data->end) { + WARN_ONCE(1, "bset was corrupt!\n"); + iter->data->k = iter->data->end; + } + + if (iter->data->k == iter->data->end) + heap_pop(iter, unused, btree_iter_cmp); + else + heap_sift(iter, 0, btree_iter_cmp); + } + + return ret; +} + +struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, + struct btree *b, ptr_filter_fn fn) +{ + struct bkey *ret; + + do { + ret = bch_btree_iter_next(iter); + } while (ret && fn(b, ret)); + + return ret; +} + +struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search) +{ + struct btree_iter iter; + + bch_btree_iter_init(b, &iter, search); + return bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); +} + +/* Mergesort */ + +static void btree_sort_fixup(struct btree_iter *iter) +{ + while (iter->used > 1) { + struct btree_iter_set *top = iter->data, *i = top + 1; + struct bkey *k; + + if (iter->used > 2 && + btree_iter_cmp(i[0], i[1])) + i++; + + for (k = i->k; + k != i->end && bkey_cmp(top->k, &START_KEY(k)) > 0; + k = bkey_next(k)) + if (top->k > i->k) + __bch_cut_front(top->k, k); + else if (KEY_SIZE(k)) + bch_cut_back(&START_KEY(k), top->k); + + if (top->k < i->k || k == i->k) + break; + + heap_sift(iter, i - top, btree_iter_cmp); + } +} + +static void btree_mergesort(struct btree *b, struct bset *out, + struct btree_iter *iter, + bool fixup, bool remove_stale) +{ + struct bkey *k, *last = NULL; + bool (*bad)(struct btree *, const struct bkey *) = remove_stale + ? bch_ptr_bad + : bch_ptr_invalid; + + while (!btree_iter_end(iter)) { + if (fixup && !b->level) + btree_sort_fixup(iter); + + k = bch_btree_iter_next(iter); + if (bad(b, k)) + continue; + + if (!last) { + last = out->start; + bkey_copy(last, k); + } else if (b->level || + !bch_bkey_try_merge(b, last, k)) { + last = bkey_next(last); + bkey_copy(last, k); + } + } + + out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0; + + pr_debug("sorted %i keys", out->keys); + bch_check_key_order(b, out); +} + +static void __btree_sort(struct btree *b, struct btree_iter *iter, + unsigned start, unsigned order, bool fixup) +{ + uint64_t start_time; + bool remove_stale = !b->written; + struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, + order); + if (!out) { + mutex_lock(&b->c->sort_lock); + out = b->c->sort; + order = ilog2(bucket_pages(b->c)); + } + + start_time = local_clock(); + + btree_mergesort(b, out, iter, fixup, remove_stale); + b->nsets = start; + + if (!fixup && !start && b->written) + bch_btree_verify(b, out); + + if (!start && order == b->page_order) { + /* + * Our temporary buffer is the same size as the btree node's + * buffer, we can just swap buffers instead of doing a big + * memcpy() + */ + + out->magic = bset_magic(b->c); + out->seq = b->sets[0].data->seq; + out->version = b->sets[0].data->version; + swap(out, b->sets[0].data); + + if (b->c->sort == b->sets[0].data) + b->c->sort = out; + } else { + b->sets[start].data->keys = out->keys; + memcpy(b->sets[start].data->start, out->start, + (void *) end(out) - (void *) out->start); + } + + if (out == b->c->sort) + mutex_unlock(&b->c->sort_lock); + else + free_pages((unsigned long) out, order); + + if (b->written) + bset_build_written_tree(b); + + if (!start) { + spin_lock(&b->c->sort_time_lock); + bch_time_stats_update(&b->c->sort_time, start_time); + spin_unlock(&b->c->sort_time_lock); + } +} + +void bch_btree_sort_partial(struct btree *b, unsigned start) +{ + size_t oldsize = 0, order = b->page_order, keys = 0; + struct btree_iter iter; + __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); + + BUG_ON(b->sets[b->nsets].data == write_block(b) && + (b->sets[b->nsets].size || b->nsets)); + + if (b->written) + oldsize = bch_count_data(b); + + if (start) { + unsigned i; + + for (i = start; i <= b->nsets; i++) + keys += b->sets[i].data->keys; + + order = roundup_pow_of_two(__set_bytes(b->sets->data, + keys)) / PAGE_SIZE; + if (order) + order = ilog2(order); + } + + __btree_sort(b, &iter, start, order, false); + + EBUG_ON(b->written && bch_count_data(b) != oldsize); +} + +void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter) +{ + BUG_ON(!b->written); + __btree_sort(b, iter, 0, b->page_order, true); +} + +void bch_btree_sort_into(struct btree *b, struct btree *new) +{ + uint64_t start_time = local_clock(); + + struct btree_iter iter; + bch_btree_iter_init(b, &iter, NULL); + + btree_mergesort(b, new->sets->data, &iter, false, true); + + spin_lock(&b->c->sort_time_lock); + bch_time_stats_update(&b->c->sort_time, start_time); + spin_unlock(&b->c->sort_time_lock); + + bkey_copy_key(&new->key, &b->key); + new->sets->size = 0; +} + +void bch_btree_sort_lazy(struct btree *b) +{ + if (b->nsets) { + unsigned i, j, keys = 0, total; + + for (i = 0; i <= b->nsets; i++) + keys += b->sets[i].data->keys; + + total = keys; + + for (j = 0; j < b->nsets; j++) { + if (keys * 2 < total || + keys < 1000) { + bch_btree_sort_partial(b, j); + return; + } + + keys -= b->sets[j].data->keys; + } + + /* Must sort if b->nsets == 3 or we'll overflow */ + if (b->nsets >= (MAX_BSETS - 1) - b->level) { + bch_btree_sort(b); + return; + } + } + + bset_build_written_tree(b); +} + +/* Sysfs stuff */ + +struct bset_stats { + size_t nodes; + size_t sets_written, sets_unwritten; + size_t bytes_written, bytes_unwritten; + size_t floats, failed; +}; + +static int bch_btree_bset_stats(struct btree *b, struct btree_op *op, + struct bset_stats *stats) +{ + struct bkey *k; + unsigned i; + + stats->nodes++; + + for (i = 0; i <= b->nsets; i++) { + struct bset_tree *t = &b->sets[i]; + size_t bytes = t->data->keys * sizeof(uint64_t); + size_t j; + + if (bset_written(b, t)) { + stats->sets_written++; + stats->bytes_written += bytes; + + stats->floats += t->size - 1; + + for (j = 1; j < t->size; j++) + if (t->tree[j].exponent == 127) + stats->failed++; + } else { + stats->sets_unwritten++; + stats->bytes_unwritten += bytes; + } + } + + if (b->level) { + struct btree_iter iter; + + for_each_key_filter(b, k, &iter, bch_ptr_bad) { + int ret = btree(bset_stats, k, b, op, stats); + if (ret) + return ret; + } + } + + return 0; +} + +int bch_bset_print_stats(struct cache_set *c, char *buf) +{ + struct btree_op op; + struct bset_stats t; + int ret; + + bch_btree_op_init_stack(&op); + memset(&t, 0, sizeof(struct bset_stats)); + + ret = btree_root(bset_stats, c, &op, &t); + if (ret) + return ret; + + return snprintf(buf, PAGE_SIZE, + "btree nodes: %zu\n" + "written sets: %zu\n" + "unwritten sets: %zu\n" + "written key bytes: %zu\n" + "unwritten key bytes: %zu\n" + "floats: %zu\n" + "failed: %zu\n", + t.nodes, + t.sets_written, t.sets_unwritten, + t.bytes_written, t.bytes_unwritten, + t.floats, t.failed); +} diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h new file mode 100644 index 00000000000..57a9cff4154 --- /dev/null +++ b/drivers/md/bcache/bset.h @@ -0,0 +1,379 @@ +#ifndef _BCACHE_BSET_H +#define _BCACHE_BSET_H + +/* + * BKEYS: + * + * A bkey contains a key, a size field, a variable number of pointers, and some + * ancillary flag bits. + * + * We use two different functions for validating bkeys, bch_ptr_invalid and + * bch_ptr_bad(). + * + * bch_ptr_invalid() primarily filters out keys and pointers that would be + * invalid due to some sort of bug, whereas bch_ptr_bad() filters out keys and + * pointer that occur in normal practice but don't point to real data. + * + * The one exception to the rule that ptr_invalid() filters out invalid keys is + * that it also filters out keys of size 0 - these are keys that have been + * completely overwritten. It'd be safe to delete these in memory while leaving + * them on disk, just unnecessary work - so we filter them out when resorting + * instead. + * + * We can't filter out stale keys when we're resorting, because garbage + * collection needs to find them to ensure bucket gens don't wrap around - + * unless we're rewriting the btree node those stale keys still exist on disk. + * + * We also implement functions here for removing some number of sectors from the + * front or the back of a bkey - this is mainly used for fixing overlapping + * extents, by removing the overlapping sectors from the older key. + * + * BSETS: + * + * A bset is an array of bkeys laid out contiguously in memory in sorted order, + * along with a header. A btree node is made up of a number of these, written at + * different times. + * + * There could be many of them on disk, but we never allow there to be more than + * 4 in memory - we lazily resort as needed. + * + * We implement code here for creating and maintaining auxiliary search trees + * (described below) for searching an individial bset, and on top of that we + * implement a btree iterator. + * + * BTREE ITERATOR: + * + * Most of the code in bcache doesn't care about an individual bset - it needs + * to search entire btree nodes and iterate over them in sorted order. + * + * The btree iterator code serves both functions; it iterates through the keys + * in a btree node in sorted order, starting from either keys after a specific + * point (if you pass it a search key) or the start of the btree node. + * + * AUXILIARY SEARCH TREES: + * + * Since keys are variable length, we can't use a binary search on a bset - we + * wouldn't be able to find the start of the next key. But binary searches are + * slow anyways, due to terrible cache behaviour; bcache originally used binary + * searches and that code topped out at under 50k lookups/second. + * + * So we need to construct some sort of lookup table. Since we only insert keys + * into the last (unwritten) set, most of the keys within a given btree node are + * usually in sets that are mostly constant. We use two different types of + * lookup tables to take advantage of this. + * + * Both lookup tables share in common that they don't index every key in the + * set; they index one key every BSET_CACHELINE bytes, and then a linear search + * is used for the rest. + * + * For sets that have been written to disk and are no longer being inserted + * into, we construct a binary search tree in an array - traversing a binary + * search tree in an array gives excellent locality of reference and is very + * fast, since both children of any node are adjacent to each other in memory + * (and their grandchildren, and great grandchildren...) - this means + * prefetching can be used to great effect. + * + * It's quite useful performance wise to keep these nodes small - not just + * because they're more likely to be in L2, but also because we can prefetch + * more nodes on a single cacheline and thus prefetch more iterations in advance + * when traversing this tree. + * + * Nodes in the auxiliary search tree must contain both a key to compare against + * (we don't want to fetch the key from the set, that would defeat the purpose), + * and a pointer to the key. We use a few tricks to compress both of these. + * + * To compress the pointer, we take advantage of the fact that one node in the + * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have + * a function (to_inorder()) that takes the index of a node in a binary tree and + * returns what its index would be in an inorder traversal, so we only have to + * store the low bits of the offset. + * + * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To + * compress that, we take advantage of the fact that when we're traversing the + * search tree at every iteration we know that both our search key and the key + * we're looking for lie within some range - bounded by our previous + * comparisons. (We special case the start of a search so that this is true even + * at the root of the tree). + * + * So we know the key we're looking for is between a and b, and a and b don't + * differ higher than bit 50, we don't need to check anything higher than bit + * 50. + * + * We don't usually need the rest of the bits, either; we only need enough bits + * to partition the key range we're currently checking. Consider key n - the + * key our auxiliary search tree node corresponds to, and key p, the key + * immediately preceding n. The lowest bit we need to store in the auxiliary + * search tree is the highest bit that differs between n and p. + * + * Note that this could be bit 0 - we might sometimes need all 80 bits to do the + * comparison. But we'd really like our nodes in the auxiliary search tree to be + * of fixed size. + * + * The solution is to make them fixed size, and when we're constructing a node + * check if p and n differed in the bits we needed them to. If they don't we + * flag that node, and when doing lookups we fallback to comparing against the + * real key. As long as this doesn't happen to often (and it seems to reliably + * happen a bit less than 1% of the time), we win - even on failures, that key + * is then more likely to be in cache than if we were doing binary searches all + * the way, since we're touching so much less memory. + * + * The keys in the auxiliary search tree are stored in (software) floating + * point, with an exponent and a mantissa. The exponent needs to be big enough + * to address all the bits in the original key, but the number of bits in the + * mantissa is somewhat arbitrary; more bits just gets us fewer failures. + * + * We need 7 bits for the exponent and 3 bits for the key's offset (since keys + * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. + * We need one node per 128 bytes in the btree node, which means the auxiliary + * search trees take up 3% as much memory as the btree itself. + * + * Constructing these auxiliary search trees is moderately expensive, and we + * don't want to be constantly rebuilding the search tree for the last set + * whenever we insert another key into it. For the unwritten set, we use a much + * simpler lookup table - it's just a flat array, so index i in the lookup table + * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing + * within each byte range works the same as with the auxiliary search trees. + * + * These are much easier to keep up to date when we insert a key - we do it + * somewhat lazily; when we shift a key up we usually just increment the pointer + * to it, only when it would overflow do we go to the trouble of finding the + * first key in that range of bytes again. + */ + +/* Btree key comparison/iteration */ + +struct btree_iter { + size_t size, used; + struct btree_iter_set { + struct bkey *k, *end; + } data[MAX_BSETS]; +}; + +struct bset_tree { + /* + * We construct a binary tree in an array as if the array + * started at 1, so that things line up on the same cachelines + * better: see comments in bset.c at cacheline_to_bkey() for + * details + */ + + /* size of the binary tree and prev array */ + unsigned size; + + /* function of size - precalculated for to_inorder() */ + unsigned extra; + + /* copy of the last key in the set */ + struct bkey end; + struct bkey_float *tree; + + /* + * The nodes in the bset tree point to specific keys - this + * array holds the sizes of the previous key. + * + * Conceptually it's a member of struct bkey_float, but we want + * to keep bkey_float to 4 bytes and prev isn't used in the fast + * path. + */ + uint8_t *prev; + + /* The actual btree node, with pointers to each sorted set */ + struct bset *data; +}; + +static __always_inline int64_t bkey_cmp(const struct bkey *l, + const struct bkey *r) +{ + return unlikely(KEY_INODE(l) != KEY_INODE(r)) + ? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r) + : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); +} + +static inline size_t bkey_u64s(const struct bkey *k) +{ + BUG_ON(KEY_CSUM(k) > 1); + return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0); +} + +static inline size_t bkey_bytes(const struct bkey *k) +{ + return bkey_u64s(k) * sizeof(uint64_t); +} + +static inline void bkey_copy(struct bkey *dest, const struct bkey *src) +{ + memcpy(dest, src, bkey_bytes(src)); +} + +static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src) +{ + if (!src) + src = &KEY(0, 0, 0); + + SET_KEY_INODE(dest, KEY_INODE(src)); + SET_KEY_OFFSET(dest, KEY_OFFSET(src)); +} + +static inline struct bkey *bkey_next(const struct bkey *k) +{ + uint64_t *d = (void *) k; + return (struct bkey *) (d + bkey_u64s(k)); +} + +/* Keylists */ + +struct keylist { + struct bkey *top; + union { + uint64_t *list; + struct bkey *bottom; + }; + + /* Enough room for btree_split's keys without realloc */ +#define KEYLIST_INLINE 16 + uint64_t d[KEYLIST_INLINE]; +}; + +static inline void bch_keylist_init(struct keylist *l) +{ + l->top = (void *) (l->list = l->d); +} + +static inline void bch_keylist_push(struct keylist *l) +{ + l->top = bkey_next(l->top); +} + +static inline void bch_keylist_add(struct keylist *l, struct bkey *k) +{ + bkey_copy(l->top, k); + bch_keylist_push(l); +} + +static inline bool bch_keylist_empty(struct keylist *l) +{ + return l->top == (void *) l->list; +} + +static inline void bch_keylist_free(struct keylist *l) +{ + if (l->list != l->d) + kfree(l->list); +} + +void bch_keylist_copy(struct keylist *, struct keylist *); +struct bkey *bch_keylist_pop(struct keylist *); +int bch_keylist_realloc(struct keylist *, int, struct cache_set *); + +void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, + unsigned); +bool __bch_cut_front(const struct bkey *, struct bkey *); +bool __bch_cut_back(const struct bkey *, struct bkey *); + +static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) +{ + BUG_ON(bkey_cmp(where, k) > 0); + return __bch_cut_front(where, k); +} + +static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) +{ + BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); + return __bch_cut_back(where, k); +} + +const char *bch_ptr_status(struct cache_set *, const struct bkey *); +bool __bch_ptr_invalid(struct cache_set *, int level, const struct bkey *); +bool bch_ptr_bad(struct btree *, const struct bkey *); + +static inline uint8_t gen_after(uint8_t a, uint8_t b) +{ + uint8_t r = a - b; + return r > 128U ? 0 : r; +} + +static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, + unsigned i) +{ + return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); +} + +static inline bool ptr_available(struct cache_set *c, const struct bkey *k, + unsigned i) +{ + return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); +} + + +typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); + +struct bkey *bch_next_recurse_key(struct btree *, struct bkey *); +struct bkey *bch_btree_iter_next(struct btree_iter *); +struct bkey *bch_btree_iter_next_filter(struct btree_iter *, + struct btree *, ptr_filter_fn); + +void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); +struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *, + struct bkey *, struct bset_tree *); + +/* 32 bits total: */ +#define BKEY_MID_BITS 3 +#define BKEY_EXPONENT_BITS 7 +#define BKEY_MANTISSA_BITS 22 +#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) + +struct bkey_float { + unsigned exponent:BKEY_EXPONENT_BITS; + unsigned m:BKEY_MID_BITS; + unsigned mantissa:BKEY_MANTISSA_BITS; +} __packed; + +/* + * BSET_CACHELINE was originally intended to match the hardware cacheline size - + * it used to be 64, but I realized the lookup code would touch slightly less + * memory if it was 128. + * + * It definites the number of bytes (in struct bset) per struct bkey_float in + * the auxiliar search tree - when we're done searching the bset_float tree we + * have this many bytes left that we do a linear search over. + * + * Since (after level 5) every level of the bset_tree is on a new cacheline, + * we're touching one fewer cacheline in the bset tree in exchange for one more + * cacheline in the linear search - but the linear search might stop before it + * gets to the second cacheline. + */ + +#define BSET_CACHELINE 128 +#define bset_tree_space(b) (btree_data_space(b) / BSET_CACHELINE) + +#define bset_tree_bytes(b) (bset_tree_space(b) * sizeof(struct bkey_float)) +#define bset_prev_bytes(b) (bset_tree_space(b) * sizeof(uint8_t)) + +void bch_bset_init_next(struct btree *); + +void bch_bset_fix_invalidated_key(struct btree *, struct bkey *); +void bch_bset_fix_lookup_table(struct btree *, struct bkey *); + +struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, + const struct bkey *); + +static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, + const struct bkey *search) +{ + return search ? __bch_bset_search(b, t, search) : t->data->start; +} + +bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); +void bch_btree_sort_lazy(struct btree *); +void bch_btree_sort_into(struct btree *, struct btree *); +void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *); +void bch_btree_sort_partial(struct btree *, unsigned); + +static inline void bch_btree_sort(struct btree *b) +{ + bch_btree_sort_partial(b, 0); +} + +int bch_bset_print_stats(struct cache_set *, char *); + +#endif diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c new file mode 100644 index 00000000000..7a5658f04e6 --- /dev/null +++ b/drivers/md/bcache/btree.c @@ -0,0 +1,2503 @@ +/* + * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> + * + * Uses a block device as cache for other block devices; optimized for SSDs. + * All allocation is done in buckets, which should match the erase block size + * of the device. + * + * Buckets containing cached data are kept on a heap sorted by priority; + * bucket priority is increased on cache hit, and periodically all the buckets + * on the heap have their priority scaled down. This currently is just used as + * an LRU but in the future should allow for more intelligent heuristics. + * + * Buckets have an 8 bit counter; freeing is accomplished by incrementing the + * counter. Garbage collection is used to remove stale pointers. + * + * Indexing is done via a btree; nodes are not necessarily fully sorted, rather + * as keys are inserted we only sort the pages that have not yet been written. + * When garbage collection is run, we resort the entire node. + * + * All configuration is done via sysfs; see Documentation/bcache.txt. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "request.h" + +#include <linux/slab.h> +#include <linux/bitops.h> +#include <linux/hash.h> +#include <linux/prefetch.h> +#include <linux/random.h> +#include <linux/rcupdate.h> +#include <trace/events/bcache.h> + +/* + * Todo: + * register_bcache: Return errors out to userspace correctly + * + * Writeback: don't undirty key until after a cache flush + * + * Create an iterator for key pointers + * + * On btree write error, mark bucket such that it won't be freed from the cache + * + * Journalling: + * Check for bad keys in replay + * Propagate barriers + * Refcount journal entries in journal_replay + * + * Garbage collection: + * Finish incremental gc + * Gc should free old UUIDs, data for invalid UUIDs + * + * Provide a way to list backing device UUIDs we have data cached for, and + * probably how long it's been since we've seen them, and a way to invalidate + * dirty data for devices that will never be attached again + * + * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so + * that based on that and how much dirty data we have we can keep writeback + * from being starved + * + * Add a tracepoint or somesuch to watch for writeback starvation + * + * When btree depth > 1 and splitting an interior node, we have to make sure + * alloc_bucket() cannot fail. This should be true but is not completely + * obvious. + * + * Make sure all allocations get charged to the root cgroup + * + * Plugging? + * + * If data write is less than hard sector size of ssd, round up offset in open + * bucket to the next whole sector + * + * Also lookup by cgroup in get_open_bucket() + * + * Superblock needs to be fleshed out for multiple cache devices + * + * Add a sysfs tunable for the number of writeback IOs in flight + * + * Add a sysfs tunable for the number of open data buckets + * + * IO tracking: Can we track when one process is doing io on behalf of another? + * IO tracking: Don't use just an average, weigh more recent stuff higher + * + * Test module load/unload + */ + +static const char * const op_types[] = { + "insert", "replace" +}; + +static const char *op_type(struct btree_op *op) +{ + return op_types[op->type]; +} + +#define MAX_NEED_GC 64 +#define MAX_SAVE_PRIO 72 + +#define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) + +#define PTR_HASH(c, k) \ + (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) + +struct workqueue_struct *bch_gc_wq; +static struct workqueue_struct *btree_io_wq; + +void bch_btree_op_init_stack(struct btree_op *op) +{ + memset(op, 0, sizeof(struct btree_op)); + closure_init_stack(&op->cl); + op->lock = -1; + bch_keylist_init(&op->keys); +} + +/* Btree key manipulation */ + +static void bkey_put(struct cache_set *c, struct bkey *k, int level) +{ + if ((level && KEY_OFFSET(k)) || !level) + __bkey_put(c, k); +} + +/* Btree IO */ + +static uint64_t btree_csum_set(struct btree *b, struct bset *i) +{ + uint64_t crc = b->key.ptr[0]; + void *data = (void *) i + 8, *end = end(i); + + crc = bch_crc64_update(crc, data, end - data); + return crc ^ 0xffffffffffffffffULL; +} + +static void btree_bio_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + struct btree *b = container_of(cl, struct btree, io.cl); + + if (error) + set_btree_node_io_error(b); + + bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE) + ? "writing btree" : "reading btree"); + closure_put(cl); +} + +static void btree_bio_init(struct btree *b) +{ + BUG_ON(b->bio); + b->bio = bch_bbio_alloc(b->c); + + b->bio->bi_end_io = btree_bio_endio; + b->bio->bi_private = &b->io.cl; +} + +void bch_btree_read_done(struct closure *cl) +{ + struct btree *b = container_of(cl, struct btree, io.cl); + struct bset *i = b->sets[0].data; + struct btree_iter *iter = b->c->fill_iter; + const char *err = "bad btree header"; + BUG_ON(b->nsets || b->written); + + bch_bbio_free(b->bio, b->c); + b->bio = NULL; + + mutex_lock(&b->c->fill_lock); + iter->used = 0; + + if (btree_node_io_error(b) || + !i->seq) + goto err; + + for (; + b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; + i = write_block(b)) { + err = "unsupported bset version"; + if (i->version > BCACHE_BSET_VERSION) + goto err; + + err = "bad btree header"; + if (b->written + set_blocks(i, b->c) > btree_blocks(b)) + goto err; + + err = "bad magic"; + if (i->magic != bset_magic(b->c)) + goto err; + + err = "bad checksum"; + switch (i->version) { + case 0: + if (i->csum != csum_set(i)) + goto err; + break; + case BCACHE_BSET_VERSION: + if (i->csum != btree_csum_set(b, i)) + goto err; + break; + } + + err = "empty set"; + if (i != b->sets[0].data && !i->keys) + goto err; + + bch_btree_iter_push(iter, i->start, end(i)); + + b->written += set_blocks(i, b->c); + } + + err = "corrupted btree"; + for (i = write_block(b); + index(i, b) < btree_blocks(b); + i = ((void *) i) + block_bytes(b->c)) + if (i->seq == b->sets[0].data->seq) + goto err; + + bch_btree_sort_and_fix_extents(b, iter); + + i = b->sets[0].data; + err = "short btree key"; + if (b->sets[0].size && + bkey_cmp(&b->key, &b->sets[0].end) < 0) + goto err; + + if (b->written < btree_blocks(b)) + bch_bset_init_next(b); +out: + + mutex_unlock(&b->c->fill_lock); + + spin_lock(&b->c->btree_read_time_lock); + bch_time_stats_update(&b->c->btree_read_time, b->io_start_time); + spin_unlock(&b->c->btree_read_time_lock); + + smp_wmb(); /* read_done is our write lock */ + set_btree_node_read_done(b); + + closure_return(cl); +err: + set_btree_node_io_error(b); + bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", + err, PTR_BUCKET_NR(b->c, &b->key, 0), + index(i, b), i->keys); + goto out; +} + +void bch_btree_read(struct btree *b) +{ + BUG_ON(b->nsets || b->written); + + if (!closure_trylock(&b->io.cl, &b->c->cl)) + BUG(); + + b->io_start_time = local_clock(); + + btree_bio_init(b); + b->bio->bi_rw = REQ_META|READ_SYNC; + b->bio->bi_size = KEY_SIZE(&b->key) << 9; + + bch_bio_map(b->bio, b->sets[0].data); + + pr_debug("%s", pbtree(b)); + trace_bcache_btree_read(b->bio); + bch_submit_bbio(b->bio, b->c, &b->key, 0); + + continue_at(&b->io.cl, bch_btree_read_done, system_wq); +} + +static void btree_complete_write(struct btree *b, struct btree_write *w) +{ + if (w->prio_blocked && + !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) + wake_up(&b->c->alloc_wait); + + if (w->journal) { + atomic_dec_bug(w->journal); + __closure_wake_up(&b->c->journal.wait); + } + + if (w->owner) + closure_put(w->owner); + + w->prio_blocked = 0; + w->journal = NULL; + w->owner = NULL; +} + +static void __btree_write_done(struct closure *cl) +{ + struct btree *b = container_of(cl, struct btree, io.cl); + struct btree_write *w = btree_prev_write(b); + + bch_bbio_free(b->bio, b->c); + b->bio = NULL; + btree_complete_write(b, w); + + if (btree_node_dirty(b)) + queue_delayed_work(btree_io_wq, &b->work, + msecs_to_jiffies(30000)); + + closure_return(cl); +} + +static void btree_write_done(struct closure *cl) +{ + struct btree *b = container_of(cl, struct btree, io.cl); + struct bio_vec *bv; + int n; + + __bio_for_each_segment(bv, b->bio, n, 0) + __free_page(bv->bv_page); + + __btree_write_done(cl); +} + +static void do_btree_write(struct btree *b) +{ + struct closure *cl = &b->io.cl; + struct bset *i = b->sets[b->nsets].data; + BKEY_PADDED(key) k; + + i->version = BCACHE_BSET_VERSION; + i->csum = btree_csum_set(b, i); + + btree_bio_init(b); + b->bio->bi_rw = REQ_META|WRITE_SYNC; + b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); + bch_bio_map(b->bio, i); + + bkey_copy(&k.key, &b->key); + SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); + + if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) { + int j; + struct bio_vec *bv; + void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); + + bio_for_each_segment(bv, b->bio, j) + memcpy(page_address(bv->bv_page), + base + j * PAGE_SIZE, PAGE_SIZE); + + trace_bcache_btree_write(b->bio); + bch_submit_bbio(b->bio, b->c, &k.key, 0); + + continue_at(cl, btree_write_done, NULL); + } else { + b->bio->bi_vcnt = 0; + bch_bio_map(b->bio, i); + + trace_bcache_btree_write(b->bio); + bch_submit_bbio(b->bio, b->c, &k.key, 0); + + closure_sync(cl); + __btree_write_done(cl); + } +} + +static void __btree_write(struct btree *b) +{ + struct bset *i = b->sets[b->nsets].data; + + BUG_ON(current->bio_list); + + closure_lock(&b->io, &b->c->cl); + cancel_delayed_work(&b->work); + + clear_bit(BTREE_NODE_dirty, &b->flags); + change_bit(BTREE_NODE_write_idx, &b->flags); + + bch_check_key_order(b, i); + BUG_ON(b->written && !i->keys); + + do_btree_write(b); + + pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys); + + b->written += set_blocks(i, b->c); + atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, + &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); + + bch_btree_sort_lazy(b); + + if (b->written < btree_blocks(b)) + bch_bset_init_next(b); +} + +static void btree_write_work(struct work_struct *w) +{ + struct btree *b = container_of(to_delayed_work(w), struct btree, work); + + down_write(&b->lock); + + if (btree_node_dirty(b)) + __btree_write(b); + up_write(&b->lock); +} + +void bch_btree_write(struct btree *b, bool now, struct btree_op *op) +{ + struct bset *i = b->sets[b->nsets].data; + struct btree_write *w = btree_current_write(b); + + BUG_ON(b->written && + (b->written >= btree_blocks(b) || + i->seq != b->sets[0].data->seq || + !i->keys)); + + if (!btree_node_dirty(b)) { + set_btree_node_dirty(b); + queue_delayed_work(btree_io_wq, &b->work, + msecs_to_jiffies(30000)); + } + + w->prio_blocked += b->prio_blocked; + b->prio_blocked = 0; + + if (op && op->journal && !b->level) { + if (w->journal && + journal_pin_cmp(b->c, w, op)) { + atomic_dec_bug(w->journal); + w->journal = NULL; + } + + if (!w->journal) { + w->journal = op->journal; + atomic_inc(w->journal); + } + } + + if (current->bio_list) + return; + + /* Force write if set is too big */ + if (now || + b->level || + set_bytes(i) > PAGE_SIZE - 48) { + if (op && now) { + /* Must wait on multiple writes */ + BUG_ON(w->owner); + w->owner = &op->cl; + closure_get(&op->cl); + } + + __btree_write(b); + } + BUG_ON(!b->written); +} + +/* + * Btree in memory cache - allocation/freeing + * mca -> memory cache + */ + +static void mca_reinit(struct btree *b) +{ + unsigned i; + + b->flags = 0; + b->written = 0; + b->nsets = 0; + + for (i = 0; i < MAX_BSETS; i++) + b->sets[i].size = 0; + /* + * Second loop starts at 1 because b->sets[0]->data is the memory we + * allocated + */ + for (i = 1; i < MAX_BSETS; i++) + b->sets[i].data = NULL; +} + +#define mca_reserve(c) (((c->root && c->root->level) \ + ? c->root->level : 1) * 8 + 16) +#define mca_can_free(c) \ + max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) + +static void mca_data_free(struct btree *b) +{ + struct bset_tree *t = b->sets; + BUG_ON(!closure_is_unlocked(&b->io.cl)); + + if (bset_prev_bytes(b) < PAGE_SIZE) + kfree(t->prev); + else + free_pages((unsigned long) t->prev, + get_order(bset_prev_bytes(b))); + + if (bset_tree_bytes(b) < PAGE_SIZE) + kfree(t->tree); + else + free_pages((unsigned long) t->tree, + get_order(bset_tree_bytes(b))); + + free_pages((unsigned long) t->data, b->page_order); + + t->prev = NULL; + t->tree = NULL; + t->data = NULL; + list_move(&b->list, &b->c->btree_cache_freed); + b->c->bucket_cache_used--; +} + +static void mca_bucket_free(struct btree *b) +{ + BUG_ON(btree_node_dirty(b)); + + b->key.ptr[0] = 0; + hlist_del_init_rcu(&b->hash); + list_move(&b->list, &b->c->btree_cache_freeable); +} + +static unsigned btree_order(struct bkey *k) +{ + return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); +} + +static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) +{ + struct bset_tree *t = b->sets; + BUG_ON(t->data); + + b->page_order = max_t(unsigned, + ilog2(b->c->btree_pages), + btree_order(k)); + + t->data = (void *) __get_free_pages(gfp, b->page_order); + if (!t->data) + goto err; + + t->tree = bset_tree_bytes(b) < PAGE_SIZE + ? kmalloc(bset_tree_bytes(b), gfp) + : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); + if (!t->tree) + goto err; + + t->prev = bset_prev_bytes(b) < PAGE_SIZE + ? kmalloc(bset_prev_bytes(b), gfp) + : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); + if (!t->prev) + goto err; + + list_move(&b->list, &b->c->btree_cache); + b->c->bucket_cache_used++; + return; +err: + mca_data_free(b); +} + +static struct btree *mca_bucket_alloc(struct cache_set *c, + struct bkey *k, gfp_t gfp) +{ + struct btree *b = kzalloc(sizeof(struct btree), gfp); + if (!b) + return NULL; + + init_rwsem(&b->lock); + lockdep_set_novalidate_class(&b->lock); + INIT_LIST_HEAD(&b->list); + INIT_DELAYED_WORK(&b->work, btree_write_work); + b->c = c; + closure_init_unlocked(&b->io); + + mca_data_alloc(b, k, gfp); + return b; +} + +static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) +{ + lockdep_assert_held(&b->c->bucket_lock); + + if (!down_write_trylock(&b->lock)) + return -ENOMEM; + + if (b->page_order < min_order) { + rw_unlock(true, b); + return -ENOMEM; + } + + BUG_ON(btree_node_dirty(b) && !b->sets[0].data); + + if (cl && btree_node_dirty(b)) + bch_btree_write(b, true, NULL); + + if (cl) + closure_wait_event_async(&b->io.wait, cl, + atomic_read(&b->io.cl.remaining) == -1); + + if (btree_node_dirty(b) || + !closure_is_unlocked(&b->io.cl) || + work_pending(&b->work.work)) { + rw_unlock(true, b); + return -EAGAIN; + } + + return 0; +} + +static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc) +{ + struct cache_set *c = container_of(shrink, struct cache_set, shrink); + struct btree *b, *t; + unsigned long i, nr = sc->nr_to_scan; + + if (c->shrinker_disabled) + return 0; + + if (c->try_harder) + return 0; + + /* + * If nr == 0, we're supposed to return the number of items we have + * cached. Not allowed to return -1. + */ + if (!nr) + return mca_can_free(c) * c->btree_pages; + + /* Return -1 if we can't do anything right now */ + if (sc->gfp_mask & __GFP_WAIT) + mutex_lock(&c->bucket_lock); + else if (!mutex_trylock(&c->bucket_lock)) + return -1; + + nr /= c->btree_pages; + nr = min_t(unsigned long, nr, mca_can_free(c)); + + i = 0; + list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { + if (!nr) + break; + + if (++i > 3 && + !mca_reap(b, NULL, 0)) { + mca_data_free(b); + rw_unlock(true, b); + --nr; + } + } + + /* + * Can happen right when we first start up, before we've read in any + * btree nodes + */ + if (list_empty(&c->btree_cache)) + goto out; + + for (i = 0; nr && i < c->bucket_cache_used; i++) { + b = list_first_entry(&c->btree_cache, struct btree, list); + list_rotate_left(&c->btree_cache); + + if (!b->accessed && + !mca_reap(b, NULL, 0)) { + mca_bucket_free(b); + mca_data_free(b); + rw_unlock(true, b); + --nr; + } else + b->accessed = 0; + } +out: + nr = mca_can_free(c) * c->btree_pages; + mutex_unlock(&c->bucket_lock); + return nr; +} + +void bch_btree_cache_free(struct cache_set *c) +{ + struct btree *b; + struct closure cl; + closure_init_stack(&cl); + + if (c->shrink.list.next) + unregister_shrinker(&c->shrink); + + mutex_lock(&c->bucket_lock); + +#ifdef CONFIG_BCACHE_DEBUG + if (c->verify_data) + list_move(&c->verify_data->list, &c->btree_cache); +#endif + + list_splice(&c->btree_cache_freeable, + &c->btree_cache); + + while (!list_empty(&c->btree_cache)) { + b = list_first_entry(&c->btree_cache, struct btree, list); + + if (btree_node_dirty(b)) + btree_complete_write(b, btree_current_write(b)); + clear_bit(BTREE_NODE_dirty, &b->flags); + + mca_data_free(b); + } + + while (!list_empty(&c->btree_cache_freed)) { + b = list_first_entry(&c->btree_cache_freed, + struct btree, list); + list_del(&b->list); + cancel_delayed_work_sync(&b->work); + kfree(b); + } + + mutex_unlock(&c->bucket_lock); +} + +int bch_btree_cache_alloc(struct cache_set *c) +{ + unsigned i; + + /* XXX: doesn't check for errors */ + + closure_init_unlocked(&c->gc); + + for (i = 0; i < mca_reserve(c); i++) + mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); + + list_splice_init(&c->btree_cache, + &c->btree_cache_freeable); + +#ifdef CONFIG_BCACHE_DEBUG + mutex_init(&c->verify_lock); + + c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); + + if (c->verify_data && + c->verify_data->sets[0].data) + list_del_init(&c->verify_data->list); + else + c->verify_data = NULL; +#endif + + c->shrink.shrink = bch_mca_shrink; + c->shrink.seeks = 4; + c->shrink.batch = c->btree_pages * 2; + register_shrinker(&c->shrink); + + return 0; +} + +/* Btree in memory cache - hash table */ + +static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) +{ + return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; +} + +static struct btree *mca_find(struct cache_set *c, struct bkey *k) +{ + struct btree *b; + + rcu_read_lock(); + hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) + if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) + goto out; + b = NULL; +out: + rcu_read_unlock(); + return b; +} + +static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, + int level, struct closure *cl) +{ + int ret = -ENOMEM; + struct btree *i; + + if (!cl) + return ERR_PTR(-ENOMEM); + + /* + * Trying to free up some memory - i.e. reuse some btree nodes - may + * require initiating IO to flush the dirty part of the node. If we're + * running under generic_make_request(), that IO will never finish and + * we would deadlock. Returning -EAGAIN causes the cache lookup code to + * punt to workqueue and retry. + */ + if (current->bio_list) + return ERR_PTR(-EAGAIN); + + if (c->try_harder && c->try_harder != cl) { + closure_wait_event_async(&c->try_wait, cl, !c->try_harder); + return ERR_PTR(-EAGAIN); + } + + /* XXX: tracepoint */ + c->try_harder = cl; + c->try_harder_start = local_clock(); +retry: + list_for_each_entry_reverse(i, &c->btree_cache, list) { + int r = mca_reap(i, cl, btree_order(k)); + if (!r) + return i; + if (r != -ENOMEM) + ret = r; + } + + if (ret == -EAGAIN && + closure_blocking(cl)) { + mutex_unlock(&c->bucket_lock); + closure_sync(cl); + mutex_lock(&c->bucket_lock); + goto retry; + } + + return ERR_PTR(ret); +} + +/* + * We can only have one thread cannibalizing other cached btree nodes at a time, + * or we'll deadlock. We use an open coded mutex to ensure that, which a + * cannibalize_bucket() will take. This means every time we unlock the root of + * the btree, we need to release this lock if we have it held. + */ +void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) +{ + if (c->try_harder == cl) { + bch_time_stats_update(&c->try_harder_time, c->try_harder_start); + c->try_harder = NULL; + __closure_wake_up(&c->try_wait); + } +} + +static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, + int level, struct closure *cl) +{ + struct btree *b; + + lockdep_assert_held(&c->bucket_lock); + + if (mca_find(c, k)) + return NULL; + + /* btree_free() doesn't free memory; it sticks the node on the end of + * the list. Check if there's any freed nodes there: + */ + list_for_each_entry(b, &c->btree_cache_freeable, list) + if (!mca_reap(b, NULL, btree_order(k))) + goto out; + + /* We never free struct btree itself, just the memory that holds the on + * disk node. Check the freed list before allocating a new one: + */ + list_for_each_entry(b, &c->btree_cache_freed, list) + if (!mca_reap(b, NULL, 0)) { + mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); + if (!b->sets[0].data) + goto err; + else + goto out; + } + + b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); + if (!b) + goto err; + + BUG_ON(!down_write_trylock(&b->lock)); + if (!b->sets->data) + goto err; +out: + BUG_ON(!closure_is_unlocked(&b->io.cl)); + + bkey_copy(&b->key, k); + list_move(&b->list, &c->btree_cache); + hlist_del_init_rcu(&b->hash); + hlist_add_head_rcu(&b->hash, mca_hash(c, k)); + + lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); + b->level = level; + + mca_reinit(b); + + return b; +err: + if (b) + rw_unlock(true, b); + + b = mca_cannibalize(c, k, level, cl); + if (!IS_ERR(b)) + goto out; + + return b; +} + +/** + * bch_btree_node_get - find a btree node in the cache and lock it, reading it + * in from disk if necessary. + * + * If IO is necessary, it uses the closure embedded in struct btree_op to wait; + * if that closure is in non blocking mode, will return -EAGAIN. + * + * The btree node will have either a read or a write lock held, depending on + * level and op->lock. + */ +struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, + int level, struct btree_op *op) +{ + int i = 0; + bool write = level <= op->lock; + struct btree *b; + + BUG_ON(level < 0); +retry: + b = mca_find(c, k); + + if (!b) { + mutex_lock(&c->bucket_lock); + b = mca_alloc(c, k, level, &op->cl); + mutex_unlock(&c->bucket_lock); + + if (!b) + goto retry; + if (IS_ERR(b)) + return b; + + bch_btree_read(b); + + if (!write) + downgrade_write(&b->lock); + } else { + rw_lock(write, b, level); + if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { + rw_unlock(write, b); + goto retry; + } + BUG_ON(b->level != level); + } + + b->accessed = 1; + + for (; i <= b->nsets && b->sets[i].size; i++) { + prefetch(b->sets[i].tree); + prefetch(b->sets[i].data); + } + + for (; i <= b->nsets; i++) + prefetch(b->sets[i].data); + + if (!closure_wait_event(&b->io.wait, &op->cl, + btree_node_read_done(b))) { + rw_unlock(write, b); + b = ERR_PTR(-EAGAIN); + } else if (btree_node_io_error(b)) { + rw_unlock(write, b); + b = ERR_PTR(-EIO); + } else + BUG_ON(!b->written); + + return b; +} + +static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) +{ + struct btree *b; + + mutex_lock(&c->bucket_lock); + b = mca_alloc(c, k, level, NULL); + mutex_unlock(&c->bucket_lock); + + if (!IS_ERR_OR_NULL(b)) { + bch_btree_read(b); + rw_unlock(true, b); + } +} + +/* Btree alloc */ + +static void btree_node_free(struct btree *b, struct btree_op *op) +{ + unsigned i; + + /* + * The BUG_ON() in btree_node_get() implies that we must have a write + * lock on parent to free or even invalidate a node + */ + BUG_ON(op->lock <= b->level); + BUG_ON(b == b->c->root); + pr_debug("bucket %s", pbtree(b)); + + if (btree_node_dirty(b)) + btree_complete_write(b, btree_current_write(b)); + clear_bit(BTREE_NODE_dirty, &b->flags); + + if (b->prio_blocked && + !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked)) + wake_up(&b->c->alloc_wait); + + b->prio_blocked = 0; + + cancel_delayed_work(&b->work); + + mutex_lock(&b->c->bucket_lock); + + for (i = 0; i < KEY_PTRS(&b->key); i++) { + BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin)); + + bch_inc_gen(PTR_CACHE(b->c, &b->key, i), + PTR_BUCKET(b->c, &b->key, i)); + } + + bch_bucket_free(b->c, &b->key); + mca_bucket_free(b); + mutex_unlock(&b->c->bucket_lock); +} + +struct btree *bch_btree_node_alloc(struct cache_set *c, int level, + struct closure *cl) +{ + BKEY_PADDED(key) k; + struct btree *b = ERR_PTR(-EAGAIN); + + mutex_lock(&c->bucket_lock); +retry: + if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) + goto err; + + SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); + + b = mca_alloc(c, &k.key, level, cl); + if (IS_ERR(b)) + goto err_free; + + if (!b) { + cache_bug(c, + "Tried to allocate bucket that was in btree cache"); + __bkey_put(c, &k.key); + goto retry; + } + + set_btree_node_read_done(b); + b->accessed = 1; + bch_bset_init_next(b); + + mutex_unlock(&c->bucket_lock); + return b; +err_free: + bch_bucket_free(c, &k.key); + __bkey_put(c, &k.key); +err: + mutex_unlock(&c->bucket_lock); + return b; +} + +static struct btree *btree_node_alloc_replacement(struct btree *b, + struct closure *cl) +{ + struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); + if (!IS_ERR_OR_NULL(n)) + bch_btree_sort_into(b, n); + + return n; +} + +/* Garbage collection */ + +uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) +{ + uint8_t stale = 0; + unsigned i; + struct bucket *g; + + /* + * ptr_invalid() can't return true for the keys that mark btree nodes as + * freed, but since ptr_bad() returns true we'll never actually use them + * for anything and thus we don't want mark their pointers here + */ + if (!bkey_cmp(k, &ZERO_KEY)) + return stale; + + for (i = 0; i < KEY_PTRS(k); i++) { + if (!ptr_available(c, k, i)) + continue; + + g = PTR_BUCKET(c, k, i); + + if (gen_after(g->gc_gen, PTR_GEN(k, i))) + g->gc_gen = PTR_GEN(k, i); + + if (ptr_stale(c, k, i)) { + stale = max(stale, ptr_stale(c, k, i)); + continue; + } + + cache_bug_on(GC_MARK(g) && + (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), + c, "inconsistent ptrs: mark = %llu, level = %i", + GC_MARK(g), level); + + if (level) + SET_GC_MARK(g, GC_MARK_METADATA); + else if (KEY_DIRTY(k)) + SET_GC_MARK(g, GC_MARK_DIRTY); + + /* guard against overflow */ + SET_GC_SECTORS_USED(g, min_t(unsigned, + GC_SECTORS_USED(g) + KEY_SIZE(k), + (1 << 14) - 1)); + + BUG_ON(!GC_SECTORS_USED(g)); + } + + return stale; +} + +#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) + +static int btree_gc_mark_node(struct btree *b, unsigned *keys, + struct gc_stat *gc) +{ + uint8_t stale = 0; + unsigned last_dev = -1; + struct bcache_device *d = NULL; + struct bkey *k; + struct btree_iter iter; + struct bset_tree *t; + + gc->nodes++; + + for_each_key_filter(b, k, &iter, bch_ptr_invalid) { + if (last_dev != KEY_INODE(k)) { + last_dev = KEY_INODE(k); + + d = KEY_INODE(k) < b->c->nr_uuids + ? b->c->devices[last_dev] + : NULL; + } + + stale = max(stale, btree_mark_key(b, k)); + + if (bch_ptr_bad(b, k)) + continue; + + *keys += bkey_u64s(k); + + gc->key_bytes += bkey_u64s(k); + gc->nkeys++; + + gc->data += KEY_SIZE(k); + if (KEY_DIRTY(k)) { + gc->dirty += KEY_SIZE(k); + if (d) + d->sectors_dirty_gc += KEY_SIZE(k); + } + } + + for (t = b->sets; t <= &b->sets[b->nsets]; t++) + btree_bug_on(t->size && + bset_written(b, t) && + bkey_cmp(&b->key, &t->end) < 0, + b, "found short btree key in gc"); + + return stale; +} + +static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, + struct btree_op *op) +{ + /* + * We block priorities from being written for the duration of garbage + * collection, so we can't sleep in btree_alloc() -> + * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it + * our closure. + */ + struct btree *n = btree_node_alloc_replacement(b, NULL); + + if (!IS_ERR_OR_NULL(n)) { + swap(b, n); + + memcpy(k->ptr, b->key.ptr, + sizeof(uint64_t) * KEY_PTRS(&b->key)); + + __bkey_put(b->c, &b->key); + atomic_inc(&b->c->prio_blocked); + b->prio_blocked++; + + btree_node_free(n, op); + up_write(&n->lock); + } + + return b; +} + +/* + * Leaving this at 2 until we've got incremental garbage collection done; it + * could be higher (and has been tested with 4) except that garbage collection + * could take much longer, adversely affecting latency. + */ +#define GC_MERGE_NODES 2U + +struct gc_merge_info { + struct btree *b; + struct bkey *k; + unsigned keys; +}; + +static void btree_gc_coalesce(struct btree *b, struct btree_op *op, + struct gc_stat *gc, struct gc_merge_info *r) +{ + unsigned nodes = 0, keys = 0, blocks; + int i; + + while (nodes < GC_MERGE_NODES && r[nodes].b) + keys += r[nodes++].keys; + + blocks = btree_default_blocks(b->c) * 2 / 3; + + if (nodes < 2 || + __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) + return; + + for (i = nodes - 1; i >= 0; --i) { + if (r[i].b->written) + r[i].b = btree_gc_alloc(r[i].b, r[i].k, op); + + if (r[i].b->written) + return; + } + + for (i = nodes - 1; i > 0; --i) { + struct bset *n1 = r[i].b->sets->data; + struct bset *n2 = r[i - 1].b->sets->data; + struct bkey *k, *last = NULL; + + keys = 0; + + if (i == 1) { + /* + * Last node we're not getting rid of - we're getting + * rid of the node at r[0]. Have to try and fit all of + * the remaining keys into this node; we can't ensure + * they will always fit due to rounding and variable + * length keys (shouldn't be possible in practice, + * though) + */ + if (__set_blocks(n1, n1->keys + r->keys, + b->c) > btree_blocks(r[i].b)) + return; + + keys = n2->keys; + last = &r->b->key; + } else + for (k = n2->start; + k < end(n2); + k = bkey_next(k)) { + if (__set_blocks(n1, n1->keys + keys + + bkey_u64s(k), b->c) > blocks) + break; + + last = k; + keys += bkey_u64s(k); + } + + BUG_ON(__set_blocks(n1, n1->keys + keys, + b->c) > btree_blocks(r[i].b)); + + if (last) { + bkey_copy_key(&r[i].b->key, last); + bkey_copy_key(r[i].k, last); + } + + memcpy(end(n1), + n2->start, + (void *) node(n2, keys) - (void *) n2->start); + + n1->keys += keys; + + memmove(n2->start, + node(n2, keys), + (void *) end(n2) - (void *) node(n2, keys)); + + n2->keys -= keys; + + r[i].keys = n1->keys; + r[i - 1].keys = n2->keys; + } + + btree_node_free(r->b, op); + up_write(&r->b->lock); + + pr_debug("coalesced %u nodes", nodes); + + gc->nodes--; + nodes--; + + memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); + memset(&r[nodes], 0, sizeof(struct gc_merge_info)); +} + +static int btree_gc_recurse(struct btree *b, struct btree_op *op, + struct closure *writes, struct gc_stat *gc) +{ + void write(struct btree *r) + { + if (!r->written) + bch_btree_write(r, true, op); + else if (btree_node_dirty(r)) { + BUG_ON(btree_current_write(r)->owner); + btree_current_write(r)->owner = writes; + closure_get(writes); + + bch_btree_write(r, true, NULL); + } + + up_write(&r->lock); + } + + int ret = 0, stale; + unsigned i; + struct gc_merge_info r[GC_MERGE_NODES]; + + memset(r, 0, sizeof(r)); + + while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { + r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); + + if (IS_ERR(r->b)) { + ret = PTR_ERR(r->b); + break; + } + + r->keys = 0; + stale = btree_gc_mark_node(r->b, &r->keys, gc); + + if (!b->written && + (r->b->level || stale > 10 || + b->c->gc_always_rewrite)) + r->b = btree_gc_alloc(r->b, r->k, op); + + if (r->b->level) + ret = btree_gc_recurse(r->b, op, writes, gc); + + if (ret) { + write(r->b); + break; + } + + bkey_copy_key(&b->c->gc_done, r->k); + + if (!b->written) + btree_gc_coalesce(b, op, gc, r); + + if (r[GC_MERGE_NODES - 1].b) + write(r[GC_MERGE_NODES - 1].b); + + memmove(&r[1], &r[0], + sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); + + /* When we've got incremental GC working, we'll want to do + * if (should_resched()) + * return -EAGAIN; + */ + cond_resched(); +#if 0 + if (need_resched()) { + ret = -EAGAIN; + break; + } +#endif + } + + for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) + write(r[i].b); + + /* Might have freed some children, must remove their keys */ + if (!b->written) + bch_btree_sort(b); + + return ret; +} + +static int bch_btree_gc_root(struct btree *b, struct btree_op *op, + struct closure *writes, struct gc_stat *gc) +{ + struct btree *n = NULL; + unsigned keys = 0; + int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); + + if (b->level || stale > 10) + n = btree_node_alloc_replacement(b, NULL); + + if (!IS_ERR_OR_NULL(n)) + swap(b, n); + + if (b->level) + ret = btree_gc_recurse(b, op, writes, gc); + + if (!b->written || btree_node_dirty(b)) { + atomic_inc(&b->c->prio_blocked); + b->prio_blocked++; + bch_btree_write(b, true, n ? op : NULL); + } + + if (!IS_ERR_OR_NULL(n)) { + closure_sync(&op->cl); + bch_btree_set_root(b); + btree_node_free(n, op); + rw_unlock(true, b); + } + + return ret; +} + +static void btree_gc_start(struct cache_set *c) +{ + struct cache *ca; + struct bucket *b; + struct bcache_device **d; + unsigned i; + + if (!c->gc_mark_valid) + return; + + mutex_lock(&c->bucket_lock); + + c->gc_mark_valid = 0; + c->gc_done = ZERO_KEY; + + for_each_cache(ca, c, i) + for_each_bucket(b, ca) { + b->gc_gen = b->gen; + if (!atomic_read(&b->pin)) + SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + } + + for (d = c->devices; + d < c->devices + c->nr_uuids; + d++) + if (*d) + (*d)->sectors_dirty_gc = 0; + + mutex_unlock(&c->bucket_lock); +} + +size_t bch_btree_gc_finish(struct cache_set *c) +{ + size_t available = 0; + struct bucket *b; + struct cache *ca; + struct bcache_device **d; + unsigned i; + + mutex_lock(&c->bucket_lock); + + set_gc_sectors(c); + c->gc_mark_valid = 1; + c->need_gc = 0; + + if (c->root) + for (i = 0; i < KEY_PTRS(&c->root->key); i++) + SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i), + GC_MARK_METADATA); + + for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) + SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), + GC_MARK_METADATA); + + for_each_cache(ca, c, i) { + uint64_t *i; + + ca->invalidate_needs_gc = 0; + + for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++) + SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); + + for (i = ca->prio_buckets; + i < ca->prio_buckets + prio_buckets(ca) * 2; i++) + SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); + + for_each_bucket(b, ca) { + b->last_gc = b->gc_gen; + c->need_gc = max(c->need_gc, bucket_gc_gen(b)); + + if (!atomic_read(&b->pin) && + GC_MARK(b) == GC_MARK_RECLAIMABLE) { + available++; + if (!GC_SECTORS_USED(b)) + bch_bucket_add_unused(ca, b); + } + } + } + + for (d = c->devices; + d < c->devices + c->nr_uuids; + d++) + if (*d) { + unsigned long last = + atomic_long_read(&((*d)->sectors_dirty)); + long difference = (*d)->sectors_dirty_gc - last; + + pr_debug("sectors dirty off by %li", difference); + + (*d)->sectors_dirty_last += difference; + + atomic_long_set(&((*d)->sectors_dirty), + (*d)->sectors_dirty_gc); + } + + mutex_unlock(&c->bucket_lock); + return available; +} + +static void bch_btree_gc(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, gc.cl); + int ret; + unsigned long available; + struct gc_stat stats; + struct closure writes; + struct btree_op op; + + uint64_t start_time = local_clock(); + trace_bcache_gc_start(c->sb.set_uuid); + blktrace_msg_all(c, "Starting gc"); + + memset(&stats, 0, sizeof(struct gc_stat)); + closure_init_stack(&writes); + bch_btree_op_init_stack(&op); + op.lock = SHRT_MAX; + + btree_gc_start(c); + + ret = btree_root(gc_root, c, &op, &writes, &stats); + closure_sync(&op.cl); + closure_sync(&writes); + + if (ret) { + blktrace_msg_all(c, "Stopped gc"); + pr_warn("gc failed!"); + + continue_at(cl, bch_btree_gc, bch_gc_wq); + } + + /* Possibly wait for new UUIDs or whatever to hit disk */ + bch_journal_meta(c, &op.cl); + closure_sync(&op.cl); + + available = bch_btree_gc_finish(c); + + bch_time_stats_update(&c->btree_gc_time, start_time); + + stats.key_bytes *= sizeof(uint64_t); + stats.dirty <<= 9; + stats.data <<= 9; + stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; + memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); + blktrace_msg_all(c, "Finished gc"); + + trace_bcache_gc_end(c->sb.set_uuid); + wake_up(&c->alloc_wait); + + continue_at(cl, bch_moving_gc, bch_gc_wq); +} + +void bch_queue_gc(struct cache_set *c) +{ + closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl); +} + +/* Initial partial gc */ + +static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, + unsigned long **seen) +{ + int ret; + unsigned i; + struct bkey *k; + struct bucket *g; + struct btree_iter iter; + + for_each_key_filter(b, k, &iter, bch_ptr_invalid) { + for (i = 0; i < KEY_PTRS(k); i++) { + if (!ptr_available(b->c, k, i)) + continue; + + g = PTR_BUCKET(b->c, k, i); + + if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i), + seen[PTR_DEV(k, i)]) || + !ptr_stale(b->c, k, i)) { + g->gen = PTR_GEN(k, i); + + if (b->level) + g->prio = BTREE_PRIO; + else if (g->prio == BTREE_PRIO) + g->prio = INITIAL_PRIO; + } + } + + btree_mark_key(b, k); + } + + if (b->level) { + k = bch_next_recurse_key(b, &ZERO_KEY); + + while (k) { + struct bkey *p = bch_next_recurse_key(b, k); + if (p) + btree_node_prefetch(b->c, p, b->level - 1); + + ret = btree(check_recurse, k, b, op, seen); + if (ret) + return ret; + + k = p; + } + } + + return 0; +} + +int bch_btree_check(struct cache_set *c, struct btree_op *op) +{ + int ret = -ENOMEM; + unsigned i; + unsigned long *seen[MAX_CACHES_PER_SET]; + + memset(seen, 0, sizeof(seen)); + + for (i = 0; c->cache[i]; i++) { + size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); + seen[i] = kmalloc(n, GFP_KERNEL); + if (!seen[i]) + goto err; + + /* Disables the seen array until prio_read() uses it too */ + memset(seen[i], 0xFF, n); + } + + ret = btree_root(check_recurse, c, op, seen); +err: + for (i = 0; i < MAX_CACHES_PER_SET; i++) + kfree(seen[i]); + return ret; +} + +/* Btree insertion */ + +static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) +{ + struct bset *i = b->sets[b->nsets].data; + + memmove((uint64_t *) where + bkey_u64s(insert), + where, + (void *) end(i) - (void *) where); + + i->keys += bkey_u64s(insert); + bkey_copy(where, insert); + bch_bset_fix_lookup_table(b, where); +} + +static bool fix_overlapping_extents(struct btree *b, + struct bkey *insert, + struct btree_iter *iter, + struct btree_op *op) +{ + void subtract_dirty(struct bkey *k, int sectors) + { + struct bcache_device *d = b->c->devices[KEY_INODE(k)]; + + if (KEY_DIRTY(k) && d) + atomic_long_sub(sectors, &d->sectors_dirty); + } + + unsigned old_size, sectors_found = 0; + + while (1) { + struct bkey *k = bch_btree_iter_next(iter); + if (!k || + bkey_cmp(&START_KEY(k), insert) >= 0) + break; + + if (bkey_cmp(k, &START_KEY(insert)) <= 0) + continue; + + old_size = KEY_SIZE(k); + + /* + * We might overlap with 0 size extents; we can't skip these + * because if they're in the set we're inserting to we have to + * adjust them so they don't overlap with the key we're + * inserting. But we don't want to check them for BTREE_REPLACE + * operations. + */ + + if (op->type == BTREE_REPLACE && + KEY_SIZE(k)) { + /* + * k might have been split since we inserted/found the + * key we're replacing + */ + unsigned i; + uint64_t offset = KEY_START(k) - + KEY_START(&op->replace); + + /* But it must be a subset of the replace key */ + if (KEY_START(k) < KEY_START(&op->replace) || + KEY_OFFSET(k) > KEY_OFFSET(&op->replace)) + goto check_failed; + + /* We didn't find a key that we were supposed to */ + if (KEY_START(k) > KEY_START(insert) + sectors_found) + goto check_failed; + + if (KEY_PTRS(&op->replace) != KEY_PTRS(k)) + goto check_failed; + + /* skip past gen */ + offset <<= 8; + + BUG_ON(!KEY_PTRS(&op->replace)); + + for (i = 0; i < KEY_PTRS(&op->replace); i++) + if (k->ptr[i] != op->replace.ptr[i] + offset) + goto check_failed; + + sectors_found = KEY_OFFSET(k) - KEY_START(insert); + } + + if (bkey_cmp(insert, k) < 0 && + bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { + /* + * We overlapped in the middle of an existing key: that + * means we have to split the old key. But we have to do + * slightly different things depending on whether the + * old key has been written out yet. + */ + + struct bkey *top; + + subtract_dirty(k, KEY_SIZE(insert)); + + if (bkey_written(b, k)) { + /* + * We insert a new key to cover the top of the + * old key, and the old key is modified in place + * to represent the bottom split. + * + * It's completely arbitrary whether the new key + * is the top or the bottom, but it has to match + * up with what btree_sort_fixup() does - it + * doesn't check for this kind of overlap, it + * depends on us inserting a new key for the top + * here. + */ + top = bch_bset_search(b, &b->sets[b->nsets], + insert); + shift_keys(b, top, k); + } else { + BKEY_PADDED(key) temp; + bkey_copy(&temp.key, k); + shift_keys(b, k, &temp.key); + top = bkey_next(k); + } + + bch_cut_front(insert, top); + bch_cut_back(&START_KEY(insert), k); + bch_bset_fix_invalidated_key(b, k); + return false; + } + + if (bkey_cmp(insert, k) < 0) { + bch_cut_front(insert, k); + } else { + if (bkey_written(b, k) && + bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { + /* + * Completely overwrote, so we don't have to + * invalidate the binary search tree + */ + bch_cut_front(k, k); + } else { + __bch_cut_back(&START_KEY(insert), k); + bch_bset_fix_invalidated_key(b, k); + } + } + + subtract_dirty(k, old_size - KEY_SIZE(k)); + } + +check_failed: + if (op->type == BTREE_REPLACE) { + if (!sectors_found) { + op->insert_collision = true; + return true; + } else if (sectors_found < KEY_SIZE(insert)) { + SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - + (KEY_SIZE(insert) - sectors_found)); + SET_KEY_SIZE(insert, sectors_found); + } + } + + return false; +} + +static bool btree_insert_key(struct btree *b, struct btree_op *op, + struct bkey *k) +{ + struct bset *i = b->sets[b->nsets].data; + struct bkey *m, *prev; + const char *status = "insert"; + + BUG_ON(bkey_cmp(k, &b->key) > 0); + BUG_ON(b->level && !KEY_PTRS(k)); + BUG_ON(!b->level && !KEY_OFFSET(k)); + + if (!b->level) { + struct btree_iter iter; + struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0); + + /* + * bset_search() returns the first key that is strictly greater + * than the search key - but for back merging, we want to find + * the first key that is greater than or equal to KEY_START(k) - + * unless KEY_START(k) is 0. + */ + if (KEY_OFFSET(&search)) + SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1); + + prev = NULL; + m = bch_btree_iter_init(b, &iter, &search); + + if (fix_overlapping_extents(b, k, &iter, op)) + return false; + + while (m != end(i) && + bkey_cmp(k, &START_KEY(m)) > 0) + prev = m, m = bkey_next(m); + + if (key_merging_disabled(b->c)) + goto insert; + + /* prev is in the tree, if we merge we're done */ + status = "back merging"; + if (prev && + bch_bkey_try_merge(b, prev, k)) + goto merged; + + status = "overwrote front"; + if (m != end(i) && + KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) + goto copy; + + status = "front merge"; + if (m != end(i) && + bch_bkey_try_merge(b, k, m)) + goto copy; + } else + m = bch_bset_search(b, &b->sets[b->nsets], k); + +insert: shift_keys(b, m, k); +copy: bkey_copy(m, k); +merged: + bch_check_keys(b, "%s for %s at %s: %s", status, + op_type(op), pbtree(b), pkey(k)); + bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status, + op_type(op), pbtree(b), pkey(k)); + + if (b->level && !KEY_OFFSET(k)) + b->prio_blocked++; + + pr_debug("%s for %s at %s: %s", status, + op_type(op), pbtree(b), pkey(k)); + + return true; +} + +bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) +{ + bool ret = false; + struct bkey *k; + unsigned oldsize = bch_count_data(b); + + while ((k = bch_keylist_pop(&op->keys))) { + bkey_put(b->c, k, b->level); + ret |= btree_insert_key(b, op, k); + } + + BUG_ON(bch_count_data(b) < oldsize); + return ret; +} + +bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, + struct bio *bio) +{ + bool ret = false; + uint64_t btree_ptr = b->key.ptr[0]; + unsigned long seq = b->seq; + BKEY_PADDED(k) tmp; + + rw_unlock(false, b); + rw_lock(true, b, b->level); + + if (b->key.ptr[0] != btree_ptr || + b->seq != seq + 1 || + should_split(b)) + goto out; + + op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio)); + + SET_KEY_PTRS(&op->replace, 1); + get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); + + SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV); + + bkey_copy(&tmp.k, &op->replace); + + BUG_ON(op->type != BTREE_INSERT); + BUG_ON(!btree_insert_key(b, op, &tmp.k)); + bch_btree_write(b, false, NULL); + ret = true; +out: + downgrade_write(&b->lock); + return ret; +} + +static int btree_split(struct btree *b, struct btree_op *op) +{ + bool split, root = b == b->c->root; + struct btree *n1, *n2 = NULL, *n3 = NULL; + uint64_t start_time = local_clock(); + + if (b->level) + set_closure_blocking(&op->cl); + + n1 = btree_node_alloc_replacement(b, &op->cl); + if (IS_ERR(n1)) + goto err; + + split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; + + pr_debug("%ssplitting at %s keys %i", split ? "" : "not ", + pbtree(b), n1->sets[0].data->keys); + + if (split) { + unsigned keys = 0; + + n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); + if (IS_ERR(n2)) + goto err_free1; + + if (root) { + n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); + if (IS_ERR(n3)) + goto err_free2; + } + + bch_btree_insert_keys(n1, op); + + /* Has to be a linear search because we don't have an auxiliary + * search tree yet + */ + + while (keys < (n1->sets[0].data->keys * 3) / 5) + keys += bkey_u64s(node(n1->sets[0].data, keys)); + + bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); + keys += bkey_u64s(node(n1->sets[0].data, keys)); + + n2->sets[0].data->keys = n1->sets[0].data->keys - keys; + n1->sets[0].data->keys = keys; + + memcpy(n2->sets[0].data->start, + end(n1->sets[0].data), + n2->sets[0].data->keys * sizeof(uint64_t)); + + bkey_copy_key(&n2->key, &b->key); + + bch_keylist_add(&op->keys, &n2->key); + bch_btree_write(n2, true, op); + rw_unlock(true, n2); + } else + bch_btree_insert_keys(n1, op); + + bch_keylist_add(&op->keys, &n1->key); + bch_btree_write(n1, true, op); + + if (n3) { + bkey_copy_key(&n3->key, &MAX_KEY); + bch_btree_insert_keys(n3, op); + bch_btree_write(n3, true, op); + + closure_sync(&op->cl); + bch_btree_set_root(n3); + rw_unlock(true, n3); + } else if (root) { + op->keys.top = op->keys.bottom; + closure_sync(&op->cl); + bch_btree_set_root(n1); + } else { + unsigned i; + + bkey_copy(op->keys.top, &b->key); + bkey_copy_key(op->keys.top, &ZERO_KEY); + + for (i = 0; i < KEY_PTRS(&b->key); i++) { + uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1; + + SET_PTR_GEN(op->keys.top, i, g); + } + + bch_keylist_push(&op->keys); + closure_sync(&op->cl); + atomic_inc(&b->c->prio_blocked); + } + + rw_unlock(true, n1); + btree_node_free(b, op); + + bch_time_stats_update(&b->c->btree_split_time, start_time); + + return 0; +err_free2: + __bkey_put(n2->c, &n2->key); + btree_node_free(n2, op); + rw_unlock(true, n2); +err_free1: + __bkey_put(n1->c, &n1->key); + btree_node_free(n1, op); + rw_unlock(true, n1); +err: + if (n3 == ERR_PTR(-EAGAIN) || + n2 == ERR_PTR(-EAGAIN) || + n1 == ERR_PTR(-EAGAIN)) + return -EAGAIN; + + pr_warn("couldn't split"); + return -ENOMEM; +} + +static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, + struct keylist *stack_keys) +{ + if (b->level) { + int ret; + struct bkey *insert = op->keys.bottom; + struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert)); + + if (!k) { + btree_bug(b, "no key to recurse on at level %i/%i", + b->level, b->c->root->level); + + op->keys.top = op->keys.bottom; + return -EIO; + } + + if (bkey_cmp(insert, k) > 0) { + unsigned i; + + if (op->type == BTREE_REPLACE) { + __bkey_put(b->c, insert); + op->keys.top = op->keys.bottom; + op->insert_collision = true; + return 0; + } + + for (i = 0; i < KEY_PTRS(insert); i++) + atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin); + + bkey_copy(stack_keys->top, insert); + + bch_cut_back(k, insert); + bch_cut_front(k, stack_keys->top); + + bch_keylist_push(stack_keys); + } + + ret = btree(insert_recurse, k, b, op, stack_keys); + if (ret) + return ret; + } + + if (!bch_keylist_empty(&op->keys)) { + if (should_split(b)) { + if (op->lock <= b->c->root->level) { + BUG_ON(b->level); + op->lock = b->c->root->level + 1; + return -EINTR; + } + return btree_split(b, op); + } + + BUG_ON(write_block(b) != b->sets[b->nsets].data); + + if (bch_btree_insert_keys(b, op)) + bch_btree_write(b, false, op); + } + + return 0; +} + +int bch_btree_insert(struct btree_op *op, struct cache_set *c) +{ + int ret = 0; + struct keylist stack_keys; + + /* + * Don't want to block with the btree locked unless we have to, + * otherwise we get deadlocks with try_harder and between split/gc + */ + clear_closure_blocking(&op->cl); + + BUG_ON(bch_keylist_empty(&op->keys)); + bch_keylist_copy(&stack_keys, &op->keys); + bch_keylist_init(&op->keys); + + while (!bch_keylist_empty(&stack_keys) || + !bch_keylist_empty(&op->keys)) { + if (bch_keylist_empty(&op->keys)) { + bch_keylist_add(&op->keys, + bch_keylist_pop(&stack_keys)); + op->lock = 0; + } + + ret = btree_root(insert_recurse, c, op, &stack_keys); + + if (ret == -EAGAIN) { + ret = 0; + closure_sync(&op->cl); + } else if (ret) { + struct bkey *k; + + pr_err("error %i trying to insert key for %s", + ret, op_type(op)); + + while ((k = bch_keylist_pop(&stack_keys) ?: + bch_keylist_pop(&op->keys))) + bkey_put(c, k, 0); + } + } + + bch_keylist_free(&stack_keys); + + if (op->journal) + atomic_dec_bug(op->journal); + op->journal = NULL; + return ret; +} + +void bch_btree_set_root(struct btree *b) +{ + unsigned i; + + BUG_ON(!b->written); + + for (i = 0; i < KEY_PTRS(&b->key); i++) + BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); + + mutex_lock(&b->c->bucket_lock); + list_del_init(&b->list); + mutex_unlock(&b->c->bucket_lock); + + b->c->root = b; + __bkey_put(b->c, &b->key); + + bch_journal_meta(b->c, NULL); + pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0)); +} + +/* Cache lookup */ + +static int submit_partial_cache_miss(struct btree *b, struct btree_op *op, + struct bkey *k) +{ + struct search *s = container_of(op, struct search, op); + struct bio *bio = &s->bio.bio; + int ret = 0; + + while (!ret && + !op->lookup_done) { + unsigned sectors = INT_MAX; + + if (KEY_INODE(k) == op->inode) { + if (KEY_START(k) <= bio->bi_sector) + break; + + sectors = min_t(uint64_t, sectors, + KEY_START(k) - bio->bi_sector); + } + + ret = s->d->cache_miss(b, s, bio, sectors); + } + + return ret; +} + +/* + * Read from a single key, handling the initial cache miss if the key starts in + * the middle of the bio + */ +static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, + struct bkey *k) +{ + struct search *s = container_of(op, struct search, op); + struct bio *bio = &s->bio.bio; + unsigned ptr; + struct bio *n; + + int ret = submit_partial_cache_miss(b, op, k); + if (ret || op->lookup_done) + return ret; + + /* XXX: figure out best pointer - for multiple cache devices */ + ptr = 0; + + PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; + + while (!op->lookup_done && + KEY_INODE(k) == op->inode && + bio->bi_sector < KEY_OFFSET(k)) { + struct bkey *bio_key; + sector_t sector = PTR_OFFSET(k, ptr) + + (bio->bi_sector - KEY_START(k)); + unsigned sectors = min_t(uint64_t, INT_MAX, + KEY_OFFSET(k) - bio->bi_sector); + + n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); + if (!n) + return -EAGAIN; + + if (n == bio) + op->lookup_done = true; + + bio_key = &container_of(n, struct bbio, bio)->key; + + /* + * The bucket we're reading from might be reused while our bio + * is in flight, and we could then end up reading the wrong + * data. + * + * We guard against this by checking (in cache_read_endio()) if + * the pointer is stale again; if so, we treat it as an error + * and reread from the backing device (but we don't pass that + * error up anywhere). + */ + + bch_bkey_copy_single_ptr(bio_key, k, ptr); + SET_PTR_OFFSET(bio_key, 0, sector); + + n->bi_end_io = bch_cache_read_endio; + n->bi_private = &s->cl; + + trace_bcache_cache_hit(n); + __bch_submit_bbio(n, b->c); + } + + return 0; +} + +int bch_btree_search_recurse(struct btree *b, struct btree_op *op) +{ + struct search *s = container_of(op, struct search, op); + struct bio *bio = &s->bio.bio; + + int ret = 0; + struct bkey *k; + struct btree_iter iter; + bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); + + pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode, + (uint64_t) bio->bi_sector); + + do { + k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); + if (!k) { + /* + * b->key would be exactly what we want, except that + * pointers to btree nodes have nonzero size - we + * wouldn't go far enough + */ + + ret = submit_partial_cache_miss(b, op, + &KEY(KEY_INODE(&b->key), + KEY_OFFSET(&b->key), 0)); + break; + } + + ret = b->level + ? btree(search_recurse, k, b, op) + : submit_partial_cache_hit(b, op, k); + } while (!ret && + !op->lookup_done); + + return ret; +} + +/* Keybuf code */ + +static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) +{ + /* Overlapping keys compare equal */ + if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) + return -1; + if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) + return 1; + return 0; +} + +static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, + struct keybuf_key *r) +{ + return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); +} + +static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, + struct keybuf *buf, struct bkey *end) +{ + struct btree_iter iter; + bch_btree_iter_init(b, &iter, &buf->last_scanned); + + while (!array_freelist_empty(&buf->freelist)) { + struct bkey *k = bch_btree_iter_next_filter(&iter, b, + bch_ptr_bad); + + if (!b->level) { + if (!k) { + buf->last_scanned = b->key; + break; + } + + buf->last_scanned = *k; + if (bkey_cmp(&buf->last_scanned, end) >= 0) + break; + + if (buf->key_predicate(buf, k)) { + struct keybuf_key *w; + + pr_debug("%s", pkey(k)); + + spin_lock(&buf->lock); + + w = array_alloc(&buf->freelist); + + w->private = NULL; + bkey_copy(&w->key, k); + + if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) + array_free(&buf->freelist, w); + + spin_unlock(&buf->lock); + } + } else { + if (!k) + break; + + btree(refill_keybuf, k, b, op, buf, end); + /* + * Might get an error here, but can't really do anything + * and it'll get logged elsewhere. Just read what we + * can. + */ + + if (bkey_cmp(&buf->last_scanned, end) >= 0) + break; + + cond_resched(); + } + } + + return 0; +} + +void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, + struct bkey *end) +{ + struct bkey start = buf->last_scanned; + struct btree_op op; + bch_btree_op_init_stack(&op); + + cond_resched(); + + btree_root(refill_keybuf, c, &op, buf, end); + closure_sync(&op.cl); + + pr_debug("found %s keys from %llu:%llu to %llu:%llu", + RB_EMPTY_ROOT(&buf->keys) ? "no" : + array_freelist_empty(&buf->freelist) ? "some" : "a few", + KEY_INODE(&start), KEY_OFFSET(&start), + KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned)); + + spin_lock(&buf->lock); + + if (!RB_EMPTY_ROOT(&buf->keys)) { + struct keybuf_key *w; + w = RB_FIRST(&buf->keys, struct keybuf_key, node); + buf->start = START_KEY(&w->key); + + w = RB_LAST(&buf->keys, struct keybuf_key, node); + buf->end = w->key; + } else { + buf->start = MAX_KEY; + buf->end = MAX_KEY; + } + + spin_unlock(&buf->lock); +} + +static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) +{ + rb_erase(&w->node, &buf->keys); + array_free(&buf->freelist, w); +} + +void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) +{ + spin_lock(&buf->lock); + __bch_keybuf_del(buf, w); + spin_unlock(&buf->lock); +} + +bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, + struct bkey *end) +{ + bool ret = false; + struct keybuf_key *p, *w, s; + s.key = *start; + + if (bkey_cmp(end, &buf->start) <= 0 || + bkey_cmp(start, &buf->end) >= 0) + return false; + + spin_lock(&buf->lock); + w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); + + while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { + p = w; + w = RB_NEXT(w, node); + + if (p->private) + ret = true; + else + __bch_keybuf_del(buf, p); + } + + spin_unlock(&buf->lock); + return ret; +} + +struct keybuf_key *bch_keybuf_next(struct keybuf *buf) +{ + struct keybuf_key *w; + spin_lock(&buf->lock); + + w = RB_FIRST(&buf->keys, struct keybuf_key, node); + + while (w && w->private) + w = RB_NEXT(w, node); + + if (w) + w->private = ERR_PTR(-EINTR); + + spin_unlock(&buf->lock); + return w; +} + +struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, + struct keybuf *buf, + struct bkey *end) +{ + struct keybuf_key *ret; + + while (1) { + ret = bch_keybuf_next(buf); + if (ret) + break; + + if (bkey_cmp(&buf->last_scanned, end) >= 0) { + pr_debug("scan finished"); + break; + } + + bch_refill_keybuf(c, buf, end); + } + + return ret; +} + +void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn) +{ + buf->key_predicate = fn; + buf->last_scanned = MAX_KEY; + buf->keys = RB_ROOT; + + spin_lock_init(&buf->lock); + array_allocator_init(&buf->freelist); +} + +void bch_btree_exit(void) +{ + if (btree_io_wq) + destroy_workqueue(btree_io_wq); + if (bch_gc_wq) + destroy_workqueue(bch_gc_wq); +} + +int __init bch_btree_init(void) +{ + if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) || + !(btree_io_wq = create_singlethread_workqueue("bch_btree_io"))) + return -ENOMEM; + + return 0; +} diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h new file mode 100644 index 00000000000..af4a7092a28 --- /dev/null +++ b/drivers/md/bcache/btree.h @@ -0,0 +1,405 @@ +#ifndef _BCACHE_BTREE_H +#define _BCACHE_BTREE_H + +/* + * THE BTREE: + * + * At a high level, bcache's btree is relatively standard b+ tree. All keys and + * pointers are in the leaves; interior nodes only have pointers to the child + * nodes. + * + * In the interior nodes, a struct bkey always points to a child btree node, and + * the key is the highest key in the child node - except that the highest key in + * an interior node is always MAX_KEY. The size field refers to the size on disk + * of the child node - this would allow us to have variable sized btree nodes + * (handy for keeping the depth of the btree 1 by expanding just the root). + * + * Btree nodes are themselves log structured, but this is hidden fairly + * thoroughly. Btree nodes on disk will in practice have extents that overlap + * (because they were written at different times), but in memory we never have + * overlapping extents - when we read in a btree node from disk, the first thing + * we do is resort all the sets of keys with a mergesort, and in the same pass + * we check for overlapping extents and adjust them appropriately. + * + * struct btree_op is a central interface to the btree code. It's used for + * specifying read vs. write locking, and the embedded closure is used for + * waiting on IO or reserve memory. + * + * BTREE CACHE: + * + * Btree nodes are cached in memory; traversing the btree might require reading + * in btree nodes which is handled mostly transparently. + * + * bch_btree_node_get() looks up a btree node in the cache and reads it in from + * disk if necessary. This function is almost never called directly though - the + * btree() macro is used to get a btree node, call some function on it, and + * unlock the node after the function returns. + * + * The root is special cased - it's taken out of the cache's lru (thus pinning + * it in memory), so we can find the root of the btree by just dereferencing a + * pointer instead of looking it up in the cache. This makes locking a bit + * tricky, since the root pointer is protected by the lock in the btree node it + * points to - the btree_root() macro handles this. + * + * In various places we must be able to allocate memory for multiple btree nodes + * in order to make forward progress. To do this we use the btree cache itself + * as a reserve; if __get_free_pages() fails, we'll find a node in the btree + * cache we can reuse. We can't allow more than one thread to be doing this at a + * time, so there's a lock, implemented by a pointer to the btree_op closure - + * this allows the btree_root() macro to implicitly release this lock. + * + * BTREE IO: + * + * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles + * this. + * + * For writing, we have two btree_write structs embeddded in struct btree - one + * write in flight, and one being set up, and we toggle between them. + * + * Writing is done with a single function - bch_btree_write() really serves two + * different purposes and should be broken up into two different functions. When + * passing now = false, it merely indicates that the node is now dirty - calling + * it ensures that the dirty keys will be written at some point in the future. + * + * When passing now = true, bch_btree_write() causes a write to happen + * "immediately" (if there was already a write in flight, it'll cause the write + * to happen as soon as the previous write completes). It returns immediately + * though - but it takes a refcount on the closure in struct btree_op you passed + * to it, so a closure_sync() later can be used to wait for the write to + * complete. + * + * This is handy because btree_split() and garbage collection can issue writes + * in parallel, reducing the amount of time they have to hold write locks. + * + * LOCKING: + * + * When traversing the btree, we may need write locks starting at some level - + * inserting a key into the btree will typically only require a write lock on + * the leaf node. + * + * This is specified with the lock field in struct btree_op; lock = 0 means we + * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get() + * checks this field and returns the node with the appropriate lock held. + * + * If, after traversing the btree, the insertion code discovers it has to split + * then it must restart from the root and take new locks - to do this it changes + * the lock field and returns -EINTR, which causes the btree_root() macro to + * loop. + * + * Handling cache misses require a different mechanism for upgrading to a write + * lock. We do cache lookups with only a read lock held, but if we get a cache + * miss and we wish to insert this data into the cache, we have to insert a + * placeholder key to detect races - otherwise, we could race with a write and + * overwrite the data that was just written to the cache with stale data from + * the backing device. + * + * For this we use a sequence number that write locks and unlocks increment - to + * insert the check key it unlocks the btree node and then takes a write lock, + * and fails if the sequence number doesn't match. + */ + +#include "bset.h" +#include "debug.h" + +struct btree_write { + struct closure *owner; + atomic_t *journal; + + /* If btree_split() frees a btree node, it writes a new pointer to that + * btree node indicating it was freed; it takes a refcount on + * c->prio_blocked because we can't write the gens until the new + * pointer is on disk. This allows btree_write_endio() to release the + * refcount that btree_split() took. + */ + int prio_blocked; +}; + +struct btree { + /* Hottest entries first */ + struct hlist_node hash; + + /* Key/pointer for this btree node */ + BKEY_PADDED(key); + + /* Single bit - set when accessed, cleared by shrinker */ + unsigned long accessed; + unsigned long seq; + struct rw_semaphore lock; + struct cache_set *c; + + unsigned long flags; + uint16_t written; /* would be nice to kill */ + uint8_t level; + uint8_t nsets; + uint8_t page_order; + + /* + * Set of sorted keys - the real btree node - plus a binary search tree + * + * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point + * to the memory we have allocated for this btree node. Additionally, + * set[0]->data points to the entire btree node as it exists on disk. + */ + struct bset_tree sets[MAX_BSETS]; + + /* Used to refcount bio splits, also protects b->bio */ + struct closure_with_waitlist io; + + /* Gets transferred to w->prio_blocked - see the comment there */ + int prio_blocked; + + struct list_head list; + struct delayed_work work; + + uint64_t io_start_time; + struct btree_write writes[2]; + struct bio *bio; +}; + +#define BTREE_FLAG(flag) \ +static inline bool btree_node_ ## flag(struct btree *b) \ +{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ + \ +static inline void set_btree_node_ ## flag(struct btree *b) \ +{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ + +enum btree_flags { + BTREE_NODE_read_done, + BTREE_NODE_io_error, + BTREE_NODE_dirty, + BTREE_NODE_write_idx, +}; + +BTREE_FLAG(read_done); +BTREE_FLAG(io_error); +BTREE_FLAG(dirty); +BTREE_FLAG(write_idx); + +static inline struct btree_write *btree_current_write(struct btree *b) +{ + return b->writes + btree_node_write_idx(b); +} + +static inline struct btree_write *btree_prev_write(struct btree *b) +{ + return b->writes + (btree_node_write_idx(b) ^ 1); +} + +static inline unsigned bset_offset(struct btree *b, struct bset *i) +{ + return (((size_t) i) - ((size_t) b->sets->data)) >> 9; +} + +static inline struct bset *write_block(struct btree *b) +{ + return ((void *) b->sets[0].data) + b->written * block_bytes(b->c); +} + +static inline bool bset_written(struct btree *b, struct bset_tree *t) +{ + return t->data < write_block(b); +} + +static inline bool bkey_written(struct btree *b, struct bkey *k) +{ + return k < write_block(b)->start; +} + +static inline void set_gc_sectors(struct cache_set *c) +{ + atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8); +} + +static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) +{ + return __bch_ptr_invalid(b->c, b->level, k); +} + +static inline struct bkey *bch_btree_iter_init(struct btree *b, + struct btree_iter *iter, + struct bkey *search) +{ + return __bch_btree_iter_init(b, iter, search, b->sets); +} + +/* Looping macros */ + +#define for_each_cached_btree(b, c, iter) \ + for (iter = 0; \ + iter < ARRAY_SIZE((c)->bucket_hash); \ + iter++) \ + hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) + +#define for_each_key_filter(b, k, iter, filter) \ + for (bch_btree_iter_init((b), (iter), NULL); \ + ((k) = bch_btree_iter_next_filter((iter), b, filter));) + +#define for_each_key(b, k, iter) \ + for (bch_btree_iter_init((b), (iter), NULL); \ + ((k) = bch_btree_iter_next(iter));) + +/* Recursing down the btree */ + +struct btree_op { + struct closure cl; + struct cache_set *c; + + /* Journal entry we have a refcount on */ + atomic_t *journal; + + /* Bio to be inserted into the cache */ + struct bio *cache_bio; + + unsigned inode; + + uint16_t write_prio; + + /* Btree level at which we start taking write locks */ + short lock; + + /* Btree insertion type */ + enum { + BTREE_INSERT, + BTREE_REPLACE + } type:8; + + unsigned csum:1; + unsigned skip:1; + unsigned flush_journal:1; + + unsigned insert_data_done:1; + unsigned lookup_done:1; + unsigned insert_collision:1; + + /* Anything after this point won't get zeroed in do_bio_hook() */ + + /* Keys to be inserted */ + struct keylist keys; + BKEY_PADDED(replace); +}; + +void bch_btree_op_init_stack(struct btree_op *); + +static inline void rw_lock(bool w, struct btree *b, int level) +{ + w ? down_write_nested(&b->lock, level + 1) + : down_read_nested(&b->lock, level + 1); + if (w) + b->seq++; +} + +static inline void rw_unlock(bool w, struct btree *b) +{ +#ifdef CONFIG_BCACHE_EDEBUG + unsigned i; + + if (w && + b->key.ptr[0] && + btree_node_read_done(b)) + for (i = 0; i <= b->nsets; i++) + bch_check_key_order(b, b->sets[i].data); +#endif + + if (w) + b->seq++; + (w ? up_write : up_read)(&b->lock); +} + +#define insert_lock(s, b) ((b)->level <= (s)->lock) + +/* + * These macros are for recursing down the btree - they handle the details of + * locking and looking up nodes in the cache for you. They're best treated as + * mere syntax when reading code that uses them. + * + * op->lock determines whether we take a read or a write lock at a given depth. + * If you've got a read lock and find that you need a write lock (i.e. you're + * going to have to split), set op->lock and return -EINTR; btree_root() will + * call you again and you'll have the correct lock. + */ + +/** + * btree - recurse down the btree on a specified key + * @fn: function to call, which will be passed the child node + * @key: key to recurse on + * @b: parent btree node + * @op: pointer to struct btree_op + */ +#define btree(fn, key, b, op, ...) \ +({ \ + int _r, l = (b)->level - 1; \ + bool _w = l <= (op)->lock; \ + struct btree *_b = bch_btree_node_get((b)->c, key, l, op); \ + if (!IS_ERR(_b)) { \ + _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ + rw_unlock(_w, _b); \ + } else \ + _r = PTR_ERR(_b); \ + _r; \ +}) + +/** + * btree_root - call a function on the root of the btree + * @fn: function to call, which will be passed the child node + * @c: cache set + * @op: pointer to struct btree_op + */ +#define btree_root(fn, c, op, ...) \ +({ \ + int _r = -EINTR; \ + do { \ + struct btree *_b = (c)->root; \ + bool _w = insert_lock(op, _b); \ + rw_lock(_w, _b, _b->level); \ + if (_b == (c)->root && \ + _w == insert_lock(op, _b)) \ + _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ + rw_unlock(_w, _b); \ + bch_cannibalize_unlock(c, &(op)->cl); \ + } while (_r == -EINTR); \ + \ + _r; \ +}) + +static inline bool should_split(struct btree *b) +{ + struct bset *i = write_block(b); + return b->written >= btree_blocks(b) || + (i->seq == b->sets[0].data->seq && + b->written + __set_blocks(i, i->keys + 15, b->c) + > btree_blocks(b)); +} + +void bch_btree_read_done(struct closure *); +void bch_btree_read(struct btree *); +void bch_btree_write(struct btree *b, bool now, struct btree_op *op); + +void bch_cannibalize_unlock(struct cache_set *, struct closure *); +void bch_btree_set_root(struct btree *); +struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); +struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, + int, struct btree_op *); + +bool bch_btree_insert_keys(struct btree *, struct btree_op *); +bool bch_btree_insert_check_key(struct btree *, struct btree_op *, + struct bio *); +int bch_btree_insert(struct btree_op *, struct cache_set *); + +int bch_btree_search_recurse(struct btree *, struct btree_op *); + +void bch_queue_gc(struct cache_set *); +size_t bch_btree_gc_finish(struct cache_set *); +void bch_moving_gc(struct closure *); +int bch_btree_check(struct cache_set *, struct btree_op *); +uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); + +void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *); +void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *); +bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, + struct bkey *); +void bch_keybuf_del(struct keybuf *, struct keybuf_key *); +struct keybuf_key *bch_keybuf_next(struct keybuf *); +struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, + struct keybuf *, struct bkey *); + +#endif diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c new file mode 100644 index 00000000000..bd05a9a8c7c --- /dev/null +++ b/drivers/md/bcache/closure.c @@ -0,0 +1,345 @@ +/* + * Asynchronous refcounty things + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include <linux/debugfs.h> +#include <linux/module.h> +#include <linux/seq_file.h> + +#include "closure.h" + +void closure_queue(struct closure *cl) +{ + struct workqueue_struct *wq = cl->wq; + if (wq) { + INIT_WORK(&cl->work, cl->work.func); + BUG_ON(!queue_work(wq, &cl->work)); + } else + cl->fn(cl); +} +EXPORT_SYMBOL_GPL(closure_queue); + +#define CL_FIELD(type, field) \ + case TYPE_ ## type: \ + return &container_of(cl, struct type, cl)->field + +static struct closure_waitlist *closure_waitlist(struct closure *cl) +{ + switch (cl->type) { + CL_FIELD(closure_with_waitlist, wait); + CL_FIELD(closure_with_waitlist_and_timer, wait); + default: + return NULL; + } +} + +static struct timer_list *closure_timer(struct closure *cl) +{ + switch (cl->type) { + CL_FIELD(closure_with_timer, timer); + CL_FIELD(closure_with_waitlist_and_timer, timer); + default: + return NULL; + } +} + +static inline void closure_put_after_sub(struct closure *cl, int flags) +{ + int r = flags & CLOSURE_REMAINING_MASK; + + BUG_ON(flags & CLOSURE_GUARD_MASK); + BUG_ON(!r && (flags & ~(CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING))); + + /* Must deliver precisely one wakeup */ + if (r == 1 && (flags & CLOSURE_SLEEPING)) + wake_up_process(cl->task); + + if (!r) { + if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { + /* CLOSURE_BLOCKING might be set - clear it */ + atomic_set(&cl->remaining, + CLOSURE_REMAINING_INITIALIZER); + closure_queue(cl); + } else { + struct closure *parent = cl->parent; + struct closure_waitlist *wait = closure_waitlist(cl); + + closure_debug_destroy(cl); + + atomic_set(&cl->remaining, -1); + + if (wait) + closure_wake_up(wait); + + if (cl->fn) + cl->fn(cl); + + if (parent) + closure_put(parent); + } + } +} + +/* For clearing flags with the same atomic op as a put */ +void closure_sub(struct closure *cl, int v) +{ + closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); +} +EXPORT_SYMBOL_GPL(closure_sub); + +void closure_put(struct closure *cl) +{ + closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); +} +EXPORT_SYMBOL_GPL(closure_put); + +static void set_waiting(struct closure *cl, unsigned long f) +{ +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + cl->waiting_on = f; +#endif +} + +void __closure_wake_up(struct closure_waitlist *wait_list) +{ + struct llist_node *list; + struct closure *cl; + struct llist_node *reverse = NULL; + + list = llist_del_all(&wait_list->list); + + /* We first reverse the list to preserve FIFO ordering and fairness */ + + while (list) { + struct llist_node *t = list; + list = llist_next(list); + + t->next = reverse; + reverse = t; + } + + /* Then do the wakeups */ + + while (reverse) { + cl = container_of(reverse, struct closure, list); + reverse = llist_next(reverse); + + set_waiting(cl, 0); + closure_sub(cl, CLOSURE_WAITING + 1); + } +} +EXPORT_SYMBOL_GPL(__closure_wake_up); + +bool closure_wait(struct closure_waitlist *list, struct closure *cl) +{ + if (atomic_read(&cl->remaining) & CLOSURE_WAITING) + return false; + + set_waiting(cl, _RET_IP_); + atomic_add(CLOSURE_WAITING + 1, &cl->remaining); + llist_add(&cl->list, &list->list); + + return true; +} +EXPORT_SYMBOL_GPL(closure_wait); + +/** + * closure_sync() - sleep until a closure a closure has nothing left to wait on + * + * Sleeps until the refcount hits 1 - the thread that's running the closure owns + * the last refcount. + */ +void closure_sync(struct closure *cl) +{ + while (1) { + __closure_start_sleep(cl); + closure_set_ret_ip(cl); + + if ((atomic_read(&cl->remaining) & + CLOSURE_REMAINING_MASK) == 1) + break; + + schedule(); + } + + __closure_end_sleep(cl); +} +EXPORT_SYMBOL_GPL(closure_sync); + +/** + * closure_trylock() - try to acquire the closure, without waiting + * @cl: closure to lock + * + * Returns true if the closure was succesfully locked. + */ +bool closure_trylock(struct closure *cl, struct closure *parent) +{ + if (atomic_cmpxchg(&cl->remaining, -1, + CLOSURE_REMAINING_INITIALIZER) != -1) + return false; + + closure_set_ret_ip(cl); + + smp_mb(); + cl->parent = parent; + if (parent) + closure_get(parent); + + closure_debug_create(cl); + return true; +} +EXPORT_SYMBOL_GPL(closure_trylock); + +void __closure_lock(struct closure *cl, struct closure *parent, + struct closure_waitlist *wait_list) +{ + struct closure wait; + closure_init_stack(&wait); + + while (1) { + if (closure_trylock(cl, parent)) + return; + + closure_wait_event_sync(wait_list, &wait, + atomic_read(&cl->remaining) == -1); + } +} +EXPORT_SYMBOL_GPL(__closure_lock); + +static void closure_delay_timer_fn(unsigned long data) +{ + struct closure *cl = (struct closure *) data; + closure_sub(cl, CLOSURE_TIMER + 1); +} + +void do_closure_timer_init(struct closure *cl) +{ + struct timer_list *timer = closure_timer(cl); + + init_timer(timer); + timer->data = (unsigned long) cl; + timer->function = closure_delay_timer_fn; +} +EXPORT_SYMBOL_GPL(do_closure_timer_init); + +bool __closure_delay(struct closure *cl, unsigned long delay, + struct timer_list *timer) +{ + if (atomic_read(&cl->remaining) & CLOSURE_TIMER) + return false; + + BUG_ON(timer_pending(timer)); + + timer->expires = jiffies + delay; + + atomic_add(CLOSURE_TIMER + 1, &cl->remaining); + add_timer(timer); + return true; +} +EXPORT_SYMBOL_GPL(__closure_delay); + +void __closure_flush(struct closure *cl, struct timer_list *timer) +{ + if (del_timer(timer)) + closure_sub(cl, CLOSURE_TIMER + 1); +} +EXPORT_SYMBOL_GPL(__closure_flush); + +void __closure_flush_sync(struct closure *cl, struct timer_list *timer) +{ + if (del_timer_sync(timer)) + closure_sub(cl, CLOSURE_TIMER + 1); +} +EXPORT_SYMBOL_GPL(__closure_flush_sync); + +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + +static LIST_HEAD(closure_list); +static DEFINE_SPINLOCK(closure_list_lock); + +void closure_debug_create(struct closure *cl) +{ + unsigned long flags; + + BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); + cl->magic = CLOSURE_MAGIC_ALIVE; + + spin_lock_irqsave(&closure_list_lock, flags); + list_add(&cl->all, &closure_list); + spin_unlock_irqrestore(&closure_list_lock, flags); +} +EXPORT_SYMBOL_GPL(closure_debug_create); + +void closure_debug_destroy(struct closure *cl) +{ + unsigned long flags; + + BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); + cl->magic = CLOSURE_MAGIC_DEAD; + + spin_lock_irqsave(&closure_list_lock, flags); + list_del(&cl->all); + spin_unlock_irqrestore(&closure_list_lock, flags); +} +EXPORT_SYMBOL_GPL(closure_debug_destroy); + +static struct dentry *debug; + +#define work_data_bits(work) ((unsigned long *)(&(work)->data)) + +static int debug_seq_show(struct seq_file *f, void *data) +{ + struct closure *cl; + spin_lock_irq(&closure_list_lock); + + list_for_each_entry(cl, &closure_list, all) { + int r = atomic_read(&cl->remaining); + + seq_printf(f, "%p: %pF -> %pf p %p r %i ", + cl, (void *) cl->ip, cl->fn, cl->parent, + r & CLOSURE_REMAINING_MASK); + + seq_printf(f, "%s%s%s%s%s%s\n", + test_bit(WORK_STRUCT_PENDING, + work_data_bits(&cl->work)) ? "Q" : "", + r & CLOSURE_RUNNING ? "R" : "", + r & CLOSURE_BLOCKING ? "B" : "", + r & CLOSURE_STACK ? "S" : "", + r & CLOSURE_SLEEPING ? "Sl" : "", + r & CLOSURE_TIMER ? "T" : ""); + + if (r & CLOSURE_WAITING) + seq_printf(f, " W %pF\n", + (void *) cl->waiting_on); + + seq_printf(f, "\n"); + } + + spin_unlock_irq(&closure_list_lock); + return 0; +} + +static int debug_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, debug_seq_show, NULL); +} + +static const struct file_operations debug_ops = { + .owner = THIS_MODULE, + .open = debug_seq_open, + .read = seq_read, + .release = single_release +}; + +void __init closure_debug_init(void) +{ + debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops); +} + +#endif + +MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h new file mode 100644 index 00000000000..00039924ea9 --- /dev/null +++ b/drivers/md/bcache/closure.h @@ -0,0 +1,672 @@ +#ifndef _LINUX_CLOSURE_H +#define _LINUX_CLOSURE_H + +#include <linux/llist.h> +#include <linux/sched.h> +#include <linux/workqueue.h> + +/* + * Closure is perhaps the most overused and abused term in computer science, but + * since I've been unable to come up with anything better you're stuck with it + * again. + * + * What are closures? + * + * They embed a refcount. The basic idea is they count "things that are in + * progress" - in flight bios, some other thread that's doing something else - + * anything you might want to wait on. + * + * The refcount may be manipulated with closure_get() and closure_put(). + * closure_put() is where many of the interesting things happen, when it causes + * the refcount to go to 0. + * + * Closures can be used to wait on things both synchronously and asynchronously, + * and synchronous and asynchronous use can be mixed without restriction. To + * wait synchronously, use closure_sync() - you will sleep until your closure's + * refcount hits 1. + * + * To wait asynchronously, use + * continue_at(cl, next_function, workqueue); + * + * passing it, as you might expect, the function to run when nothing is pending + * and the workqueue to run that function out of. + * + * continue_at() also, critically, is a macro that returns the calling function. + * There's good reason for this. + * + * To use safely closures asynchronously, they must always have a refcount while + * they are running owned by the thread that is running them. Otherwise, suppose + * you submit some bios and wish to have a function run when they all complete: + * + * foo_endio(struct bio *bio, int error) + * { + * closure_put(cl); + * } + * + * closure_init(cl); + * + * do_stuff(); + * closure_get(cl); + * bio1->bi_endio = foo_endio; + * bio_submit(bio1); + * + * do_more_stuff(); + * closure_get(cl); + * bio2->bi_endio = foo_endio; + * bio_submit(bio2); + * + * continue_at(cl, complete_some_read, system_wq); + * + * If closure's refcount started at 0, complete_some_read() could run before the + * second bio was submitted - which is almost always not what you want! More + * importantly, it wouldn't be possible to say whether the original thread or + * complete_some_read()'s thread owned the closure - and whatever state it was + * associated with! + * + * So, closure_init() initializes a closure's refcount to 1 - and when a + * closure_fn is run, the refcount will be reset to 1 first. + * + * Then, the rule is - if you got the refcount with closure_get(), release it + * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount + * on a closure because you called closure_init() or you were run out of a + * closure - _always_ use continue_at(). Doing so consistently will help + * eliminate an entire class of particularly pernicious races. + * + * For a closure to wait on an arbitrary event, we need to introduce waitlists: + * + * struct closure_waitlist list; + * closure_wait_event(list, cl, condition); + * closure_wake_up(wait_list); + * + * These work analagously to wait_event() and wake_up() - except that instead of + * operating on the current thread (for wait_event()) and lists of threads, they + * operate on an explicit closure and lists of closures. + * + * Because it's a closure we can now wait either synchronously or + * asynchronously. closure_wait_event() returns the current value of the + * condition, and if it returned false continue_at() or closure_sync() can be + * used to wait for it to become true. + * + * It's useful for waiting on things when you can't sleep in the context in + * which you must check the condition (perhaps a spinlock held, or you might be + * beneath generic_make_request() - in which case you can't sleep on IO). + * + * closure_wait_event() will wait either synchronously or asynchronously, + * depending on whether the closure is in blocking mode or not. You can pick a + * mode explicitly with closure_wait_event_sync() and + * closure_wait_event_async(), which do just what you might expect. + * + * Lastly, you might have a wait list dedicated to a specific event, and have no + * need for specifying the condition - you just want to wait until someone runs + * closure_wake_up() on the appropriate wait list. In that case, just use + * closure_wait(). It will return either true or false, depending on whether the + * closure was already on a wait list or not - a closure can only be on one wait + * list at a time. + * + * Parents: + * + * closure_init() takes two arguments - it takes the closure to initialize, and + * a (possibly null) parent. + * + * If parent is non null, the new closure will have a refcount for its lifetime; + * a closure is considered to be "finished" when its refcount hits 0 and the + * function to run is null. Hence + * + * continue_at(cl, NULL, NULL); + * + * returns up the (spaghetti) stack of closures, precisely like normal return + * returns up the C stack. continue_at() with non null fn is better thought of + * as doing a tail call. + * + * All this implies that a closure should typically be embedded in a particular + * struct (which its refcount will normally control the lifetime of), and that + * struct can very much be thought of as a stack frame. + * + * Locking: + * + * Closures are based on work items but they can be thought of as more like + * threads - in that like threads and unlike work items they have a well + * defined lifetime; they are created (with closure_init()) and eventually + * complete after a continue_at(cl, NULL, NULL). + * + * Suppose you've got some larger structure with a closure embedded in it that's + * used for periodically doing garbage collection. You only want one garbage + * collection happening at a time, so the natural thing to do is protect it with + * a lock. However, it's difficult to use a lock protecting a closure correctly + * because the unlock should come after the last continue_to() (additionally, if + * you're using the closure asynchronously a mutex won't work since a mutex has + * to be unlocked by the same process that locked it). + * + * So to make it less error prone and more efficient, we also have the ability + * to use closures as locks: + * + * closure_init_unlocked(); + * closure_trylock(); + * + * That's all we need for trylock() - the last closure_put() implicitly unlocks + * it for you. But for closure_lock(), we also need a wait list: + * + * struct closure_with_waitlist frobnicator_cl; + * + * closure_init_unlocked(&frobnicator_cl); + * closure_lock(&frobnicator_cl); + * + * A closure_with_waitlist embeds a closure and a wait list - much like struct + * delayed_work embeds a work item and a timer_list. The important thing is, use + * it exactly like you would a regular closure and closure_put() will magically + * handle everything for you. + * + * We've got closures that embed timers, too. They're called, appropriately + * enough: + * struct closure_with_timer; + * + * This gives you access to closure_delay(). It takes a refcount for a specified + * number of jiffies - you could then call closure_sync() (for a slightly + * convoluted version of msleep()) or continue_at() - which gives you the same + * effect as using a delayed work item, except you can reuse the work_struct + * already embedded in struct closure. + * + * Lastly, there's struct closure_with_waitlist_and_timer. It does what you + * probably expect, if you happen to need the features of both. (You don't + * really want to know how all this is implemented, but if I've done my job + * right you shouldn't have to care). + */ + +struct closure; +typedef void (closure_fn) (struct closure *); + +struct closure_waitlist { + struct llist_head list; +}; + +enum closure_type { + TYPE_closure = 0, + TYPE_closure_with_waitlist = 1, + TYPE_closure_with_timer = 2, + TYPE_closure_with_waitlist_and_timer = 3, + MAX_CLOSURE_TYPE = 3, +}; + +enum closure_state { + /* + * CLOSURE_BLOCKING: Causes closure_wait_event() to block, instead of + * waiting asynchronously + * + * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by + * the thread that owns the closure, and cleared by the thread that's + * waking up the closure. + * + * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep + * - indicates that cl->task is valid and closure_put() may wake it up. + * Only set or cleared by the thread that owns the closure. + * + * CLOSURE_TIMER: Analagous to CLOSURE_WAITING, indicates that a closure + * has an outstanding timer. Must be set by the thread that owns the + * closure, and cleared by the timer function when the timer goes off. + * + * The rest are for debugging and don't affect behaviour: + * + * CLOSURE_RUNNING: Set when a closure is running (i.e. by + * closure_init() and when closure_put() runs then next function), and + * must be cleared before remaining hits 0. Primarily to help guard + * against incorrect usage and accidentally transferring references. + * continue_at() and closure_return() clear it for you, if you're doing + * something unusual you can use closure_set_dead() which also helps + * annotate where references are being transferred. + * + * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a + * closure with this flag set + */ + + CLOSURE_BITS_START = (1 << 19), + CLOSURE_DESTRUCTOR = (1 << 19), + CLOSURE_BLOCKING = (1 << 21), + CLOSURE_WAITING = (1 << 23), + CLOSURE_SLEEPING = (1 << 25), + CLOSURE_TIMER = (1 << 27), + CLOSURE_RUNNING = (1 << 29), + CLOSURE_STACK = (1 << 31), +}; + +#define CLOSURE_GUARD_MASK \ + ((CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING|CLOSURE_WAITING| \ + CLOSURE_SLEEPING|CLOSURE_TIMER|CLOSURE_RUNNING|CLOSURE_STACK) << 1) + +#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) +#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) + +struct closure { + union { + struct { + struct workqueue_struct *wq; + struct task_struct *task; + struct llist_node list; + closure_fn *fn; + }; + struct work_struct work; + }; + + struct closure *parent; + + atomic_t remaining; + + enum closure_type type; + +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +#define CLOSURE_MAGIC_DEAD 0xc054dead +#define CLOSURE_MAGIC_ALIVE 0xc054a11e + + unsigned magic; + struct list_head all; + unsigned long ip; + unsigned long waiting_on; +#endif +}; + +struct closure_with_waitlist { + struct closure cl; + struct closure_waitlist wait; +}; + +struct closure_with_timer { + struct closure cl; + struct timer_list timer; +}; + +struct closure_with_waitlist_and_timer { + struct closure cl; + struct closure_waitlist wait; + struct timer_list timer; +}; + +extern unsigned invalid_closure_type(void); + +#define __CLOSURE_TYPE(cl, _t) \ + __builtin_types_compatible_p(typeof(cl), struct _t) \ + ? TYPE_ ## _t : \ + +#define __closure_type(cl) \ +( \ + __CLOSURE_TYPE(cl, closure) \ + __CLOSURE_TYPE(cl, closure_with_waitlist) \ + __CLOSURE_TYPE(cl, closure_with_timer) \ + __CLOSURE_TYPE(cl, closure_with_waitlist_and_timer) \ + invalid_closure_type() \ +) + +void closure_sub(struct closure *cl, int v); +void closure_put(struct closure *cl); +void closure_queue(struct closure *cl); +void __closure_wake_up(struct closure_waitlist *list); +bool closure_wait(struct closure_waitlist *list, struct closure *cl); +void closure_sync(struct closure *cl); + +bool closure_trylock(struct closure *cl, struct closure *parent); +void __closure_lock(struct closure *cl, struct closure *parent, + struct closure_waitlist *wait_list); + +void do_closure_timer_init(struct closure *cl); +bool __closure_delay(struct closure *cl, unsigned long delay, + struct timer_list *timer); +void __closure_flush(struct closure *cl, struct timer_list *timer); +void __closure_flush_sync(struct closure *cl, struct timer_list *timer); + +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + +void closure_debug_init(void); +void closure_debug_create(struct closure *cl); +void closure_debug_destroy(struct closure *cl); + +#else + +static inline void closure_debug_init(void) {} +static inline void closure_debug_create(struct closure *cl) {} +static inline void closure_debug_destroy(struct closure *cl) {} + +#endif + +static inline void closure_set_ip(struct closure *cl) +{ +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + cl->ip = _THIS_IP_; +#endif +} + +static inline void closure_set_ret_ip(struct closure *cl) +{ +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + cl->ip = _RET_IP_; +#endif +} + +static inline void closure_get(struct closure *cl) +{ +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + BUG_ON((atomic_inc_return(&cl->remaining) & + CLOSURE_REMAINING_MASK) <= 1); +#else + atomic_inc(&cl->remaining); +#endif +} + +static inline void closure_set_stopped(struct closure *cl) +{ + atomic_sub(CLOSURE_RUNNING, &cl->remaining); +} + +static inline bool closure_is_stopped(struct closure *cl) +{ + return !(atomic_read(&cl->remaining) & CLOSURE_RUNNING); +} + +static inline bool closure_is_unlocked(struct closure *cl) +{ + return atomic_read(&cl->remaining) == -1; +} + +static inline void do_closure_init(struct closure *cl, struct closure *parent, + bool running) +{ + switch (cl->type) { + case TYPE_closure_with_timer: + case TYPE_closure_with_waitlist_and_timer: + do_closure_timer_init(cl); + default: + break; + } + + cl->parent = parent; + if (parent) + closure_get(parent); + + if (running) { + closure_debug_create(cl); + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); + } else + atomic_set(&cl->remaining, -1); + + closure_set_ip(cl); +} + +/* + * Hack to get at the embedded closure if there is one, by doing an unsafe cast: + * the result of __closure_type() is thrown away, it's used merely for type + * checking. + */ +#define __to_internal_closure(cl) \ +({ \ + BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE); \ + (struct closure *) cl; \ +}) + +#define closure_init_type(cl, parent, running) \ +do { \ + struct closure *_cl = __to_internal_closure(cl); \ + _cl->type = __closure_type(*(cl)); \ + do_closure_init(_cl, parent, running); \ +} while (0) + +/** + * __closure_init() - Initialize a closure, skipping the memset() + * + * May be used instead of closure_init() when memory has already been zeroed. + */ +#define __closure_init(cl, parent) \ + closure_init_type(cl, parent, true) + +/** + * closure_init() - Initialize a closure, setting the refcount to 1 + * @cl: closure to initialize + * @parent: parent of the new closure. cl will take a refcount on it for its + * lifetime; may be NULL. + */ +#define closure_init(cl, parent) \ +do { \ + memset((cl), 0, sizeof(*(cl))); \ + __closure_init(cl, parent); \ +} while (0) + +static inline void closure_init_stack(struct closure *cl) +{ + memset(cl, 0, sizeof(struct closure)); + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER| + CLOSURE_BLOCKING|CLOSURE_STACK); +} + +/** + * closure_init_unlocked() - Initialize a closure but leave it unlocked. + * @cl: closure to initialize + * + * For when the closure will be used as a lock. The closure may not be used + * until after a closure_lock() or closure_trylock(). + */ +#define closure_init_unlocked(cl) \ +do { \ + memset((cl), 0, sizeof(*(cl))); \ + closure_init_type(cl, NULL, false); \ +} while (0) + +/** + * closure_lock() - lock and initialize a closure. + * @cl: the closure to lock + * @parent: the new parent for this closure + * + * The closure must be of one of the types that has a waitlist (otherwise we + * wouldn't be able to sleep on contention). + * + * @parent has exactly the same meaning as in closure_init(); if non null, the + * closure will take a reference on @parent which will be released when it is + * unlocked. + */ +#define closure_lock(cl, parent) \ + __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait) + +/** + * closure_delay() - delay some number of jiffies + * @cl: the closure that will sleep + * @delay: the delay in jiffies + * + * Takes a refcount on @cl which will be released after @delay jiffies; this may + * be used to have a function run after a delay with continue_at(), or + * closure_sync() may be used for a convoluted version of msleep(). + */ +#define closure_delay(cl, delay) \ + __closure_delay(__to_internal_closure(cl), delay, &(cl)->timer) + +#define closure_flush(cl) \ + __closure_flush(__to_internal_closure(cl), &(cl)->timer) + +#define closure_flush_sync(cl) \ + __closure_flush_sync(__to_internal_closure(cl), &(cl)->timer) + +static inline void __closure_end_sleep(struct closure *cl) +{ + __set_current_state(TASK_RUNNING); + + if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING) + atomic_sub(CLOSURE_SLEEPING, &cl->remaining); +} + +static inline void __closure_start_sleep(struct closure *cl) +{ + closure_set_ip(cl); + cl->task = current; + set_current_state(TASK_UNINTERRUPTIBLE); + + if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) + atomic_add(CLOSURE_SLEEPING, &cl->remaining); +} + +/** + * closure_blocking() - returns true if the closure is in blocking mode. + * + * If a closure is in blocking mode, closure_wait_event() will sleep until the + * condition is true instead of waiting asynchronously. + */ +static inline bool closure_blocking(struct closure *cl) +{ + return atomic_read(&cl->remaining) & CLOSURE_BLOCKING; +} + +/** + * set_closure_blocking() - put a closure in blocking mode. + * + * If a closure is in blocking mode, closure_wait_event() will sleep until the + * condition is true instead of waiting asynchronously. + * + * Not thread safe - can only be called by the thread running the closure. + */ +static inline void set_closure_blocking(struct closure *cl) +{ + if (!closure_blocking(cl)) + atomic_add(CLOSURE_BLOCKING, &cl->remaining); +} + +/* + * Not thread safe - can only be called by the thread running the closure. + */ +static inline void clear_closure_blocking(struct closure *cl) +{ + if (closure_blocking(cl)) + atomic_sub(CLOSURE_BLOCKING, &cl->remaining); +} + +/** + * closure_wake_up() - wake up all closures on a wait list. + */ +static inline void closure_wake_up(struct closure_waitlist *list) +{ + smp_mb(); + __closure_wake_up(list); +} + +/* + * Wait on an event, synchronously or asynchronously - analogous to wait_event() + * but for closures. + * + * The loop is oddly structured so as to avoid a race; we must check the + * condition again after we've added ourself to the waitlist. We know if we were + * already on the waitlist because closure_wait() returns false; thus, we only + * schedule or break if closure_wait() returns false. If it returns true, we + * just loop again - rechecking the condition. + * + * The __closure_wake_up() is necessary because we may race with the event + * becoming true; i.e. we see event false -> wait -> recheck condition, but the + * thread that made the event true may have called closure_wake_up() before we + * added ourself to the wait list. + * + * We have to call closure_sync() at the end instead of just + * __closure_end_sleep() because a different thread might've called + * closure_wake_up() before us and gotten preempted before they dropped the + * refcount on our closure. If this was a stack allocated closure, that would be + * bad. + */ +#define __closure_wait_event(list, cl, condition, _block) \ +({ \ + bool block = _block; \ + typeof(condition) ret; \ + \ + while (1) { \ + ret = (condition); \ + if (ret) { \ + __closure_wake_up(list); \ + if (block) \ + closure_sync(cl); \ + \ + break; \ + } \ + \ + if (block) \ + __closure_start_sleep(cl); \ + \ + if (!closure_wait(list, cl)) { \ + if (!block) \ + break; \ + \ + schedule(); \ + } \ + } \ + \ + ret; \ +}) + +/** + * closure_wait_event() - wait on a condition, synchronously or asynchronously. + * @list: the wait list to wait on + * @cl: the closure that is doing the waiting + * @condition: a C expression for the event to wait for + * + * If the closure is in blocking mode, sleeps until the @condition evaluates to + * true - exactly like wait_event(). + * + * If the closure is not in blocking mode, waits asynchronously; if the + * condition is currently false the @cl is put onto @list and returns. @list + * owns a refcount on @cl; closure_sync() or continue_at() may be used later to + * wait for another thread to wake up @list, which drops the refcount on @cl. + * + * Returns the value of @condition; @cl will be on @list iff @condition was + * false. + * + * closure_wake_up(@list) must be called after changing any variable that could + * cause @condition to become true. + */ +#define closure_wait_event(list, cl, condition) \ + __closure_wait_event(list, cl, condition, closure_blocking(cl)) + +#define closure_wait_event_async(list, cl, condition) \ + __closure_wait_event(list, cl, condition, false) + +#define closure_wait_event_sync(list, cl, condition) \ + __closure_wait_event(list, cl, condition, true) + +static inline void set_closure_fn(struct closure *cl, closure_fn *fn, + struct workqueue_struct *wq) +{ + BUG_ON(object_is_on_stack(cl)); + closure_set_ip(cl); + cl->fn = fn; + cl->wq = wq; + /* between atomic_dec() in closure_put() */ + smp_mb__before_atomic_dec(); +} + +#define continue_at(_cl, _fn, _wq) \ +do { \ + set_closure_fn(_cl, _fn, _wq); \ + closure_sub(_cl, CLOSURE_RUNNING + 1); \ + return; \ +} while (0) + +#define closure_return(_cl) continue_at((_cl), NULL, NULL) + +#define continue_at_nobarrier(_cl, _fn, _wq) \ +do { \ + set_closure_fn(_cl, _fn, _wq); \ + closure_queue(cl); \ + return; \ +} while (0) + +#define closure_return_with_destructor(_cl, _destructor) \ +do { \ + set_closure_fn(_cl, _destructor, NULL); \ + closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ + return; \ +} while (0) + +static inline void closure_call(struct closure *cl, closure_fn fn, + struct workqueue_struct *wq, + struct closure *parent) +{ + closure_init(cl, parent); + continue_at_nobarrier(cl, fn, wq); +} + +static inline void closure_trylock_call(struct closure *cl, closure_fn fn, + struct workqueue_struct *wq, + struct closure *parent) +{ + if (closure_trylock(cl, parent)) + continue_at_nobarrier(cl, fn, wq); +} + +#endif /* _LINUX_CLOSURE_H */ diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c new file mode 100644 index 00000000000..89fd5204924 --- /dev/null +++ b/drivers/md/bcache/debug.c @@ -0,0 +1,565 @@ +/* + * Assorted bcache debug code + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "request.h" + +#include <linux/console.h> +#include <linux/debugfs.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/seq_file.h> + +static struct dentry *debug; + +const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) { + struct cache *ca = PTR_CACHE(c, k, i); + size_t bucket = PTR_BUCKET_NR(c, k, i); + size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); + + if (KEY_SIZE(k) + r > c->sb.bucket_size) + return "bad, length too big"; + if (bucket < ca->sb.first_bucket) + return "bad, short offset"; + if (bucket >= ca->sb.nbuckets) + return "bad, offset past end of device"; + if (ptr_stale(c, k, i)) + return "stale"; + } + + if (!bkey_cmp(k, &ZERO_KEY)) + return "bad, null key"; + if (!KEY_PTRS(k)) + return "bad, no pointers"; + if (!KEY_SIZE(k)) + return "zeroed key"; + return ""; +} + +struct keyprint_hack bch_pkey(const struct bkey *k) +{ + unsigned i = 0; + struct keyprint_hack r; + char *out = r.s, *end = r.s + KEYHACK_SIZE; + +#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) + + p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k)); + + if (KEY_PTRS(k)) + while (1) { + p("%llu:%llu gen %llu", + PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i)); + + if (++i == KEY_PTRS(k)) + break; + + p(", "); + } + + p("]"); + + if (KEY_DIRTY(k)) + p(" dirty"); + if (KEY_CSUM(k)) + p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); +#undef p + return r; +} + +struct keyprint_hack bch_pbtree(const struct btree *b) +{ + struct keyprint_hack r; + + snprintf(r.s, 40, "%zu level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0), + b->level, b->c->root ? b->c->root->level : -1); + return r; +} + +#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) + +static bool skipped_backwards(struct btree *b, struct bkey *k) +{ + return bkey_cmp(k, (!b->level) + ? &START_KEY(bkey_next(k)) + : bkey_next(k)) > 0; +} + +static void dump_bset(struct btree *b, struct bset *i) +{ + struct bkey *k; + unsigned j; + + for (k = i->start; k < end(i); k = bkey_next(k)) { + printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), + (uint64_t *) k - i->d, i->keys, pkey(k)); + + for (j = 0; j < KEY_PTRS(k); j++) { + size_t n = PTR_BUCKET_NR(b->c, k, j); + printk(" bucket %zu", n); + + if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) + printk(" prio %i", + PTR_BUCKET(b->c, k, j)->prio); + } + + printk(" %s\n", bch_ptr_status(b->c, k)); + + if (bkey_next(k) < end(i) && + skipped_backwards(b, k)) + printk(KERN_ERR "Key skipped backwards\n"); + } +} + +#endif + +#ifdef CONFIG_BCACHE_DEBUG + +void bch_btree_verify(struct btree *b, struct bset *new) +{ + struct btree *v = b->c->verify_data; + struct closure cl; + closure_init_stack(&cl); + + if (!b->c->verify) + return; + + closure_wait_event(&b->io.wait, &cl, + atomic_read(&b->io.cl.remaining) == -1); + + mutex_lock(&b->c->verify_lock); + + bkey_copy(&v->key, &b->key); + v->written = 0; + v->level = b->level; + + bch_btree_read(v); + closure_wait_event(&v->io.wait, &cl, + atomic_read(&b->io.cl.remaining) == -1); + + if (new->keys != v->sets[0].data->keys || + memcmp(new->start, + v->sets[0].data->start, + (void *) end(new) - (void *) new->start)) { + unsigned i, j; + + console_lock(); + + printk(KERN_ERR "*** original memory node:\n"); + for (i = 0; i <= b->nsets; i++) + dump_bset(b, b->sets[i].data); + + printk(KERN_ERR "*** sorted memory node:\n"); + dump_bset(b, new); + + printk(KERN_ERR "*** on disk node:\n"); + dump_bset(v, v->sets[0].data); + + for (j = 0; j < new->keys; j++) + if (new->d[j] != v->sets[0].data->d[j]) + break; + + console_unlock(); + panic("verify failed at %u\n", j); + } + + mutex_unlock(&b->c->verify_lock); +} + +static void data_verify_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + closure_put(cl); +} + +void bch_data_verify(struct search *s) +{ + char name[BDEVNAME_SIZE]; + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + struct closure *cl = &s->cl; + struct bio *check; + struct bio_vec *bv; + int i; + + if (!s->unaligned_bvec) + bio_for_each_segment(bv, s->orig_bio, i) + bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; + + check = bio_clone(s->orig_bio, GFP_NOIO); + if (!check) + return; + + if (bch_bio_alloc_pages(check, GFP_NOIO)) + goto out_put; + + check->bi_rw = READ_SYNC; + check->bi_private = cl; + check->bi_end_io = data_verify_endio; + + closure_bio_submit(check, cl, &dc->disk); + closure_sync(cl); + + bio_for_each_segment(bv, s->orig_bio, i) { + void *p1 = kmap(bv->bv_page); + void *p2 = kmap(check->bi_io_vec[i].bv_page); + + if (memcmp(p1 + bv->bv_offset, + p2 + bv->bv_offset, + bv->bv_len)) + printk(KERN_ERR + "bcache (%s): verify failed at sector %llu\n", + bdevname(dc->bdev, name), + (uint64_t) s->orig_bio->bi_sector); + + kunmap(bv->bv_page); + kunmap(check->bi_io_vec[i].bv_page); + } + + __bio_for_each_segment(bv, check, i, 0) + __free_page(bv->bv_page); +out_put: + bio_put(check); +} + +#endif + +#ifdef CONFIG_BCACHE_EDEBUG + +unsigned bch_count_data(struct btree *b) +{ + unsigned ret = 0; + struct btree_iter iter; + struct bkey *k; + + if (!b->level) + for_each_key(b, k, &iter) + ret += KEY_SIZE(k); + return ret; +} + +static void vdump_bucket_and_panic(struct btree *b, const char *fmt, + va_list args) +{ + unsigned i; + + console_lock(); + + for (i = 0; i <= b->nsets; i++) + dump_bset(b, b->sets[i].data); + + vprintk(fmt, args); + + console_unlock(); + + panic("at %s\n", pbtree(b)); +} + +void bch_check_key_order_msg(struct btree *b, struct bset *i, + const char *fmt, ...) +{ + struct bkey *k; + + if (!i->keys) + return; + + for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k)) + if (skipped_backwards(b, k)) { + va_list args; + va_start(args, fmt); + + vdump_bucket_and_panic(b, fmt, args); + va_end(args); + } +} + +void bch_check_keys(struct btree *b, const char *fmt, ...) +{ + va_list args; + struct bkey *k, *p = NULL; + struct btree_iter iter; + + if (b->level) + return; + + for_each_key(b, k, &iter) { + if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) { + printk(KERN_ERR "Keys out of order:\n"); + goto bug; + } + + if (bch_ptr_invalid(b, k)) + continue; + + if (p && bkey_cmp(p, &START_KEY(k)) > 0) { + printk(KERN_ERR "Overlapping keys:\n"); + goto bug; + } + p = k; + } + return; +bug: + va_start(args, fmt); + vdump_bucket_and_panic(b, fmt, args); + va_end(args); +} + +#endif + +#ifdef CONFIG_DEBUG_FS + +/* XXX: cache set refcounting */ + +struct dump_iterator { + char buf[PAGE_SIZE]; + size_t bytes; + struct cache_set *c; + struct keybuf keys; +}; + +static bool dump_pred(struct keybuf *buf, struct bkey *k) +{ + return true; +} + +static ssize_t bch_dump_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iterator *i = file->private_data; + ssize_t ret = 0; + + while (size) { + struct keybuf_key *w; + unsigned bytes = min(i->bytes, size); + + int err = copy_to_user(buf, i->buf, bytes); + if (err) + return err; + + ret += bytes; + buf += bytes; + size -= bytes; + i->bytes -= bytes; + memmove(i->buf, i->buf + bytes, i->bytes); + + if (i->bytes) + break; + + w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY); + if (!w) + break; + + i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key)); + bch_keybuf_del(&i->keys, w); + } + + return ret; +} + +static int bch_dump_open(struct inode *inode, struct file *file) +{ + struct cache_set *c = inode->i_private; + struct dump_iterator *i; + + i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL); + if (!i) + return -ENOMEM; + + file->private_data = i; + i->c = c; + bch_keybuf_init(&i->keys, dump_pred); + i->keys.last_scanned = KEY(0, 0, 0); + + return 0; +} + +static int bch_dump_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static const struct file_operations cache_set_debug_ops = { + .owner = THIS_MODULE, + .open = bch_dump_open, + .read = bch_dump_read, + .release = bch_dump_release +}; + +void bch_debug_init_cache_set(struct cache_set *c) +{ + if (!IS_ERR_OR_NULL(debug)) { + char name[50]; + snprintf(name, 50, "bcache-%pU", c->sb.set_uuid); + + c->debug = debugfs_create_file(name, 0400, debug, c, + &cache_set_debug_ops); + } +} + +#endif + +/* Fuzz tester has rotted: */ +#if 0 + +static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a, + const char *buffer, size_t size) +{ + void dump(struct btree *b) + { + struct bset *i; + + for (i = b->sets[0].data; + index(i, b) < btree_blocks(b) && + i->seq == b->sets[0].data->seq; + i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c)) + dump_bset(b, i); + } + + struct cache_sb *sb; + struct cache_set *c; + struct btree *all[3], *b, *fill, *orig; + int j; + + struct btree_op op; + bch_btree_op_init_stack(&op); + + sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL); + if (!sb) + return -ENOMEM; + + sb->bucket_size = 128; + sb->block_size = 4; + + c = bch_cache_set_alloc(sb); + if (!c) + return -ENOMEM; + + for (j = 0; j < 3; j++) { + BUG_ON(list_empty(&c->btree_cache)); + all[j] = list_first_entry(&c->btree_cache, struct btree, list); + list_del_init(&all[j]->list); + + all[j]->key = KEY(0, 0, c->sb.bucket_size); + bkey_copy_key(&all[j]->key, &MAX_KEY); + } + + b = all[0]; + fill = all[1]; + orig = all[2]; + + while (1) { + for (j = 0; j < 3; j++) + all[j]->written = all[j]->nsets = 0; + + bch_bset_init_next(b); + + while (1) { + struct bset *i = write_block(b); + struct bkey *k = op.keys.top; + unsigned rand; + + bkey_init(k); + rand = get_random_int(); + + op.type = rand & 1 + ? BTREE_INSERT + : BTREE_REPLACE; + rand >>= 1; + + SET_KEY_SIZE(k, bucket_remainder(c, rand)); + rand >>= c->bucket_bits; + rand &= 1024 * 512 - 1; + rand += c->sb.bucket_size; + SET_KEY_OFFSET(k, rand); +#if 0 + SET_KEY_PTRS(k, 1); +#endif + bch_keylist_push(&op.keys); + bch_btree_insert_keys(b, &op); + + if (should_split(b) || + set_blocks(i, b->c) != + __set_blocks(i, i->keys + 15, b->c)) { + i->csum = csum_set(i); + + memcpy(write_block(fill), + i, set_bytes(i)); + + b->written += set_blocks(i, b->c); + fill->written = b->written; + if (b->written == btree_blocks(b)) + break; + + bch_btree_sort_lazy(b); + bch_bset_init_next(b); + } + } + + memcpy(orig->sets[0].data, + fill->sets[0].data, + btree_bytes(c)); + + bch_btree_sort(b); + fill->written = 0; + bch_btree_read_done(&fill->io.cl); + + if (b->sets[0].data->keys != fill->sets[0].data->keys || + memcmp(b->sets[0].data->start, + fill->sets[0].data->start, + b->sets[0].data->keys * sizeof(uint64_t))) { + struct bset *i = b->sets[0].data; + struct bkey *k, *l; + + for (k = i->start, + l = fill->sets[0].data->start; + k < end(i); + k = bkey_next(k), l = bkey_next(l)) + if (bkey_cmp(k, l) || + KEY_SIZE(k) != KEY_SIZE(l)) + pr_err("key %zi differs: %s != %s", + (uint64_t *) k - i->d, + pkey(k), pkey(l)); + + for (j = 0; j < 3; j++) { + pr_err("**** Set %i ****", j); + dump(all[j]); + } + panic("\n"); + } + + pr_info("fuzz complete: %i keys", b->sets[0].data->keys); + } +} + +kobj_attribute_write(fuzz, btree_fuzz); +#endif + +void bch_debug_exit(void) +{ + if (!IS_ERR_OR_NULL(debug)) + debugfs_remove_recursive(debug); +} + +int __init bch_debug_init(struct kobject *kobj) +{ + int ret = 0; +#if 0 + ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr); + if (ret) + return ret; +#endif + + debug = debugfs_create_dir("bcache", NULL); + return ret; +} diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h new file mode 100644 index 00000000000..f9378a21814 --- /dev/null +++ b/drivers/md/bcache/debug.h @@ -0,0 +1,54 @@ +#ifndef _BCACHE_DEBUG_H +#define _BCACHE_DEBUG_H + +/* Btree/bkey debug printing */ + +#define KEYHACK_SIZE 80 +struct keyprint_hack { + char s[KEYHACK_SIZE]; +}; + +struct keyprint_hack bch_pkey(const struct bkey *k); +struct keyprint_hack bch_pbtree(const struct btree *b); +#define pkey(k) (&bch_pkey(k).s[0]) +#define pbtree(b) (&bch_pbtree(b).s[0]) + +#ifdef CONFIG_BCACHE_EDEBUG + +unsigned bch_count_data(struct btree *); +void bch_check_key_order_msg(struct btree *, struct bset *, const char *, ...); +void bch_check_keys(struct btree *, const char *, ...); + +#define bch_check_key_order(b, i) \ + bch_check_key_order_msg(b, i, "keys out of order") +#define EBUG_ON(cond) BUG_ON(cond) + +#else /* EDEBUG */ + +#define bch_count_data(b) 0 +#define bch_check_key_order(b, i) do {} while (0) +#define bch_check_key_order_msg(b, i, ...) do {} while (0) +#define bch_check_keys(b, ...) do {} while (0) +#define EBUG_ON(cond) do {} while (0) + +#endif + +#ifdef CONFIG_BCACHE_DEBUG + +void bch_btree_verify(struct btree *, struct bset *); +void bch_data_verify(struct search *); + +#else /* DEBUG */ + +static inline void bch_btree_verify(struct btree *b, struct bset *i) {} +static inline void bch_data_verify(struct search *s) {}; + +#endif + +#ifdef CONFIG_DEBUG_FS +void bch_debug_init_cache_set(struct cache_set *); +#else +static inline void bch_debug_init_cache_set(struct cache_set *c) {} +#endif + +#endif diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c new file mode 100644 index 00000000000..48efd4dea64 --- /dev/null +++ b/drivers/md/bcache/io.c @@ -0,0 +1,397 @@ +/* + * Some low level IO code, and hacks for various block layer limitations + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "bset.h" +#include "debug.h" + +static void bch_bi_idx_hack_endio(struct bio *bio, int error) +{ + struct bio *p = bio->bi_private; + + bio_endio(p, error); + bio_put(bio); +} + +static void bch_generic_make_request_hack(struct bio *bio) +{ + if (bio->bi_idx) { + struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio)); + + memcpy(clone->bi_io_vec, + bio_iovec(bio), + bio_segments(bio) * sizeof(struct bio_vec)); + + clone->bi_sector = bio->bi_sector; + clone->bi_bdev = bio->bi_bdev; + clone->bi_rw = bio->bi_rw; + clone->bi_vcnt = bio_segments(bio); + clone->bi_size = bio->bi_size; + + clone->bi_private = bio; + clone->bi_end_io = bch_bi_idx_hack_endio; + + bio = clone; + } + + /* + * Hack, since drivers that clone bios clone up to bi_max_vecs, but our + * bios might have had more than that (before we split them per device + * limitations). + * + * To be taken out once immutable bvec stuff is in. + */ + bio->bi_max_vecs = bio->bi_vcnt; + + generic_make_request(bio); +} + +/** + * bch_bio_split - split a bio + * @bio: bio to split + * @sectors: number of sectors to split from the front of @bio + * @gfp: gfp mask + * @bs: bio set to allocate from + * + * Allocates and returns a new bio which represents @sectors from the start of + * @bio, and updates @bio to represent the remaining sectors. + * + * If bio_sectors(@bio) was less than or equal to @sectors, returns @bio + * unchanged. + * + * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a + * bvec boundry; it is the caller's responsibility to ensure that @bio is not + * freed before the split. + * + * If bch_bio_split() is running under generic_make_request(), it's not safe to + * allocate more than one bio from the same bio set. Therefore, if it is running + * under generic_make_request() it masks out __GFP_WAIT when doing the + * allocation. The caller must check for failure if there's any possibility of + * it being called from under generic_make_request(); it is then the caller's + * responsibility to retry from a safe context (by e.g. punting to workqueue). + */ +struct bio *bch_bio_split(struct bio *bio, int sectors, + gfp_t gfp, struct bio_set *bs) +{ + unsigned idx = bio->bi_idx, vcnt = 0, nbytes = sectors << 9; + struct bio_vec *bv; + struct bio *ret = NULL; + + BUG_ON(sectors <= 0); + + /* + * If we're being called from underneath generic_make_request() and we + * already allocated any bios from this bio set, we risk deadlock if we + * use the mempool. So instead, we possibly fail and let the caller punt + * to workqueue or somesuch and retry in a safe context. + */ + if (current->bio_list) + gfp &= ~__GFP_WAIT; + + if (sectors >= bio_sectors(bio)) + return bio; + + if (bio->bi_rw & REQ_DISCARD) { + ret = bio_alloc_bioset(gfp, 1, bs); + idx = 0; + goto out; + } + + bio_for_each_segment(bv, bio, idx) { + vcnt = idx - bio->bi_idx; + + if (!nbytes) { + ret = bio_alloc_bioset(gfp, vcnt, bs); + if (!ret) + return NULL; + + memcpy(ret->bi_io_vec, bio_iovec(bio), + sizeof(struct bio_vec) * vcnt); + + break; + } else if (nbytes < bv->bv_len) { + ret = bio_alloc_bioset(gfp, ++vcnt, bs); + if (!ret) + return NULL; + + memcpy(ret->bi_io_vec, bio_iovec(bio), + sizeof(struct bio_vec) * vcnt); + + ret->bi_io_vec[vcnt - 1].bv_len = nbytes; + bv->bv_offset += nbytes; + bv->bv_len -= nbytes; + break; + } + + nbytes -= bv->bv_len; + } +out: + ret->bi_bdev = bio->bi_bdev; + ret->bi_sector = bio->bi_sector; + ret->bi_size = sectors << 9; + ret->bi_rw = bio->bi_rw; + ret->bi_vcnt = vcnt; + ret->bi_max_vecs = vcnt; + + bio->bi_sector += sectors; + bio->bi_size -= sectors << 9; + bio->bi_idx = idx; + + if (bio_integrity(bio)) { + if (bio_integrity_clone(ret, bio, gfp)) { + bio_put(ret); + return NULL; + } + + bio_integrity_trim(ret, 0, bio_sectors(ret)); + bio_integrity_trim(bio, bio_sectors(ret), bio_sectors(bio)); + } + + return ret; +} + +static unsigned bch_bio_max_sectors(struct bio *bio) +{ + unsigned ret = bio_sectors(bio); + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES, + queue_max_segments(q)); + struct bio_vec *bv, *end = bio_iovec(bio) + + min_t(int, bio_segments(bio), max_segments); + + if (bio->bi_rw & REQ_DISCARD) + return min(ret, q->limits.max_discard_sectors); + + if (bio_segments(bio) > max_segments || + q->merge_bvec_fn) { + ret = 0; + + for (bv = bio_iovec(bio); bv < end; bv++) { + struct bvec_merge_data bvm = { + .bi_bdev = bio->bi_bdev, + .bi_sector = bio->bi_sector, + .bi_size = ret << 9, + .bi_rw = bio->bi_rw, + }; + + if (q->merge_bvec_fn && + q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len) + break; + + ret += bv->bv_len >> 9; + } + } + + ret = min(ret, queue_max_sectors(q)); + + WARN_ON(!ret); + ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9); + + return ret; +} + +static void bch_bio_submit_split_done(struct closure *cl) +{ + struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); + + s->bio->bi_end_io = s->bi_end_io; + s->bio->bi_private = s->bi_private; + bio_endio(s->bio, 0); + + closure_debug_destroy(&s->cl); + mempool_free(s, s->p->bio_split_hook); +} + +static void bch_bio_submit_split_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); + + if (error) + clear_bit(BIO_UPTODATE, &s->bio->bi_flags); + + bio_put(bio); + closure_put(cl); +} + +static void __bch_bio_submit_split(struct closure *cl) +{ + struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); + struct bio *bio = s->bio, *n; + + do { + n = bch_bio_split(bio, bch_bio_max_sectors(bio), + GFP_NOIO, s->p->bio_split); + if (!n) + continue_at(cl, __bch_bio_submit_split, system_wq); + + n->bi_end_io = bch_bio_submit_split_endio; + n->bi_private = cl; + + closure_get(cl); + bch_generic_make_request_hack(n); + } while (n != bio); + + continue_at(cl, bch_bio_submit_split_done, NULL); +} + +void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) +{ + struct bio_split_hook *s; + + if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD)) + goto submit; + + if (bio_sectors(bio) <= bch_bio_max_sectors(bio)) + goto submit; + + s = mempool_alloc(p->bio_split_hook, GFP_NOIO); + + s->bio = bio; + s->p = p; + s->bi_end_io = bio->bi_end_io; + s->bi_private = bio->bi_private; + bio_get(bio); + + closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL); + return; +submit: + bch_generic_make_request_hack(bio); +} + +/* Bios with headers */ + +void bch_bbio_free(struct bio *bio, struct cache_set *c) +{ + struct bbio *b = container_of(bio, struct bbio, bio); + mempool_free(b, c->bio_meta); +} + +struct bio *bch_bbio_alloc(struct cache_set *c) +{ + struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO); + struct bio *bio = &b->bio; + + bio_init(bio); + bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET; + bio->bi_max_vecs = bucket_pages(c); + bio->bi_io_vec = bio->bi_inline_vecs; + + return bio; +} + +void __bch_submit_bbio(struct bio *bio, struct cache_set *c) +{ + struct bbio *b = container_of(bio, struct bbio, bio); + + bio->bi_sector = PTR_OFFSET(&b->key, 0); + bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev; + + b->submit_time_us = local_clock_us(); + closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0)); +} + +void bch_submit_bbio(struct bio *bio, struct cache_set *c, + struct bkey *k, unsigned ptr) +{ + struct bbio *b = container_of(bio, struct bbio, bio); + bch_bkey_copy_single_ptr(&b->key, k, ptr); + __bch_submit_bbio(bio, c); +} + +/* IO errors */ + +void bch_count_io_errors(struct cache *ca, int error, const char *m) +{ + /* + * The halflife of an error is: + * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh + */ + + if (ca->set->error_decay) { + unsigned count = atomic_inc_return(&ca->io_count); + + while (count > ca->set->error_decay) { + unsigned errors; + unsigned old = count; + unsigned new = count - ca->set->error_decay; + + /* + * First we subtract refresh from count; each time we + * succesfully do so, we rescale the errors once: + */ + + count = atomic_cmpxchg(&ca->io_count, old, new); + + if (count == old) { + count = new; + + errors = atomic_read(&ca->io_errors); + do { + old = errors; + new = ((uint64_t) errors * 127) / 128; + errors = atomic_cmpxchg(&ca->io_errors, + old, new); + } while (old != errors); + } + } + } + + if (error) { + char buf[BDEVNAME_SIZE]; + unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT, + &ca->io_errors); + errors >>= IO_ERROR_SHIFT; + + if (errors < ca->set->error_limit) + pr_err("%s: IO error on %s, recovering", + bdevname(ca->bdev, buf), m); + else + bch_cache_set_error(ca->set, + "%s: too many IO errors %s", + bdevname(ca->bdev, buf), m); + } +} + +void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, + int error, const char *m) +{ + struct bbio *b = container_of(bio, struct bbio, bio); + struct cache *ca = PTR_CACHE(c, &b->key, 0); + + unsigned threshold = bio->bi_rw & REQ_WRITE + ? c->congested_write_threshold_us + : c->congested_read_threshold_us; + + if (threshold) { + unsigned t = local_clock_us(); + + int us = t - b->submit_time_us; + int congested = atomic_read(&c->congested); + + if (us > (int) threshold) { + int ms = us / 1024; + c->congested_last_us = t; + + ms = min(ms, CONGESTED_MAX + congested); + atomic_sub(ms, &c->congested); + } else if (congested < 0) + atomic_inc(&c->congested); + } + + bch_count_io_errors(ca, error, m); +} + +void bch_bbio_endio(struct cache_set *c, struct bio *bio, + int error, const char *m) +{ + struct closure *cl = bio->bi_private; + + bch_bbio_count_io_errors(c, bio, error, m); + bio_put(bio); + closure_put(cl); +} diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c new file mode 100644 index 00000000000..8c8dfdcd9d4 --- /dev/null +++ b/drivers/md/bcache/journal.c @@ -0,0 +1,787 @@ +/* + * bcache journalling code, for btree insertions + * + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "request.h" + +/* + * Journal replay/recovery: + * + * This code is all driven from run_cache_set(); we first read the journal + * entries, do some other stuff, then we mark all the keys in the journal + * entries (same as garbage collection would), then we replay them - reinserting + * them into the cache in precisely the same order as they appear in the + * journal. + * + * We only journal keys that go in leaf nodes, which simplifies things quite a + * bit. + */ + +static void journal_read_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + closure_put(cl); +} + +static int journal_read_bucket(struct cache *ca, struct list_head *list, + struct btree_op *op, unsigned bucket_index) +{ + struct journal_device *ja = &ca->journal; + struct bio *bio = &ja->bio; + + struct journal_replay *i; + struct jset *j, *data = ca->set->journal.w[0].data; + unsigned len, left, offset = 0; + int ret = 0; + sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); + + pr_debug("reading %llu", (uint64_t) bucket); + + while (offset < ca->sb.bucket_size) { +reread: left = ca->sb.bucket_size - offset; + len = min_t(unsigned, left, PAGE_SECTORS * 8); + + bio_reset(bio); + bio->bi_sector = bucket + offset; + bio->bi_bdev = ca->bdev; + bio->bi_rw = READ; + bio->bi_size = len << 9; + + bio->bi_end_io = journal_read_endio; + bio->bi_private = &op->cl; + bch_bio_map(bio, data); + + closure_bio_submit(bio, &op->cl, ca); + closure_sync(&op->cl); + + /* This function could be simpler now since we no longer write + * journal entries that overlap bucket boundaries; this means + * the start of a bucket will always have a valid journal entry + * if it has any journal entries at all. + */ + + j = data; + while (len) { + struct list_head *where; + size_t blocks, bytes = set_bytes(j); + + if (j->magic != jset_magic(ca->set)) + return ret; + + if (bytes > left << 9) + return ret; + + if (bytes > len << 9) + goto reread; + + if (j->csum != csum_set(j)) + return ret; + + blocks = set_blocks(j, ca->set); + + while (!list_empty(list)) { + i = list_first_entry(list, + struct journal_replay, list); + if (i->j.seq >= j->last_seq) + break; + list_del(&i->list); + kfree(i); + } + + list_for_each_entry_reverse(i, list, list) { + if (j->seq == i->j.seq) + goto next_set; + + if (j->seq < i->j.last_seq) + goto next_set; + + if (j->seq > i->j.seq) { + where = &i->list; + goto add; + } + } + + where = list; +add: + i = kmalloc(offsetof(struct journal_replay, j) + + bytes, GFP_KERNEL); + if (!i) + return -ENOMEM; + memcpy(&i->j, j, bytes); + list_add(&i->list, where); + ret = 1; + + ja->seq[bucket_index] = j->seq; +next_set: + offset += blocks * ca->sb.block_size; + len -= blocks * ca->sb.block_size; + j = ((void *) j) + blocks * block_bytes(ca); + } + } + + return ret; +} + +int bch_journal_read(struct cache_set *c, struct list_head *list, + struct btree_op *op) +{ +#define read_bucket(b) \ + ({ \ + int ret = journal_read_bucket(ca, list, op, b); \ + __set_bit(b, bitmap); \ + if (ret < 0) \ + return ret; \ + ret; \ + }) + + struct cache *ca; + unsigned iter; + + for_each_cache(ca, c, iter) { + struct journal_device *ja = &ca->journal; + unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG]; + unsigned i, l, r, m; + uint64_t seq; + + bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); + pr_debug("%u journal buckets", ca->sb.njournal_buckets); + + /* Read journal buckets ordered by golden ratio hash to quickly + * find a sequence of buckets with valid journal entries + */ + for (i = 0; i < ca->sb.njournal_buckets; i++) { + l = (i * 2654435769U) % ca->sb.njournal_buckets; + + if (test_bit(l, bitmap)) + break; + + if (read_bucket(l)) + goto bsearch; + } + + /* If that fails, check all the buckets we haven't checked + * already + */ + pr_debug("falling back to linear search"); + + for (l = 0; l < ca->sb.njournal_buckets; l++) { + if (test_bit(l, bitmap)) + continue; + + if (read_bucket(l)) + goto bsearch; + } +bsearch: + /* Binary search */ + m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); + pr_debug("starting binary search, l %u r %u", l, r); + + while (l + 1 < r) { + m = (l + r) >> 1; + + if (read_bucket(m)) + l = m; + else + r = m; + } + + /* Read buckets in reverse order until we stop finding more + * journal entries + */ + pr_debug("finishing up"); + l = m; + + while (1) { + if (!l--) + l = ca->sb.njournal_buckets - 1; + + if (l == m) + break; + + if (test_bit(l, bitmap)) + continue; + + if (!read_bucket(l)) + break; + } + + seq = 0; + + for (i = 0; i < ca->sb.njournal_buckets; i++) + if (ja->seq[i] > seq) { + seq = ja->seq[i]; + ja->cur_idx = ja->discard_idx = + ja->last_idx = i; + + } + } + + c->journal.seq = list_entry(list->prev, + struct journal_replay, + list)->j.seq; + + return 0; +#undef read_bucket +} + +void bch_journal_mark(struct cache_set *c, struct list_head *list) +{ + atomic_t p = { 0 }; + struct bkey *k; + struct journal_replay *i; + struct journal *j = &c->journal; + uint64_t last = j->seq; + + /* + * journal.pin should never fill up - we never write a journal + * entry when it would fill up. But if for some reason it does, we + * iterate over the list in reverse order so that we can just skip that + * refcount instead of bugging. + */ + + list_for_each_entry_reverse(i, list, list) { + BUG_ON(last < i->j.seq); + i->pin = NULL; + + while (last-- != i->j.seq) + if (fifo_free(&j->pin) > 1) { + fifo_push_front(&j->pin, p); + atomic_set(&fifo_front(&j->pin), 0); + } + + if (fifo_free(&j->pin) > 1) { + fifo_push_front(&j->pin, p); + i->pin = &fifo_front(&j->pin); + atomic_set(i->pin, 1); + } + + for (k = i->j.start; + k < end(&i->j); + k = bkey_next(k)) { + unsigned j; + + for (j = 0; j < KEY_PTRS(k); j++) { + struct bucket *g = PTR_BUCKET(c, k, j); + atomic_inc(&g->pin); + + if (g->prio == BTREE_PRIO && + !ptr_stale(c, k, j)) + g->prio = INITIAL_PRIO; + } + + __bch_btree_mark_key(c, 0, k); + } + } +} + +int bch_journal_replay(struct cache_set *s, struct list_head *list, + struct btree_op *op) +{ + int ret = 0, keys = 0, entries = 0; + struct bkey *k; + struct journal_replay *i = + list_entry(list->prev, struct journal_replay, list); + + uint64_t start = i->j.last_seq, end = i->j.seq, n = start; + + list_for_each_entry(i, list, list) { + BUG_ON(i->pin && atomic_read(i->pin) != 1); + + if (n != i->j.seq) + pr_err( + "journal entries %llu-%llu missing! (replaying %llu-%llu)\n", + n, i->j.seq - 1, start, end); + + for (k = i->j.start; + k < end(&i->j); + k = bkey_next(k)) { + pr_debug("%s", pkey(k)); + bkey_copy(op->keys.top, k); + bch_keylist_push(&op->keys); + + op->journal = i->pin; + atomic_inc(op->journal); + + ret = bch_btree_insert(op, s); + if (ret) + goto err; + + BUG_ON(!bch_keylist_empty(&op->keys)); + keys++; + + cond_resched(); + } + + if (i->pin) + atomic_dec(i->pin); + n = i->j.seq + 1; + entries++; + } + + pr_info("journal replay done, %i keys in %i entries, seq %llu", + keys, entries, end); + + while (!list_empty(list)) { + i = list_first_entry(list, struct journal_replay, list); + list_del(&i->list); + kfree(i); + } +err: + closure_sync(&op->cl); + return ret; +} + +/* Journalling */ + +static void btree_flush_write(struct cache_set *c) +{ + /* + * Try to find the btree node with that references the oldest journal + * entry, best is our current candidate and is locked if non NULL: + */ + struct btree *b, *best = NULL; + unsigned iter; + + for_each_cached_btree(b, c, iter) { + if (!down_write_trylock(&b->lock)) + continue; + + if (!btree_node_dirty(b) || + !btree_current_write(b)->journal) { + rw_unlock(true, b); + continue; + } + + if (!best) + best = b; + else if (journal_pin_cmp(c, + btree_current_write(best), + btree_current_write(b))) { + rw_unlock(true, best); + best = b; + } else + rw_unlock(true, b); + } + + if (best) + goto out; + + /* We can't find the best btree node, just pick the first */ + list_for_each_entry(b, &c->btree_cache, list) + if (!b->level && btree_node_dirty(b)) { + best = b; + rw_lock(true, best, best->level); + goto found; + } + +out: + if (!best) + return; +found: + if (btree_node_dirty(best)) + bch_btree_write(best, true, NULL); + rw_unlock(true, best); +} + +#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) + +static void journal_discard_endio(struct bio *bio, int error) +{ + struct journal_device *ja = + container_of(bio, struct journal_device, discard_bio); + struct cache *ca = container_of(ja, struct cache, journal); + + atomic_set(&ja->discard_in_flight, DISCARD_DONE); + + closure_wake_up(&ca->set->journal.wait); + closure_put(&ca->set->cl); +} + +static void journal_discard_work(struct work_struct *work) +{ + struct journal_device *ja = + container_of(work, struct journal_device, discard_work); + + submit_bio(0, &ja->discard_bio); +} + +static void do_journal_discard(struct cache *ca) +{ + struct journal_device *ja = &ca->journal; + struct bio *bio = &ja->discard_bio; + + if (!ca->discard) { + ja->discard_idx = ja->last_idx; + return; + } + + switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) { + case DISCARD_IN_FLIGHT: + return; + + case DISCARD_DONE: + ja->discard_idx = (ja->discard_idx + 1) % + ca->sb.njournal_buckets; + + atomic_set(&ja->discard_in_flight, DISCARD_READY); + /* fallthrough */ + + case DISCARD_READY: + if (ja->discard_idx == ja->last_idx) + return; + + atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); + + bio_init(bio); + bio->bi_sector = bucket_to_sector(ca->set, + ca->sb.d[ja->discard_idx]); + bio->bi_bdev = ca->bdev; + bio->bi_rw = REQ_WRITE|REQ_DISCARD; + bio->bi_max_vecs = 1; + bio->bi_io_vec = bio->bi_inline_vecs; + bio->bi_size = bucket_bytes(ca); + bio->bi_end_io = journal_discard_endio; + + closure_get(&ca->set->cl); + INIT_WORK(&ja->discard_work, journal_discard_work); + schedule_work(&ja->discard_work); + } +} + +static void journal_reclaim(struct cache_set *c) +{ + struct bkey *k = &c->journal.key; + struct cache *ca; + uint64_t last_seq; + unsigned iter, n = 0; + atomic_t p; + + while (!atomic_read(&fifo_front(&c->journal.pin))) + fifo_pop(&c->journal.pin, p); + + last_seq = last_seq(&c->journal); + + /* Update last_idx */ + + for_each_cache(ca, c, iter) { + struct journal_device *ja = &ca->journal; + + while (ja->last_idx != ja->cur_idx && + ja->seq[ja->last_idx] < last_seq) + ja->last_idx = (ja->last_idx + 1) % + ca->sb.njournal_buckets; + } + + for_each_cache(ca, c, iter) + do_journal_discard(ca); + + if (c->journal.blocks_free) + return; + + /* + * Allocate: + * XXX: Sort by free journal space + */ + + for_each_cache(ca, c, iter) { + struct journal_device *ja = &ca->journal; + unsigned next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; + + /* No space available on this device */ + if (next == ja->discard_idx) + continue; + + ja->cur_idx = next; + k->ptr[n++] = PTR(0, + bucket_to_sector(c, ca->sb.d[ja->cur_idx]), + ca->sb.nr_this_dev); + } + + bkey_init(k); + SET_KEY_PTRS(k, n); + + if (n) + c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; + + if (!journal_full(&c->journal)) + __closure_wake_up(&c->journal.wait); +} + +void bch_journal_next(struct journal *j) +{ + atomic_t p = { 1 }; + + j->cur = (j->cur == j->w) + ? &j->w[1] + : &j->w[0]; + + /* + * The fifo_push() needs to happen at the same time as j->seq is + * incremented for last_seq() to be calculated correctly + */ + BUG_ON(!fifo_push(&j->pin, p)); + atomic_set(&fifo_back(&j->pin), 1); + + j->cur->data->seq = ++j->seq; + j->cur->need_write = false; + j->cur->data->keys = 0; + + if (fifo_full(&j->pin)) + pr_debug("journal_pin full (%zu)", fifo_used(&j->pin)); +} + +static void journal_write_endio(struct bio *bio, int error) +{ + struct journal_write *w = bio->bi_private; + + cache_set_err_on(error, w->c, "journal io error"); + closure_put(&w->c->journal.io.cl); +} + +static void journal_write(struct closure *); + +static void journal_write_done(struct closure *cl) +{ + struct journal *j = container_of(cl, struct journal, io.cl); + struct cache_set *c = container_of(j, struct cache_set, journal); + + struct journal_write *w = (j->cur == j->w) + ? &j->w[1] + : &j->w[0]; + + __closure_wake_up(&w->wait); + + if (c->journal_delay_ms) + closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms)); + + continue_at(cl, journal_write, system_wq); +} + +static void journal_write_unlocked(struct closure *cl) + __releases(c->journal.lock) +{ + struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); + struct cache *ca; + struct journal_write *w = c->journal.cur; + struct bkey *k = &c->journal.key; + unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size; + + struct bio *bio; + struct bio_list list; + bio_list_init(&list); + + if (!w->need_write) { + /* + * XXX: have to unlock closure before we unlock journal lock, + * else we race with bch_journal(). But this way we race + * against cache set unregister. Doh. + */ + set_closure_fn(cl, NULL, NULL); + closure_sub(cl, CLOSURE_RUNNING + 1); + spin_unlock(&c->journal.lock); + return; + } else if (journal_full(&c->journal)) { + journal_reclaim(c); + spin_unlock(&c->journal.lock); + + btree_flush_write(c); + continue_at(cl, journal_write, system_wq); + } + + c->journal.blocks_free -= set_blocks(w->data, c); + + w->data->btree_level = c->root->level; + + bkey_copy(&w->data->btree_root, &c->root->key); + bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); + + for_each_cache(ca, c, i) + w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; + + w->data->magic = jset_magic(c); + w->data->version = BCACHE_JSET_VERSION; + w->data->last_seq = last_seq(&c->journal); + w->data->csum = csum_set(w->data); + + for (i = 0; i < KEY_PTRS(k); i++) { + ca = PTR_CACHE(c, k, i); + bio = &ca->journal.bio; + + atomic_long_add(sectors, &ca->meta_sectors_written); + + bio_reset(bio); + bio->bi_sector = PTR_OFFSET(k, i); + bio->bi_bdev = ca->bdev; + bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH; + bio->bi_size = sectors << 9; + + bio->bi_end_io = journal_write_endio; + bio->bi_private = w; + bch_bio_map(bio, w->data); + + trace_bcache_journal_write(bio); + bio_list_add(&list, bio); + + SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors); + + ca->journal.seq[ca->journal.cur_idx] = w->data->seq; + } + + atomic_dec_bug(&fifo_back(&c->journal.pin)); + bch_journal_next(&c->journal); + journal_reclaim(c); + + spin_unlock(&c->journal.lock); + + while ((bio = bio_list_pop(&list))) + closure_bio_submit(bio, cl, c->cache[0]); + + continue_at(cl, journal_write_done, NULL); +} + +static void journal_write(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); + + spin_lock(&c->journal.lock); + journal_write_unlocked(cl); +} + +static void __journal_try_write(struct cache_set *c, bool noflush) + __releases(c->journal.lock) +{ + struct closure *cl = &c->journal.io.cl; + + if (!closure_trylock(cl, &c->cl)) + spin_unlock(&c->journal.lock); + else if (noflush && journal_full(&c->journal)) { + spin_unlock(&c->journal.lock); + continue_at(cl, journal_write, system_wq); + } else + journal_write_unlocked(cl); +} + +#define journal_try_write(c) __journal_try_write(c, false) + +void bch_journal_meta(struct cache_set *c, struct closure *cl) +{ + struct journal_write *w; + + if (CACHE_SYNC(&c->sb)) { + spin_lock(&c->journal.lock); + + w = c->journal.cur; + w->need_write = true; + + if (cl) + BUG_ON(!closure_wait(&w->wait, cl)); + + __journal_try_write(c, true); + } +} + +/* + * Entry point to the journalling code - bio_insert() and btree_invalidate() + * pass bch_journal() a list of keys to be journalled, and then + * bch_journal() hands those same keys off to btree_insert_async() + */ + +void bch_journal(struct closure *cl) +{ + struct btree_op *op = container_of(cl, struct btree_op, cl); + struct cache_set *c = op->c; + struct journal_write *w; + size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list; + + if (op->type != BTREE_INSERT || + !CACHE_SYNC(&c->sb)) + goto out; + + /* + * If we're looping because we errored, might already be waiting on + * another journal write: + */ + while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING) + closure_sync(cl->parent); + + spin_lock(&c->journal.lock); + + if (journal_full(&c->journal)) { + /* XXX: tracepoint */ + closure_wait(&c->journal.wait, cl); + + journal_reclaim(c); + spin_unlock(&c->journal.lock); + + btree_flush_write(c); + continue_at(cl, bch_journal, bcache_wq); + } + + w = c->journal.cur; + w->need_write = true; + b = __set_blocks(w->data, w->data->keys + n, c); + + if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || + b > c->journal.blocks_free) { + /* XXX: If we were inserting so many keys that they won't fit in + * an _empty_ journal write, we'll deadlock. For now, handle + * this in bch_keylist_realloc() - but something to think about. + */ + BUG_ON(!w->data->keys); + + /* XXX: tracepoint */ + BUG_ON(!closure_wait(&w->wait, cl)); + + closure_flush(&c->journal.io); + + journal_try_write(c); + continue_at(cl, bch_journal, bcache_wq); + } + + memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t)); + w->data->keys += n; + + op->journal = &fifo_back(&c->journal.pin); + atomic_inc(op->journal); + + if (op->flush_journal) { + closure_flush(&c->journal.io); + closure_wait(&w->wait, cl->parent); + } + + journal_try_write(c); +out: + bch_btree_insert_async(cl); +} + +void bch_journal_free(struct cache_set *c) +{ + free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); + free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); + free_fifo(&c->journal.pin); +} + +int bch_journal_alloc(struct cache_set *c) +{ + struct journal *j = &c->journal; + + closure_init_unlocked(&j->io); + spin_lock_init(&j->lock); + + c->journal_delay_ms = 100; + + j->w[0].c = c; + j->w[1].c = c; + + if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || + !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || + !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) + return -ENOMEM; + + return 0; +} diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h new file mode 100644 index 00000000000..3d7851274b0 --- /dev/null +++ b/drivers/md/bcache/journal.h @@ -0,0 +1,215 @@ +#ifndef _BCACHE_JOURNAL_H +#define _BCACHE_JOURNAL_H + +/* + * THE JOURNAL: + * + * The journal is treated as a circular buffer of buckets - a journal entry + * never spans two buckets. This means (not implemented yet) we can resize the + * journal at runtime, and will be needed for bcache on raw flash support. + * + * Journal entries contain a list of keys, ordered by the time they were + * inserted; thus journal replay just has to reinsert the keys. + * + * We also keep some things in the journal header that are logically part of the + * superblock - all the things that are frequently updated. This is for future + * bcache on raw flash support; the superblock (which will become another + * journal) can't be moved or wear leveled, so it contains just enough + * information to find the main journal, and the superblock only has to be + * rewritten when we want to move/wear level the main journal. + * + * Currently, we don't journal BTREE_REPLACE operations - this will hopefully be + * fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions + * from cache misses, which don't have to be journaled, and for writeback and + * moving gc we work around it by flushing the btree to disk before updating the + * gc information. But it is a potential issue with incremental garbage + * collection, and it's fragile. + * + * OPEN JOURNAL ENTRIES: + * + * Each journal entry contains, in the header, the sequence number of the last + * journal entry still open - i.e. that has keys that haven't been flushed to + * disk in the btree. + * + * We track this by maintaining a refcount for every open journal entry, in a + * fifo; each entry in the fifo corresponds to a particular journal + * entry/sequence number. When the refcount at the tail of the fifo goes to + * zero, we pop it off - thus, the size of the fifo tells us the number of open + * journal entries + * + * We take a refcount on a journal entry when we add some keys to a journal + * entry that we're going to insert (held by struct btree_op), and then when we + * insert those keys into the btree the btree write we're setting up takes a + * copy of that refcount (held by struct btree_write). That refcount is dropped + * when the btree write completes. + * + * A struct btree_write can only hold a refcount on a single journal entry, but + * might contain keys for many journal entries - we handle this by making sure + * it always has a refcount on the _oldest_ journal entry of all the journal + * entries it has keys for. + * + * JOURNAL RECLAIM: + * + * As mentioned previously, our fifo of refcounts tells us the number of open + * journal entries; from that and the current journal sequence number we compute + * last_seq - the oldest journal entry we still need. We write last_seq in each + * journal entry, and we also have to keep track of where it exists on disk so + * we don't overwrite it when we loop around the journal. + * + * To do that we track, for each journal bucket, the sequence number of the + * newest journal entry it contains - if we don't need that journal entry we + * don't need anything in that bucket anymore. From that we track the last + * journal bucket we still need; all this is tracked in struct journal_device + * and updated by journal_reclaim(). + * + * JOURNAL FILLING UP: + * + * There are two ways the journal could fill up; either we could run out of + * space to write to, or we could have too many open journal entries and run out + * of room in the fifo of refcounts. Since those refcounts are decremented + * without any locking we can't safely resize that fifo, so we handle it the + * same way. + * + * If the journal fills up, we start flushing dirty btree nodes until we can + * allocate space for a journal write again - preferentially flushing btree + * nodes that are pinning the oldest journal entries first. + */ + +#define BCACHE_JSET_VERSION_UUIDv1 1 +/* Always latest UUID format */ +#define BCACHE_JSET_VERSION_UUID 1 +#define BCACHE_JSET_VERSION 1 + +/* + * On disk format for a journal entry: + * seq is monotonically increasing; every journal entry has its own unique + * sequence number. + * + * last_seq is the oldest journal entry that still has keys the btree hasn't + * flushed to disk yet. + * + * version is for on disk format changes. + */ +struct jset { + uint64_t csum; + uint64_t magic; + uint64_t seq; + uint32_t version; + uint32_t keys; + + uint64_t last_seq; + + BKEY_PADDED(uuid_bucket); + BKEY_PADDED(btree_root); + uint16_t btree_level; + uint16_t pad[3]; + + uint64_t prio_bucket[MAX_CACHES_PER_SET]; + + union { + struct bkey start[0]; + uint64_t d[0]; + }; +}; + +/* + * Only used for holding the journal entries we read in btree_journal_read() + * during cache_registration + */ +struct journal_replay { + struct list_head list; + atomic_t *pin; + struct jset j; +}; + +/* + * We put two of these in struct journal; we used them for writes to the + * journal that are being staged or in flight. + */ +struct journal_write { + struct jset *data; +#define JSET_BITS 3 + + struct cache_set *c; + struct closure_waitlist wait; + bool need_write; +}; + +/* Embedded in struct cache_set */ +struct journal { + spinlock_t lock; + /* used when waiting because the journal was full */ + struct closure_waitlist wait; + struct closure_with_timer io; + + /* Number of blocks free in the bucket(s) we're currently writing to */ + unsigned blocks_free; + uint64_t seq; + DECLARE_FIFO(atomic_t, pin); + + BKEY_PADDED(key); + + struct journal_write w[2], *cur; +}; + +/* + * Embedded in struct cache. First three fields refer to the array of journal + * buckets, in cache_sb. + */ +struct journal_device { + /* + * For each journal bucket, contains the max sequence number of the + * journal writes it contains - so we know when a bucket can be reused. + */ + uint64_t seq[SB_JOURNAL_BUCKETS]; + + /* Journal bucket we're currently writing to */ + unsigned cur_idx; + + /* Last journal bucket that still contains an open journal entry */ + unsigned last_idx; + + /* Next journal bucket to be discarded */ + unsigned discard_idx; + +#define DISCARD_READY 0 +#define DISCARD_IN_FLIGHT 1 +#define DISCARD_DONE 2 + /* 1 - discard in flight, -1 - discard completed */ + atomic_t discard_in_flight; + + struct work_struct discard_work; + struct bio discard_bio; + struct bio_vec discard_bv; + + /* Bio for journal reads/writes to this device */ + struct bio bio; + struct bio_vec bv[8]; +}; + +#define journal_pin_cmp(c, l, r) \ + (fifo_idx(&(c)->journal.pin, (l)->journal) > \ + fifo_idx(&(c)->journal.pin, (r)->journal)) + +#define JOURNAL_PIN 20000 + +#define journal_full(j) \ + (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1) + +struct closure; +struct cache_set; +struct btree_op; + +void bch_journal(struct closure *); +void bch_journal_next(struct journal *); +void bch_journal_mark(struct cache_set *, struct list_head *); +void bch_journal_meta(struct cache_set *, struct closure *); +int bch_journal_read(struct cache_set *, struct list_head *, + struct btree_op *); +int bch_journal_replay(struct cache_set *, struct list_head *, + struct btree_op *); + +void bch_journal_free(struct cache_set *); +int bch_journal_alloc(struct cache_set *); + +#endif /* _BCACHE_JOURNAL_H */ diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c new file mode 100644 index 00000000000..8589512c972 --- /dev/null +++ b/drivers/md/bcache/movinggc.c @@ -0,0 +1,254 @@ +/* + * Moving/copying garbage collector + * + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "request.h" + +struct moving_io { + struct keybuf_key *w; + struct search s; + struct bbio bio; +}; + +static bool moving_pred(struct keybuf *buf, struct bkey *k) +{ + struct cache_set *c = container_of(buf, struct cache_set, + moving_gc_keys); + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) { + struct cache *ca = PTR_CACHE(c, k, i); + struct bucket *g = PTR_BUCKET(c, k, i); + + if (GC_SECTORS_USED(g) < ca->gc_move_threshold) + return true; + } + + return false; +} + +/* Moving GC - IO loop */ + +static void moving_io_destructor(struct closure *cl) +{ + struct moving_io *io = container_of(cl, struct moving_io, s.cl); + kfree(io); +} + +static void write_moving_finish(struct closure *cl) +{ + struct moving_io *io = container_of(cl, struct moving_io, s.cl); + struct bio *bio = &io->bio.bio; + struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt); + + while (bv-- != bio->bi_io_vec) + __free_page(bv->bv_page); + + pr_debug("%s %s", io->s.op.insert_collision + ? "collision moving" : "moved", + pkey(&io->w->key)); + + bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); + + atomic_dec_bug(&io->s.op.c->in_flight); + closure_wake_up(&io->s.op.c->moving_gc_wait); + + closure_return_with_destructor(cl, moving_io_destructor); +} + +static void read_moving_endio(struct bio *bio, int error) +{ + struct moving_io *io = container_of(bio->bi_private, + struct moving_io, s.cl); + + if (error) + io->s.error = error; + + bch_bbio_endio(io->s.op.c, bio, error, "reading data to move"); +} + +static void moving_init(struct moving_io *io) +{ + struct bio *bio = &io->bio.bio; + + bio_init(bio); + bio_get(bio); + bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + + bio->bi_size = KEY_SIZE(&io->w->key) << 9; + bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), + PAGE_SECTORS); + bio->bi_private = &io->s.cl; + bio->bi_io_vec = bio->bi_inline_vecs; + bch_bio_map(bio, NULL); +} + +static void write_moving(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct moving_io *io = container_of(s, struct moving_io, s); + + if (!s->error) { + trace_bcache_write_moving(&io->bio.bio); + + moving_init(io); + + io->bio.bio.bi_sector = KEY_START(&io->w->key); + s->op.lock = -1; + s->op.write_prio = 1; + s->op.cache_bio = &io->bio.bio; + + s->writeback = KEY_DIRTY(&io->w->key); + s->op.csum = KEY_CSUM(&io->w->key); + + s->op.type = BTREE_REPLACE; + bkey_copy(&s->op.replace, &io->w->key); + + closure_init(&s->op.cl, cl); + bch_insert_data(&s->op.cl); + } + + continue_at(cl, write_moving_finish, NULL); +} + +static void read_moving_submit(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct moving_io *io = container_of(s, struct moving_io, s); + struct bio *bio = &io->bio.bio; + + trace_bcache_read_moving(bio); + bch_submit_bbio(bio, s->op.c, &io->w->key, 0); + + continue_at(cl, write_moving, bch_gc_wq); +} + +static void read_moving(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, moving_gc); + struct keybuf_key *w; + struct moving_io *io; + struct bio *bio; + + /* XXX: if we error, background writeback could stall indefinitely */ + + while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { + w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY); + if (!w) + break; + + io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) + * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); + if (!io) + goto err; + + w->private = io; + io->w = w; + io->s.op.inode = KEY_INODE(&w->key); + io->s.op.c = c; + + moving_init(io); + bio = &io->bio.bio; + + bio->bi_rw = READ; + bio->bi_end_io = read_moving_endio; + + if (bch_bio_alloc_pages(bio, GFP_KERNEL)) + goto err; + + pr_debug("%s", pkey(&w->key)); + + closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); + + if (atomic_inc_return(&c->in_flight) >= 64) { + closure_wait_event(&c->moving_gc_wait, cl, + atomic_read(&c->in_flight) < 64); + continue_at(cl, read_moving, bch_gc_wq); + } + } + + if (0) { +err: if (!IS_ERR_OR_NULL(w->private)) + kfree(w->private); + + bch_keybuf_del(&c->moving_gc_keys, w); + } + + closure_return(cl); +} + +static bool bucket_cmp(struct bucket *l, struct bucket *r) +{ + return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); +} + +static unsigned bucket_heap_top(struct cache *ca) +{ + return GC_SECTORS_USED(heap_peek(&ca->heap)); +} + +void bch_moving_gc(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, gc.cl); + struct cache *ca; + struct bucket *b; + unsigned i; + + if (!c->copy_gc_enabled) + closure_return(cl); + + mutex_lock(&c->bucket_lock); + + for_each_cache(ca, c, i) { + unsigned sectors_to_move = 0; + unsigned reserve_sectors = ca->sb.bucket_size * + min(fifo_used(&ca->free), ca->free.size / 2); + + ca->heap.used = 0; + + for_each_bucket(b, ca) { + if (!GC_SECTORS_USED(b)) + continue; + + if (!heap_full(&ca->heap)) { + sectors_to_move += GC_SECTORS_USED(b); + heap_add(&ca->heap, b, bucket_cmp); + } else if (bucket_cmp(b, heap_peek(&ca->heap))) { + sectors_to_move -= bucket_heap_top(ca); + sectors_to_move += GC_SECTORS_USED(b); + + ca->heap.data[0] = b; + heap_sift(&ca->heap, 0, bucket_cmp); + } + } + + while (sectors_to_move > reserve_sectors) { + heap_pop(&ca->heap, b, bucket_cmp); + sectors_to_move -= GC_SECTORS_USED(b); + } + + ca->gc_move_threshold = bucket_heap_top(ca); + + pr_debug("threshold %u", ca->gc_move_threshold); + } + + mutex_unlock(&c->bucket_lock); + + c->moving_gc_keys.last_scanned = ZERO_KEY; + + closure_init(&c->moving_gc, cl); + read_moving(&c->moving_gc); + + closure_return(cl); +} + +void bch_moving_init_cache_set(struct cache_set *c) +{ + bch_keybuf_init(&c->moving_gc_keys, moving_pred); +} diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c new file mode 100644 index 00000000000..e5ff12e52d5 --- /dev/null +++ b/drivers/md/bcache/request.c @@ -0,0 +1,1411 @@ +/* + * Main bcache entry point - handle a read or a write request and decide what to + * do with it; the make_request functions are called by the block layer. + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "request.h" + +#include <linux/cgroup.h> +#include <linux/module.h> +#include <linux/hash.h> +#include <linux/random.h> +#include "blk-cgroup.h" + +#include <trace/events/bcache.h> + +#define CUTOFF_CACHE_ADD 95 +#define CUTOFF_CACHE_READA 90 +#define CUTOFF_WRITEBACK 50 +#define CUTOFF_WRITEBACK_SYNC 75 + +struct kmem_cache *bch_search_cache; + +static void check_should_skip(struct cached_dev *, struct search *); + +/* Cgroup interface */ + +#ifdef CONFIG_CGROUP_BCACHE +static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 }; + +static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup) +{ + struct cgroup_subsys_state *css; + return cgroup && + (css = cgroup_subsys_state(cgroup, bcache_subsys_id)) + ? container_of(css, struct bch_cgroup, css) + : &bcache_default_cgroup; +} + +struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio) +{ + struct cgroup_subsys_state *css = bio->bi_css + ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id) + : task_subsys_state(current, bcache_subsys_id); + + return css + ? container_of(css, struct bch_cgroup, css) + : &bcache_default_cgroup; +} + +static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + char __user *buf, size_t nbytes, loff_t *ppos) +{ + char tmp[1024]; + int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes, + cgroup_to_bcache(cgrp)->cache_mode + 1); + + if (len < 0) + return len; + + return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); +} + +static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft, + const char *buf) +{ + int v = bch_read_string_list(buf, bch_cache_modes); + if (v < 0) + return v; + + cgroup_to_bcache(cgrp)->cache_mode = v - 1; + return 0; +} + +static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft) +{ + return cgroup_to_bcache(cgrp)->verify; +} + +static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val) +{ + cgroup_to_bcache(cgrp)->verify = val; + return 0; +} + +static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); + return atomic_read(&bcachecg->stats.cache_hits); +} + +static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); + return atomic_read(&bcachecg->stats.cache_misses); +} + +static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp, + struct cftype *cft) +{ + struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); + return atomic_read(&bcachecg->stats.cache_bypass_hits); +} + +static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp, + struct cftype *cft) +{ + struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); + return atomic_read(&bcachecg->stats.cache_bypass_misses); +} + +static struct cftype bch_files[] = { + { + .name = "cache_mode", + .read = cache_mode_read, + .write_string = cache_mode_write, + }, + { + .name = "verify", + .read_u64 = bch_verify_read, + .write_u64 = bch_verify_write, + }, + { + .name = "cache_hits", + .read_u64 = bch_cache_hits_read, + }, + { + .name = "cache_misses", + .read_u64 = bch_cache_misses_read, + }, + { + .name = "cache_bypass_hits", + .read_u64 = bch_cache_bypass_hits_read, + }, + { + .name = "cache_bypass_misses", + .read_u64 = bch_cache_bypass_misses_read, + }, + { } /* terminate */ +}; + +static void init_bch_cgroup(struct bch_cgroup *cg) +{ + cg->cache_mode = -1; +} + +static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup) +{ + struct bch_cgroup *cg; + + cg = kzalloc(sizeof(*cg), GFP_KERNEL); + if (!cg) + return ERR_PTR(-ENOMEM); + init_bch_cgroup(cg); + return &cg->css; +} + +static void bcachecg_destroy(struct cgroup *cgroup) +{ + struct bch_cgroup *cg = cgroup_to_bcache(cgroup); + free_css_id(&bcache_subsys, &cg->css); + kfree(cg); +} + +struct cgroup_subsys bcache_subsys = { + .create = bcachecg_create, + .destroy = bcachecg_destroy, + .subsys_id = bcache_subsys_id, + .name = "bcache", + .module = THIS_MODULE, +}; +EXPORT_SYMBOL_GPL(bcache_subsys); +#endif + +static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) +{ +#ifdef CONFIG_CGROUP_BCACHE + int r = bch_bio_to_cgroup(bio)->cache_mode; + if (r >= 0) + return r; +#endif + return BDEV_CACHE_MODE(&dc->sb); +} + +static bool verify(struct cached_dev *dc, struct bio *bio) +{ +#ifdef CONFIG_CGROUP_BCACHE + if (bch_bio_to_cgroup(bio)->verify) + return true; +#endif + return dc->verify; +} + +static void bio_csum(struct bio *bio, struct bkey *k) +{ + struct bio_vec *bv; + uint64_t csum = 0; + int i; + + bio_for_each_segment(bv, bio, i) { + void *d = kmap(bv->bv_page) + bv->bv_offset; + csum = bch_crc64_update(csum, d, bv->bv_len); + kunmap(bv->bv_page); + } + + k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); +} + +/* Insert data into cache */ + +static void bio_invalidate(struct closure *cl) +{ + struct btree_op *op = container_of(cl, struct btree_op, cl); + struct bio *bio = op->cache_bio; + + pr_debug("invalidating %i sectors from %llu", + bio_sectors(bio), (uint64_t) bio->bi_sector); + + while (bio_sectors(bio)) { + unsigned len = min(bio_sectors(bio), 1U << 14); + + if (bch_keylist_realloc(&op->keys, 0, op->c)) + goto out; + + bio->bi_sector += len; + bio->bi_size -= len << 9; + + bch_keylist_add(&op->keys, + &KEY(op->inode, bio->bi_sector, len)); + } + + op->insert_data_done = true; + bio_put(bio); +out: + continue_at(cl, bch_journal, bcache_wq); +} + +struct open_bucket { + struct list_head list; + struct task_struct *last; + unsigned sectors_free; + BKEY_PADDED(key); +}; + +void bch_open_buckets_free(struct cache_set *c) +{ + struct open_bucket *b; + + while (!list_empty(&c->data_buckets)) { + b = list_first_entry(&c->data_buckets, + struct open_bucket, list); + list_del(&b->list); + kfree(b); + } +} + +int bch_open_buckets_alloc(struct cache_set *c) +{ + int i; + + spin_lock_init(&c->data_bucket_lock); + + for (i = 0; i < 6; i++) { + struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); + if (!b) + return -ENOMEM; + + list_add(&b->list, &c->data_buckets); + } + + return 0; +} + +/* + * We keep multiple buckets open for writes, and try to segregate different + * write streams for better cache utilization: first we look for a bucket where + * the last write to it was sequential with the current write, and failing that + * we look for a bucket that was last used by the same task. + * + * The ideas is if you've got multiple tasks pulling data into the cache at the + * same time, you'll get better cache utilization if you try to segregate their + * data and preserve locality. + * + * For example, say you've starting Firefox at the same time you're copying a + * bunch of files. Firefox will likely end up being fairly hot and stay in the + * cache awhile, but the data you copied might not be; if you wrote all that + * data to the same buckets it'd get invalidated at the same time. + * + * Both of those tasks will be doing fairly random IO so we can't rely on + * detecting sequential IO to segregate their data, but going off of the task + * should be a sane heuristic. + */ +static struct open_bucket *pick_data_bucket(struct cache_set *c, + const struct bkey *search, + struct task_struct *task, + struct bkey *alloc) +{ + struct open_bucket *ret, *ret_task = NULL; + + list_for_each_entry_reverse(ret, &c->data_buckets, list) + if (!bkey_cmp(&ret->key, search)) + goto found; + else if (ret->last == task) + ret_task = ret; + + ret = ret_task ?: list_first_entry(&c->data_buckets, + struct open_bucket, list); +found: + if (!ret->sectors_free && KEY_PTRS(alloc)) { + ret->sectors_free = c->sb.bucket_size; + bkey_copy(&ret->key, alloc); + bkey_init(alloc); + } + + if (!ret->sectors_free) + ret = NULL; + + return ret; +} + +/* + * Allocates some space in the cache to write to, and k to point to the newly + * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the + * end of the newly allocated space). + * + * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many + * sectors were actually allocated. + * + * If s->writeback is true, will not fail. + */ +static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, + struct search *s) +{ + struct cache_set *c = s->op.c; + struct open_bucket *b; + BKEY_PADDED(key) alloc; + struct closure cl, *w = NULL; + unsigned i; + + if (s->writeback) { + closure_init_stack(&cl); + w = &cl; + } + + /* + * We might have to allocate a new bucket, which we can't do with a + * spinlock held. So if we have to allocate, we drop the lock, allocate + * and then retry. KEY_PTRS() indicates whether alloc points to + * allocated bucket(s). + */ + + bkey_init(&alloc.key); + spin_lock(&c->data_bucket_lock); + + while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) { + unsigned watermark = s->op.write_prio + ? WATERMARK_MOVINGGC + : WATERMARK_NONE; + + spin_unlock(&c->data_bucket_lock); + + if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w)) + return false; + + spin_lock(&c->data_bucket_lock); + } + + /* + * If we had to allocate, we might race and not need to allocate the + * second time we call find_data_bucket(). If we allocated a bucket but + * didn't use it, drop the refcount bch_bucket_alloc_set() took: + */ + if (KEY_PTRS(&alloc.key)) + __bkey_put(c, &alloc.key); + + for (i = 0; i < KEY_PTRS(&b->key); i++) + EBUG_ON(ptr_stale(c, &b->key, i)); + + /* Set up the pointer to the space we're allocating: */ + + for (i = 0; i < KEY_PTRS(&b->key); i++) + k->ptr[i] = b->key.ptr[i]; + + sectors = min(sectors, b->sectors_free); + + SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); + SET_KEY_SIZE(k, sectors); + SET_KEY_PTRS(k, KEY_PTRS(&b->key)); + + /* + * Move b to the end of the lru, and keep track of what this bucket was + * last used for: + */ + list_move_tail(&b->list, &c->data_buckets); + bkey_copy_key(&b->key, k); + b->last = s->task; + + b->sectors_free -= sectors; + + for (i = 0; i < KEY_PTRS(&b->key); i++) { + SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); + + atomic_long_add(sectors, + &PTR_CACHE(c, &b->key, i)->sectors_written); + } + + if (b->sectors_free < c->sb.block_size) + b->sectors_free = 0; + + /* + * k takes refcounts on the buckets it points to until it's inserted + * into the btree, but if we're done with this bucket we just transfer + * get_data_bucket()'s refcount. + */ + if (b->sectors_free) + for (i = 0; i < KEY_PTRS(&b->key); i++) + atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); + + spin_unlock(&c->data_bucket_lock); + return true; +} + +static void bch_insert_data_error(struct closure *cl) +{ + struct btree_op *op = container_of(cl, struct btree_op, cl); + + /* + * Our data write just errored, which means we've got a bunch of keys to + * insert that point to data that wasn't succesfully written. + * + * We don't have to insert those keys but we still have to invalidate + * that region of the cache - so, if we just strip off all the pointers + * from the keys we'll accomplish just that. + */ + + struct bkey *src = op->keys.bottom, *dst = op->keys.bottom; + + while (src != op->keys.top) { + struct bkey *n = bkey_next(src); + + SET_KEY_PTRS(src, 0); + bkey_copy(dst, src); + + dst = bkey_next(dst); + src = n; + } + + op->keys.top = dst; + + bch_journal(cl); +} + +static void bch_insert_data_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + struct btree_op *op = container_of(cl, struct btree_op, cl); + struct search *s = container_of(op, struct search, op); + + if (error) { + /* TODO: We could try to recover from this. */ + if (s->writeback) + s->error = error; + else if (s->write) + set_closure_fn(cl, bch_insert_data_error, bcache_wq); + else + set_closure_fn(cl, NULL, NULL); + } + + bch_bbio_endio(op->c, bio, error, "writing data to cache"); +} + +static void bch_insert_data_loop(struct closure *cl) +{ + struct btree_op *op = container_of(cl, struct btree_op, cl); + struct search *s = container_of(op, struct search, op); + struct bio *bio = op->cache_bio, *n; + + if (op->skip) + return bio_invalidate(cl); + + if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { + set_gc_sectors(op->c); + bch_queue_gc(op->c); + } + + do { + unsigned i; + struct bkey *k; + struct bio_set *split = s->d + ? s->d->bio_split : op->c->bio_split; + + /* 1 for the device pointer and 1 for the chksum */ + if (bch_keylist_realloc(&op->keys, + 1 + (op->csum ? 1 : 0), + op->c)) + continue_at(cl, bch_journal, bcache_wq); + + k = op->keys.top; + bkey_init(k); + SET_KEY_INODE(k, op->inode); + SET_KEY_OFFSET(k, bio->bi_sector); + + if (!bch_alloc_sectors(k, bio_sectors(bio), s)) + goto err; + + n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); + if (!n) { + __bkey_put(op->c, k); + continue_at(cl, bch_insert_data_loop, bcache_wq); + } + + n->bi_end_io = bch_insert_data_endio; + n->bi_private = cl; + + if (s->writeback) { + SET_KEY_DIRTY(k, true); + + for (i = 0; i < KEY_PTRS(k); i++) + SET_GC_MARK(PTR_BUCKET(op->c, k, i), + GC_MARK_DIRTY); + } + + SET_KEY_CSUM(k, op->csum); + if (KEY_CSUM(k)) + bio_csum(n, k); + + pr_debug("%s", pkey(k)); + bch_keylist_push(&op->keys); + + trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev); + n->bi_rw |= REQ_WRITE; + bch_submit_bbio(n, op->c, k, 0); + } while (n != bio); + + op->insert_data_done = true; + continue_at(cl, bch_journal, bcache_wq); +err: + /* bch_alloc_sectors() blocks if s->writeback = true */ + BUG_ON(s->writeback); + + /* + * But if it's not a writeback write we'd rather just bail out if + * there aren't any buckets ready to write to - it might take awhile and + * we might be starving btree writes for gc or something. + */ + + if (s->write) { + /* + * Writethrough write: We can't complete the write until we've + * updated the index. But we don't want to delay the write while + * we wait for buckets to be freed up, so just invalidate the + * rest of the write. + */ + op->skip = true; + return bio_invalidate(cl); + } else { + /* + * From a cache miss, we can just insert the keys for the data + * we have written or bail out if we didn't do anything. + */ + op->insert_data_done = true; + bio_put(bio); + + if (!bch_keylist_empty(&op->keys)) + continue_at(cl, bch_journal, bcache_wq); + else + closure_return(cl); + } +} + +/** + * bch_insert_data - stick some data in the cache + * + * This is the starting point for any data to end up in a cache device; it could + * be from a normal write, or a writeback write, or a write to a flash only + * volume - it's also used by the moving garbage collector to compact data in + * mostly empty buckets. + * + * It first writes the data to the cache, creating a list of keys to be inserted + * (if the data had to be fragmented there will be multiple keys); after the + * data is written it calls bch_journal, and after the keys have been added to + * the next journal write they're inserted into the btree. + * + * It inserts the data in op->cache_bio; bi_sector is used for the key offset, + * and op->inode is used for the key inode. + * + * If op->skip is true, instead of inserting the data it invalidates the region + * of the cache represented by op->cache_bio and op->inode. + */ +void bch_insert_data(struct closure *cl) +{ + struct btree_op *op = container_of(cl, struct btree_op, cl); + + bch_keylist_init(&op->keys); + bio_get(op->cache_bio); + bch_insert_data_loop(cl); +} + +void bch_btree_insert_async(struct closure *cl) +{ + struct btree_op *op = container_of(cl, struct btree_op, cl); + struct search *s = container_of(op, struct search, op); + + if (bch_btree_insert(op, op->c)) { + s->error = -ENOMEM; + op->insert_data_done = true; + } + + if (op->insert_data_done) { + bch_keylist_free(&op->keys); + closure_return(cl); + } else + continue_at(cl, bch_insert_data_loop, bcache_wq); +} + +/* Common code for the make_request functions */ + +static void request_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + + if (error) { + struct search *s = container_of(cl, struct search, cl); + s->error = error; + /* Only cache read errors are recoverable */ + s->recoverable = false; + } + + bio_put(bio); + closure_put(cl); +} + +void bch_cache_read_endio(struct bio *bio, int error) +{ + struct bbio *b = container_of(bio, struct bbio, bio); + struct closure *cl = bio->bi_private; + struct search *s = container_of(cl, struct search, cl); + + /* + * If the bucket was reused while our bio was in flight, we might have + * read the wrong data. Set s->error but not error so it doesn't get + * counted against the cache device, but we'll still reread the data + * from the backing device. + */ + + if (error) + s->error = error; + else if (ptr_stale(s->op.c, &b->key, 0)) { + atomic_long_inc(&s->op.c->cache_read_races); + s->error = -EINTR; + } + + bch_bbio_endio(s->op.c, bio, error, "reading from cache"); +} + +static void bio_complete(struct search *s) +{ + if (s->orig_bio) { + int cpu, rw = bio_data_dir(s->orig_bio); + unsigned long duration = jiffies - s->start_time; + + cpu = part_stat_lock(); + part_round_stats(cpu, &s->d->disk->part0); + part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration); + part_stat_unlock(); + + trace_bcache_request_end(s, s->orig_bio); + bio_endio(s->orig_bio, s->error); + s->orig_bio = NULL; + } +} + +static void do_bio_hook(struct search *s) +{ + struct bio *bio = &s->bio.bio; + memcpy(bio, s->orig_bio, sizeof(struct bio)); + + bio->bi_end_io = request_endio; + bio->bi_private = &s->cl; + atomic_set(&bio->bi_cnt, 3); +} + +static void search_free(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + bio_complete(s); + + if (s->op.cache_bio) + bio_put(s->op.cache_bio); + + if (s->unaligned_bvec) + mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); + + closure_debug_destroy(cl); + mempool_free(s, s->d->c->search); +} + +static struct search *search_alloc(struct bio *bio, struct bcache_device *d) +{ + struct bio_vec *bv; + struct search *s = mempool_alloc(d->c->search, GFP_NOIO); + memset(s, 0, offsetof(struct search, op.keys)); + + __closure_init(&s->cl, NULL); + + s->op.inode = d->id; + s->op.c = d->c; + s->d = d; + s->op.lock = -1; + s->task = current; + s->orig_bio = bio; + s->write = (bio->bi_rw & REQ_WRITE) != 0; + s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0; + s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; + s->recoverable = 1; + s->start_time = jiffies; + do_bio_hook(s); + + if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) { + bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO); + memcpy(bv, bio_iovec(bio), + sizeof(struct bio_vec) * bio_segments(bio)); + + s->bio.bio.bi_io_vec = bv; + s->unaligned_bvec = 1; + } + + return s; +} + +static void btree_read_async(struct closure *cl) +{ + struct btree_op *op = container_of(cl, struct btree_op, cl); + + int ret = btree_root(search_recurse, op->c, op); + + if (ret == -EAGAIN) + continue_at(cl, btree_read_async, bcache_wq); + + closure_return(cl); +} + +/* Cached devices */ + +static void cached_dev_bio_complete(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + + search_free(cl); + cached_dev_put(dc); +} + +/* Process reads */ + +static void cached_dev_read_complete(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + + if (s->op.insert_collision) + bch_mark_cache_miss_collision(s); + + if (s->op.cache_bio) { + int i; + struct bio_vec *bv; + + __bio_for_each_segment(bv, s->op.cache_bio, i, 0) + __free_page(bv->bv_page); + } + + cached_dev_bio_complete(cl); +} + +static void request_read_error(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct bio_vec *bv; + int i; + + if (s->recoverable) { + /* The cache read failed, but we can retry from the backing + * device. + */ + pr_debug("recovering at sector %llu", + (uint64_t) s->orig_bio->bi_sector); + + s->error = 0; + bv = s->bio.bio.bi_io_vec; + do_bio_hook(s); + s->bio.bio.bi_io_vec = bv; + + if (!s->unaligned_bvec) + bio_for_each_segment(bv, s->orig_bio, i) + bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; + else + memcpy(s->bio.bio.bi_io_vec, + bio_iovec(s->orig_bio), + sizeof(struct bio_vec) * + bio_segments(s->orig_bio)); + + /* XXX: invalidate cache */ + + trace_bcache_read_retry(&s->bio.bio); + closure_bio_submit(&s->bio.bio, &s->cl, s->d); + } + + continue_at(cl, cached_dev_read_complete, NULL); +} + +static void request_read_done(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + + /* + * s->cache_bio != NULL implies that we had a cache miss; cache_bio now + * contains data ready to be inserted into the cache. + * + * First, we copy the data we just read from cache_bio's bounce buffers + * to the buffers the original bio pointed to: + */ + + if (s->op.cache_bio) { + struct bio_vec *src, *dst; + unsigned src_offset, dst_offset, bytes; + void *dst_ptr; + + bio_reset(s->op.cache_bio); + s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; + s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; + s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; + bch_bio_map(s->op.cache_bio, NULL); + + src = bio_iovec(s->op.cache_bio); + dst = bio_iovec(s->cache_miss); + src_offset = src->bv_offset; + dst_offset = dst->bv_offset; + dst_ptr = kmap(dst->bv_page); + + while (1) { + if (dst_offset == dst->bv_offset + dst->bv_len) { + kunmap(dst->bv_page); + dst++; + if (dst == bio_iovec_idx(s->cache_miss, + s->cache_miss->bi_vcnt)) + break; + + dst_offset = dst->bv_offset; + dst_ptr = kmap(dst->bv_page); + } + + if (src_offset == src->bv_offset + src->bv_len) { + src++; + if (src == bio_iovec_idx(s->op.cache_bio, + s->op.cache_bio->bi_vcnt)) + BUG(); + + src_offset = src->bv_offset; + } + + bytes = min(dst->bv_offset + dst->bv_len - dst_offset, + src->bv_offset + src->bv_len - src_offset); + + memcpy(dst_ptr + dst_offset, + page_address(src->bv_page) + src_offset, + bytes); + + src_offset += bytes; + dst_offset += bytes; + } + + bio_put(s->cache_miss); + s->cache_miss = NULL; + } + + if (verify(dc, &s->bio.bio) && s->recoverable) + bch_data_verify(s); + + bio_complete(s); + + if (s->op.cache_bio && + !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) { + s->op.type = BTREE_REPLACE; + closure_call(&s->op.cl, bch_insert_data, NULL, cl); + } + + continue_at(cl, cached_dev_read_complete, NULL); +} + +static void request_read_done_bh(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + + bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); + + if (s->error) + continue_at_nobarrier(cl, request_read_error, bcache_wq); + else if (s->op.cache_bio || verify(dc, &s->bio.bio)) + continue_at_nobarrier(cl, request_read_done, bcache_wq); + else + continue_at_nobarrier(cl, cached_dev_read_complete, NULL); +} + +static int cached_dev_cache_miss(struct btree *b, struct search *s, + struct bio *bio, unsigned sectors) +{ + int ret = 0; + unsigned reada; + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + struct bio *miss; + + miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); + if (!miss) + return -EAGAIN; + + if (miss == bio) + s->op.lookup_done = true; + + miss->bi_end_io = request_endio; + miss->bi_private = &s->cl; + + if (s->cache_miss || s->op.skip) + goto out_submit; + + if (miss != bio || + (bio->bi_rw & REQ_RAHEAD) || + (bio->bi_rw & REQ_META) || + s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA) + reada = 0; + else { + reada = min(dc->readahead >> 9, + sectors - bio_sectors(miss)); + + if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev)) + reada = bdev_sectors(miss->bi_bdev) - bio_end(miss); + } + + s->cache_bio_sectors = bio_sectors(miss) + reada; + s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT, + DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS), + dc->disk.bio_split); + + if (!s->op.cache_bio) + goto out_submit; + + s->op.cache_bio->bi_sector = miss->bi_sector; + s->op.cache_bio->bi_bdev = miss->bi_bdev; + s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; + + s->op.cache_bio->bi_end_io = request_endio; + s->op.cache_bio->bi_private = &s->cl; + + /* btree_search_recurse()'s btree iterator is no good anymore */ + ret = -EINTR; + if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio)) + goto out_put; + + bch_bio_map(s->op.cache_bio, NULL); + if (bch_bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) + goto out_put; + + s->cache_miss = miss; + bio_get(s->op.cache_bio); + + trace_bcache_cache_miss(s->orig_bio); + closure_bio_submit(s->op.cache_bio, &s->cl, s->d); + + return ret; +out_put: + bio_put(s->op.cache_bio); + s->op.cache_bio = NULL; +out_submit: + closure_bio_submit(miss, &s->cl, s->d); + return ret; +} + +static void request_read(struct cached_dev *dc, struct search *s) +{ + struct closure *cl = &s->cl; + + check_should_skip(dc, s); + closure_call(&s->op.cl, btree_read_async, NULL, cl); + + continue_at(cl, request_read_done_bh, NULL); +} + +/* Process writes */ + +static void cached_dev_write_complete(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + + up_read_non_owner(&dc->writeback_lock); + cached_dev_bio_complete(cl); +} + +static bool should_writeback(struct cached_dev *dc, struct bio *bio) +{ + unsigned threshold = (bio->bi_rw & REQ_SYNC) + ? CUTOFF_WRITEBACK_SYNC + : CUTOFF_WRITEBACK; + + return !atomic_read(&dc->disk.detaching) && + cache_mode(dc, bio) == CACHE_MODE_WRITEBACK && + dc->disk.c->gc_stats.in_use < threshold; +} + +static void request_write(struct cached_dev *dc, struct search *s) +{ + struct closure *cl = &s->cl; + struct bio *bio = &s->bio.bio; + struct bkey start, end; + start = KEY(dc->disk.id, bio->bi_sector, 0); + end = KEY(dc->disk.id, bio_end(bio), 0); + + bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); + + check_should_skip(dc, s); + down_read_non_owner(&dc->writeback_lock); + + if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { + s->op.skip = false; + s->writeback = true; + } + + if (bio->bi_rw & REQ_DISCARD) + goto skip; + + if (s->op.skip) + goto skip; + + if (should_writeback(dc, s->orig_bio)) + s->writeback = true; + + if (!s->writeback) { + s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, + dc->disk.bio_split); + + trace_bcache_writethrough(s->orig_bio); + closure_bio_submit(bio, cl, s->d); + } else { + s->op.cache_bio = bio; + trace_bcache_writeback(s->orig_bio); + bch_writeback_add(dc, bio_sectors(bio)); + } +out: + closure_call(&s->op.cl, bch_insert_data, NULL, cl); + continue_at(cl, cached_dev_write_complete, NULL); +skip: + s->op.skip = true; + s->op.cache_bio = s->orig_bio; + bio_get(s->op.cache_bio); + trace_bcache_write_skip(s->orig_bio); + + if ((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(dc->bdev))) + goto out; + + closure_bio_submit(bio, cl, s->d); + goto out; +} + +static void request_nodata(struct cached_dev *dc, struct search *s) +{ + struct closure *cl = &s->cl; + struct bio *bio = &s->bio.bio; + + if (bio->bi_rw & REQ_DISCARD) { + request_write(dc, s); + return; + } + + if (s->op.flush_journal) + bch_journal_meta(s->op.c, cl); + + closure_bio_submit(bio, cl, s->d); + + continue_at(cl, cached_dev_bio_complete, NULL); +} + +/* Cached devices - read & write stuff */ + +int bch_get_congested(struct cache_set *c) +{ + int i; + + if (!c->congested_read_threshold_us && + !c->congested_write_threshold_us) + return 0; + + i = (local_clock_us() - c->congested_last_us) / 1024; + if (i < 0) + return 0; + + i += atomic_read(&c->congested); + if (i >= 0) + return 0; + + i += CONGESTED_MAX; + + return i <= 0 ? 1 : fract_exp_two(i, 6); +} + +static void add_sequential(struct task_struct *t) +{ + ewma_add(t->sequential_io_avg, + t->sequential_io, 8, 0); + + t->sequential_io = 0; +} + +static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) +{ + return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; +} + +static void check_should_skip(struct cached_dev *dc, struct search *s) +{ + struct cache_set *c = s->op.c; + struct bio *bio = &s->bio.bio; + + long rand; + int cutoff = bch_get_congested(c); + unsigned mode = cache_mode(dc, bio); + + if (atomic_read(&dc->disk.detaching) || + c->gc_stats.in_use > CUTOFF_CACHE_ADD || + (bio->bi_rw & REQ_DISCARD)) + goto skip; + + if (mode == CACHE_MODE_NONE || + (mode == CACHE_MODE_WRITEAROUND && + (bio->bi_rw & REQ_WRITE))) + goto skip; + + if (bio->bi_sector & (c->sb.block_size - 1) || + bio_sectors(bio) & (c->sb.block_size - 1)) { + pr_debug("skipping unaligned io"); + goto skip; + } + + if (!cutoff) { + cutoff = dc->sequential_cutoff >> 9; + + if (!cutoff) + goto rescale; + + if (mode == CACHE_MODE_WRITEBACK && + (bio->bi_rw & REQ_WRITE) && + (bio->bi_rw & REQ_SYNC)) + goto rescale; + } + + if (dc->sequential_merge) { + struct io *i; + + spin_lock(&dc->io_lock); + + hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) + if (i->last == bio->bi_sector && + time_before(jiffies, i->jiffies)) + goto found; + + i = list_first_entry(&dc->io_lru, struct io, lru); + + add_sequential(s->task); + i->sequential = 0; +found: + if (i->sequential + bio->bi_size > i->sequential) + i->sequential += bio->bi_size; + + i->last = bio_end(bio); + i->jiffies = jiffies + msecs_to_jiffies(5000); + s->task->sequential_io = i->sequential; + + hlist_del(&i->hash); + hlist_add_head(&i->hash, iohash(dc, i->last)); + list_move_tail(&i->lru, &dc->io_lru); + + spin_unlock(&dc->io_lock); + } else { + s->task->sequential_io = bio->bi_size; + + add_sequential(s->task); + } + + rand = get_random_int(); + cutoff -= bitmap_weight(&rand, BITS_PER_LONG); + + if (cutoff <= (int) (max(s->task->sequential_io, + s->task->sequential_io_avg) >> 9)) + goto skip; + +rescale: + bch_rescale_priorities(c, bio_sectors(bio)); + return; +skip: + bch_mark_sectors_bypassed(s, bio_sectors(bio)); + s->op.skip = true; +} + +static void cached_dev_make_request(struct request_queue *q, struct bio *bio) +{ + struct search *s; + struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + int cpu, rw = bio_data_dir(bio); + + cpu = part_stat_lock(); + part_stat_inc(cpu, &d->disk->part0, ios[rw]); + part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); + part_stat_unlock(); + + bio->bi_bdev = dc->bdev; + bio->bi_sector += dc->sb.data_offset; + + if (cached_dev_get(dc)) { + s = search_alloc(bio, d); + trace_bcache_request_start(s, bio); + + if (!bio_has_data(bio)) + request_nodata(dc, s); + else if (rw) + request_write(dc, s); + else + request_read(dc, s); + } else { + if ((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(dc->bdev))) + bio_endio(bio, 0); + else + bch_generic_make_request(bio, &d->bio_split_hook); + } +} + +static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); +} + +static int cached_dev_congested(void *data, int bits) +{ + struct bcache_device *d = data; + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + struct request_queue *q = bdev_get_queue(dc->bdev); + int ret = 0; + + if (bdi_congested(&q->backing_dev_info, bits)) + return 1; + + if (cached_dev_get(dc)) { + unsigned i; + struct cache *ca; + + for_each_cache(ca, d->c, i) { + q = bdev_get_queue(ca->bdev); + ret |= bdi_congested(&q->backing_dev_info, bits); + } + + cached_dev_put(dc); + } + + return ret; +} + +void bch_cached_dev_request_init(struct cached_dev *dc) +{ + struct gendisk *g = dc->disk.disk; + + g->queue->make_request_fn = cached_dev_make_request; + g->queue->backing_dev_info.congested_fn = cached_dev_congested; + dc->disk.cache_miss = cached_dev_cache_miss; + dc->disk.ioctl = cached_dev_ioctl; +} + +/* Flash backed devices */ + +static int flash_dev_cache_miss(struct btree *b, struct search *s, + struct bio *bio, unsigned sectors) +{ + /* Zero fill bio */ + + while (bio->bi_idx != bio->bi_vcnt) { + struct bio_vec *bv = bio_iovec(bio); + unsigned j = min(bv->bv_len >> 9, sectors); + + void *p = kmap(bv->bv_page); + memset(p + bv->bv_offset, 0, j << 9); + kunmap(bv->bv_page); + + bv->bv_len -= j << 9; + bv->bv_offset += j << 9; + + if (bv->bv_len) + return 0; + + bio->bi_sector += j; + bio->bi_size -= j << 9; + + bio->bi_idx++; + sectors -= j; + } + + s->op.lookup_done = true; + + return 0; +} + +static void flash_dev_make_request(struct request_queue *q, struct bio *bio) +{ + struct search *s; + struct closure *cl; + struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; + int cpu, rw = bio_data_dir(bio); + + cpu = part_stat_lock(); + part_stat_inc(cpu, &d->disk->part0, ios[rw]); + part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); + part_stat_unlock(); + + s = search_alloc(bio, d); + cl = &s->cl; + bio = &s->bio.bio; + + trace_bcache_request_start(s, bio); + + if (bio_has_data(bio) && !rw) { + closure_call(&s->op.cl, btree_read_async, NULL, cl); + } else if (bio_has_data(bio) || s->op.skip) { + bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, + &KEY(d->id, bio->bi_sector, 0), + &KEY(d->id, bio_end(bio), 0)); + + s->writeback = true; + s->op.cache_bio = bio; + + closure_call(&s->op.cl, bch_insert_data, NULL, cl); + } else { + /* No data - probably a cache flush */ + if (s->op.flush_journal) + bch_journal_meta(s->op.c, cl); + } + + continue_at(cl, search_free, NULL); +} + +static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + return -ENOTTY; +} + +static int flash_dev_congested(void *data, int bits) +{ + struct bcache_device *d = data; + struct request_queue *q; + struct cache *ca; + unsigned i; + int ret = 0; + + for_each_cache(ca, d->c, i) { + q = bdev_get_queue(ca->bdev); + ret |= bdi_congested(&q->backing_dev_info, bits); + } + + return ret; +} + +void bch_flash_dev_request_init(struct bcache_device *d) +{ + struct gendisk *g = d->disk; + + g->queue->make_request_fn = flash_dev_make_request; + g->queue->backing_dev_info.congested_fn = flash_dev_congested; + d->cache_miss = flash_dev_cache_miss; + d->ioctl = flash_dev_ioctl; +} + +void bch_request_exit(void) +{ +#ifdef CONFIG_CGROUP_BCACHE + cgroup_unload_subsys(&bcache_subsys); +#endif + if (bch_search_cache) + kmem_cache_destroy(bch_search_cache); +} + +int __init bch_request_init(void) +{ + bch_search_cache = KMEM_CACHE(search, 0); + if (!bch_search_cache) + return -ENOMEM; + +#ifdef CONFIG_CGROUP_BCACHE + cgroup_load_subsys(&bcache_subsys); + init_bch_cgroup(&bcache_default_cgroup); + + cgroup_add_cftypes(&bcache_subsys, bch_files); +#endif + return 0; +} diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h new file mode 100644 index 00000000000..254d9ab5707 --- /dev/null +++ b/drivers/md/bcache/request.h @@ -0,0 +1,62 @@ +#ifndef _BCACHE_REQUEST_H_ +#define _BCACHE_REQUEST_H_ + +#include <linux/cgroup.h> + +struct search { + /* Stack frame for bio_complete */ + struct closure cl; + + struct bcache_device *d; + struct task_struct *task; + + struct bbio bio; + struct bio *orig_bio; + struct bio *cache_miss; + unsigned cache_bio_sectors; + + unsigned recoverable:1; + unsigned unaligned_bvec:1; + + unsigned write:1; + unsigned writeback:1; + + /* IO error returned to s->bio */ + short error; + unsigned long start_time; + + /* Anything past op->keys won't get zeroed in do_bio_hook */ + struct btree_op op; +}; + +void bch_cache_read_endio(struct bio *, int); +int bch_get_congested(struct cache_set *); +void bch_insert_data(struct closure *cl); +void bch_btree_insert_async(struct closure *); +void bch_cache_read_endio(struct bio *, int); + +void bch_open_buckets_free(struct cache_set *); +int bch_open_buckets_alloc(struct cache_set *); + +void bch_cached_dev_request_init(struct cached_dev *dc); +void bch_flash_dev_request_init(struct bcache_device *d); + +extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache; + +struct bch_cgroup { +#ifdef CONFIG_CGROUP_BCACHE + struct cgroup_subsys_state css; +#endif + /* + * We subtract one from the index into bch_cache_modes[], so that + * default == -1; this makes it so the rest match up with d->cache_mode, + * and we use d->cache_mode if cgrp->cache_mode < 0 + */ + short cache_mode; + bool verify; + struct cache_stat_collector stats; +}; + +struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio); + +#endif /* _BCACHE_REQUEST_H_ */ diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c new file mode 100644 index 00000000000..64e679449c2 --- /dev/null +++ b/drivers/md/bcache/stats.c @@ -0,0 +1,246 @@ +/* + * bcache stats code + * + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "stats.h" +#include "btree.h" +#include "request.h" +#include "sysfs.h" + +/* + * We keep absolute totals of various statistics, and addionally a set of three + * rolling averages. + * + * Every so often, a timer goes off and rescales the rolling averages. + * accounting_rescale[] is how many times the timer has to go off before we + * rescale each set of numbers; that gets us half lives of 5 minutes, one hour, + * and one day. + * + * accounting_delay is how often the timer goes off - 22 times in 5 minutes, + * and accounting_weight is what we use to rescale: + * + * pow(31 / 32, 22) ~= 1/2 + * + * So that we don't have to increment each set of numbers every time we (say) + * get a cache hit, we increment a single atomic_t in acc->collector, and when + * the rescale function runs it resets the atomic counter to 0 and adds its + * old value to each of the exported numbers. + * + * To reduce rounding error, the numbers in struct cache_stats are all + * stored left shifted by 16, and scaled back in the sysfs show() function. + */ + +static const unsigned DAY_RESCALE = 288; +static const unsigned HOUR_RESCALE = 12; +static const unsigned FIVE_MINUTE_RESCALE = 1; +static const unsigned accounting_delay = (HZ * 300) / 22; +static const unsigned accounting_weight = 32; + +/* sysfs reading/writing */ + +read_attribute(cache_hits); +read_attribute(cache_misses); +read_attribute(cache_bypass_hits); +read_attribute(cache_bypass_misses); +read_attribute(cache_hit_ratio); +read_attribute(cache_readaheads); +read_attribute(cache_miss_collisions); +read_attribute(bypassed); + +SHOW(bch_stats) +{ + struct cache_stats *s = + container_of(kobj, struct cache_stats, kobj); +#define var(stat) (s->stat >> 16) + var_print(cache_hits); + var_print(cache_misses); + var_print(cache_bypass_hits); + var_print(cache_bypass_misses); + + sysfs_print(cache_hit_ratio, + DIV_SAFE(var(cache_hits) * 100, + var(cache_hits) + var(cache_misses))); + + var_print(cache_readaheads); + var_print(cache_miss_collisions); + sysfs_hprint(bypassed, var(sectors_bypassed) << 9); +#undef var + return 0; +} + +STORE(bch_stats) +{ + return size; +} + +static void bch_stats_release(struct kobject *k) +{ +} + +static struct attribute *bch_stats_files[] = { + &sysfs_cache_hits, + &sysfs_cache_misses, + &sysfs_cache_bypass_hits, + &sysfs_cache_bypass_misses, + &sysfs_cache_hit_ratio, + &sysfs_cache_readaheads, + &sysfs_cache_miss_collisions, + &sysfs_bypassed, + NULL +}; +static KTYPE(bch_stats); + +static void scale_accounting(unsigned long data); + +void bch_cache_accounting_init(struct cache_accounting *acc, + struct closure *parent) +{ + kobject_init(&acc->total.kobj, &bch_stats_ktype); + kobject_init(&acc->five_minute.kobj, &bch_stats_ktype); + kobject_init(&acc->hour.kobj, &bch_stats_ktype); + kobject_init(&acc->day.kobj, &bch_stats_ktype); + + closure_init(&acc->cl, parent); + init_timer(&acc->timer); + acc->timer.expires = jiffies + accounting_delay; + acc->timer.data = (unsigned long) acc; + acc->timer.function = scale_accounting; + add_timer(&acc->timer); +} + +int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, + struct kobject *parent) +{ + int ret = kobject_add(&acc->total.kobj, parent, + "stats_total"); + ret = ret ?: kobject_add(&acc->five_minute.kobj, parent, + "stats_five_minute"); + ret = ret ?: kobject_add(&acc->hour.kobj, parent, + "stats_hour"); + ret = ret ?: kobject_add(&acc->day.kobj, parent, + "stats_day"); + return ret; +} + +void bch_cache_accounting_clear(struct cache_accounting *acc) +{ + memset(&acc->total.cache_hits, + 0, + sizeof(unsigned long) * 7); +} + +void bch_cache_accounting_destroy(struct cache_accounting *acc) +{ + kobject_put(&acc->total.kobj); + kobject_put(&acc->five_minute.kobj); + kobject_put(&acc->hour.kobj); + kobject_put(&acc->day.kobj); + + atomic_set(&acc->closing, 1); + if (del_timer_sync(&acc->timer)) + closure_return(&acc->cl); +} + +/* EWMA scaling */ + +static void scale_stat(unsigned long *stat) +{ + *stat = ewma_add(*stat, 0, accounting_weight, 0); +} + +static void scale_stats(struct cache_stats *stats, unsigned long rescale_at) +{ + if (++stats->rescale == rescale_at) { + stats->rescale = 0; + scale_stat(&stats->cache_hits); + scale_stat(&stats->cache_misses); + scale_stat(&stats->cache_bypass_hits); + scale_stat(&stats->cache_bypass_misses); + scale_stat(&stats->cache_readaheads); + scale_stat(&stats->cache_miss_collisions); + scale_stat(&stats->sectors_bypassed); + } +} + +static void scale_accounting(unsigned long data) +{ + struct cache_accounting *acc = (struct cache_accounting *) data; + +#define move_stat(name) do { \ + unsigned t = atomic_xchg(&acc->collector.name, 0); \ + t <<= 16; \ + acc->five_minute.name += t; \ + acc->hour.name += t; \ + acc->day.name += t; \ + acc->total.name += t; \ +} while (0) + + move_stat(cache_hits); + move_stat(cache_misses); + move_stat(cache_bypass_hits); + move_stat(cache_bypass_misses); + move_stat(cache_readaheads); + move_stat(cache_miss_collisions); + move_stat(sectors_bypassed); + + scale_stats(&acc->total, 0); + scale_stats(&acc->day, DAY_RESCALE); + scale_stats(&acc->hour, HOUR_RESCALE); + scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE); + + acc->timer.expires += accounting_delay; + + if (!atomic_read(&acc->closing)) + add_timer(&acc->timer); + else + closure_return(&acc->cl); +} + +static void mark_cache_stats(struct cache_stat_collector *stats, + bool hit, bool bypass) +{ + if (!bypass) + if (hit) + atomic_inc(&stats->cache_hits); + else + atomic_inc(&stats->cache_misses); + else + if (hit) + atomic_inc(&stats->cache_bypass_hits); + else + atomic_inc(&stats->cache_bypass_misses); +} + +void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass) +{ + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + mark_cache_stats(&dc->accounting.collector, hit, bypass); + mark_cache_stats(&s->op.c->accounting.collector, hit, bypass); +#ifdef CONFIG_CGROUP_BCACHE + mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass); +#endif +} + +void bch_mark_cache_readahead(struct search *s) +{ + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + atomic_inc(&dc->accounting.collector.cache_readaheads); + atomic_inc(&s->op.c->accounting.collector.cache_readaheads); +} + +void bch_mark_cache_miss_collision(struct search *s) +{ + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + atomic_inc(&dc->accounting.collector.cache_miss_collisions); + atomic_inc(&s->op.c->accounting.collector.cache_miss_collisions); +} + +void bch_mark_sectors_bypassed(struct search *s, int sectors) +{ + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); + atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed); +} diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h new file mode 100644 index 00000000000..c7c7a8fd29f --- /dev/null +++ b/drivers/md/bcache/stats.h @@ -0,0 +1,58 @@ +#ifndef _BCACHE_STATS_H_ +#define _BCACHE_STATS_H_ + +struct cache_stat_collector { + atomic_t cache_hits; + atomic_t cache_misses; + atomic_t cache_bypass_hits; + atomic_t cache_bypass_misses; + atomic_t cache_readaheads; + atomic_t cache_miss_collisions; + atomic_t sectors_bypassed; +}; + +struct cache_stats { + struct kobject kobj; + + unsigned long cache_hits; + unsigned long cache_misses; + unsigned long cache_bypass_hits; + unsigned long cache_bypass_misses; + unsigned long cache_readaheads; + unsigned long cache_miss_collisions; + unsigned long sectors_bypassed; + + unsigned rescale; +}; + +struct cache_accounting { + struct closure cl; + struct timer_list timer; + atomic_t closing; + + struct cache_stat_collector collector; + + struct cache_stats total; + struct cache_stats five_minute; + struct cache_stats hour; + struct cache_stats day; +}; + +struct search; + +void bch_cache_accounting_init(struct cache_accounting *acc, + struct closure *parent); + +int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, + struct kobject *parent); + +void bch_cache_accounting_clear(struct cache_accounting *acc); + +void bch_cache_accounting_destroy(struct cache_accounting *acc); + +void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass); +void bch_mark_cache_readahead(struct search *s); +void bch_mark_cache_miss_collision(struct search *s); +void bch_mark_sectors_bypassed(struct search *s, int sectors); + +#endif /* _BCACHE_STATS_H_ */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c new file mode 100644 index 00000000000..c8046bc4aa5 --- /dev/null +++ b/drivers/md/bcache/super.c @@ -0,0 +1,1987 @@ +/* + * bcache setup/teardown code, and some metadata io - read a superblock and + * figure out what to do with it. + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "request.h" + +#include <linux/buffer_head.h> +#include <linux/debugfs.h> +#include <linux/genhd.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/reboot.h> +#include <linux/sysfs.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); + +static const char bcache_magic[] = { + 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca, + 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81 +}; + +static const char invalid_uuid[] = { + 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78, + 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99 +}; + +/* Default is -1; we skip past it for struct cached_dev's cache mode */ +const char * const bch_cache_modes[] = { + "default", + "writethrough", + "writeback", + "writearound", + "none", + NULL +}; + +struct uuid_entry_v0 { + uint8_t uuid[16]; + uint8_t label[32]; + uint32_t first_reg; + uint32_t last_reg; + uint32_t invalidated; + uint32_t pad; +}; + +static struct kobject *bcache_kobj; +struct mutex bch_register_lock; +LIST_HEAD(bch_cache_sets); +static LIST_HEAD(uncached_devices); + +static int bcache_major, bcache_minor; +static wait_queue_head_t unregister_wait; +struct workqueue_struct *bcache_wq; + +#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) + +static void bio_split_pool_free(struct bio_split_pool *p) +{ + if (p->bio_split_hook) + mempool_destroy(p->bio_split_hook); + + if (p->bio_split) + bioset_free(p->bio_split); +} + +static int bio_split_pool_init(struct bio_split_pool *p) +{ + p->bio_split = bioset_create(4, 0); + if (!p->bio_split) + return -ENOMEM; + + p->bio_split_hook = mempool_create_kmalloc_pool(4, + sizeof(struct bio_split_hook)); + if (!p->bio_split_hook) + return -ENOMEM; + + return 0; +} + +/* Superblock */ + +static const char *read_super(struct cache_sb *sb, struct block_device *bdev, + struct page **res) +{ + const char *err; + struct cache_sb *s; + struct buffer_head *bh = __bread(bdev, 1, SB_SIZE); + unsigned i; + + if (!bh) + return "IO error"; + + s = (struct cache_sb *) bh->b_data; + + sb->offset = le64_to_cpu(s->offset); + sb->version = le64_to_cpu(s->version); + + memcpy(sb->magic, s->magic, 16); + memcpy(sb->uuid, s->uuid, 16); + memcpy(sb->set_uuid, s->set_uuid, 16); + memcpy(sb->label, s->label, SB_LABEL_SIZE); + + sb->flags = le64_to_cpu(s->flags); + sb->seq = le64_to_cpu(s->seq); + sb->last_mount = le32_to_cpu(s->last_mount); + sb->first_bucket = le16_to_cpu(s->first_bucket); + sb->keys = le16_to_cpu(s->keys); + + for (i = 0; i < SB_JOURNAL_BUCKETS; i++) + sb->d[i] = le64_to_cpu(s->d[i]); + + pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", + sb->version, sb->flags, sb->seq, sb->keys); + + err = "Not a bcache superblock"; + if (sb->offset != SB_SECTOR) + goto err; + + if (memcmp(sb->magic, bcache_magic, 16)) + goto err; + + err = "Too many journal buckets"; + if (sb->keys > SB_JOURNAL_BUCKETS) + goto err; + + err = "Bad checksum"; + if (s->csum != csum_set(s)) + goto err; + + err = "Bad UUID"; + if (bch_is_zero(sb->uuid, 16)) + goto err; + + sb->block_size = le16_to_cpu(s->block_size); + + err = "Superblock block size smaller than device block size"; + if (sb->block_size << 9 < bdev_logical_block_size(bdev)) + goto err; + + switch (sb->version) { + case BCACHE_SB_VERSION_BDEV: + sb->data_offset = BDEV_DATA_START_DEFAULT; + break; + case BCACHE_SB_VERSION_BDEV_WITH_OFFSET: + sb->data_offset = le64_to_cpu(s->data_offset); + + err = "Bad data offset"; + if (sb->data_offset < BDEV_DATA_START_DEFAULT) + goto err; + + break; + case BCACHE_SB_VERSION_CDEV: + case BCACHE_SB_VERSION_CDEV_WITH_UUID: + sb->nbuckets = le64_to_cpu(s->nbuckets); + sb->block_size = le16_to_cpu(s->block_size); + sb->bucket_size = le16_to_cpu(s->bucket_size); + + sb->nr_in_set = le16_to_cpu(s->nr_in_set); + sb->nr_this_dev = le16_to_cpu(s->nr_this_dev); + + err = "Too many buckets"; + if (sb->nbuckets > LONG_MAX) + goto err; + + err = "Not enough buckets"; + if (sb->nbuckets < 1 << 7) + goto err; + + err = "Bad block/bucket size"; + if (!is_power_of_2(sb->block_size) || + sb->block_size > PAGE_SECTORS || + !is_power_of_2(sb->bucket_size) || + sb->bucket_size < PAGE_SECTORS) + goto err; + + err = "Invalid superblock: device too small"; + if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets) + goto err; + + err = "Bad UUID"; + if (bch_is_zero(sb->set_uuid, 16)) + goto err; + + err = "Bad cache device number in set"; + if (!sb->nr_in_set || + sb->nr_in_set <= sb->nr_this_dev || + sb->nr_in_set > MAX_CACHES_PER_SET) + goto err; + + err = "Journal buckets not sequential"; + for (i = 0; i < sb->keys; i++) + if (sb->d[i] != sb->first_bucket + i) + goto err; + + err = "Too many journal buckets"; + if (sb->first_bucket + sb->keys > sb->nbuckets) + goto err; + + err = "Invalid superblock: first bucket comes before end of super"; + if (sb->first_bucket * sb->bucket_size < 16) + goto err; + + break; + default: + err = "Unsupported superblock version"; + goto err; + } + + sb->last_mount = get_seconds(); + err = NULL; + + get_page(bh->b_page); + *res = bh->b_page; +err: + put_bh(bh); + return err; +} + +static void write_bdev_super_endio(struct bio *bio, int error) +{ + struct cached_dev *dc = bio->bi_private; + /* XXX: error checking */ + + closure_put(&dc->sb_write.cl); +} + +static void __write_super(struct cache_sb *sb, struct bio *bio) +{ + struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page); + unsigned i; + + bio->bi_sector = SB_SECTOR; + bio->bi_rw = REQ_SYNC|REQ_META; + bio->bi_size = SB_SIZE; + bch_bio_map(bio, NULL); + + out->offset = cpu_to_le64(sb->offset); + out->version = cpu_to_le64(sb->version); + + memcpy(out->uuid, sb->uuid, 16); + memcpy(out->set_uuid, sb->set_uuid, 16); + memcpy(out->label, sb->label, SB_LABEL_SIZE); + + out->flags = cpu_to_le64(sb->flags); + out->seq = cpu_to_le64(sb->seq); + + out->last_mount = cpu_to_le32(sb->last_mount); + out->first_bucket = cpu_to_le16(sb->first_bucket); + out->keys = cpu_to_le16(sb->keys); + + for (i = 0; i < sb->keys; i++) + out->d[i] = cpu_to_le64(sb->d[i]); + + out->csum = csum_set(out); + + pr_debug("ver %llu, flags %llu, seq %llu", + sb->version, sb->flags, sb->seq); + + submit_bio(REQ_WRITE, bio); +} + +void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) +{ + struct closure *cl = &dc->sb_write.cl; + struct bio *bio = &dc->sb_bio; + + closure_lock(&dc->sb_write, parent); + + bio_reset(bio); + bio->bi_bdev = dc->bdev; + bio->bi_end_io = write_bdev_super_endio; + bio->bi_private = dc; + + closure_get(cl); + __write_super(&dc->sb, bio); + + closure_return(cl); +} + +static void write_super_endio(struct bio *bio, int error) +{ + struct cache *ca = bio->bi_private; + + bch_count_io_errors(ca, error, "writing superblock"); + closure_put(&ca->set->sb_write.cl); +} + +void bcache_write_super(struct cache_set *c) +{ + struct closure *cl = &c->sb_write.cl; + struct cache *ca; + unsigned i; + + closure_lock(&c->sb_write, &c->cl); + + c->sb.seq++; + + for_each_cache(ca, c, i) { + struct bio *bio = &ca->sb_bio; + + ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID; + ca->sb.seq = c->sb.seq; + ca->sb.last_mount = c->sb.last_mount; + + SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb)); + + bio_reset(bio); + bio->bi_bdev = ca->bdev; + bio->bi_end_io = write_super_endio; + bio->bi_private = ca; + + closure_get(cl); + __write_super(&ca->sb, bio); + } + + closure_return(cl); +} + +/* UUID io */ + +static void uuid_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl); + + cache_set_err_on(error, c, "accessing uuids"); + bch_bbio_free(bio, c); + closure_put(cl); +} + +static void uuid_io(struct cache_set *c, unsigned long rw, + struct bkey *k, struct closure *parent) +{ + struct closure *cl = &c->uuid_write.cl; + struct uuid_entry *u; + unsigned i; + + BUG_ON(!parent); + closure_lock(&c->uuid_write, parent); + + for (i = 0; i < KEY_PTRS(k); i++) { + struct bio *bio = bch_bbio_alloc(c); + + bio->bi_rw = REQ_SYNC|REQ_META|rw; + bio->bi_size = KEY_SIZE(k) << 9; + + bio->bi_end_io = uuid_endio; + bio->bi_private = cl; + bch_bio_map(bio, c->uuids); + + bch_submit_bbio(bio, c, k, i); + + if (!(rw & WRITE)) + break; + } + + pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", + pkey(&c->uuid_bucket)); + + for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) + if (!bch_is_zero(u->uuid, 16)) + pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u", + u - c->uuids, u->uuid, u->label, + u->first_reg, u->last_reg, u->invalidated); + + closure_return(cl); +} + +static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) +{ + struct bkey *k = &j->uuid_bucket; + + if (__bch_ptr_invalid(c, 1, k)) + return "bad uuid pointer"; + + bkey_copy(&c->uuid_bucket, k); + uuid_io(c, READ_SYNC, k, cl); + + if (j->version < BCACHE_JSET_VERSION_UUIDv1) { + struct uuid_entry_v0 *u0 = (void *) c->uuids; + struct uuid_entry *u1 = (void *) c->uuids; + int i; + + closure_sync(cl); + + /* + * Since the new uuid entry is bigger than the old, we have to + * convert starting at the highest memory address and work down + * in order to do it in place + */ + + for (i = c->nr_uuids - 1; + i >= 0; + --i) { + memcpy(u1[i].uuid, u0[i].uuid, 16); + memcpy(u1[i].label, u0[i].label, 32); + + u1[i].first_reg = u0[i].first_reg; + u1[i].last_reg = u0[i].last_reg; + u1[i].invalidated = u0[i].invalidated; + + u1[i].flags = 0; + u1[i].sectors = 0; + } + } + + return NULL; +} + +static int __uuid_write(struct cache_set *c) +{ + BKEY_PADDED(key) k; + struct closure cl; + closure_init_stack(&cl); + + lockdep_assert_held(&bch_register_lock); + + if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, &cl)) + return 1; + + SET_KEY_SIZE(&k.key, c->sb.bucket_size); + uuid_io(c, REQ_WRITE, &k.key, &cl); + closure_sync(&cl); + + bkey_copy(&c->uuid_bucket, &k.key); + __bkey_put(c, &k.key); + return 0; +} + +int bch_uuid_write(struct cache_set *c) +{ + int ret = __uuid_write(c); + + if (!ret) + bch_journal_meta(c, NULL); + + return ret; +} + +static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid) +{ + struct uuid_entry *u; + + for (u = c->uuids; + u < c->uuids + c->nr_uuids; u++) + if (!memcmp(u->uuid, uuid, 16)) + return u; + + return NULL; +} + +static struct uuid_entry *uuid_find_empty(struct cache_set *c) +{ + static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; + return uuid_find(c, zero_uuid); +} + +/* + * Bucket priorities/gens: + * + * For each bucket, we store on disk its + * 8 bit gen + * 16 bit priority + * + * See alloc.c for an explanation of the gen. The priority is used to implement + * lru (and in the future other) cache replacement policies; for most purposes + * it's just an opaque integer. + * + * The gens and the priorities don't have a whole lot to do with each other, and + * it's actually the gens that must be written out at specific times - it's no + * big deal if the priorities don't get written, if we lose them we just reuse + * buckets in suboptimal order. + * + * On disk they're stored in a packed array, and in as many buckets are required + * to fit them all. The buckets we use to store them form a list; the journal + * header points to the first bucket, the first bucket points to the second + * bucket, et cetera. + * + * This code is used by the allocation code; periodically (whenever it runs out + * of buckets to allocate from) the allocation code will invalidate some + * buckets, but it can't use those buckets until their new gens are safely on + * disk. + */ + +static void prio_endio(struct bio *bio, int error) +{ + struct cache *ca = bio->bi_private; + + cache_set_err_on(error, ca->set, "accessing priorities"); + bch_bbio_free(bio, ca->set); + closure_put(&ca->prio); +} + +static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw) +{ + struct closure *cl = &ca->prio; + struct bio *bio = bch_bbio_alloc(ca->set); + + closure_init_stack(cl); + + bio->bi_sector = bucket * ca->sb.bucket_size; + bio->bi_bdev = ca->bdev; + bio->bi_rw = REQ_SYNC|REQ_META|rw; + bio->bi_size = bucket_bytes(ca); + + bio->bi_end_io = prio_endio; + bio->bi_private = ca; + bch_bio_map(bio, ca->disk_buckets); + + closure_bio_submit(bio, &ca->prio, ca); + closure_sync(cl); +} + +#define buckets_free(c) "free %zu, free_inc %zu, unused %zu", \ + fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused) + +void bch_prio_write(struct cache *ca) +{ + int i; + struct bucket *b; + struct closure cl; + + closure_init_stack(&cl); + + lockdep_assert_held(&ca->set->bucket_lock); + + for (b = ca->buckets; + b < ca->buckets + ca->sb.nbuckets; b++) + b->disk_gen = b->gen; + + ca->disk_buckets->seq++; + + atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), + &ca->meta_sectors_written); + + pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), + fifo_used(&ca->free_inc), fifo_used(&ca->unused)); + blktrace_msg(ca, "Starting priorities: " buckets_free(ca)); + + for (i = prio_buckets(ca) - 1; i >= 0; --i) { + long bucket; + struct prio_set *p = ca->disk_buckets; + struct bucket_disk *d = p->data; + struct bucket_disk *end = d + prios_per_bucket(ca); + + for (b = ca->buckets + i * prios_per_bucket(ca); + b < ca->buckets + ca->sb.nbuckets && d < end; + b++, d++) { + d->prio = cpu_to_le16(b->prio); + d->gen = b->gen; + } + + p->next_bucket = ca->prio_buckets[i + 1]; + p->magic = pset_magic(ca); + p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); + + bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, &cl); + BUG_ON(bucket == -1); + + mutex_unlock(&ca->set->bucket_lock); + prio_io(ca, bucket, REQ_WRITE); + mutex_lock(&ca->set->bucket_lock); + + ca->prio_buckets[i] = bucket; + atomic_dec_bug(&ca->buckets[bucket].pin); + } + + mutex_unlock(&ca->set->bucket_lock); + + bch_journal_meta(ca->set, &cl); + closure_sync(&cl); + + mutex_lock(&ca->set->bucket_lock); + + ca->need_save_prio = 0; + + /* + * Don't want the old priorities to get garbage collected until after we + * finish writing the new ones, and they're journalled + */ + for (i = 0; i < prio_buckets(ca); i++) + ca->prio_last_buckets[i] = ca->prio_buckets[i]; +} + +static void prio_read(struct cache *ca, uint64_t bucket) +{ + struct prio_set *p = ca->disk_buckets; + struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d; + struct bucket *b; + unsigned bucket_nr = 0; + + for (b = ca->buckets; + b < ca->buckets + ca->sb.nbuckets; + b++, d++) { + if (d == end) { + ca->prio_buckets[bucket_nr] = bucket; + ca->prio_last_buckets[bucket_nr] = bucket; + bucket_nr++; + + prio_io(ca, bucket, READ_SYNC); + + if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8)) + pr_warn("bad csum reading priorities"); + + if (p->magic != pset_magic(ca)) + pr_warn("bad magic reading priorities"); + + bucket = p->next_bucket; + d = p->data; + } + + b->prio = le16_to_cpu(d->prio); + b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen; + } +} + +/* Bcache device */ + +static int open_dev(struct block_device *b, fmode_t mode) +{ + struct bcache_device *d = b->bd_disk->private_data; + if (atomic_read(&d->closing)) + return -ENXIO; + + closure_get(&d->cl); + return 0; +} + +static int release_dev(struct gendisk *b, fmode_t mode) +{ + struct bcache_device *d = b->private_data; + closure_put(&d->cl); + return 0; +} + +static int ioctl_dev(struct block_device *b, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct bcache_device *d = b->bd_disk->private_data; + return d->ioctl(d, mode, cmd, arg); +} + +static const struct block_device_operations bcache_ops = { + .open = open_dev, + .release = release_dev, + .ioctl = ioctl_dev, + .owner = THIS_MODULE, +}; + +void bcache_device_stop(struct bcache_device *d) +{ + if (!atomic_xchg(&d->closing, 1)) + closure_queue(&d->cl); +} + +static void bcache_device_unlink(struct bcache_device *d) +{ + unsigned i; + struct cache *ca; + + sysfs_remove_link(&d->c->kobj, d->name); + sysfs_remove_link(&d->kobj, "cache"); + + for_each_cache(ca, d->c, i) + bd_unlink_disk_holder(ca->bdev, d->disk); +} + +static void bcache_device_link(struct bcache_device *d, struct cache_set *c, + const char *name) +{ + unsigned i; + struct cache *ca; + + for_each_cache(ca, d->c, i) + bd_link_disk_holder(ca->bdev, d->disk); + + snprintf(d->name, BCACHEDEVNAME_SIZE, + "%s%u", name, d->id); + + WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || + sysfs_create_link(&c->kobj, &d->kobj, d->name), + "Couldn't create device <-> cache set symlinks"); +} + +static void bcache_device_detach(struct bcache_device *d) +{ + lockdep_assert_held(&bch_register_lock); + + if (atomic_read(&d->detaching)) { + struct uuid_entry *u = d->c->uuids + d->id; + + SET_UUID_FLASH_ONLY(u, 0); + memcpy(u->uuid, invalid_uuid, 16); + u->invalidated = cpu_to_le32(get_seconds()); + bch_uuid_write(d->c); + + atomic_set(&d->detaching, 0); + } + + bcache_device_unlink(d); + + d->c->devices[d->id] = NULL; + closure_put(&d->c->caching); + d->c = NULL; +} + +static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, + unsigned id) +{ + BUG_ON(test_bit(CACHE_SET_STOPPING, &c->flags)); + + d->id = id; + d->c = c; + c->devices[id] = d; + + closure_get(&c->caching); +} + +static void bcache_device_free(struct bcache_device *d) +{ + lockdep_assert_held(&bch_register_lock); + + pr_info("%s stopped", d->disk->disk_name); + + if (d->c) + bcache_device_detach(d); + + if (d->disk) + del_gendisk(d->disk); + if (d->disk && d->disk->queue) + blk_cleanup_queue(d->disk->queue); + if (d->disk) + put_disk(d->disk); + + bio_split_pool_free(&d->bio_split_hook); + if (d->unaligned_bvec) + mempool_destroy(d->unaligned_bvec); + if (d->bio_split) + bioset_free(d->bio_split); + + closure_debug_destroy(&d->cl); +} + +static int bcache_device_init(struct bcache_device *d, unsigned block_size) +{ + struct request_queue *q; + + if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || + !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, + sizeof(struct bio_vec) * BIO_MAX_PAGES)) || + bio_split_pool_init(&d->bio_split_hook)) + + return -ENOMEM; + + d->disk = alloc_disk(1); + if (!d->disk) + return -ENOMEM; + + snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); + + d->disk->major = bcache_major; + d->disk->first_minor = bcache_minor++; + d->disk->fops = &bcache_ops; + d->disk->private_data = d; + + q = blk_alloc_queue(GFP_KERNEL); + if (!q) + return -ENOMEM; + + blk_queue_make_request(q, NULL); + d->disk->queue = q; + q->queuedata = d; + q->backing_dev_info.congested_data = d; + q->limits.max_hw_sectors = UINT_MAX; + q->limits.max_sectors = UINT_MAX; + q->limits.max_segment_size = UINT_MAX; + q->limits.max_segments = BIO_MAX_PAGES; + q->limits.max_discard_sectors = UINT_MAX; + q->limits.io_min = block_size; + q->limits.logical_block_size = block_size; + q->limits.physical_block_size = block_size; + set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags); + set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); + + return 0; +} + +/* Cached device */ + +static void calc_cached_dev_sectors(struct cache_set *c) +{ + uint64_t sectors = 0; + struct cached_dev *dc; + + list_for_each_entry(dc, &c->cached_devs, list) + sectors += bdev_sectors(dc->bdev); + + c->cached_dev_sectors = sectors; +} + +void bch_cached_dev_run(struct cached_dev *dc) +{ + struct bcache_device *d = &dc->disk; + + if (atomic_xchg(&dc->running, 1)) + return; + + if (!d->c && + BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) { + struct closure cl; + closure_init_stack(&cl); + + SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE); + bch_write_bdev_super(dc, &cl); + closure_sync(&cl); + } + + add_disk(d->disk); + bd_link_disk_holder(dc->bdev, dc->disk.disk); +#if 0 + char *env[] = { "SYMLINK=label" , NULL }; + kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); +#endif + if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || + sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) + pr_debug("error creating sysfs link"); +} + +static void cached_dev_detach_finish(struct work_struct *w) +{ + struct cached_dev *dc = container_of(w, struct cached_dev, detach); + char buf[BDEVNAME_SIZE]; + struct closure cl; + closure_init_stack(&cl); + + BUG_ON(!atomic_read(&dc->disk.detaching)); + BUG_ON(atomic_read(&dc->count)); + + mutex_lock(&bch_register_lock); + + memset(&dc->sb.set_uuid, 0, 16); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); + + bch_write_bdev_super(dc, &cl); + closure_sync(&cl); + + bcache_device_detach(&dc->disk); + list_move(&dc->list, &uncached_devices); + + mutex_unlock(&bch_register_lock); + + pr_info("Caching disabled for %s", bdevname(dc->bdev, buf)); + + /* Drop ref we took in cached_dev_detach() */ + closure_put(&dc->disk.cl); +} + +void bch_cached_dev_detach(struct cached_dev *dc) +{ + lockdep_assert_held(&bch_register_lock); + + if (atomic_read(&dc->disk.closing)) + return; + + if (atomic_xchg(&dc->disk.detaching, 1)) + return; + + /* + * Block the device from being closed and freed until we're finished + * detaching + */ + closure_get(&dc->disk.cl); + + bch_writeback_queue(dc); + cached_dev_put(dc); +} + +int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) +{ + uint32_t rtime = cpu_to_le32(get_seconds()); + struct uuid_entry *u; + char buf[BDEVNAME_SIZE]; + + bdevname(dc->bdev, buf); + + if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)) + return -ENOENT; + + if (dc->disk.c) { + pr_err("Can't attach %s: already attached", buf); + return -EINVAL; + } + + if (test_bit(CACHE_SET_STOPPING, &c->flags)) { + pr_err("Can't attach %s: shutting down", buf); + return -EINVAL; + } + + if (dc->sb.block_size < c->sb.block_size) { + /* Will die */ + pr_err("Couldn't attach %s: block size less than set's block size", + buf); + return -EINVAL; + } + + u = uuid_find(c, dc->sb.uuid); + + if (u && + (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE || + BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) { + memcpy(u->uuid, invalid_uuid, 16); + u->invalidated = cpu_to_le32(get_seconds()); + u = NULL; + } + + if (!u) { + if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { + pr_err("Couldn't find uuid for %s in set", buf); + return -ENOENT; + } + + u = uuid_find_empty(c); + if (!u) { + pr_err("Not caching %s, no room for UUID", buf); + return -EINVAL; + } + } + + /* Deadlocks since we're called via sysfs... + sysfs_remove_file(&dc->kobj, &sysfs_attach); + */ + + if (bch_is_zero(u->uuid, 16)) { + struct closure cl; + closure_init_stack(&cl); + + memcpy(u->uuid, dc->sb.uuid, 16); + memcpy(u->label, dc->sb.label, SB_LABEL_SIZE); + u->first_reg = u->last_reg = rtime; + bch_uuid_write(c); + + memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); + + bch_write_bdev_super(dc, &cl); + closure_sync(&cl); + } else { + u->last_reg = rtime; + bch_uuid_write(c); + } + + bcache_device_attach(&dc->disk, c, u - c->uuids); + list_move(&dc->list, &c->cached_devs); + calc_cached_dev_sectors(c); + + smp_wmb(); + /* + * dc->c must be set before dc->count != 0 - paired with the mb in + * cached_dev_get() + */ + atomic_set(&dc->count, 1); + + if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { + atomic_set(&dc->has_dirty, 1); + atomic_inc(&dc->count); + bch_writeback_queue(dc); + } + + bch_cached_dev_run(dc); + bcache_device_link(&dc->disk, c, "bdev"); + + pr_info("Caching %s as %s on set %pU", + bdevname(dc->bdev, buf), dc->disk.disk->disk_name, + dc->disk.c->sb.set_uuid); + return 0; +} + +void bch_cached_dev_release(struct kobject *kobj) +{ + struct cached_dev *dc = container_of(kobj, struct cached_dev, + disk.kobj); + kfree(dc); + module_put(THIS_MODULE); +} + +static void cached_dev_free(struct closure *cl) +{ + struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); + + cancel_delayed_work_sync(&dc->writeback_rate_update); + + mutex_lock(&bch_register_lock); + + bd_unlink_disk_holder(dc->bdev, dc->disk.disk); + bcache_device_free(&dc->disk); + list_del(&dc->list); + + mutex_unlock(&bch_register_lock); + + if (!IS_ERR_OR_NULL(dc->bdev)) { + blk_sync_queue(bdev_get_queue(dc->bdev)); + blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + } + + wake_up(&unregister_wait); + + kobject_put(&dc->disk.kobj); +} + +static void cached_dev_flush(struct closure *cl) +{ + struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); + struct bcache_device *d = &dc->disk; + + bch_cache_accounting_destroy(&dc->accounting); + kobject_del(&d->kobj); + + continue_at(cl, cached_dev_free, system_wq); +} + +static int cached_dev_init(struct cached_dev *dc, unsigned block_size) +{ + int err; + struct io *io; + + closure_init(&dc->disk.cl, NULL); + set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); + + __module_get(THIS_MODULE); + INIT_LIST_HEAD(&dc->list); + kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); + + bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); + + err = bcache_device_init(&dc->disk, block_size); + if (err) + goto err; + + spin_lock_init(&dc->io_lock); + closure_init_unlocked(&dc->sb_write); + INIT_WORK(&dc->detach, cached_dev_detach_finish); + + dc->sequential_merge = true; + dc->sequential_cutoff = 4 << 20; + + INIT_LIST_HEAD(&dc->io_lru); + dc->sb_bio.bi_max_vecs = 1; + dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs; + + for (io = dc->io; io < dc->io + RECENT_IO; io++) { + list_add(&io->lru, &dc->io_lru); + hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); + } + + bch_writeback_init_cached_dev(dc); + return 0; +err: + bcache_device_stop(&dc->disk); + return err; +} + +/* Cached device - bcache superblock */ + +static const char *register_bdev(struct cache_sb *sb, struct page *sb_page, + struct block_device *bdev, + struct cached_dev *dc) +{ + char name[BDEVNAME_SIZE]; + const char *err = "cannot allocate memory"; + struct gendisk *g; + struct cache_set *c; + + if (!dc || cached_dev_init(dc, sb->block_size << 9) != 0) + return err; + + memcpy(&dc->sb, sb, sizeof(struct cache_sb)); + dc->sb_bio.bi_io_vec[0].bv_page = sb_page; + dc->bdev = bdev; + dc->bdev->bd_holder = dc; + + g = dc->disk.disk; + + set_capacity(g, dc->bdev->bd_part->nr_sects - dc->sb.data_offset); + + g->queue->backing_dev_info.ra_pages = + max(g->queue->backing_dev_info.ra_pages, + bdev->bd_queue->backing_dev_info.ra_pages); + + bch_cached_dev_request_init(dc); + + err = "error creating kobject"; + if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj, + "bcache")) + goto err; + if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj)) + goto err; + + list_add(&dc->list, &uncached_devices); + list_for_each_entry(c, &bch_cache_sets, list) + bch_cached_dev_attach(dc, c); + + if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || + BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) + bch_cached_dev_run(dc); + + return NULL; +err: + kobject_put(&dc->disk.kobj); + pr_notice("error opening %s: %s", bdevname(bdev, name), err); + /* + * Return NULL instead of an error because kobject_put() cleans + * everything up + */ + return NULL; +} + +/* Flash only volumes */ + +void bch_flash_dev_release(struct kobject *kobj) +{ + struct bcache_device *d = container_of(kobj, struct bcache_device, + kobj); + kfree(d); +} + +static void flash_dev_free(struct closure *cl) +{ + struct bcache_device *d = container_of(cl, struct bcache_device, cl); + bcache_device_free(d); + kobject_put(&d->kobj); +} + +static void flash_dev_flush(struct closure *cl) +{ + struct bcache_device *d = container_of(cl, struct bcache_device, cl); + + bcache_device_unlink(d); + kobject_del(&d->kobj); + continue_at(cl, flash_dev_free, system_wq); +} + +static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) +{ + struct bcache_device *d = kzalloc(sizeof(struct bcache_device), + GFP_KERNEL); + if (!d) + return -ENOMEM; + + closure_init(&d->cl, NULL); + set_closure_fn(&d->cl, flash_dev_flush, system_wq); + + kobject_init(&d->kobj, &bch_flash_dev_ktype); + + if (bcache_device_init(d, block_bytes(c))) + goto err; + + bcache_device_attach(d, c, u - c->uuids); + set_capacity(d->disk, u->sectors); + bch_flash_dev_request_init(d); + add_disk(d->disk); + + if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache")) + goto err; + + bcache_device_link(d, c, "volume"); + + return 0; +err: + kobject_put(&d->kobj); + return -ENOMEM; +} + +static int flash_devs_run(struct cache_set *c) +{ + int ret = 0; + struct uuid_entry *u; + + for (u = c->uuids; + u < c->uuids + c->nr_uuids && !ret; + u++) + if (UUID_FLASH_ONLY(u)) + ret = flash_dev_run(c, u); + + return ret; +} + +int bch_flash_dev_create(struct cache_set *c, uint64_t size) +{ + struct uuid_entry *u; + + if (test_bit(CACHE_SET_STOPPING, &c->flags)) + return -EINTR; + + u = uuid_find_empty(c); + if (!u) { + pr_err("Can't create volume, no room for UUID"); + return -EINVAL; + } + + get_random_bytes(u->uuid, 16); + memset(u->label, 0, 32); + u->first_reg = u->last_reg = cpu_to_le32(get_seconds()); + + SET_UUID_FLASH_ONLY(u, 1); + u->sectors = size >> 9; + + bch_uuid_write(c); + + return flash_dev_run(c, u); +} + +/* Cache set */ + +__printf(2, 3) +bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) +{ + va_list args; + + if (test_bit(CACHE_SET_STOPPING, &c->flags)) + return false; + + /* XXX: we can be called from atomic context + acquire_console_sem(); + */ + + printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid); + + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + + printk(", disabling caching\n"); + + bch_cache_set_unregister(c); + return true; +} + +void bch_cache_set_release(struct kobject *kobj) +{ + struct cache_set *c = container_of(kobj, struct cache_set, kobj); + kfree(c); + module_put(THIS_MODULE); +} + +static void cache_set_free(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, cl); + struct cache *ca; + unsigned i; + + if (!IS_ERR_OR_NULL(c->debug)) + debugfs_remove(c->debug); + + bch_open_buckets_free(c); + bch_btree_cache_free(c); + bch_journal_free(c); + + for_each_cache(ca, c, i) + if (ca) + kobject_put(&ca->kobj); + + free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); + free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); + + kfree(c->fill_iter); + if (c->bio_split) + bioset_free(c->bio_split); + if (c->bio_meta) + mempool_destroy(c->bio_meta); + if (c->search) + mempool_destroy(c->search); + kfree(c->devices); + + mutex_lock(&bch_register_lock); + list_del(&c->list); + mutex_unlock(&bch_register_lock); + + pr_info("Cache set %pU unregistered", c->sb.set_uuid); + wake_up(&unregister_wait); + + closure_debug_destroy(&c->cl); + kobject_put(&c->kobj); +} + +static void cache_set_flush(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, caching); + struct btree *b; + + /* Shut down allocator threads */ + set_bit(CACHE_SET_STOPPING_2, &c->flags); + wake_up(&c->alloc_wait); + + bch_cache_accounting_destroy(&c->accounting); + + kobject_put(&c->internal); + kobject_del(&c->kobj); + + if (!IS_ERR_OR_NULL(c->root)) + list_add(&c->root->list, &c->btree_cache); + + /* Should skip this if we're unregistering because of an error */ + list_for_each_entry(b, &c->btree_cache, list) + if (btree_node_dirty(b)) + bch_btree_write(b, true, NULL); + + closure_return(cl); +} + +static void __cache_set_unregister(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, caching); + struct cached_dev *dc, *t; + size_t i; + + mutex_lock(&bch_register_lock); + + if (test_bit(CACHE_SET_UNREGISTERING, &c->flags)) + list_for_each_entry_safe(dc, t, &c->cached_devs, list) + bch_cached_dev_detach(dc); + + for (i = 0; i < c->nr_uuids; i++) + if (c->devices[i] && UUID_FLASH_ONLY(&c->uuids[i])) + bcache_device_stop(c->devices[i]); + + mutex_unlock(&bch_register_lock); + + continue_at(cl, cache_set_flush, system_wq); +} + +void bch_cache_set_stop(struct cache_set *c) +{ + if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags)) + closure_queue(&c->caching); +} + +void bch_cache_set_unregister(struct cache_set *c) +{ + set_bit(CACHE_SET_UNREGISTERING, &c->flags); + bch_cache_set_stop(c); +} + +#define alloc_bucket_pages(gfp, c) \ + ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c)))) + +struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) +{ + int iter_size; + struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL); + if (!c) + return NULL; + + __module_get(THIS_MODULE); + closure_init(&c->cl, NULL); + set_closure_fn(&c->cl, cache_set_free, system_wq); + + closure_init(&c->caching, &c->cl); + set_closure_fn(&c->caching, __cache_set_unregister, system_wq); + + /* Maybe create continue_at_noreturn() and use it here? */ + closure_set_stopped(&c->cl); + closure_put(&c->cl); + + kobject_init(&c->kobj, &bch_cache_set_ktype); + kobject_init(&c->internal, &bch_cache_set_internal_ktype); + + bch_cache_accounting_init(&c->accounting, &c->cl); + + memcpy(c->sb.set_uuid, sb->set_uuid, 16); + c->sb.block_size = sb->block_size; + c->sb.bucket_size = sb->bucket_size; + c->sb.nr_in_set = sb->nr_in_set; + c->sb.last_mount = sb->last_mount; + c->bucket_bits = ilog2(sb->bucket_size); + c->block_bits = ilog2(sb->block_size); + c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); + + c->btree_pages = c->sb.bucket_size / PAGE_SECTORS; + if (c->btree_pages > BTREE_MAX_PAGES) + c->btree_pages = max_t(int, c->btree_pages / 4, + BTREE_MAX_PAGES); + + init_waitqueue_head(&c->alloc_wait); + mutex_init(&c->bucket_lock); + mutex_init(&c->fill_lock); + mutex_init(&c->sort_lock); + spin_lock_init(&c->sort_time_lock); + closure_init_unlocked(&c->sb_write); + closure_init_unlocked(&c->uuid_write); + spin_lock_init(&c->btree_read_time_lock); + bch_moving_init_cache_set(c); + + INIT_LIST_HEAD(&c->list); + INIT_LIST_HEAD(&c->cached_devs); + INIT_LIST_HEAD(&c->btree_cache); + INIT_LIST_HEAD(&c->btree_cache_freeable); + INIT_LIST_HEAD(&c->btree_cache_freed); + INIT_LIST_HEAD(&c->data_buckets); + + c->search = mempool_create_slab_pool(32, bch_search_cache); + if (!c->search) + goto err; + + iter_size = (sb->bucket_size / sb->block_size + 1) * + sizeof(struct btree_iter_set); + + if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) || + !(c->bio_meta = mempool_create_kmalloc_pool(2, + sizeof(struct bbio) + sizeof(struct bio_vec) * + bucket_pages(c))) || + !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || + !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) || + !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || + !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || + bch_journal_alloc(c) || + bch_btree_cache_alloc(c) || + bch_open_buckets_alloc(c)) + goto err; + + c->fill_iter->size = sb->bucket_size / sb->block_size; + + c->congested_read_threshold_us = 2000; + c->congested_write_threshold_us = 20000; + c->error_limit = 8 << IO_ERROR_SHIFT; + + return c; +err: + bch_cache_set_unregister(c); + return NULL; +} + +static void run_cache_set(struct cache_set *c) +{ + const char *err = "cannot allocate memory"; + struct cached_dev *dc, *t; + struct cache *ca; + unsigned i; + + struct btree_op op; + bch_btree_op_init_stack(&op); + op.lock = SHRT_MAX; + + for_each_cache(ca, c, i) + c->nbuckets += ca->sb.nbuckets; + + if (CACHE_SYNC(&c->sb)) { + LIST_HEAD(journal); + struct bkey *k; + struct jset *j; + + err = "cannot allocate memory for journal"; + if (bch_journal_read(c, &journal, &op)) + goto err; + + pr_debug("btree_journal_read() done"); + + err = "no journal entries found"; + if (list_empty(&journal)) + goto err; + + j = &list_entry(journal.prev, struct journal_replay, list)->j; + + err = "IO error reading priorities"; + for_each_cache(ca, c, i) + prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]); + + /* + * If prio_read() fails it'll call cache_set_error and we'll + * tear everything down right away, but if we perhaps checked + * sooner we could avoid journal replay. + */ + + k = &j->btree_root; + + err = "bad btree root"; + if (__bch_ptr_invalid(c, j->btree_level + 1, k)) + goto err; + + err = "error reading btree root"; + c->root = bch_btree_node_get(c, k, j->btree_level, &op); + if (IS_ERR_OR_NULL(c->root)) + goto err; + + list_del_init(&c->root->list); + rw_unlock(true, c->root); + + err = uuid_read(c, j, &op.cl); + if (err) + goto err; + + err = "error in recovery"; + if (bch_btree_check(c, &op)) + goto err; + + bch_journal_mark(c, &journal); + bch_btree_gc_finish(c); + pr_debug("btree_check() done"); + + /* + * bcache_journal_next() can't happen sooner, or + * btree_gc_finish() will give spurious errors about last_gc > + * gc_gen - this is a hack but oh well. + */ + bch_journal_next(&c->journal); + + for_each_cache(ca, c, i) + closure_call(&ca->alloc, bch_allocator_thread, + system_wq, &c->cl); + + /* + * First place it's safe to allocate: btree_check() and + * btree_gc_finish() have to run before we have buckets to + * allocate, and bch_bucket_alloc_set() might cause a journal + * entry to be written so bcache_journal_next() has to be called + * first. + * + * If the uuids were in the old format we have to rewrite them + * before the next journal entry is written: + */ + if (j->version < BCACHE_JSET_VERSION_UUID) + __uuid_write(c); + + bch_journal_replay(c, &journal, &op); + } else { + pr_notice("invalidating existing data"); + /* Don't want invalidate_buckets() to queue a gc yet */ + closure_lock(&c->gc, NULL); + + for_each_cache(ca, c, i) { + unsigned j; + + ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, + 2, SB_JOURNAL_BUCKETS); + + for (j = 0; j < ca->sb.keys; j++) + ca->sb.d[j] = ca->sb.first_bucket + j; + } + + bch_btree_gc_finish(c); + + for_each_cache(ca, c, i) + closure_call(&ca->alloc, bch_allocator_thread, + ca->alloc_workqueue, &c->cl); + + mutex_lock(&c->bucket_lock); + for_each_cache(ca, c, i) + bch_prio_write(ca); + mutex_unlock(&c->bucket_lock); + + wake_up(&c->alloc_wait); + + err = "cannot allocate new UUID bucket"; + if (__uuid_write(c)) + goto err_unlock_gc; + + err = "cannot allocate new btree root"; + c->root = bch_btree_node_alloc(c, 0, &op.cl); + if (IS_ERR_OR_NULL(c->root)) + goto err_unlock_gc; + + bkey_copy_key(&c->root->key, &MAX_KEY); + bch_btree_write(c->root, true, &op); + + bch_btree_set_root(c->root); + rw_unlock(true, c->root); + + /* + * We don't want to write the first journal entry until + * everything is set up - fortunately journal entries won't be + * written until the SET_CACHE_SYNC() here: + */ + SET_CACHE_SYNC(&c->sb, true); + + bch_journal_next(&c->journal); + bch_journal_meta(c, &op.cl); + + /* Unlock */ + closure_set_stopped(&c->gc.cl); + closure_put(&c->gc.cl); + } + + closure_sync(&op.cl); + c->sb.last_mount = get_seconds(); + bcache_write_super(c); + + list_for_each_entry_safe(dc, t, &uncached_devices, list) + bch_cached_dev_attach(dc, c); + + flash_devs_run(c); + + return; +err_unlock_gc: + closure_set_stopped(&c->gc.cl); + closure_put(&c->gc.cl); +err: + closure_sync(&op.cl); + /* XXX: test this, it's broken */ + bch_cache_set_error(c, err); +} + +static bool can_attach_cache(struct cache *ca, struct cache_set *c) +{ + return ca->sb.block_size == c->sb.block_size && + ca->sb.bucket_size == c->sb.block_size && + ca->sb.nr_in_set == c->sb.nr_in_set; +} + +static const char *register_cache_set(struct cache *ca) +{ + char buf[12]; + const char *err = "cannot allocate memory"; + struct cache_set *c; + + list_for_each_entry(c, &bch_cache_sets, list) + if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) { + if (c->cache[ca->sb.nr_this_dev]) + return "duplicate cache set member"; + + if (!can_attach_cache(ca, c)) + return "cache sb does not match set"; + + if (!CACHE_SYNC(&ca->sb)) + SET_CACHE_SYNC(&c->sb, false); + + goto found; + } + + c = bch_cache_set_alloc(&ca->sb); + if (!c) + return err; + + err = "error creating kobject"; + if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) || + kobject_add(&c->internal, &c->kobj, "internal")) + goto err; + + if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj)) + goto err; + + bch_debug_init_cache_set(c); + + list_add(&c->list, &bch_cache_sets); +found: + sprintf(buf, "cache%i", ca->sb.nr_this_dev); + if (sysfs_create_link(&ca->kobj, &c->kobj, "set") || + sysfs_create_link(&c->kobj, &ca->kobj, buf)) + goto err; + + if (ca->sb.seq > c->sb.seq) { + c->sb.version = ca->sb.version; + memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16); + c->sb.flags = ca->sb.flags; + c->sb.seq = ca->sb.seq; + pr_debug("set version = %llu", c->sb.version); + } + + ca->set = c; + ca->set->cache[ca->sb.nr_this_dev] = ca; + c->cache_by_alloc[c->caches_loaded++] = ca; + + if (c->caches_loaded == c->sb.nr_in_set) + run_cache_set(c); + + return NULL; +err: + bch_cache_set_unregister(c); + return err; +} + +/* Cache device */ + +void bch_cache_release(struct kobject *kobj) +{ + struct cache *ca = container_of(kobj, struct cache, kobj); + + if (ca->set) + ca->set->cache[ca->sb.nr_this_dev] = NULL; + + bch_cache_allocator_exit(ca); + + bio_split_pool_free(&ca->bio_split_hook); + + if (ca->alloc_workqueue) + destroy_workqueue(ca->alloc_workqueue); + + free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); + kfree(ca->prio_buckets); + vfree(ca->buckets); + + free_heap(&ca->heap); + free_fifo(&ca->unused); + free_fifo(&ca->free_inc); + free_fifo(&ca->free); + + if (ca->sb_bio.bi_inline_vecs[0].bv_page) + put_page(ca->sb_bio.bi_io_vec[0].bv_page); + + if (!IS_ERR_OR_NULL(ca->bdev)) { + blk_sync_queue(bdev_get_queue(ca->bdev)); + blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + } + + kfree(ca); + module_put(THIS_MODULE); +} + +static int cache_alloc(struct cache_sb *sb, struct cache *ca) +{ + size_t free; + struct bucket *b; + + if (!ca) + return -ENOMEM; + + __module_get(THIS_MODULE); + kobject_init(&ca->kobj, &bch_cache_ktype); + + memcpy(&ca->sb, sb, sizeof(struct cache_sb)); + + INIT_LIST_HEAD(&ca->discards); + + bio_init(&ca->sb_bio); + ca->sb_bio.bi_max_vecs = 1; + ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs; + + bio_init(&ca->journal.bio); + ca->journal.bio.bi_max_vecs = 8; + ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; + + free = roundup_pow_of_two(ca->sb.nbuckets) >> 9; + free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2); + + if (!init_fifo(&ca->free, free, GFP_KERNEL) || + !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || + !init_fifo(&ca->unused, free << 2, GFP_KERNEL) || + !init_heap(&ca->heap, free << 3, GFP_KERNEL) || + !(ca->buckets = vmalloc(sizeof(struct bucket) * + ca->sb.nbuckets)) || + !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) * + 2, GFP_KERNEL)) || + !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || + !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) || + bio_split_pool_init(&ca->bio_split_hook)) + goto err; + + ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca); + + memset(ca->buckets, 0, ca->sb.nbuckets * sizeof(struct bucket)); + for_each_bucket(b, ca) + atomic_set(&b->pin, 0); + + if (bch_cache_allocator_init(ca)) + goto err; + + return 0; +err: + kobject_put(&ca->kobj); + return -ENOMEM; +} + +static const char *register_cache(struct cache_sb *sb, struct page *sb_page, + struct block_device *bdev, struct cache *ca) +{ + char name[BDEVNAME_SIZE]; + const char *err = "cannot allocate memory"; + + if (cache_alloc(sb, ca) != 0) + return err; + + ca->sb_bio.bi_io_vec[0].bv_page = sb_page; + ca->bdev = bdev; + ca->bdev->bd_holder = ca; + + if (blk_queue_discard(bdev_get_queue(ca->bdev))) + ca->discard = CACHE_DISCARD(&ca->sb); + + err = "error creating kobject"; + if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) + goto err; + + err = register_cache_set(ca); + if (err) + goto err; + + pr_info("registered cache device %s", bdevname(bdev, name)); + + return NULL; +err: + kobject_put(&ca->kobj); + pr_info("error opening %s: %s", bdevname(bdev, name), err); + /* Return NULL instead of an error because kobject_put() cleans + * everything up + */ + return NULL; +} + +/* Global interfaces/init */ + +static ssize_t register_bcache(struct kobject *, struct kobj_attribute *, + const char *, size_t); + +kobj_attribute_write(register, register_bcache); +kobj_attribute_write(register_quiet, register_bcache); + +static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + const char *buffer, size_t size) +{ + ssize_t ret = size; + const char *err = "cannot allocate memory"; + char *path = NULL; + struct cache_sb *sb = NULL; + struct block_device *bdev = NULL; + struct page *sb_page = NULL; + + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + mutex_lock(&bch_register_lock); + + if (!(path = kstrndup(buffer, size, GFP_KERNEL)) || + !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL))) + goto err; + + err = "failed to open device"; + bdev = blkdev_get_by_path(strim(path), + FMODE_READ|FMODE_WRITE|FMODE_EXCL, + sb); + if (bdev == ERR_PTR(-EBUSY)) + err = "device busy"; + + if (IS_ERR(bdev) || + set_blocksize(bdev, 4096)) + goto err; + + err = read_super(sb, bdev, &sb_page); + if (err) + goto err_close; + + if (SB_IS_BDEV(sb)) { + struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); + + err = register_bdev(sb, sb_page, bdev, dc); + } else { + struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + + err = register_cache(sb, sb_page, bdev, ca); + } + + if (err) { + /* register_(bdev|cache) will only return an error if they + * didn't get far enough to create the kobject - if they did, + * the kobject destructor will do this cleanup. + */ + put_page(sb_page); +err_close: + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); +err: + if (attr != &ksysfs_register_quiet) + pr_info("error opening %s: %s", path, err); + ret = -EINVAL; + } + + kfree(sb); + kfree(path); + mutex_unlock(&bch_register_lock); + module_put(THIS_MODULE); + return ret; +} + +static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) +{ + if (code == SYS_DOWN || + code == SYS_HALT || + code == SYS_POWER_OFF) { + DEFINE_WAIT(wait); + unsigned long start = jiffies; + bool stopped = false; + + struct cache_set *c, *tc; + struct cached_dev *dc, *tdc; + + mutex_lock(&bch_register_lock); + + if (list_empty(&bch_cache_sets) && + list_empty(&uncached_devices)) + goto out; + + pr_info("Stopping all devices:"); + + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + bch_cache_set_stop(c); + + list_for_each_entry_safe(dc, tdc, &uncached_devices, list) + bcache_device_stop(&dc->disk); + + /* What's a condition variable? */ + while (1) { + long timeout = start + 2 * HZ - jiffies; + + stopped = list_empty(&bch_cache_sets) && + list_empty(&uncached_devices); + + if (timeout < 0 || stopped) + break; + + prepare_to_wait(&unregister_wait, &wait, + TASK_UNINTERRUPTIBLE); + + mutex_unlock(&bch_register_lock); + schedule_timeout(timeout); + mutex_lock(&bch_register_lock); + } + + finish_wait(&unregister_wait, &wait); + + if (stopped) + pr_info("All devices stopped"); + else + pr_notice("Timeout waiting for devices to be closed"); +out: + mutex_unlock(&bch_register_lock); + } + + return NOTIFY_DONE; +} + +static struct notifier_block reboot = { + .notifier_call = bcache_reboot, + .priority = INT_MAX, /* before any real devices */ +}; + +static void bcache_exit(void) +{ + bch_debug_exit(); + bch_writeback_exit(); + bch_request_exit(); + bch_btree_exit(); + if (bcache_kobj) + kobject_put(bcache_kobj); + if (bcache_wq) + destroy_workqueue(bcache_wq); + unregister_blkdev(bcache_major, "bcache"); + unregister_reboot_notifier(&reboot); +} + +static int __init bcache_init(void) +{ + static const struct attribute *files[] = { + &ksysfs_register.attr, + &ksysfs_register_quiet.attr, + NULL + }; + + mutex_init(&bch_register_lock); + init_waitqueue_head(&unregister_wait); + register_reboot_notifier(&reboot); + closure_debug_init(); + + bcache_major = register_blkdev(0, "bcache"); + if (bcache_major < 0) + return bcache_major; + + if (!(bcache_wq = create_workqueue("bcache")) || + !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || + sysfs_create_files(bcache_kobj, files) || + bch_btree_init() || + bch_request_init() || + bch_writeback_init() || + bch_debug_init(bcache_kobj)) + goto err; + + return 0; +err: + bcache_exit(); + return -ENOMEM; +} + +module_exit(bcache_exit); +module_init(bcache_init); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c new file mode 100644 index 00000000000..4d9cca47e4c --- /dev/null +++ b/drivers/md/bcache/sysfs.c @@ -0,0 +1,817 @@ +/* + * bcache sysfs interfaces + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "sysfs.h" +#include "btree.h" +#include "request.h" + +#include <linux/sort.h> + +static const char * const cache_replacement_policies[] = { + "lru", + "fifo", + "random", + NULL +}; + +write_attribute(attach); +write_attribute(detach); +write_attribute(unregister); +write_attribute(stop); +write_attribute(clear_stats); +write_attribute(trigger_gc); +write_attribute(prune_cache); +write_attribute(flash_vol_create); + +read_attribute(bucket_size); +read_attribute(block_size); +read_attribute(nbuckets); +read_attribute(tree_depth); +read_attribute(root_usage_percent); +read_attribute(priority_stats); +read_attribute(btree_cache_size); +read_attribute(btree_cache_max_chain); +read_attribute(cache_available_percent); +read_attribute(written); +read_attribute(btree_written); +read_attribute(metadata_written); +read_attribute(active_journal_entries); + +sysfs_time_stats_attribute(btree_gc, sec, ms); +sysfs_time_stats_attribute(btree_split, sec, us); +sysfs_time_stats_attribute(btree_sort, ms, us); +sysfs_time_stats_attribute(btree_read, ms, us); +sysfs_time_stats_attribute(try_harder, ms, us); + +read_attribute(btree_nodes); +read_attribute(btree_used_percent); +read_attribute(average_key_size); +read_attribute(dirty_data); +read_attribute(bset_tree_stats); + +read_attribute(state); +read_attribute(cache_read_races); +read_attribute(writeback_keys_done); +read_attribute(writeback_keys_failed); +read_attribute(io_errors); +read_attribute(congested); +rw_attribute(congested_read_threshold_us); +rw_attribute(congested_write_threshold_us); + +rw_attribute(sequential_cutoff); +rw_attribute(sequential_merge); +rw_attribute(data_csum); +rw_attribute(cache_mode); +rw_attribute(writeback_metadata); +rw_attribute(writeback_running); +rw_attribute(writeback_percent); +rw_attribute(writeback_delay); +rw_attribute(writeback_rate); + +rw_attribute(writeback_rate_update_seconds); +rw_attribute(writeback_rate_d_term); +rw_attribute(writeback_rate_p_term_inverse); +rw_attribute(writeback_rate_d_smooth); +read_attribute(writeback_rate_debug); + +rw_attribute(synchronous); +rw_attribute(journal_delay_ms); +rw_attribute(discard); +rw_attribute(running); +rw_attribute(label); +rw_attribute(readahead); +rw_attribute(io_error_limit); +rw_attribute(io_error_halflife); +rw_attribute(verify); +rw_attribute(key_merging_disabled); +rw_attribute(gc_always_rewrite); +rw_attribute(freelist_percent); +rw_attribute(cache_replacement_policy); +rw_attribute(btree_shrinker_disabled); +rw_attribute(copy_gc_enabled); +rw_attribute(size); + +SHOW(__bch_cached_dev) +{ + struct cached_dev *dc = container_of(kobj, struct cached_dev, + disk.kobj); + const char *states[] = { "no cache", "clean", "dirty", "inconsistent" }; + +#define var(stat) (dc->stat) + + if (attr == &sysfs_cache_mode) + return bch_snprint_string_list(buf, PAGE_SIZE, + bch_cache_modes + 1, + BDEV_CACHE_MODE(&dc->sb)); + + sysfs_printf(data_csum, "%i", dc->disk.data_csum); + var_printf(verify, "%i"); + var_printf(writeback_metadata, "%i"); + var_printf(writeback_running, "%i"); + var_print(writeback_delay); + var_print(writeback_percent); + sysfs_print(writeback_rate, dc->writeback_rate.rate); + + var_print(writeback_rate_update_seconds); + var_print(writeback_rate_d_term); + var_print(writeback_rate_p_term_inverse); + var_print(writeback_rate_d_smooth); + + if (attr == &sysfs_writeback_rate_debug) { + char dirty[20]; + char derivative[20]; + char target[20]; + bch_hprint(dirty, + atomic_long_read(&dc->disk.sectors_dirty) << 9); + bch_hprint(derivative, dc->writeback_rate_derivative << 9); + bch_hprint(target, dc->writeback_rate_target << 9); + + return sprintf(buf, + "rate:\t\t%u\n" + "change:\t\t%i\n" + "dirty:\t\t%s\n" + "derivative:\t%s\n" + "target:\t\t%s\n", + dc->writeback_rate.rate, + dc->writeback_rate_change, + dirty, derivative, target); + } + + sysfs_hprint(dirty_data, + atomic_long_read(&dc->disk.sectors_dirty) << 9); + + var_printf(sequential_merge, "%i"); + var_hprint(sequential_cutoff); + var_hprint(readahead); + + sysfs_print(running, atomic_read(&dc->running)); + sysfs_print(state, states[BDEV_STATE(&dc->sb)]); + + if (attr == &sysfs_label) { + memcpy(buf, dc->sb.label, SB_LABEL_SIZE); + buf[SB_LABEL_SIZE + 1] = '\0'; + strcat(buf, "\n"); + return strlen(buf); + } + +#undef var + return 0; +} +SHOW_LOCKED(bch_cached_dev) + +STORE(__cached_dev) +{ + struct cached_dev *dc = container_of(kobj, struct cached_dev, + disk.kobj); + unsigned v = size; + struct cache_set *c; + +#define d_strtoul(var) sysfs_strtoul(var, dc->var) +#define d_strtoi_h(var) sysfs_hatoi(var, dc->var) + + sysfs_strtoul(data_csum, dc->disk.data_csum); + d_strtoul(verify); + d_strtoul(writeback_metadata); + d_strtoul(writeback_running); + d_strtoul(writeback_delay); + sysfs_strtoul_clamp(writeback_rate, + dc->writeback_rate.rate, 1, 1000000); + sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); + + d_strtoul(writeback_rate_update_seconds); + d_strtoul(writeback_rate_d_term); + d_strtoul(writeback_rate_p_term_inverse); + sysfs_strtoul_clamp(writeback_rate_p_term_inverse, + dc->writeback_rate_p_term_inverse, 1, INT_MAX); + d_strtoul(writeback_rate_d_smooth); + + d_strtoul(sequential_merge); + d_strtoi_h(sequential_cutoff); + d_strtoi_h(readahead); + + if (attr == &sysfs_clear_stats) + bch_cache_accounting_clear(&dc->accounting); + + if (attr == &sysfs_running && + strtoul_or_return(buf)) + bch_cached_dev_run(dc); + + if (attr == &sysfs_cache_mode) { + ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1); + + if (v < 0) + return v; + + if ((unsigned) v != BDEV_CACHE_MODE(&dc->sb)) { + SET_BDEV_CACHE_MODE(&dc->sb, v); + bch_write_bdev_super(dc, NULL); + } + } + + if (attr == &sysfs_label) { + memcpy(dc->sb.label, buf, SB_LABEL_SIZE); + bch_write_bdev_super(dc, NULL); + if (dc->disk.c) { + memcpy(dc->disk.c->uuids[dc->disk.id].label, + buf, SB_LABEL_SIZE); + bch_uuid_write(dc->disk.c); + } + } + + if (attr == &sysfs_attach) { + if (bch_parse_uuid(buf, dc->sb.set_uuid) < 16) + return -EINVAL; + + list_for_each_entry(c, &bch_cache_sets, list) { + v = bch_cached_dev_attach(dc, c); + if (!v) + return size; + } + + pr_err("Can't attach %s: cache set not found", buf); + size = v; + } + + if (attr == &sysfs_detach && dc->disk.c) + bch_cached_dev_detach(dc); + + if (attr == &sysfs_stop) + bcache_device_stop(&dc->disk); + + return size; +} + +STORE(bch_cached_dev) +{ + struct cached_dev *dc = container_of(kobj, struct cached_dev, + disk.kobj); + + mutex_lock(&bch_register_lock); + size = __cached_dev_store(kobj, attr, buf, size); + + if (attr == &sysfs_writeback_running) + bch_writeback_queue(dc); + + if (attr == &sysfs_writeback_percent) + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); + + mutex_unlock(&bch_register_lock); + return size; +} + +static struct attribute *bch_cached_dev_files[] = { + &sysfs_attach, + &sysfs_detach, + &sysfs_stop, +#if 0 + &sysfs_data_csum, +#endif + &sysfs_cache_mode, + &sysfs_writeback_metadata, + &sysfs_writeback_running, + &sysfs_writeback_delay, + &sysfs_writeback_percent, + &sysfs_writeback_rate, + &sysfs_writeback_rate_update_seconds, + &sysfs_writeback_rate_d_term, + &sysfs_writeback_rate_p_term_inverse, + &sysfs_writeback_rate_d_smooth, + &sysfs_writeback_rate_debug, + &sysfs_dirty_data, + &sysfs_sequential_cutoff, + &sysfs_sequential_merge, + &sysfs_clear_stats, + &sysfs_running, + &sysfs_state, + &sysfs_label, + &sysfs_readahead, +#ifdef CONFIG_BCACHE_DEBUG + &sysfs_verify, +#endif + NULL +}; +KTYPE(bch_cached_dev); + +SHOW(bch_flash_dev) +{ + struct bcache_device *d = container_of(kobj, struct bcache_device, + kobj); + struct uuid_entry *u = &d->c->uuids[d->id]; + + sysfs_printf(data_csum, "%i", d->data_csum); + sysfs_hprint(size, u->sectors << 9); + + if (attr == &sysfs_label) { + memcpy(buf, u->label, SB_LABEL_SIZE); + buf[SB_LABEL_SIZE + 1] = '\0'; + strcat(buf, "\n"); + return strlen(buf); + } + + return 0; +} + +STORE(__bch_flash_dev) +{ + struct bcache_device *d = container_of(kobj, struct bcache_device, + kobj); + struct uuid_entry *u = &d->c->uuids[d->id]; + + sysfs_strtoul(data_csum, d->data_csum); + + if (attr == &sysfs_size) { + uint64_t v; + strtoi_h_or_return(buf, v); + + u->sectors = v >> 9; + bch_uuid_write(d->c); + set_capacity(d->disk, u->sectors); + } + + if (attr == &sysfs_label) { + memcpy(u->label, buf, SB_LABEL_SIZE); + bch_uuid_write(d->c); + } + + if (attr == &sysfs_unregister) { + atomic_set(&d->detaching, 1); + bcache_device_stop(d); + } + + return size; +} +STORE_LOCKED(bch_flash_dev) + +static struct attribute *bch_flash_dev_files[] = { + &sysfs_unregister, +#if 0 + &sysfs_data_csum, +#endif + &sysfs_label, + &sysfs_size, + NULL +}; +KTYPE(bch_flash_dev); + +SHOW(__bch_cache_set) +{ + unsigned root_usage(struct cache_set *c) + { + unsigned bytes = 0; + struct bkey *k; + struct btree *b; + struct btree_iter iter; + + goto lock_root; + + do { + rw_unlock(false, b); +lock_root: + b = c->root; + rw_lock(false, b, b->level); + } while (b != c->root); + + for_each_key_filter(b, k, &iter, bch_ptr_bad) + bytes += bkey_bytes(k); + + rw_unlock(false, b); + + return (bytes * 100) / btree_bytes(c); + } + + size_t cache_size(struct cache_set *c) + { + size_t ret = 0; + struct btree *b; + + mutex_lock(&c->bucket_lock); + list_for_each_entry(b, &c->btree_cache, list) + ret += 1 << (b->page_order + PAGE_SHIFT); + + mutex_unlock(&c->bucket_lock); + return ret; + } + + unsigned cache_max_chain(struct cache_set *c) + { + unsigned ret = 0; + struct hlist_head *h; + + mutex_lock(&c->bucket_lock); + + for (h = c->bucket_hash; + h < c->bucket_hash + (1 << BUCKET_HASH_BITS); + h++) { + unsigned i = 0; + struct hlist_node *p; + + hlist_for_each(p, h) + i++; + + ret = max(ret, i); + } + + mutex_unlock(&c->bucket_lock); + return ret; + } + + unsigned btree_used(struct cache_set *c) + { + return div64_u64(c->gc_stats.key_bytes * 100, + (c->gc_stats.nodes ?: 1) * btree_bytes(c)); + } + + unsigned average_key_size(struct cache_set *c) + { + return c->gc_stats.nkeys + ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) + : 0; + } + + struct cache_set *c = container_of(kobj, struct cache_set, kobj); + + sysfs_print(synchronous, CACHE_SYNC(&c->sb)); + sysfs_print(journal_delay_ms, c->journal_delay_ms); + sysfs_hprint(bucket_size, bucket_bytes(c)); + sysfs_hprint(block_size, block_bytes(c)); + sysfs_print(tree_depth, c->root->level); + sysfs_print(root_usage_percent, root_usage(c)); + + sysfs_hprint(btree_cache_size, cache_size(c)); + sysfs_print(btree_cache_max_chain, cache_max_chain(c)); + sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use); + + sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); + sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); + sysfs_print_time_stats(&c->sort_time, btree_sort, ms, us); + sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); + sysfs_print_time_stats(&c->try_harder_time, try_harder, ms, us); + + sysfs_print(btree_used_percent, btree_used(c)); + sysfs_print(btree_nodes, c->gc_stats.nodes); + sysfs_hprint(dirty_data, c->gc_stats.dirty); + sysfs_hprint(average_key_size, average_key_size(c)); + + sysfs_print(cache_read_races, + atomic_long_read(&c->cache_read_races)); + + sysfs_print(writeback_keys_done, + atomic_long_read(&c->writeback_keys_done)); + sysfs_print(writeback_keys_failed, + atomic_long_read(&c->writeback_keys_failed)); + + /* See count_io_errors for why 88 */ + sysfs_print(io_error_halflife, c->error_decay * 88); + sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); + + sysfs_hprint(congested, + ((uint64_t) bch_get_congested(c)) << 9); + sysfs_print(congested_read_threshold_us, + c->congested_read_threshold_us); + sysfs_print(congested_write_threshold_us, + c->congested_write_threshold_us); + + sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); + sysfs_printf(verify, "%i", c->verify); + sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); + sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); + sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); + sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + + if (attr == &sysfs_bset_tree_stats) + return bch_bset_print_stats(c, buf); + + return 0; +} +SHOW_LOCKED(bch_cache_set) + +STORE(__bch_cache_set) +{ + struct cache_set *c = container_of(kobj, struct cache_set, kobj); + + if (attr == &sysfs_unregister) + bch_cache_set_unregister(c); + + if (attr == &sysfs_stop) + bch_cache_set_stop(c); + + if (attr == &sysfs_synchronous) { + bool sync = strtoul_or_return(buf); + + if (sync != CACHE_SYNC(&c->sb)) { + SET_CACHE_SYNC(&c->sb, sync); + bcache_write_super(c); + } + } + + if (attr == &sysfs_flash_vol_create) { + int r; + uint64_t v; + strtoi_h_or_return(buf, v); + + r = bch_flash_dev_create(c, v); + if (r) + return r; + } + + if (attr == &sysfs_clear_stats) { + atomic_long_set(&c->writeback_keys_done, 0); + atomic_long_set(&c->writeback_keys_failed, 0); + + memset(&c->gc_stats, 0, sizeof(struct gc_stat)); + bch_cache_accounting_clear(&c->accounting); + } + + if (attr == &sysfs_trigger_gc) + bch_queue_gc(c); + + if (attr == &sysfs_prune_cache) { + struct shrink_control sc; + sc.gfp_mask = GFP_KERNEL; + sc.nr_to_scan = strtoul_or_return(buf); + c->shrink.shrink(&c->shrink, &sc); + } + + sysfs_strtoul(congested_read_threshold_us, + c->congested_read_threshold_us); + sysfs_strtoul(congested_write_threshold_us, + c->congested_write_threshold_us); + + if (attr == &sysfs_io_error_limit) + c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; + + /* See count_io_errors() for why 88 */ + if (attr == &sysfs_io_error_halflife) + c->error_decay = strtoul_or_return(buf) / 88; + + sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); + sysfs_strtoul(verify, c->verify); + sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); + sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); + sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); + sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); + + return size; +} +STORE_LOCKED(bch_cache_set) + +SHOW(bch_cache_set_internal) +{ + struct cache_set *c = container_of(kobj, struct cache_set, internal); + return bch_cache_set_show(&c->kobj, attr, buf); +} + +STORE(bch_cache_set_internal) +{ + struct cache_set *c = container_of(kobj, struct cache_set, internal); + return bch_cache_set_store(&c->kobj, attr, buf, size); +} + +static void bch_cache_set_internal_release(struct kobject *k) +{ +} + +static struct attribute *bch_cache_set_files[] = { + &sysfs_unregister, + &sysfs_stop, + &sysfs_synchronous, + &sysfs_journal_delay_ms, + &sysfs_flash_vol_create, + + &sysfs_bucket_size, + &sysfs_block_size, + &sysfs_tree_depth, + &sysfs_root_usage_percent, + &sysfs_btree_cache_size, + &sysfs_cache_available_percent, + + &sysfs_average_key_size, + &sysfs_dirty_data, + + &sysfs_io_error_limit, + &sysfs_io_error_halflife, + &sysfs_congested, + &sysfs_congested_read_threshold_us, + &sysfs_congested_write_threshold_us, + &sysfs_clear_stats, + NULL +}; +KTYPE(bch_cache_set); + +static struct attribute *bch_cache_set_internal_files[] = { + &sysfs_active_journal_entries, + + sysfs_time_stats_attribute_list(btree_gc, sec, ms) + sysfs_time_stats_attribute_list(btree_split, sec, us) + sysfs_time_stats_attribute_list(btree_sort, ms, us) + sysfs_time_stats_attribute_list(btree_read, ms, us) + sysfs_time_stats_attribute_list(try_harder, ms, us) + + &sysfs_btree_nodes, + &sysfs_btree_used_percent, + &sysfs_btree_cache_max_chain, + + &sysfs_bset_tree_stats, + &sysfs_cache_read_races, + &sysfs_writeback_keys_done, + &sysfs_writeback_keys_failed, + + &sysfs_trigger_gc, + &sysfs_prune_cache, +#ifdef CONFIG_BCACHE_DEBUG + &sysfs_verify, + &sysfs_key_merging_disabled, +#endif + &sysfs_gc_always_rewrite, + &sysfs_btree_shrinker_disabled, + &sysfs_copy_gc_enabled, + NULL +}; +KTYPE(bch_cache_set_internal); + +SHOW(__bch_cache) +{ + struct cache *ca = container_of(kobj, struct cache, kobj); + + sysfs_hprint(bucket_size, bucket_bytes(ca)); + sysfs_hprint(block_size, block_bytes(ca)); + sysfs_print(nbuckets, ca->sb.nbuckets); + sysfs_print(discard, ca->discard); + sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9); + sysfs_hprint(btree_written, + atomic_long_read(&ca->btree_sectors_written) << 9); + sysfs_hprint(metadata_written, + (atomic_long_read(&ca->meta_sectors_written) + + atomic_long_read(&ca->btree_sectors_written)) << 9); + + sysfs_print(io_errors, + atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT); + + sysfs_print(freelist_percent, ca->free.size * 100 / + ((size_t) ca->sb.nbuckets)); + + if (attr == &sysfs_cache_replacement_policy) + return bch_snprint_string_list(buf, PAGE_SIZE, + cache_replacement_policies, + CACHE_REPLACEMENT(&ca->sb)); + + if (attr == &sysfs_priority_stats) { + int cmp(const void *l, const void *r) + { return *((uint16_t *) r) - *((uint16_t *) l); } + + /* Number of quantiles we compute */ + const unsigned nq = 31; + + size_t n = ca->sb.nbuckets, i, unused, btree; + uint64_t sum = 0; + uint16_t q[nq], *p, *cached; + ssize_t ret; + + cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t)); + if (!p) + return -ENOMEM; + + mutex_lock(&ca->set->bucket_lock); + for (i = ca->sb.first_bucket; i < n; i++) + p[i] = ca->buckets[i].prio; + mutex_unlock(&ca->set->bucket_lock); + + sort(p, n, sizeof(uint16_t), cmp, NULL); + + while (n && + !cached[n - 1]) + --n; + + unused = ca->sb.nbuckets - n; + + while (cached < p + n && + *cached == BTREE_PRIO) + cached++; + + btree = cached - p; + n -= btree; + + for (i = 0; i < n; i++) + sum += INITIAL_PRIO - cached[i]; + + if (n) + do_div(sum, n); + + for (i = 0; i < nq; i++) + q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)]; + + vfree(p); + + ret = snprintf(buf, PAGE_SIZE, + "Unused: %zu%%\n" + "Metadata: %zu%%\n" + "Average: %llu\n" + "Sectors per Q: %zu\n" + "Quantiles: [", + unused * 100 / (size_t) ca->sb.nbuckets, + btree * 100 / (size_t) ca->sb.nbuckets, sum, + n * ca->sb.bucket_size / (nq + 1)); + + for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++) + ret += snprintf(buf + ret, PAGE_SIZE - ret, + i < nq - 1 ? "%u " : "%u]\n", q[i]); + + buf[PAGE_SIZE - 1] = '\0'; + return ret; + } + + return 0; +} +SHOW_LOCKED(bch_cache) + +STORE(__bch_cache) +{ + struct cache *ca = container_of(kobj, struct cache, kobj); + + if (attr == &sysfs_discard) { + bool v = strtoul_or_return(buf); + + if (blk_queue_discard(bdev_get_queue(ca->bdev))) + ca->discard = v; + + if (v != CACHE_DISCARD(&ca->sb)) { + SET_CACHE_DISCARD(&ca->sb, v); + bcache_write_super(ca->set); + } + } + + if (attr == &sysfs_cache_replacement_policy) { + ssize_t v = bch_read_string_list(buf, cache_replacement_policies); + + if (v < 0) + return v; + + if ((unsigned) v != CACHE_REPLACEMENT(&ca->sb)) { + mutex_lock(&ca->set->bucket_lock); + SET_CACHE_REPLACEMENT(&ca->sb, v); + mutex_unlock(&ca->set->bucket_lock); + + bcache_write_super(ca->set); + } + } + + if (attr == &sysfs_freelist_percent) { + DECLARE_FIFO(long, free); + long i; + size_t p = strtoul_or_return(buf); + + p = clamp_t(size_t, + ((size_t) ca->sb.nbuckets * p) / 100, + roundup_pow_of_two(ca->sb.nbuckets) >> 9, + ca->sb.nbuckets / 2); + + if (!init_fifo_exact(&free, p, GFP_KERNEL)) + return -ENOMEM; + + mutex_lock(&ca->set->bucket_lock); + + fifo_move(&free, &ca->free); + fifo_swap(&free, &ca->free); + + mutex_unlock(&ca->set->bucket_lock); + + while (fifo_pop(&free, i)) + atomic_dec(&ca->buckets[i].pin); + + free_fifo(&free); + } + + if (attr == &sysfs_clear_stats) { + atomic_long_set(&ca->sectors_written, 0); + atomic_long_set(&ca->btree_sectors_written, 0); + atomic_long_set(&ca->meta_sectors_written, 0); + atomic_set(&ca->io_count, 0); + atomic_set(&ca->io_errors, 0); + } + + return size; +} +STORE_LOCKED(bch_cache) + +static struct attribute *bch_cache_files[] = { + &sysfs_bucket_size, + &sysfs_block_size, + &sysfs_nbuckets, + &sysfs_priority_stats, + &sysfs_discard, + &sysfs_written, + &sysfs_btree_written, + &sysfs_metadata_written, + &sysfs_io_errors, + &sysfs_clear_stats, + &sysfs_freelist_percent, + &sysfs_cache_replacement_policy, + NULL +}; +KTYPE(bch_cache); diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h new file mode 100644 index 00000000000..0526fe92a68 --- /dev/null +++ b/drivers/md/bcache/sysfs.h @@ -0,0 +1,110 @@ +#ifndef _BCACHE_SYSFS_H_ +#define _BCACHE_SYSFS_H_ + +#define KTYPE(type) \ +struct kobj_type type ## _ktype = { \ + .release = type ## _release, \ + .sysfs_ops = &((const struct sysfs_ops) { \ + .show = type ## _show, \ + .store = type ## _store \ + }), \ + .default_attrs = type ## _files \ +} + +#define SHOW(fn) \ +static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ + char *buf) \ + +#define STORE(fn) \ +static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ + const char *buf, size_t size) \ + +#define SHOW_LOCKED(fn) \ +SHOW(fn) \ +{ \ + ssize_t ret; \ + mutex_lock(&bch_register_lock); \ + ret = __ ## fn ## _show(kobj, attr, buf); \ + mutex_unlock(&bch_register_lock); \ + return ret; \ +} + +#define STORE_LOCKED(fn) \ +STORE(fn) \ +{ \ + ssize_t ret; \ + mutex_lock(&bch_register_lock); \ + ret = __ ## fn ## _store(kobj, attr, buf, size); \ + mutex_unlock(&bch_register_lock); \ + return ret; \ +} + +#define __sysfs_attribute(_name, _mode) \ + static struct attribute sysfs_##_name = \ + { .name = #_name, .mode = _mode } + +#define write_attribute(n) __sysfs_attribute(n, S_IWUSR) +#define read_attribute(n) __sysfs_attribute(n, S_IRUGO) +#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR) + +#define sysfs_printf(file, fmt, ...) \ +do { \ + if (attr == &sysfs_ ## file) \ + return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__); \ +} while (0) + +#define sysfs_print(file, var) \ +do { \ + if (attr == &sysfs_ ## file) \ + return snprint(buf, PAGE_SIZE, var); \ +} while (0) + +#define sysfs_hprint(file, val) \ +do { \ + if (attr == &sysfs_ ## file) { \ + ssize_t ret = bch_hprint(buf, val); \ + strcat(buf, "\n"); \ + return ret + 1; \ + } \ +} while (0) + +#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) +#define var_print(_var) sysfs_print(_var, var(_var)) +#define var_hprint(_var) sysfs_hprint(_var, var(_var)) + +#define sysfs_strtoul(file, var) \ +do { \ + if (attr == &sysfs_ ## file) \ + return strtoul_safe(buf, var) ?: (ssize_t) size; \ +} while (0) + +#define sysfs_strtoul_clamp(file, var, min, max) \ +do { \ + if (attr == &sysfs_ ## file) \ + return strtoul_safe_clamp(buf, var, min, max) \ + ?: (ssize_t) size; \ +} while (0) + +#define strtoul_or_return(cp) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (_r) \ + return _r; \ + _v; \ +}) + +#define strtoi_h_or_return(cp, v) \ +do { \ + int _r = strtoi_h(cp, &v); \ + if (_r) \ + return _r; \ +} while (0) + +#define sysfs_hatoi(file, var) \ +do { \ + if (attr == &sysfs_ ## file) \ + return strtoi_h(buf, &var) ?: (ssize_t) size; \ +} while (0) + +#endif /* _BCACHE_SYSFS_H_ */ diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c new file mode 100644 index 00000000000..983f9bb411b --- /dev/null +++ b/drivers/md/bcache/trace.c @@ -0,0 +1,26 @@ +#include "bcache.h" +#include "btree.h" +#include "request.h" + +#include <linux/module.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/bcache.h> + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end); diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c new file mode 100644 index 00000000000..da3a99e85b1 --- /dev/null +++ b/drivers/md/bcache/util.c @@ -0,0 +1,377 @@ +/* + * random utiility code, for bcache but in theory not specific to bcache + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/ctype.h> +#include <linux/debugfs.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <linux/types.h> + +#include "util.h" + +#define simple_strtoint(c, end, base) simple_strtol(c, end, base) +#define simple_strtouint(c, end, base) simple_strtoul(c, end, base) + +#define STRTO_H(name, type) \ +int bch_ ## name ## _h(const char *cp, type *res) \ +{ \ + int u = 0; \ + char *e; \ + type i = simple_ ## name(cp, &e, 10); \ + \ + switch (tolower(*e)) { \ + default: \ + return -EINVAL; \ + case 'y': \ + case 'z': \ + u++; \ + case 'e': \ + u++; \ + case 'p': \ + u++; \ + case 't': \ + u++; \ + case 'g': \ + u++; \ + case 'm': \ + u++; \ + case 'k': \ + u++; \ + if (e++ == cp) \ + return -EINVAL; \ + case '\n': \ + case '\0': \ + if (*e == '\n') \ + e++; \ + } \ + \ + if (*e) \ + return -EINVAL; \ + \ + while (u--) { \ + if ((type) ~0 > 0 && \ + (type) ~0 / 1024 <= i) \ + return -EINVAL; \ + if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) || \ + (i < 0 && -ANYSINT_MAX(type) / 1024 > i)) \ + return -EINVAL; \ + i *= 1024; \ + } \ + \ + *res = i; \ + return 0; \ +} \ + +STRTO_H(strtoint, int) +STRTO_H(strtouint, unsigned int) +STRTO_H(strtoll, long long) +STRTO_H(strtoull, unsigned long long) + +ssize_t bch_hprint(char *buf, int64_t v) +{ + static const char units[] = "?kMGTPEZY"; + char dec[4] = ""; + int u, t = 0; + + for (u = 0; v >= 1024 || v <= -1024; u++) { + t = v & ~(~0 << 10); + v >>= 10; + } + + if (!u) + return sprintf(buf, "%llu", v); + + if (v < 100 && v > -100) + snprintf(dec, sizeof(dec), ".%i", t / 100); + + return sprintf(buf, "%lli%s%c", v, dec, units[u]); +} + +ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], + size_t selected) +{ + char *out = buf; + size_t i; + + for (i = 0; list[i]; i++) + out += snprintf(out, buf + size - out, + i == selected ? "[%s] " : "%s ", list[i]); + + out[-1] = '\n'; + return out - buf; +} + +ssize_t bch_read_string_list(const char *buf, const char * const list[]) +{ + size_t i; + char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL); + if (!d) + return -ENOMEM; + + s = strim(d); + + for (i = 0; list[i]; i++) + if (!strcmp(list[i], s)) + break; + + kfree(d); + + if (!list[i]) + return -EINVAL; + + return i; +} + +bool bch_is_zero(const char *p, size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) + if (p[i]) + return false; + return true; +} + +int bch_parse_uuid(const char *s, char *uuid) +{ + size_t i, j, x; + memset(uuid, 0, 16); + + for (i = 0, j = 0; + i < strspn(s, "-0123456789:ABCDEFabcdef") && j < 32; + i++) { + x = s[i] | 32; + + switch (x) { + case '0'...'9': + x -= '0'; + break; + case 'a'...'f': + x -= 'a' - 10; + break; + default: + continue; + } + + if (!(j & 1)) + x <<= 4; + uuid[j++ >> 1] |= x; + } + return i; +} + +void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) +{ + uint64_t now = local_clock(); + uint64_t duration = time_after64(now, start_time) + ? now - start_time : 0; + uint64_t last = time_after64(now, stats->last) + ? now - stats->last : 0; + + stats->max_duration = max(stats->max_duration, duration); + + if (stats->last) { + ewma_add(stats->average_duration, duration, 8, 8); + + if (stats->average_frequency) + ewma_add(stats->average_frequency, last, 8, 8); + else + stats->average_frequency = last << 8; + } else { + stats->average_duration = duration << 8; + } + + stats->last = now ?: 1; +} + +unsigned bch_next_delay(struct ratelimit *d, uint64_t done) +{ + uint64_t now = local_clock(); + + d->next += div_u64(done, d->rate); + + return time_after64(d->next, now) + ? div_u64(d->next - now, NSEC_PER_SEC / HZ) + : 0; +} + +void bch_bio_map(struct bio *bio, void *base) +{ + size_t size = bio->bi_size; + struct bio_vec *bv = bio->bi_io_vec; + + BUG_ON(!bio->bi_size); + BUG_ON(bio->bi_vcnt); + + bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0; + goto start; + + for (; size; bio->bi_vcnt++, bv++) { + bv->bv_offset = 0; +start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, + size); + if (base) { + bv->bv_page = is_vmalloc_addr(base) + ? vmalloc_to_page(base) + : virt_to_page(base); + + base += bv->bv_len; + } + + size -= bv->bv_len; + } +} + +int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp) +{ + int i; + struct bio_vec *bv; + + bio_for_each_segment(bv, bio, i) { + bv->bv_page = alloc_page(gfp); + if (!bv->bv_page) { + while (bv-- != bio->bi_io_vec + bio->bi_idx) + __free_page(bv->bv_page); + return -ENOMEM; + } + } + + return 0; +} + +/* + * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any + * use permitted, subject to terms of PostgreSQL license; see.) + + * If we have a 64-bit integer type, then a 64-bit CRC looks just like the + * usual sort of implementation. (See Ross Williams' excellent introduction + * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from + * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.) + * If we have no working 64-bit type, then fake it with two 32-bit registers. + * + * The present implementation is a normal (not "reflected", in Williams' + * terms) 64-bit CRC, using initial all-ones register contents and a final + * bit inversion. The chosen polynomial is borrowed from the DLT1 spec + * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM): + * + * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + + * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + + * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + + * x^7 + x^4 + x + 1 +*/ + +static const uint64_t crc_table[256] = { + 0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL, + 0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL, + 0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL, + 0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL, + 0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL, + 0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL, + 0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL, + 0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL, + 0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL, + 0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL, + 0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL, + 0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL, + 0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL, + 0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL, + 0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL, + 0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL, + 0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL, + 0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL, + 0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL, + 0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL, + 0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL, + 0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL, + 0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL, + 0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL, + 0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL, + 0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL, + 0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL, + 0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL, + 0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL, + 0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL, + 0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL, + 0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL, + 0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL, + 0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL, + 0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL, + 0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL, + 0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL, + 0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL, + 0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL, + 0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL, + 0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL, + 0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL, + 0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL, + 0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL, + 0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL, + 0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL, + 0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL, + 0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL, + 0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL, + 0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL, + 0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL, + 0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL, + 0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL, + 0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL, + 0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL, + 0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL, + 0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL, + 0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL, + 0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL, + 0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL, + 0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL, + 0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL, + 0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL, + 0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL, + 0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL, + 0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL, + 0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL, + 0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL, + 0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL, + 0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL, + 0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL, + 0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL, + 0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL, + 0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL, + 0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL, + 0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL, + 0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL, + 0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL, + 0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL, + 0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL, + 0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL, + 0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL, + 0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL, + 0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL, + 0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL, + 0x9AFCE626CE85B507ULL, +}; + +uint64_t bch_crc64_update(uint64_t crc, const void *_data, size_t len) +{ + const unsigned char *data = _data; + + while (len--) { + int i = ((int) (crc >> 56) ^ *data++) & 0xFF; + crc = crc_table[i] ^ (crc << 8); + } + + return crc; +} + +uint64_t bch_crc64(const void *data, size_t len) +{ + uint64_t crc = 0xffffffffffffffffULL; + + crc = bch_crc64_update(crc, data, len); + + return crc ^ 0xffffffffffffffffULL; +} diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h new file mode 100644 index 00000000000..577393e38c3 --- /dev/null +++ b/drivers/md/bcache/util.h @@ -0,0 +1,589 @@ + +#ifndef _BCACHE_UTIL_H +#define _BCACHE_UTIL_H + +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/llist.h> +#include <linux/ratelimit.h> +#include <linux/vmalloc.h> +#include <linux/workqueue.h> + +#include "closure.h" + +#define PAGE_SECTORS (PAGE_SIZE / 512) + +struct closure; + +#include <trace/events/bcache.h> + +#ifdef CONFIG_BCACHE_EDEBUG + +#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) +#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) + +#else /* EDEBUG */ + +#define atomic_dec_bug(v) atomic_dec(v) +#define atomic_inc_bug(v, i) atomic_inc(v) + +#endif + +#define BITMASK(name, type, field, offset, size) \ +static inline uint64_t name(const type *k) \ +{ return (k->field >> offset) & ~(((uint64_t) ~0) << size); } \ + \ +static inline void SET_##name(type *k, uint64_t v) \ +{ \ + k->field &= ~(~((uint64_t) ~0 << size) << offset); \ + k->field |= v << offset; \ +} + +#define DECLARE_HEAP(type, name) \ + struct { \ + size_t size, used; \ + type *data; \ + } name + +#define init_heap(heap, _size, gfp) \ +({ \ + size_t _bytes; \ + (heap)->used = 0; \ + (heap)->size = (_size); \ + _bytes = (heap)->size * sizeof(*(heap)->data); \ + (heap)->data = NULL; \ + if (_bytes < KMALLOC_MAX_SIZE) \ + (heap)->data = kmalloc(_bytes, (gfp)); \ + if ((!(heap)->data) && ((gfp) & GFP_KERNEL)) \ + (heap)->data = vmalloc(_bytes); \ + (heap)->data; \ +}) + +#define free_heap(heap) \ +do { \ + if (is_vmalloc_addr((heap)->data)) \ + vfree((heap)->data); \ + else \ + kfree((heap)->data); \ + (heap)->data = NULL; \ +} while (0) + +#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j]) + +#define heap_sift(h, i, cmp) \ +do { \ + size_t _r, _j = i; \ + \ + for (; _j * 2 + 1 < (h)->used; _j = _r) { \ + _r = _j * 2 + 1; \ + if (_r + 1 < (h)->used && \ + cmp((h)->data[_r], (h)->data[_r + 1])) \ + _r++; \ + \ + if (cmp((h)->data[_r], (h)->data[_j])) \ + break; \ + heap_swap(h, _r, _j); \ + } \ +} while (0) + +#define heap_sift_down(h, i, cmp) \ +do { \ + while (i) { \ + size_t p = (i - 1) / 2; \ + if (cmp((h)->data[i], (h)->data[p])) \ + break; \ + heap_swap(h, i, p); \ + i = p; \ + } \ +} while (0) + +#define heap_add(h, d, cmp) \ +({ \ + bool _r = !heap_full(h); \ + if (_r) { \ + size_t _i = (h)->used++; \ + (h)->data[_i] = d; \ + \ + heap_sift_down(h, _i, cmp); \ + heap_sift(h, _i, cmp); \ + } \ + _r; \ +}) + +#define heap_pop(h, d, cmp) \ +({ \ + bool _r = (h)->used; \ + if (_r) { \ + (d) = (h)->data[0]; \ + (h)->used--; \ + heap_swap(h, 0, (h)->used); \ + heap_sift(h, 0, cmp); \ + } \ + _r; \ +}) + +#define heap_peek(h) ((h)->size ? (h)->data[0] : NULL) + +#define heap_full(h) ((h)->used == (h)->size) + +#define DECLARE_FIFO(type, name) \ + struct { \ + size_t front, back, size, mask; \ + type *data; \ + } name + +#define fifo_for_each(c, fifo, iter) \ + for (iter = (fifo)->front; \ + c = (fifo)->data[iter], iter != (fifo)->back; \ + iter = (iter + 1) & (fifo)->mask) + +#define __init_fifo(fifo, gfp) \ +({ \ + size_t _allocated_size, _bytes; \ + BUG_ON(!(fifo)->size); \ + \ + _allocated_size = roundup_pow_of_two((fifo)->size + 1); \ + _bytes = _allocated_size * sizeof(*(fifo)->data); \ + \ + (fifo)->mask = _allocated_size - 1; \ + (fifo)->front = (fifo)->back = 0; \ + (fifo)->data = NULL; \ + \ + if (_bytes < KMALLOC_MAX_SIZE) \ + (fifo)->data = kmalloc(_bytes, (gfp)); \ + if ((!(fifo)->data) && ((gfp) & GFP_KERNEL)) \ + (fifo)->data = vmalloc(_bytes); \ + (fifo)->data; \ +}) + +#define init_fifo_exact(fifo, _size, gfp) \ +({ \ + (fifo)->size = (_size); \ + __init_fifo(fifo, gfp); \ +}) + +#define init_fifo(fifo, _size, gfp) \ +({ \ + (fifo)->size = (_size); \ + if ((fifo)->size > 4) \ + (fifo)->size = roundup_pow_of_two((fifo)->size) - 1; \ + __init_fifo(fifo, gfp); \ +}) + +#define free_fifo(fifo) \ +do { \ + if (is_vmalloc_addr((fifo)->data)) \ + vfree((fifo)->data); \ + else \ + kfree((fifo)->data); \ + (fifo)->data = NULL; \ +} while (0) + +#define fifo_used(fifo) (((fifo)->back - (fifo)->front) & (fifo)->mask) +#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) + +#define fifo_empty(fifo) (!fifo_used(fifo)) +#define fifo_full(fifo) (!fifo_free(fifo)) + +#define fifo_front(fifo) ((fifo)->data[(fifo)->front]) +#define fifo_back(fifo) \ + ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) + +#define fifo_idx(fifo, p) (((p) - &fifo_front(fifo)) & (fifo)->mask) + +#define fifo_push_back(fifo, i) \ +({ \ + bool _r = !fifo_full((fifo)); \ + if (_r) { \ + (fifo)->data[(fifo)->back++] = (i); \ + (fifo)->back &= (fifo)->mask; \ + } \ + _r; \ +}) + +#define fifo_pop_front(fifo, i) \ +({ \ + bool _r = !fifo_empty((fifo)); \ + if (_r) { \ + (i) = (fifo)->data[(fifo)->front++]; \ + (fifo)->front &= (fifo)->mask; \ + } \ + _r; \ +}) + +#define fifo_push_front(fifo, i) \ +({ \ + bool _r = !fifo_full((fifo)); \ + if (_r) { \ + --(fifo)->front; \ + (fifo)->front &= (fifo)->mask; \ + (fifo)->data[(fifo)->front] = (i); \ + } \ + _r; \ +}) + +#define fifo_pop_back(fifo, i) \ +({ \ + bool _r = !fifo_empty((fifo)); \ + if (_r) { \ + --(fifo)->back; \ + (fifo)->back &= (fifo)->mask; \ + (i) = (fifo)->data[(fifo)->back] \ + } \ + _r; \ +}) + +#define fifo_push(fifo, i) fifo_push_back(fifo, (i)) +#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) + +#define fifo_swap(l, r) \ +do { \ + swap((l)->front, (r)->front); \ + swap((l)->back, (r)->back); \ + swap((l)->size, (r)->size); \ + swap((l)->mask, (r)->mask); \ + swap((l)->data, (r)->data); \ +} while (0) + +#define fifo_move(dest, src) \ +do { \ + typeof(*((dest)->data)) _t; \ + while (!fifo_full(dest) && \ + fifo_pop(src, _t)) \ + fifo_push(dest, _t); \ +} while (0) + +/* + * Simple array based allocator - preallocates a number of elements and you can + * never allocate more than that, also has no locking. + * + * Handy because if you know you only need a fixed number of elements you don't + * have to worry about memory allocation failure, and sometimes a mempool isn't + * what you want. + * + * We treat the free elements as entries in a singly linked list, and the + * freelist as a stack - allocating and freeing push and pop off the freelist. + */ + +#define DECLARE_ARRAY_ALLOCATOR(type, name, size) \ + struct { \ + type *freelist; \ + type data[size]; \ + } name + +#define array_alloc(array) \ +({ \ + typeof((array)->freelist) _ret = (array)->freelist; \ + \ + if (_ret) \ + (array)->freelist = *((typeof((array)->freelist) *) _ret);\ + \ + _ret; \ +}) + +#define array_free(array, ptr) \ +do { \ + typeof((array)->freelist) _ptr = ptr; \ + \ + *((typeof((array)->freelist) *) _ptr) = (array)->freelist; \ + (array)->freelist = _ptr; \ +} while (0) + +#define array_allocator_init(array) \ +do { \ + typeof((array)->freelist) _i; \ + \ + BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *)); \ + (array)->freelist = NULL; \ + \ + for (_i = (array)->data; \ + _i < (array)->data + ARRAY_SIZE((array)->data); \ + _i++) \ + array_free(array, _i); \ +} while (0) + +#define array_freelist_empty(array) ((array)->freelist == NULL) + +#define ANYSINT_MAX(t) \ + ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) + +int bch_strtoint_h(const char *, int *); +int bch_strtouint_h(const char *, unsigned int *); +int bch_strtoll_h(const char *, long long *); +int bch_strtoull_h(const char *, unsigned long long *); + +static inline int bch_strtol_h(const char *cp, long *res) +{ +#if BITS_PER_LONG == 32 + return bch_strtoint_h(cp, (int *) res); +#else + return bch_strtoll_h(cp, (long long *) res); +#endif +} + +static inline int bch_strtoul_h(const char *cp, long *res) +{ +#if BITS_PER_LONG == 32 + return bch_strtouint_h(cp, (unsigned int *) res); +#else + return bch_strtoull_h(cp, (unsigned long long *) res); +#endif +} + +#define strtoi_h(cp, res) \ + (__builtin_types_compatible_p(typeof(*res), int) \ + ? bch_strtoint_h(cp, (void *) res) \ + : __builtin_types_compatible_p(typeof(*res), long) \ + ? bch_strtol_h(cp, (void *) res) \ + : __builtin_types_compatible_p(typeof(*res), long long) \ + ? bch_strtoll_h(cp, (void *) res) \ + : __builtin_types_compatible_p(typeof(*res), unsigned int) \ + ? bch_strtouint_h(cp, (void *) res) \ + : __builtin_types_compatible_p(typeof(*res), unsigned long) \ + ? bch_strtoul_h(cp, (void *) res) \ + : __builtin_types_compatible_p(typeof(*res), unsigned long long)\ + ? bch_strtoull_h(cp, (void *) res) : -EINVAL) + +#define strtoul_safe(cp, var) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (!_r) \ + var = _v; \ + _r; \ +}) + +#define strtoul_safe_clamp(cp, var, min, max) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (!_r) \ + var = clamp_t(typeof(var), _v, min, max); \ + _r; \ +}) + +#define snprint(buf, size, var) \ + snprintf(buf, size, \ + __builtin_types_compatible_p(typeof(var), int) \ + ? "%i\n" : \ + __builtin_types_compatible_p(typeof(var), unsigned) \ + ? "%u\n" : \ + __builtin_types_compatible_p(typeof(var), long) \ + ? "%li\n" : \ + __builtin_types_compatible_p(typeof(var), unsigned long)\ + ? "%lu\n" : \ + __builtin_types_compatible_p(typeof(var), int64_t) \ + ? "%lli\n" : \ + __builtin_types_compatible_p(typeof(var), uint64_t) \ + ? "%llu\n" : \ + __builtin_types_compatible_p(typeof(var), const char *) \ + ? "%s\n" : "%i\n", var) + +ssize_t bch_hprint(char *buf, int64_t v); + +bool bch_is_zero(const char *p, size_t n); +int bch_parse_uuid(const char *s, char *uuid); + +ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], + size_t selected); + +ssize_t bch_read_string_list(const char *buf, const char * const list[]); + +struct time_stats { + /* + * all fields are in nanoseconds, averages are ewmas stored left shifted + * by 8 + */ + uint64_t max_duration; + uint64_t average_duration; + uint64_t average_frequency; + uint64_t last; +}; + +void bch_time_stats_update(struct time_stats *stats, uint64_t time); + +#define NSEC_PER_ns 1L +#define NSEC_PER_us NSEC_PER_USEC +#define NSEC_PER_ms NSEC_PER_MSEC +#define NSEC_PER_sec NSEC_PER_SEC + +#define __print_time_stat(stats, name, stat, units) \ + sysfs_print(name ## _ ## stat ## _ ## units, \ + div_u64((stats)->stat >> 8, NSEC_PER_ ## units)) + +#define sysfs_print_time_stats(stats, name, \ + frequency_units, \ + duration_units) \ +do { \ + __print_time_stat(stats, name, \ + average_frequency, frequency_units); \ + __print_time_stat(stats, name, \ + average_duration, duration_units); \ + __print_time_stat(stats, name, \ + max_duration, duration_units); \ + \ + sysfs_print(name ## _last_ ## frequency_units, (stats)->last \ + ? div_s64(local_clock() - (stats)->last, \ + NSEC_PER_ ## frequency_units) \ + : -1LL); \ +} while (0) + +#define sysfs_time_stats_attribute(name, \ + frequency_units, \ + duration_units) \ +read_attribute(name ## _average_frequency_ ## frequency_units); \ +read_attribute(name ## _average_duration_ ## duration_units); \ +read_attribute(name ## _max_duration_ ## duration_units); \ +read_attribute(name ## _last_ ## frequency_units) + +#define sysfs_time_stats_attribute_list(name, \ + frequency_units, \ + duration_units) \ +&sysfs_ ## name ## _average_frequency_ ## frequency_units, \ +&sysfs_ ## name ## _average_duration_ ## duration_units, \ +&sysfs_ ## name ## _max_duration_ ## duration_units, \ +&sysfs_ ## name ## _last_ ## frequency_units, + +#define ewma_add(ewma, val, weight, factor) \ +({ \ + (ewma) *= (weight) - 1; \ + (ewma) += (val) << factor; \ + (ewma) /= (weight); \ + (ewma) >> factor; \ +}) + +struct ratelimit { + uint64_t next; + unsigned rate; +}; + +static inline void ratelimit_reset(struct ratelimit *d) +{ + d->next = local_clock(); +} + +unsigned bch_next_delay(struct ratelimit *d, uint64_t done); + +#define __DIV_SAFE(n, d, zero) \ +({ \ + typeof(n) _n = (n); \ + typeof(d) _d = (d); \ + _d ? _n / _d : zero; \ +}) + +#define DIV_SAFE(n, d) __DIV_SAFE(n, d, 0) + +#define container_of_or_null(ptr, type, member) \ +({ \ + typeof(ptr) _ptr = ptr; \ + _ptr ? container_of(_ptr, type, member) : NULL; \ +}) + +#define RB_INSERT(root, new, member, cmp) \ +({ \ + __label__ dup; \ + struct rb_node **n = &(root)->rb_node, *parent = NULL; \ + typeof(new) this; \ + int res, ret = -1; \ + \ + while (*n) { \ + parent = *n; \ + this = container_of(*n, typeof(*(new)), member); \ + res = cmp(new, this); \ + if (!res) \ + goto dup; \ + n = res < 0 \ + ? &(*n)->rb_left \ + : &(*n)->rb_right; \ + } \ + \ + rb_link_node(&(new)->member, parent, n); \ + rb_insert_color(&(new)->member, root); \ + ret = 0; \ +dup: \ + ret; \ +}) + +#define RB_SEARCH(root, search, member, cmp) \ +({ \ + struct rb_node *n = (root)->rb_node; \ + typeof(&(search)) this, ret = NULL; \ + int res; \ + \ + while (n) { \ + this = container_of(n, typeof(search), member); \ + res = cmp(&(search), this); \ + if (!res) { \ + ret = this; \ + break; \ + } \ + n = res < 0 \ + ? n->rb_left \ + : n->rb_right; \ + } \ + ret; \ +}) + +#define RB_GREATER(root, search, member, cmp) \ +({ \ + struct rb_node *n = (root)->rb_node; \ + typeof(&(search)) this, ret = NULL; \ + int res; \ + \ + while (n) { \ + this = container_of(n, typeof(search), member); \ + res = cmp(&(search), this); \ + if (res < 0) { \ + ret = this; \ + n = n->rb_left; \ + } else \ + n = n->rb_right; \ + } \ + ret; \ +}) + +#define RB_FIRST(root, type, member) \ + container_of_or_null(rb_first(root), type, member) + +#define RB_LAST(root, type, member) \ + container_of_or_null(rb_last(root), type, member) + +#define RB_NEXT(ptr, member) \ + container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member) + +#define RB_PREV(ptr, member) \ + container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member) + +/* Does linear interpolation between powers of two */ +static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) +{ + unsigned fract = x & ~(~0 << fract_bits); + + x >>= fract_bits; + x = 1 << x; + x += (x * fract) >> fract_bits; + + return x; +} + +#define bio_end(bio) ((bio)->bi_sector + bio_sectors(bio)) + +void bch_bio_map(struct bio *bio, void *base); + +int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp); + +static inline sector_t bdev_sectors(struct block_device *bdev) +{ + return bdev->bd_inode->i_size >> 9; +} + +#define closure_bio_submit(bio, cl, dev) \ +do { \ + closure_get(cl); \ + bch_generic_make_request(bio, &(dev)->bio_split_hook); \ +} while (0) + +uint64_t bch_crc64_update(uint64_t, const void *, size_t); +uint64_t bch_crc64(const void *, size_t); + +#endif /* _BCACHE_UTIL_H */ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c new file mode 100644 index 00000000000..93e7e31a4bd --- /dev/null +++ b/drivers/md/bcache/writeback.c @@ -0,0 +1,414 @@ +/* + * background writeback - scan btree for dirty data and write it to the backing + * device + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" + +static struct workqueue_struct *dirty_wq; + +static void read_dirty(struct closure *); + +struct dirty_io { + struct closure cl; + struct cached_dev *dc; + struct bio bio; +}; + +/* Rate limiting */ + +static void __update_writeback_rate(struct cached_dev *dc) +{ + struct cache_set *c = dc->disk.c; + uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; + uint64_t cache_dirty_target = + div_u64(cache_sectors * dc->writeback_percent, 100); + + int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), + c->cached_dev_sectors); + + /* PD controller */ + + int change = 0; + int64_t error; + int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty); + int64_t derivative = dirty - dc->disk.sectors_dirty_last; + + dc->disk.sectors_dirty_last = dirty; + + derivative *= dc->writeback_rate_d_term; + derivative = clamp(derivative, -dirty, dirty); + + derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, + dc->writeback_rate_d_smooth, 0); + + /* Avoid divide by zero */ + if (!target) + goto out; + + error = div64_s64((dirty + derivative - target) << 8, target); + + change = div_s64((dc->writeback_rate.rate * error) >> 8, + dc->writeback_rate_p_term_inverse); + + /* Don't increase writeback rate if the device isn't keeping up */ + if (change > 0 && + time_after64(local_clock(), + dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) + change = 0; + + dc->writeback_rate.rate = + clamp_t(int64_t, dc->writeback_rate.rate + change, + 1, NSEC_PER_MSEC); +out: + dc->writeback_rate_derivative = derivative; + dc->writeback_rate_change = change; + dc->writeback_rate_target = target; + + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); +} + +static void update_writeback_rate(struct work_struct *work) +{ + struct cached_dev *dc = container_of(to_delayed_work(work), + struct cached_dev, + writeback_rate_update); + + down_read(&dc->writeback_lock); + + if (atomic_read(&dc->has_dirty) && + dc->writeback_percent) + __update_writeback_rate(dc); + + up_read(&dc->writeback_lock); +} + +static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) +{ + if (atomic_read(&dc->disk.detaching) || + !dc->writeback_percent) + return 0; + + return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); +} + +/* Background writeback */ + +static bool dirty_pred(struct keybuf *buf, struct bkey *k) +{ + return KEY_DIRTY(k); +} + +static void dirty_init(struct keybuf_key *w) +{ + struct dirty_io *io = w->private; + struct bio *bio = &io->bio; + + bio_init(bio); + if (!io->dc->writeback_percent) + bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + + bio->bi_size = KEY_SIZE(&w->key) << 9; + bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); + bio->bi_private = w; + bio->bi_io_vec = bio->bi_inline_vecs; + bch_bio_map(bio, NULL); +} + +static void refill_dirty(struct closure *cl) +{ + struct cached_dev *dc = container_of(cl, struct cached_dev, + writeback.cl); + struct keybuf *buf = &dc->writeback_keys; + bool searched_from_start = false; + struct bkey end = MAX_KEY; + SET_KEY_INODE(&end, dc->disk.id); + + if (!atomic_read(&dc->disk.detaching) && + !dc->writeback_running) + closure_return(cl); + + down_write(&dc->writeback_lock); + + if (!atomic_read(&dc->has_dirty)) { + SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); + bch_write_bdev_super(dc, NULL); + + up_write(&dc->writeback_lock); + closure_return(cl); + } + + if (bkey_cmp(&buf->last_scanned, &end) >= 0) { + buf->last_scanned = KEY(dc->disk.id, 0, 0); + searched_from_start = true; + } + + bch_refill_keybuf(dc->disk.c, buf, &end); + + if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { + /* Searched the entire btree - delay awhile */ + + if (RB_EMPTY_ROOT(&buf->keys)) { + atomic_set(&dc->has_dirty, 0); + cached_dev_put(dc); + } + + if (!atomic_read(&dc->disk.detaching)) + closure_delay(&dc->writeback, dc->writeback_delay * HZ); + } + + up_write(&dc->writeback_lock); + + ratelimit_reset(&dc->writeback_rate); + + /* Punt to workqueue only so we don't recurse and blow the stack */ + continue_at(cl, read_dirty, dirty_wq); +} + +void bch_writeback_queue(struct cached_dev *dc) +{ + if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { + if (!atomic_read(&dc->disk.detaching)) + closure_delay(&dc->writeback, dc->writeback_delay * HZ); + + continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); + } +} + +void bch_writeback_add(struct cached_dev *dc, unsigned sectors) +{ + atomic_long_add(sectors, &dc->disk.sectors_dirty); + + if (!atomic_read(&dc->has_dirty) && + !atomic_xchg(&dc->has_dirty, 1)) { + atomic_inc(&dc->count); + + if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { + SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); + /* XXX: should do this synchronously */ + bch_write_bdev_super(dc, NULL); + } + + bch_writeback_queue(dc); + + if (dc->writeback_percent) + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); + } +} + +/* Background writeback - IO loop */ + +static void dirty_io_destructor(struct closure *cl) +{ + struct dirty_io *io = container_of(cl, struct dirty_io, cl); + kfree(io); +} + +static void write_dirty_finish(struct closure *cl) +{ + struct dirty_io *io = container_of(cl, struct dirty_io, cl); + struct keybuf_key *w = io->bio.bi_private; + struct cached_dev *dc = io->dc; + struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt); + + while (bv-- != io->bio.bi_io_vec) + __free_page(bv->bv_page); + + /* This is kind of a dumb way of signalling errors. */ + if (KEY_DIRTY(&w->key)) { + unsigned i; + struct btree_op op; + bch_btree_op_init_stack(&op); + + op.type = BTREE_REPLACE; + bkey_copy(&op.replace, &w->key); + + SET_KEY_DIRTY(&w->key, false); + bch_keylist_add(&op.keys, &w->key); + + for (i = 0; i < KEY_PTRS(&w->key); i++) + atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); + + pr_debug("clearing %s", pkey(&w->key)); + bch_btree_insert(&op, dc->disk.c); + closure_sync(&op.cl); + + atomic_long_inc(op.insert_collision + ? &dc->disk.c->writeback_keys_failed + : &dc->disk.c->writeback_keys_done); + } + + bch_keybuf_del(&dc->writeback_keys, w); + atomic_dec_bug(&dc->in_flight); + + closure_wake_up(&dc->writeback_wait); + + closure_return_with_destructor(cl, dirty_io_destructor); +} + +static void dirty_endio(struct bio *bio, int error) +{ + struct keybuf_key *w = bio->bi_private; + struct dirty_io *io = w->private; + + if (error) + SET_KEY_DIRTY(&w->key, false); + + closure_put(&io->cl); +} + +static void write_dirty(struct closure *cl) +{ + struct dirty_io *io = container_of(cl, struct dirty_io, cl); + struct keybuf_key *w = io->bio.bi_private; + + dirty_init(w); + io->bio.bi_rw = WRITE; + io->bio.bi_sector = KEY_START(&w->key); + io->bio.bi_bdev = io->dc->bdev; + io->bio.bi_end_io = dirty_endio; + + trace_bcache_write_dirty(&io->bio); + closure_bio_submit(&io->bio, cl, &io->dc->disk); + + continue_at(cl, write_dirty_finish, dirty_wq); +} + +static void read_dirty_endio(struct bio *bio, int error) +{ + struct keybuf_key *w = bio->bi_private; + struct dirty_io *io = w->private; + + bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), + error, "reading dirty data from cache"); + + dirty_endio(bio, error); +} + +static void read_dirty_submit(struct closure *cl) +{ + struct dirty_io *io = container_of(cl, struct dirty_io, cl); + + trace_bcache_read_dirty(&io->bio); + closure_bio_submit(&io->bio, cl, &io->dc->disk); + + continue_at(cl, write_dirty, dirty_wq); +} + +static void read_dirty(struct closure *cl) +{ + struct cached_dev *dc = container_of(cl, struct cached_dev, + writeback.cl); + unsigned delay = writeback_delay(dc, 0); + struct keybuf_key *w; + struct dirty_io *io; + + /* + * XXX: if we error, background writeback just spins. Should use some + * mempools. + */ + + while (1) { + w = bch_keybuf_next(&dc->writeback_keys); + if (!w) + break; + + BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); + + if (delay > 0 && + (KEY_START(&w->key) != dc->last_read || + jiffies_to_msecs(delay) > 50)) { + w->private = NULL; + + closure_delay(&dc->writeback, delay); + continue_at(cl, read_dirty, dirty_wq); + } + + dc->last_read = KEY_OFFSET(&w->key); + + io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) + * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); + if (!io) + goto err; + + w->private = io; + io->dc = dc; + + dirty_init(w); + io->bio.bi_sector = PTR_OFFSET(&w->key, 0); + io->bio.bi_bdev = PTR_CACHE(dc->disk.c, + &w->key, 0)->bdev; + io->bio.bi_rw = READ; + io->bio.bi_end_io = read_dirty_endio; + + if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) + goto err_free; + + pr_debug("%s", pkey(&w->key)); + + closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); + + delay = writeback_delay(dc, KEY_SIZE(&w->key)); + + atomic_inc(&dc->in_flight); + + if (!closure_wait_event(&dc->writeback_wait, cl, + atomic_read(&dc->in_flight) < 64)) + continue_at(cl, read_dirty, dirty_wq); + } + + if (0) { +err_free: + kfree(w->private); +err: + bch_keybuf_del(&dc->writeback_keys, w); + } + + refill_dirty(cl); +} + +void bch_writeback_init_cached_dev(struct cached_dev *dc) +{ + closure_init_unlocked(&dc->writeback); + init_rwsem(&dc->writeback_lock); + + bch_keybuf_init(&dc->writeback_keys, dirty_pred); + + dc->writeback_metadata = true; + dc->writeback_running = true; + dc->writeback_percent = 10; + dc->writeback_delay = 30; + dc->writeback_rate.rate = 1024; + + dc->writeback_rate_update_seconds = 30; + dc->writeback_rate_d_term = 16; + dc->writeback_rate_p_term_inverse = 64; + dc->writeback_rate_d_smooth = 8; + + INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); +} + +void bch_writeback_exit(void) +{ + if (dirty_wq) + destroy_workqueue(dirty_wq); +} + +int __init bch_writeback_init(void) +{ + dirty_wq = create_singlethread_workqueue("bcache_writeback"); + if (!dirty_wq) + return -ENOMEM; + + return 0; +} diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index f204a7a9cf3..6e7ec64b69a 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -78,3 +78,9 @@ SUBSYS(hugetlb) #endif /* */ + +#ifdef CONFIG_CGROUP_BCACHE +SUBSYS(bcache) +#endif + +/* */ diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 0c5a18ec322..1b4d4ee1168 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -52,7 +52,7 @@ #endif extern const char *drbd_buildtag(void); -#define REL_VERSION "8.4.2" +#define REL_VERSION "8.4.3" #define API_VERSION 1 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 101 @@ -319,7 +319,8 @@ enum drbd_state_rv { SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */ SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */ SS_O_VOL_PEER_PRI = -20, - SS_AFTER_LAST_ERROR = -21, /* Keep this at bottom */ + SS_OUTDATE_WO_CONN = -21, + SS_AFTER_LAST_ERROR = -22, /* Keep this at bottom */ }; /* from drbd_strings.c */ diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 1fa19c5f5e6..1fedf2b17cc 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -126,13 +126,12 @@ #define DRBD_RESYNC_RATE_DEF 250 #define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */ - /* less than 7 would hit performance unnecessarily. - * 919 slots context information per transaction, - * 32k activity log, 4k transaction size, - * one transaction in flight: - * 919 * 7 = 6433 */ + /* less than 7 would hit performance unnecessarily. */ #define DRBD_AL_EXTENTS_MIN 7 -#define DRBD_AL_EXTENTS_MAX 6433 + /* we use u16 as "slot number", (u16)~0 is "FREE". + * If you use >= 292 kB on-disk ring buffer, + * this is the maximum you can use: */ +#define DRBD_AL_EXTENTS_MAX 0xfffe #define DRBD_AL_EXTENTS_DEF 1237 #define DRBD_AL_EXTENTS_SCALE '1' diff --git a/include/linux/idr.h b/include/linux/idr.h index a470ac3ef49..871a213a847 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -124,11 +124,13 @@ static inline void *idr_find(struct idr *idr, int id) * @idp: idr handle * @entry: the type * to use as cursor * @id: id entry's key + * + * @entry and @id do not need to be initialized before the loop, and + * after normal terminatinon @entry is left with the value NULL. This + * is convenient for a "not found" value. */ -#define idr_for_each_entry(idp, entry, id) \ - for (id = 0, entry = (typeof(entry))idr_get_next((idp), &(id)); \ - entry != NULL; \ - ++id, entry = (typeof(entry))idr_get_next((idp), &(id))) +#define idr_for_each_entry(idp, entry, id) \ + for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id) /* * Don't use the following functions. These exist only to suppress diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h index 4019013c659..46262284de4 100644 --- a/include/linux/lru_cache.h +++ b/include/linux/lru_cache.h @@ -256,6 +256,7 @@ extern void lc_destroy(struct lru_cache *lc); extern void lc_set(struct lru_cache *lc, unsigned int enr, int index); extern void lc_del(struct lru_cache *lc, struct lc_element *element); +extern struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr); extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr); extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr); extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr); diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 8da67d625e1..0616ffe4570 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -133,10 +133,20 @@ do { \ _down_write_nest_lock(sem, &(nest_lock)->dep_map); \ } while (0); +/* + * Take/release a lock when not the owner will release it. + * + * [ This API should be avoided as much as possible - the + * proper abstraction for this case is completions. ] + */ +extern void down_read_non_owner(struct rw_semaphore *sem); +extern void up_read_non_owner(struct rw_semaphore *sem); #else # define down_read_nested(sem, subclass) down_read(sem) # define down_write_nest_lock(sem, nest_lock) down_write(sem) # define down_write_nested(sem, subclass) down_write(sem) +# define down_read_non_owner(sem) down_read(sem) +# define up_read_non_owner(sem) up_read(sem) #endif #endif /* _LINUX_RWSEM_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 022c085ac3c..caa8f4d0186 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1411,6 +1411,10 @@ struct task_struct { #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif +#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) + unsigned int sequential_io; + unsigned int sequential_io_avg; +#endif }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h new file mode 100644 index 00000000000..3cc5a0b278c --- /dev/null +++ b/include/trace/events/bcache.h @@ -0,0 +1,271 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bcache + +#if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BCACHE_H + +#include <linux/tracepoint.h> + +struct search; + +DECLARE_EVENT_CLASS(bcache_request, + + TP_PROTO(struct search *s, struct bio *bio), + + TP_ARGS(s, bio), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(unsigned int, orig_major ) + __field(unsigned int, orig_minor ) + __field(sector_t, sector ) + __field(dev_t, orig_sector ) + __field(unsigned int, nr_sector ) + __array(char, rwbs, 6 ) + __array(char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->orig_major = s->d->disk->major; + __entry->orig_minor = s->d->disk->first_minor; + __entry->sector = bio->bi_sector; + __entry->orig_sector = bio->bi_sector - 16; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d @ %llu)", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm, + __entry->orig_major, __entry->orig_minor, + (unsigned long long)__entry->orig_sector) +); + +DEFINE_EVENT(bcache_request, bcache_request_start, + + TP_PROTO(struct search *s, struct bio *bio), + + TP_ARGS(s, bio) +); + +DEFINE_EVENT(bcache_request, bcache_request_end, + + TP_PROTO(struct search *s, struct bio *bio), + + TP_ARGS(s, bio) +); + +DECLARE_EVENT_CLASS(bcache_bio, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(sector_t, sector ) + __field(unsigned int, nr_sector ) + __array(char, rwbs, 6 ) + __array(char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu + %u [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm) +); + + +DEFINE_EVENT(bcache_bio, bcache_passthrough, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_cache_hit, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_cache_miss, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_read_retry, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_writethrough, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_writeback, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_write_skip, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_btree_read, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_btree_write, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_write_dirty, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_read_dirty, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_write_moving, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_read_moving, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_journal_write, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DECLARE_EVENT_CLASS(bcache_cache_bio, + + TP_PROTO(struct bio *bio, + sector_t orig_sector, + struct block_device* orig_bdev), + + TP_ARGS(bio, orig_sector, orig_bdev), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(dev_t, orig_dev ) + __field(sector_t, sector ) + __field(sector_t, orig_sector ) + __field(unsigned int, nr_sector ) + __array(char, rwbs, 6 ) + __array(char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->orig_dev = orig_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->orig_sector = orig_sector; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d %llu)", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm, + MAJOR(__entry->orig_dev), MINOR(__entry->orig_dev), + (unsigned long long)__entry->orig_sector) +); + +DEFINE_EVENT(bcache_cache_bio, bcache_cache_insert, + + TP_PROTO(struct bio *bio, + sector_t orig_sector, + struct block_device *orig_bdev), + + TP_ARGS(bio, orig_sector, orig_bdev) +); + +DECLARE_EVENT_CLASS(bcache_gc, + + TP_PROTO(uint8_t *uuid), + + TP_ARGS(uuid), + + TP_STRUCT__entry( + __field(uint8_t *, uuid) + ), + + TP_fast_assign( + __entry->uuid = uuid; + ), + + TP_printk("%pU", __entry->uuid) +); + + +DEFINE_EVENT(bcache_gc, bcache_gc_start, + + TP_PROTO(uint8_t *uuid), + + TP_ARGS(uuid) +); + +DEFINE_EVENT(bcache_gc, bcache_gc_end, + + TP_PROTO(uint8_t *uuid), + + TP_ARGS(uuid) +); + +#endif /* _TRACE_BCACHE_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/fork.c b/kernel/fork.c index c509cc4a0d5..987b28a1f01 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1304,6 +1304,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->memcg_batch.do_batch = 0; p->memcg_batch.memcg = NULL; #endif +#ifdef CONFIG_BCACHE + p->sequential_io = 0; + p->sequential_io_avg = 0; +#endif /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 6a3bccba7e7..1f3186b37fd 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2998,6 +2998,7 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, EXPORT_SYMBOL_GPL(lockdep_init_map); struct lock_class_key __lockdep_no_validate__; +EXPORT_SYMBOL_GPL(__lockdep_no_validate__); static int print_lock_nested_lock_not_held(struct task_struct *curr, diff --git a/kernel/rwsem.c b/kernel/rwsem.c index b3c6c3fcd84..cfff1435bdf 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -126,6 +126,15 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) EXPORT_SYMBOL(_down_write_nest_lock); +void down_read_non_owner(struct rw_semaphore *sem) +{ + might_sleep(); + + __down_read(sem); +} + +EXPORT_SYMBOL(down_read_non_owner); + void down_write_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); @@ -136,6 +145,13 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_write_nested); +void up_read_non_owner(struct rw_semaphore *sem) +{ + __up_read(sem); +} + +EXPORT_SYMBOL(up_read_non_owner); + #endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index ed58a3216a6..b8b8560bfb9 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1808,6 +1808,7 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) rwbs[i] = '\0'; } +EXPORT_SYMBOL_GPL(blk_fill_rwbs); #endif /* CONFIG_EVENT_TRACING */ diff --git a/lib/lru_cache.c b/lib/lru_cache.c index 8335d39d2cc..4a83ecd0365 100644 --- a/lib/lru_cache.c +++ b/lib/lru_cache.c @@ -365,7 +365,13 @@ static int lc_unused_element_available(struct lru_cache *lc) return 0; } -static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool may_change) +/* used as internal flags to __lc_get */ +enum { + LC_GET_MAY_CHANGE = 1, + LC_GET_MAY_USE_UNCOMMITTED = 2, +}; + +static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, unsigned int flags) { struct lc_element *e; @@ -380,22 +386,31 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool * this enr is currently being pulled in already, * and will be available once the pending transaction * has been committed. */ - if (e && e->lc_new_number == e->lc_number) { + if (e) { + if (e->lc_new_number != e->lc_number) { + /* It has been found above, but on the "to_be_changed" + * list, not yet committed. Don't pull it in twice, + * wait for the transaction, then try again... + */ + if (!(flags & LC_GET_MAY_USE_UNCOMMITTED)) + RETURN(NULL); + /* ... unless the caller is aware of the implications, + * probably preparing a cumulative transaction. */ + ++e->refcnt; + ++lc->hits; + RETURN(e); + } + /* else: lc_new_number == lc_number; a real hit. */ ++lc->hits; if (e->refcnt++ == 0) lc->used++; list_move(&e->list, &lc->in_use); /* Not evictable... */ RETURN(e); } + /* e == NULL */ ++lc->misses; - if (!may_change) - RETURN(NULL); - - /* It has been found above, but on the "to_be_changed" list, not yet - * committed. Don't pull it in twice, wait for the transaction, then - * try again */ - if (e) + if (!(flags & LC_GET_MAY_CHANGE)) RETURN(NULL); /* To avoid races with lc_try_lock(), first, mark us dirty @@ -477,7 +492,27 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool */ struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) { - return __lc_get(lc, enr, 1); + return __lc_get(lc, enr, LC_GET_MAY_CHANGE); +} + +/** + * lc_get_cumulative - like lc_get; also finds to-be-changed elements + * @lc: the lru cache to operate on + * @enr: the label to look up + * + * Unlike lc_get this also returns the element for @enr, if it is belonging to + * a pending transaction, so the return values are like for lc_get(), + * plus: + * + * pointer to an element already on the "to_be_changed" list. + * In this case, the cache was already marked %LC_DIRTY. + * + * Caller needs to make sure that the pending transaction is completed, + * before proceeding to actually use this element. + */ +struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr) +{ + return __lc_get(lc, enr, LC_GET_MAY_CHANGE|LC_GET_MAY_USE_UNCOMMITTED); } /** @@ -648,3 +683,4 @@ EXPORT_SYMBOL(lc_seq_printf_stats); EXPORT_SYMBOL(lc_seq_dump_details); EXPORT_SYMBOL(lc_try_lock); EXPORT_SYMBOL(lc_is_used); +EXPORT_SYMBOL(lc_get_cumulative); |