diff options
Diffstat (limited to 'drivers/staging/ramster/zbud.c')
-rw-r--r-- | drivers/staging/ramster/zbud.c | 1060 |
1 files changed, 1060 insertions, 0 deletions
diff --git a/drivers/staging/ramster/zbud.c b/drivers/staging/ramster/zbud.c new file mode 100644 index 00000000000..a7c436127aa --- /dev/null +++ b/drivers/staging/ramster/zbud.c @@ -0,0 +1,1060 @@ +/* + * zbud.c - Compression buddies allocator + * + * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp. + * + * Compression buddies ("zbud") provides for efficiently packing two + * (or, possibly in the future, more) compressed pages ("zpages") into + * a single "raw" pageframe and for tracking both zpages and pageframes + * so that whole pageframes can be easily reclaimed in LRU-like order. + * It is designed to be used in conjunction with transcendent memory + * ("tmem"); for example separate LRU lists are maintained for persistent + * vs. ephemeral pages. + * + * A zbudpage is an overlay for a struct page and thus each zbudpage + * refers to a physical pageframe of RAM. When the caller passes a + * struct page from the kernel's page allocator, zbud "transforms" it + * to a zbudpage which sets/uses a different set of fields than the + * struct-page and thus must "untransform" it back by reinitializing + * certain fields before the struct-page can be freed. The fields + * of a zbudpage include a page lock for controlling access to the + * corresponding pageframe, and there is a size field for each zpage. + * Each zbudpage also lives on two linked lists: a "budlist" which is + * used to support efficient buddying of zpages; and an "lru" which + * is used for reclaiming pageframes in approximately least-recently-used + * order. + * + * A zbudpageframe is a pageframe divided up into aligned 64-byte "chunks" + * which contain the compressed data for zero, one, or two zbuds. Contained + * with the compressed data is a tmem_handle which is a key to allow + * the same data to be found via the tmem interface so the zpage can + * be invalidated (for ephemeral pages) or repatriated to the swap cache + * (for persistent pages). The contents of a zbudpageframe must never + * be accessed without holding the page lock for the corresponding + * zbudpage and, to accomodate highmem machines, the contents may + * only be examined or changes when kmapped. Thus, when in use, a + * kmapped zbudpageframe is referred to in the zbud code as "void *zbpg". + * + * Note that the term "zbud" refers to the combination of a zpage and + * a tmem_handle that is stored as one of possibly two "buddied" zpages; + * it also generically refers to this allocator... sorry for any confusion. + * + * A zbudref is a pointer to a struct zbudpage (which can be cast to a + * struct page), with the LSB either cleared or set to indicate, respectively, + * the first or second zpage in the zbudpageframe. Since a zbudref can be + * cast to a pointer, it is used as the tmem "pampd" pointer and uniquely + * references a stored tmem page and so is the only zbud data structure + * externally visible to zbud.c/zbud.h. + * + * Since we wish to reclaim entire pageframes but zpages may be randomly + * added and deleted to any given pageframe, we approximate LRU by + * promoting a pageframe to MRU when a zpage is added to it, but + * leaving it at the current place in the list when a zpage is deleted + * from it. As a side effect, zpages that are difficult to buddy (e.g. + * very large paages) will be reclaimed faster than average, which seems + * reasonable. + * + * In the current implementation, no more than two zpages may be stored in + * any pageframe and no zpage ever crosses a pageframe boundary. While + * other zpage allocation mechanisms may allow greater density, this two + * zpage-per-pageframe limit both ensures simple reclaim of pageframes + * (including garbage collection of references to the contents of those + * pageframes from tmem data structures) AND avoids the need for compaction. + * With additional complexity, zbud could be modified to support storing + * up to three zpages per pageframe or, to handle larger average zpages, + * up to three zpages per pair of pageframes, but it is not clear if the + * additional complexity would be worth it. So consider it an exercise + * for future developers. + * + * Note also that zbud does no page allocation or freeing. This is so + * that the caller has complete control over and, for accounting, visibility + * into if/when pages are allocated and freed. + * + * Finally, note that zbud limits the size of zpages it can store; the + * caller must check the zpage size with zbud_max_buddy_size before + * storing it, else BUGs will result. User beware. + */ + +#include <linux/module.h> +#include <linux/highmem.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/pagemap.h> +#include <linux/atomic.h> +#include <linux/bug.h> +#include "tmem.h" +#include "zcache.h" +#include "zbud.h" + +/* + * We need to ensure that a struct zbudpage is never larger than a + * struct page. This is checked with a BUG_ON in zbud_init. + * + * The unevictable field indicates that a zbud is being added to the + * zbudpage. Since this is a two-phase process (due to tmem locking), + * this field locks the zbudpage against eviction when a zbud match + * or creation is in process. Since this addition process may occur + * in parallel for two zbuds in one zbudpage, the field is a counter + * that must not exceed two. + */ +struct zbudpage { + union { + struct page page; + struct { + unsigned long space_for_flags; + struct { + unsigned zbud0_size:12; + unsigned zbud1_size:12; + unsigned unevictable:2; + }; + struct list_head budlist; + struct list_head lru; + }; + }; +}; + +struct zbudref { + union { + struct zbudpage *zbudpage; + unsigned long zbudref; + }; +}; + +#define CHUNK_SHIFT 6 +#define CHUNK_SIZE (1 << CHUNK_SHIFT) +#define CHUNK_MASK (~(CHUNK_SIZE-1)) +#define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT) +#define MAX_CHUNK (NCHUNKS-1) + +/* + * The following functions deal with the difference between struct + * page and struct zbudpage. Note the hack of using the pageflags + * from struct page; this is to avoid duplicating all the complex + * pageflag macros. + */ +static inline void zbudpage_spin_lock(struct zbudpage *zbudpage) +{ + struct page *page = (struct page *)zbudpage; + + while (unlikely(test_and_set_bit_lock(PG_locked, &page->flags))) { + do { + cpu_relax(); + } while (test_bit(PG_locked, &page->flags)); + } +} + +static inline void zbudpage_spin_unlock(struct zbudpage *zbudpage) +{ + struct page *page = (struct page *)zbudpage; + + clear_bit(PG_locked, &page->flags); +} + +static inline int zbudpage_spin_trylock(struct zbudpage *zbudpage) +{ + return trylock_page((struct page *)zbudpage); +} + +static inline int zbudpage_is_locked(struct zbudpage *zbudpage) +{ + return PageLocked((struct page *)zbudpage); +} + +static inline void *kmap_zbudpage_atomic(struct zbudpage *zbudpage) +{ + return kmap_atomic((struct page *)zbudpage); +} + +/* + * A dying zbudpage is an ephemeral page in the process of being evicted. + * Any data contained in the zbudpage is invalid and we are just waiting for + * the tmem pampds to be invalidated before freeing the page + */ +static inline int zbudpage_is_dying(struct zbudpage *zbudpage) +{ + struct page *page = (struct page *)zbudpage; + + return test_bit(PG_reclaim, &page->flags); +} + +static inline void zbudpage_set_dying(struct zbudpage *zbudpage) +{ + struct page *page = (struct page *)zbudpage; + + set_bit(PG_reclaim, &page->flags); +} + +static inline void zbudpage_clear_dying(struct zbudpage *zbudpage) +{ + struct page *page = (struct page *)zbudpage; + + clear_bit(PG_reclaim, &page->flags); +} + +/* + * A zombie zbudpage is a persistent page in the process of being evicted. + * The data contained in the zbudpage is valid and we are just waiting for + * the tmem pampds to be invalidated before freeing the page + */ +static inline int zbudpage_is_zombie(struct zbudpage *zbudpage) +{ + struct page *page = (struct page *)zbudpage; + + return test_bit(PG_dirty, &page->flags); +} + +static inline void zbudpage_set_zombie(struct zbudpage *zbudpage) +{ + struct page *page = (struct page *)zbudpage; + + set_bit(PG_dirty, &page->flags); +} + +static inline void zbudpage_clear_zombie(struct zbudpage *zbudpage) +{ + struct page *page = (struct page *)zbudpage; + + clear_bit(PG_dirty, &page->flags); +} + +static inline void kunmap_zbudpage_atomic(void *zbpg) +{ + kunmap_atomic(zbpg); +} + +/* + * zbud "translation" and helper functions + */ + +static inline struct zbudpage *zbudref_to_zbudpage(struct zbudref *zref) +{ + unsigned long zbud = (unsigned long)zref; + zbud &= ~1UL; + return (struct zbudpage *)zbud; +} + +static inline struct zbudref *zbudpage_to_zbudref(struct zbudpage *zbudpage, + unsigned budnum) +{ + unsigned long zbud = (unsigned long)zbudpage; + BUG_ON(budnum > 1); + zbud |= budnum; + return (struct zbudref *)zbud; +} + +static inline int zbudref_budnum(struct zbudref *zbudref) +{ + unsigned long zbud = (unsigned long)zbudref; + return zbud & 1UL; +} + +static inline unsigned zbud_max_size(void) +{ + return MAX_CHUNK << CHUNK_SHIFT; +} + +static inline unsigned zbud_size_to_chunks(unsigned size) +{ + BUG_ON(size == 0 || size > zbud_max_size()); + return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; +} + +/* can only be used between kmap_zbudpage_atomic/kunmap_zbudpage_atomic! */ +static inline char *zbud_data(void *zbpg, + unsigned budnum, unsigned size) +{ + char *p; + + BUG_ON(size == 0 || size > zbud_max_size()); + p = (char *)zbpg; + if (budnum == 1) + p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK); + return p; +} + +/* + * These are all informative and exposed through debugfs... except for + * the arrays... anyone know how to do that? To avoid confusion for + * debugfs viewers, some of these should also be atomic_long_t, but + * I don't know how to expose atomics via debugfs either... + */ +static unsigned long zbud_eph_pageframes; +static unsigned long zbud_pers_pageframes; +static unsigned long zbud_eph_zpages; +static unsigned long zbud_pers_zpages; +static u64 zbud_eph_zbytes; +static u64 zbud_pers_zbytes; +static unsigned long zbud_eph_evicted_pageframes; +static unsigned long zbud_pers_evicted_pageframes; +static unsigned long zbud_eph_cumul_zpages; +static unsigned long zbud_pers_cumul_zpages; +static u64 zbud_eph_cumul_zbytes; +static u64 zbud_pers_cumul_zbytes; +static unsigned long zbud_eph_cumul_chunk_counts[NCHUNKS]; +static unsigned long zbud_pers_cumul_chunk_counts[NCHUNKS]; +static unsigned long zbud_eph_buddied_count; +static unsigned long zbud_pers_buddied_count; +static unsigned long zbud_eph_unbuddied_count; +static unsigned long zbud_pers_unbuddied_count; +static unsigned long zbud_eph_zombie_count; +static unsigned long zbud_pers_zombie_count; +static atomic_t zbud_eph_zombie_atomic; +static atomic_t zbud_pers_zombie_atomic; + +#ifdef CONFIG_DEBUG_FS +#include <linux/debugfs.h> +#define zdfs debugfs_create_size_t +#define zdfs64 debugfs_create_u64 +static int zbud_debugfs_init(void) +{ + struct dentry *root = debugfs_create_dir("zbud", NULL); + if (root == NULL) + return -ENXIO; + + /* + * would be nice to dump the sizes of the unbuddied + * arrays, like was done with sysfs, but it doesn't + * look like debugfs is flexible enough to do that + */ + zdfs64("eph_zbytes", S_IRUGO, root, &zbud_eph_zbytes); + zdfs64("eph_cumul_zbytes", S_IRUGO, root, &zbud_eph_cumul_zbytes); + zdfs64("pers_zbytes", S_IRUGO, root, &zbud_pers_zbytes); + zdfs64("pers_cumul_zbytes", S_IRUGO, root, &zbud_pers_cumul_zbytes); + zdfs("eph_cumul_zpages", S_IRUGO, root, &zbud_eph_cumul_zpages); + zdfs("eph_evicted_pageframes", S_IRUGO, root, + &zbud_eph_evicted_pageframes); + zdfs("eph_zpages", S_IRUGO, root, &zbud_eph_zpages); + zdfs("eph_pageframes", S_IRUGO, root, &zbud_eph_pageframes); + zdfs("eph_buddied_count", S_IRUGO, root, &zbud_eph_buddied_count); + zdfs("eph_unbuddied_count", S_IRUGO, root, &zbud_eph_unbuddied_count); + zdfs("pers_cumul_zpages", S_IRUGO, root, &zbud_pers_cumul_zpages); + zdfs("pers_evicted_pageframes", S_IRUGO, root, + &zbud_pers_evicted_pageframes); + zdfs("pers_zpages", S_IRUGO, root, &zbud_pers_zpages); + zdfs("pers_pageframes", S_IRUGO, root, &zbud_pers_pageframes); + zdfs("pers_buddied_count", S_IRUGO, root, &zbud_pers_buddied_count); + zdfs("pers_unbuddied_count", S_IRUGO, root, &zbud_pers_unbuddied_count); + zdfs("pers_zombie_count", S_IRUGO, root, &zbud_pers_zombie_count); + return 0; +} +#undef zdfs +#undef zdfs64 +#endif + +/* protects the buddied list and all unbuddied lists */ +static DEFINE_SPINLOCK(zbud_eph_lists_lock); +static DEFINE_SPINLOCK(zbud_pers_lists_lock); + +struct zbud_unbuddied { + struct list_head list; + unsigned count; +}; + +/* list N contains pages with N chunks USED and NCHUNKS-N unused */ +/* element 0 is never used but optimizing that isn't worth it */ +static struct zbud_unbuddied zbud_eph_unbuddied[NCHUNKS]; +static struct zbud_unbuddied zbud_pers_unbuddied[NCHUNKS]; +static LIST_HEAD(zbud_eph_lru_list); +static LIST_HEAD(zbud_pers_lru_list); +static LIST_HEAD(zbud_eph_buddied_list); +static LIST_HEAD(zbud_pers_buddied_list); +static LIST_HEAD(zbud_eph_zombie_list); +static LIST_HEAD(zbud_pers_zombie_list); + +/* + * Given a struct page, transform it to a zbudpage so that it can be + * used by zbud and initialize fields as necessary. + */ +static inline struct zbudpage *zbud_init_zbudpage(struct page *page, bool eph) +{ + struct zbudpage *zbudpage = (struct zbudpage *)page; + + BUG_ON(page == NULL); + INIT_LIST_HEAD(&zbudpage->budlist); + INIT_LIST_HEAD(&zbudpage->lru); + zbudpage->zbud0_size = 0; + zbudpage->zbud1_size = 0; + zbudpage->unevictable = 0; + if (eph) + zbud_eph_pageframes++; + else + zbud_pers_pageframes++; + return zbudpage; +} + +/* "Transform" a zbudpage back to a struct page suitable to free. */ +static inline struct page *zbud_unuse_zbudpage(struct zbudpage *zbudpage, + bool eph) +{ + struct page *page = (struct page *)zbudpage; + + BUG_ON(!list_empty(&zbudpage->budlist)); + BUG_ON(!list_empty(&zbudpage->lru)); + BUG_ON(zbudpage->zbud0_size != 0); + BUG_ON(zbudpage->zbud1_size != 0); + BUG_ON(!PageLocked(page)); + BUG_ON(zbudpage->unevictable != 0); + BUG_ON(zbudpage_is_dying(zbudpage)); + BUG_ON(zbudpage_is_zombie(zbudpage)); + if (eph) + zbud_eph_pageframes--; + else + zbud_pers_pageframes--; + zbudpage_spin_unlock(zbudpage); + reset_page_mapcount(page); + init_page_count(page); + page->index = 0; + return page; +} + +/* Mark a zbud as unused and do accounting */ +static inline void zbud_unuse_zbud(struct zbudpage *zbudpage, + int budnum, bool eph) +{ + unsigned size; + + BUG_ON(!zbudpage_is_locked(zbudpage)); + if (budnum == 0) { + size = zbudpage->zbud0_size; + zbudpage->zbud0_size = 0; + } else { + size = zbudpage->zbud1_size; + zbudpage->zbud1_size = 0; + } + if (eph) { + zbud_eph_zbytes -= size; + zbud_eph_zpages--; + } else { + zbud_pers_zbytes -= size; + zbud_pers_zpages--; + } +} + +/* + * Given a zbudpage/budnum/size, a tmem handle, and a kmapped pointer + * to some data, set up the zbud appropriately including data copying + * and accounting. Note that if cdata is NULL, the data copying is + * skipped. (This is useful for lazy writes such as for RAMster.) + */ +static void zbud_init_zbud(struct zbudpage *zbudpage, struct tmem_handle *th, + bool eph, void *cdata, + unsigned budnum, unsigned size) +{ + char *to; + void *zbpg; + struct tmem_handle *to_th; + unsigned nchunks = zbud_size_to_chunks(size); + + BUG_ON(!zbudpage_is_locked(zbudpage)); + zbpg = kmap_zbudpage_atomic(zbudpage); + to = zbud_data(zbpg, budnum, size); + to_th = (struct tmem_handle *)to; + to_th->index = th->index; + to_th->oid = th->oid; + to_th->pool_id = th->pool_id; + to_th->client_id = th->client_id; + to += sizeof(struct tmem_handle); + if (cdata != NULL) + memcpy(to, cdata, size - sizeof(struct tmem_handle)); + kunmap_zbudpage_atomic(zbpg); + if (budnum == 0) + zbudpage->zbud0_size = size; + else + zbudpage->zbud1_size = size; + if (eph) { + zbud_eph_cumul_chunk_counts[nchunks]++; + zbud_eph_zpages++; + zbud_eph_cumul_zpages++; + zbud_eph_zbytes += size; + zbud_eph_cumul_zbytes += size; + } else { + zbud_pers_cumul_chunk_counts[nchunks]++; + zbud_pers_zpages++; + zbud_pers_cumul_zpages++; + zbud_pers_zbytes += size; + zbud_pers_cumul_zbytes += size; + } +} + +/* + * Given a locked dying zbudpage, read out the tmem handles from the data, + * unlock the page, then use the handles to tell tmem to flush out its + * references + */ +static void zbud_evict_tmem(struct zbudpage *zbudpage) +{ + int i, j; + uint32_t pool_id[2], client_id[2]; + uint32_t index[2]; + struct tmem_oid oid[2]; + struct tmem_pool *pool; + void *zbpg; + struct tmem_handle *th; + unsigned size; + + /* read out the tmem handles from the data and set aside */ + zbpg = kmap_zbudpage_atomic(zbudpage); + for (i = 0, j = 0; i < 2; i++) { + size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size; + if (size) { + th = (struct tmem_handle *)zbud_data(zbpg, i, size); + client_id[j] = th->client_id; + pool_id[j] = th->pool_id; + oid[j] = th->oid; + index[j] = th->index; + j++; + zbud_unuse_zbud(zbudpage, i, true); + } + } + kunmap_zbudpage_atomic(zbpg); + zbudpage_spin_unlock(zbudpage); + /* zbudpage is now an unlocked dying... tell tmem to flush pointers */ + for (i = 0; i < j; i++) { + pool = zcache_get_pool_by_id(client_id[i], pool_id[i]); + if (pool != NULL) { + tmem_flush_page(pool, &oid[i], index[i]); + zcache_put_pool(pool); + } + } +} + +/* + * Externally callable zbud handling routines. + */ + +/* + * Return the maximum size compressed page that can be stored (secretly + * setting aside space for the tmem handle. + */ +unsigned int zbud_max_buddy_size(void) +{ + return zbud_max_size() - sizeof(struct tmem_handle); +} + +/* + * Given a zbud reference, free the corresponding zbud from all lists, + * mark it as unused, do accounting, and if the freeing of the zbud + * frees up an entire pageframe, return it to the caller (else NULL). + */ +struct page *zbud_free_and_delist(struct zbudref *zref, bool eph, + unsigned int *zsize, unsigned int *zpages) +{ + unsigned long budnum = zbudref_budnum(zref); + struct zbudpage *zbudpage = zbudref_to_zbudpage(zref); + struct page *page = NULL; + unsigned chunks, bud_size, other_bud_size; + spinlock_t *lists_lock = + eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; + struct zbud_unbuddied *unbud = + eph ? zbud_eph_unbuddied : zbud_pers_unbuddied; + + + spin_lock(lists_lock); + zbudpage_spin_lock(zbudpage); + if (zbudpage_is_dying(zbudpage)) { + /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */ + zbudpage_spin_unlock(zbudpage); + spin_unlock(lists_lock); + *zpages = 0; + *zsize = 0; + goto out; + } + if (budnum == 0) { + bud_size = zbudpage->zbud0_size; + other_bud_size = zbudpage->zbud1_size; + } else { + bud_size = zbudpage->zbud1_size; + other_bud_size = zbudpage->zbud0_size; + } + *zsize = bud_size - sizeof(struct tmem_handle); + *zpages = 1; + zbud_unuse_zbud(zbudpage, budnum, eph); + if (other_bud_size == 0) { /* was unbuddied: unlist and free */ + chunks = zbud_size_to_chunks(bud_size) ; + if (zbudpage_is_zombie(zbudpage)) { + if (eph) + zbud_pers_zombie_count = + atomic_dec_return(&zbud_eph_zombie_atomic); + else + zbud_pers_zombie_count = + atomic_dec_return(&zbud_pers_zombie_atomic); + zbudpage_clear_zombie(zbudpage); + } else { + BUG_ON(list_empty(&unbud[chunks].list)); + list_del_init(&zbudpage->budlist); + unbud[chunks].count--; + } + list_del_init(&zbudpage->lru); + spin_unlock(lists_lock); + if (eph) + zbud_eph_unbuddied_count--; + else + zbud_pers_unbuddied_count--; + page = zbud_unuse_zbudpage(zbudpage, eph); + } else { /* was buddied: move remaining buddy to unbuddied list */ + chunks = zbud_size_to_chunks(other_bud_size) ; + if (!zbudpage_is_zombie(zbudpage)) { + list_del_init(&zbudpage->budlist); + list_add_tail(&zbudpage->budlist, &unbud[chunks].list); + unbud[chunks].count++; + } + if (eph) { + zbud_eph_buddied_count--; + zbud_eph_unbuddied_count++; + } else { + zbud_pers_unbuddied_count++; + zbud_pers_buddied_count--; + } + /* don't mess with lru, no need to move it */ + zbudpage_spin_unlock(zbudpage); + spin_unlock(lists_lock); + } +out: + return page; +} + +/* + * Given a tmem handle, and a kmapped pointer to compressed data of + * the given size, try to find an unbuddied zbudpage in which to + * create a zbud. If found, put it there, mark the zbudpage unevictable, + * and return a zbudref to it. Else return NULL. + */ +struct zbudref *zbud_match_prep(struct tmem_handle *th, bool eph, + void *cdata, unsigned size) +{ + struct zbudpage *zbudpage = NULL, *zbudpage2; + unsigned long budnum = 0UL; + unsigned nchunks; + int i, found_good_buddy = 0; + spinlock_t *lists_lock = + eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; + struct zbud_unbuddied *unbud = + eph ? zbud_eph_unbuddied : zbud_pers_unbuddied; + + size += sizeof(struct tmem_handle); + nchunks = zbud_size_to_chunks(size); + for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) { + spin_lock(lists_lock); + if (!list_empty(&unbud[i].list)) { + list_for_each_entry_safe(zbudpage, zbudpage2, + &unbud[i].list, budlist) { + if (zbudpage_spin_trylock(zbudpage)) { + found_good_buddy = i; + goto found_unbuddied; + } + } + } + spin_unlock(lists_lock); + } + zbudpage = NULL; + goto out; + +found_unbuddied: + BUG_ON(!zbudpage_is_locked(zbudpage)); + BUG_ON(!((zbudpage->zbud0_size == 0) ^ (zbudpage->zbud1_size == 0))); + if (zbudpage->zbud0_size == 0) + budnum = 0UL; + else if (zbudpage->zbud1_size == 0) + budnum = 1UL; + list_del_init(&zbudpage->budlist); + if (eph) { + list_add_tail(&zbudpage->budlist, &zbud_eph_buddied_list); + unbud[found_good_buddy].count--; + zbud_eph_unbuddied_count--; + zbud_eph_buddied_count++; + /* "promote" raw zbudpage to most-recently-used */ + list_del_init(&zbudpage->lru); + list_add_tail(&zbudpage->lru, &zbud_eph_lru_list); + } else { + list_add_tail(&zbudpage->budlist, &zbud_pers_buddied_list); + unbud[found_good_buddy].count--; + zbud_pers_unbuddied_count--; + zbud_pers_buddied_count++; + /* "promote" raw zbudpage to most-recently-used */ + list_del_init(&zbudpage->lru); + list_add_tail(&zbudpage->lru, &zbud_pers_lru_list); + } + zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size); + zbudpage->unevictable++; + BUG_ON(zbudpage->unevictable == 3); + zbudpage_spin_unlock(zbudpage); + spin_unlock(lists_lock); +out: + return zbudpage_to_zbudref(zbudpage, budnum); + +} + +/* + * Given a tmem handle, and a kmapped pointer to compressed data of + * the given size, and a newly allocated struct page, create an unevictable + * zbud in that new page and return a zbudref to it. + */ +struct zbudref *zbud_create_prep(struct tmem_handle *th, bool eph, + void *cdata, unsigned size, + struct page *newpage) +{ + struct zbudpage *zbudpage; + unsigned long budnum = 0; + unsigned nchunks; + spinlock_t *lists_lock = + eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; + struct zbud_unbuddied *unbud = + eph ? zbud_eph_unbuddied : zbud_pers_unbuddied; + +#if 0 + /* this may be worth it later to support decompress-in-place? */ + static unsigned long counter; + budnum = counter++ & 1; /* alternate using zbud0 and zbud1 */ +#endif + + if (size > zbud_max_buddy_size()) + return NULL; + if (newpage == NULL) + return NULL; + + size += sizeof(struct tmem_handle); + nchunks = zbud_size_to_chunks(size) ; + spin_lock(lists_lock); + zbudpage = zbud_init_zbudpage(newpage, eph); + zbudpage_spin_lock(zbudpage); + list_add_tail(&zbudpage->budlist, &unbud[nchunks].list); + if (eph) { + list_add_tail(&zbudpage->lru, &zbud_eph_lru_list); + zbud_eph_unbuddied_count++; + } else { + list_add_tail(&zbudpage->lru, &zbud_pers_lru_list); + zbud_pers_unbuddied_count++; + } + unbud[nchunks].count++; + zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size); + zbudpage->unevictable++; + BUG_ON(zbudpage->unevictable == 3); + zbudpage_spin_unlock(zbudpage); + spin_unlock(lists_lock); + return zbudpage_to_zbudref(zbudpage, budnum); +} + +/* + * Finish creation of a zbud by, assuming another zbud isn't being created + * in parallel, marking it evictable. + */ +void zbud_create_finish(struct zbudref *zref, bool eph) +{ + struct zbudpage *zbudpage = zbudref_to_zbudpage(zref); + spinlock_t *lists_lock = + eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; + + spin_lock(lists_lock); + zbudpage_spin_lock(zbudpage); + BUG_ON(zbudpage_is_dying(zbudpage)); + zbudpage->unevictable--; + BUG_ON((int)zbudpage->unevictable < 0); + zbudpage_spin_unlock(zbudpage); + spin_unlock(lists_lock); +} + +/* + * Given a zbudref and a struct page, decompress the data from + * the zbud into the physical page represented by the struct page + * by upcalling to zcache_decompress + */ +int zbud_decompress(struct page *data_page, struct zbudref *zref, bool eph, + void (*decompress)(char *, unsigned int, char *)) +{ + struct zbudpage *zbudpage = zbudref_to_zbudpage(zref); + unsigned long budnum = zbudref_budnum(zref); + void *zbpg; + char *to_va, *from_va; + unsigned size; + int ret = -1; + spinlock_t *lists_lock = + eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; + + spin_lock(lists_lock); + zbudpage_spin_lock(zbudpage); + if (zbudpage_is_dying(zbudpage)) { + /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */ + goto out; + } + zbpg = kmap_zbudpage_atomic(zbudpage); + to_va = kmap_atomic(data_page); + if (budnum == 0) + size = zbudpage->zbud0_size; + else + size = zbudpage->zbud1_size; + BUG_ON(size == 0 || size > zbud_max_size()); + from_va = zbud_data(zbpg, budnum, size); + from_va += sizeof(struct tmem_handle); + size -= sizeof(struct tmem_handle); + decompress(from_va, size, to_va); + kunmap_atomic(to_va); + kunmap_zbudpage_atomic(zbpg); + ret = 0; +out: + zbudpage_spin_unlock(zbudpage); + spin_unlock(lists_lock); + return ret; +} + +/* + * Given a zbudref and a kernel pointer, copy the data from + * the zbud to the kernel pointer. + */ +int zbud_copy_from_zbud(char *to_va, struct zbudref *zref, + size_t *sizep, bool eph) +{ + struct zbudpage *zbudpage = zbudref_to_zbudpage(zref); + unsigned long budnum = zbudref_budnum(zref); + void *zbpg; + char *from_va; + unsigned size; + int ret = -1; + spinlock_t *lists_lock = + eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; + + spin_lock(lists_lock); + zbudpage_spin_lock(zbudpage); + if (zbudpage_is_dying(zbudpage)) { + /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */ + goto out; + } + zbpg = kmap_zbudpage_atomic(zbudpage); + if (budnum == 0) + size = zbudpage->zbud0_size; + else + size = zbudpage->zbud1_size; + BUG_ON(size == 0 || size > zbud_max_size()); + from_va = zbud_data(zbpg, budnum, size); + from_va += sizeof(struct tmem_handle); + size -= sizeof(struct tmem_handle); + *sizep = size; + memcpy(to_va, from_va, size); + + kunmap_zbudpage_atomic(zbpg); + ret = 0; +out: + zbudpage_spin_unlock(zbudpage); + spin_unlock(lists_lock); + return ret; +} + +/* + * Given a zbudref and a kernel pointer, copy the data from + * the kernel pointer to the zbud. + */ +int zbud_copy_to_zbud(struct zbudref *zref, char *from_va, bool eph) +{ + struct zbudpage *zbudpage = zbudref_to_zbudpage(zref); + unsigned long budnum = zbudref_budnum(zref); + void *zbpg; + char *to_va; + unsigned size; + int ret = -1; + spinlock_t *lists_lock = + eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; + + spin_lock(lists_lock); + zbudpage_spin_lock(zbudpage); + if (zbudpage_is_dying(zbudpage)) { + /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */ + goto out; + } + zbpg = kmap_zbudpage_atomic(zbudpage); + if (budnum == 0) + size = zbudpage->zbud0_size; + else + size = zbudpage->zbud1_size; + BUG_ON(size == 0 || size > zbud_max_size()); + to_va = zbud_data(zbpg, budnum, size); + to_va += sizeof(struct tmem_handle); + size -= sizeof(struct tmem_handle); + memcpy(to_va, from_va, size); + + kunmap_zbudpage_atomic(zbpg); + ret = 0; +out: + zbudpage_spin_unlock(zbudpage); + spin_unlock(lists_lock); + return ret; +} + +/* + * Choose an ephemeral LRU zbudpage that is evictable (not locked), ensure + * there are no references to it remaining, and return the now unused + * (and re-init'ed) struct page and the total amount of compressed + * data that was evicted. + */ +struct page *zbud_evict_pageframe_lru(unsigned int *zsize, unsigned int *zpages) +{ + struct zbudpage *zbudpage = NULL, *zbudpage2; + struct zbud_unbuddied *unbud = zbud_eph_unbuddied; + struct page *page = NULL; + bool irqs_disabled = irqs_disabled(); + + /* + * Since this can be called indirectly from cleancache_put, which + * has interrupts disabled, as well as frontswap_put, which does not, + * we need to be able to handle both cases, even though it is ugly. + */ + if (irqs_disabled) + spin_lock(&zbud_eph_lists_lock); + else + spin_lock_bh(&zbud_eph_lists_lock); + *zsize = 0; + if (list_empty(&zbud_eph_lru_list)) + goto unlock_out; + list_for_each_entry_safe(zbudpage, zbudpage2, &zbud_eph_lru_list, lru) { + /* skip a locked zbudpage */ + if (unlikely(!zbudpage_spin_trylock(zbudpage))) + continue; + /* skip an unevictable zbudpage */ + if (unlikely(zbudpage->unevictable != 0)) { + zbudpage_spin_unlock(zbudpage); + continue; + } + /* got a locked evictable page */ + goto evict_page; + + } +unlock_out: + /* no unlocked evictable pages, give up */ + if (irqs_disabled) + spin_unlock(&zbud_eph_lists_lock); + else + spin_unlock_bh(&zbud_eph_lists_lock); + goto out; + +evict_page: + list_del_init(&zbudpage->budlist); + list_del_init(&zbudpage->lru); + zbudpage_set_dying(zbudpage); + /* + * the zbudpage is now "dying" and attempts to read, write, + * or delete data from it will be ignored + */ + if (zbudpage->zbud0_size != 0 && zbudpage->zbud1_size != 0) { + *zsize = zbudpage->zbud0_size + zbudpage->zbud1_size - + (2 * sizeof(struct tmem_handle)); + *zpages = 2; + } else if (zbudpage->zbud0_size != 0) { + unbud[zbud_size_to_chunks(zbudpage->zbud0_size)].count--; + *zsize = zbudpage->zbud0_size - sizeof(struct tmem_handle); + *zpages = 1; + } else if (zbudpage->zbud1_size != 0) { + unbud[zbud_size_to_chunks(zbudpage->zbud1_size)].count--; + *zsize = zbudpage->zbud1_size - sizeof(struct tmem_handle); + *zpages = 1; + } else { + BUG(); + } + spin_unlock(&zbud_eph_lists_lock); + zbud_eph_evicted_pageframes++; + if (*zpages == 1) + zbud_eph_unbuddied_count--; + else + zbud_eph_buddied_count--; + zbud_evict_tmem(zbudpage); + zbudpage_spin_lock(zbudpage); + zbudpage_clear_dying(zbudpage); + page = zbud_unuse_zbudpage(zbudpage, true); + if (!irqs_disabled) + local_bh_enable(); +out: + return page; +} + +/* + * Choose a persistent LRU zbudpage that is evictable (not locked), zombify it, + * read the tmem_handle(s) out of it into the passed array, and return the + * number of zbuds. Caller must perform necessary tmem functions and, + * indirectly, zbud functions to fetch any valid data and cause the + * now-zombified zbudpage to eventually be freed. We track the zombified + * zbudpage count so it is possible to observe if there is a leak. + FIXME: describe (ramster) case where data pointers are passed in for memcpy + */ +unsigned int zbud_make_zombie_lru(struct tmem_handle *th, unsigned char **data, + unsigned int *zsize, bool eph) +{ + struct zbudpage *zbudpage = NULL, *zbudpag2; + struct tmem_handle *thfrom; + char *from_va; + void *zbpg; + unsigned size; + int ret = 0, i; + spinlock_t *lists_lock = + eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock; + struct list_head *lru_list = + eph ? &zbud_eph_lru_list : &zbud_pers_lru_list; + + spin_lock_bh(lists_lock); + if (list_empty(lru_list)) + goto out; + list_for_each_entry_safe(zbudpage, zbudpag2, lru_list, lru) { + /* skip a locked zbudpage */ + if (unlikely(!zbudpage_spin_trylock(zbudpage))) + continue; + /* skip an unevictable zbudpage */ + if (unlikely(zbudpage->unevictable != 0)) { + zbudpage_spin_unlock(zbudpage); + continue; + } + /* got a locked evictable page */ + goto zombify_page; + } + /* no unlocked evictable pages, give up */ + goto out; + +zombify_page: + /* got an unlocked evictable page, zombify it */ + list_del_init(&zbudpage->budlist); + zbudpage_set_zombie(zbudpage); + /* FIXME what accounting do I need to do here? */ + list_del_init(&zbudpage->lru); + if (eph) { + list_add_tail(&zbudpage->lru, &zbud_eph_zombie_list); + zbud_eph_zombie_count = + atomic_inc_return(&zbud_eph_zombie_atomic); + } else { + list_add_tail(&zbudpage->lru, &zbud_pers_zombie_list); + zbud_pers_zombie_count = + atomic_inc_return(&zbud_pers_zombie_atomic); + } + /* FIXME what accounting do I need to do here? */ + zbpg = kmap_zbudpage_atomic(zbudpage); + for (i = 0; i < 2; i++) { + size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size; + if (size) { + from_va = zbud_data(zbpg, i, size); + thfrom = (struct tmem_handle *)from_va; + from_va += sizeof(struct tmem_handle); + size -= sizeof(struct tmem_handle); + if (th != NULL) + th[ret] = *thfrom; + if (data != NULL) + memcpy(data[ret], from_va, size); + if (zsize != NULL) + *zsize++ = size; + ret++; + } + } + kunmap_zbudpage_atomic(zbpg); + zbudpage_spin_unlock(zbudpage); +out: + spin_unlock_bh(lists_lock); + return ret; +} + +void __init zbud_init(void) +{ + int i; + +#ifdef CONFIG_DEBUG_FS + zbud_debugfs_init(); +#endif + BUG_ON((sizeof(struct tmem_handle) * 2 > CHUNK_SIZE)); + BUG_ON(sizeof(struct zbudpage) > sizeof(struct page)); + for (i = 0; i < NCHUNKS; i++) { + INIT_LIST_HEAD(&zbud_eph_unbuddied[i].list); + INIT_LIST_HEAD(&zbud_pers_unbuddied[i].list); + } +} |