diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-03-19 09:43:06 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-03-19 09:43:06 -0700 |
commit | fc7f99cf36ebae853639dabb43bc2f0098c59aef (patch) | |
tree | 3ca7050397f515f91ef98f8b6293f9f7fd84ef02 /fs/ceph/msgpool.c | |
parent | 0a492fdef8aa241f6139e6455e852cc710ae8ed1 (diff) | |
parent | f1a3d57213fe264b4cf584e78bac36aaf9998729 (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (205 commits)
ceph: update for write_inode API change
ceph: reset osd after relevant messages timed out
ceph: fix flush_dirty_caps race with caps migration
ceph: include migrating caps in issued set
ceph: fix osdmap decoding when pools include (removed) snaps
ceph: return EBADF if waiting for caps on closed file
ceph: set osd request message front length correctly
ceph: reset front len on return to msgpool; BUG on mismatched front iov
ceph: fix snaptrace decoding on cap migration between mds
ceph: use single osd op reply msg
ceph: reset bits on connection close
ceph: remove bogus mds forward warning
ceph: remove fragile __map_osds optimization
ceph: fix connection fault STANDBY check
ceph: invalidate_authorizer without con->mutex held
ceph: don't clobber write return value when using O_SYNC
ceph: fix client_request_forward decoding
ceph: drop messages on unregistered mds sessions; cleanup
ceph: fix comments, locking in destroy_inode
ceph: move dereference after NULL test
...
Fix trivial conflicts in Documentation/ioctl/ioctl-number.txt
Diffstat (limited to 'fs/ceph/msgpool.c')
-rw-r--r-- | fs/ceph/msgpool.c | 186 |
1 files changed, 186 insertions, 0 deletions
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c new file mode 100644 index 00000000000..ca3b44a89f2 --- /dev/null +++ b/fs/ceph/msgpool.c @@ -0,0 +1,186 @@ +#include "ceph_debug.h" + +#include <linux/err.h> +#include <linux/sched.h> +#include <linux/types.h> +#include <linux/vmalloc.h> + +#include "msgpool.h" + +/* + * We use msg pools to preallocate memory for messages we expect to + * receive over the wire, to avoid getting ourselves into OOM + * conditions at unexpected times. We take use a few different + * strategies: + * + * - for request/response type interactions, we preallocate the + * memory needed for the response when we generate the request. + * + * - for messages we can receive at any time from the MDS, we preallocate + * a pool of messages we can re-use. + * + * - for writeback, we preallocate some number of messages to use for + * requests and their replies, so that we always make forward + * progress. + * + * The msgpool behaves like a mempool_t, but keeps preallocated + * ceph_msgs strung together on a list_head instead of using a pointer + * vector. This avoids vector reallocation when we adjust the number + * of preallocated items (which happens frequently). + */ + + +/* + * Allocate or release as necessary to meet our target pool size. + */ +static int __fill_msgpool(struct ceph_msgpool *pool) +{ + struct ceph_msg *msg; + + while (pool->num < pool->min) { + dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num, + pool->min); + spin_unlock(&pool->lock); + msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL); + spin_lock(&pool->lock); + if (IS_ERR(msg)) + return PTR_ERR(msg); + msg->pool = pool; + list_add(&msg->list_head, &pool->msgs); + pool->num++; + } + while (pool->num > pool->min) { + msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head); + dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num, + pool->min, msg); + list_del_init(&msg->list_head); + pool->num--; + ceph_msg_kfree(msg); + } + return 0; +} + +int ceph_msgpool_init(struct ceph_msgpool *pool, + int front_len, int min, bool blocking) +{ + int ret; + + dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min); + spin_lock_init(&pool->lock); + pool->front_len = front_len; + INIT_LIST_HEAD(&pool->msgs); + pool->num = 0; + pool->min = min; + pool->blocking = blocking; + init_waitqueue_head(&pool->wait); + + spin_lock(&pool->lock); + ret = __fill_msgpool(pool); + spin_unlock(&pool->lock); + return ret; +} + +void ceph_msgpool_destroy(struct ceph_msgpool *pool) +{ + dout("msgpool_destroy %p\n", pool); + spin_lock(&pool->lock); + pool->min = 0; + __fill_msgpool(pool); + spin_unlock(&pool->lock); +} + +int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta) +{ + int ret; + + spin_lock(&pool->lock); + dout("msgpool_resv %p delta %d\n", pool, delta); + pool->min += delta; + ret = __fill_msgpool(pool); + spin_unlock(&pool->lock); + return ret; +} + +struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len) +{ + wait_queue_t wait; + struct ceph_msg *msg; + + if (front_len && front_len > pool->front_len) { + pr_err("msgpool_get pool %p need front %d, pool size is %d\n", + pool, front_len, pool->front_len); + WARN_ON(1); + + /* try to alloc a fresh message */ + msg = ceph_msg_new(0, front_len, 0, 0, NULL); + if (!IS_ERR(msg)) + return msg; + } + + if (!front_len) + front_len = pool->front_len; + + if (pool->blocking) { + /* mempool_t behavior; first try to alloc */ + msg = ceph_msg_new(0, front_len, 0, 0, NULL); + if (!IS_ERR(msg)) + return msg; + } + + while (1) { + spin_lock(&pool->lock); + if (likely(pool->num)) { + msg = list_entry(pool->msgs.next, struct ceph_msg, + list_head); + list_del_init(&msg->list_head); + pool->num--; + dout("msgpool_get %p got %p, now %d/%d\n", pool, msg, + pool->num, pool->min); + spin_unlock(&pool->lock); + return msg; + } + pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num, + pool->min, pool->blocking ? "waiting" : "may fail"); + spin_unlock(&pool->lock); + + if (!pool->blocking) { + WARN_ON(1); + + /* maybe we can allocate it now? */ + msg = ceph_msg_new(0, front_len, 0, 0, NULL); + if (!IS_ERR(msg)) + return msg; + + pr_err("msgpool_get %p empty + alloc failed\n", pool); + return ERR_PTR(-ENOMEM); + } + + init_wait(&wait); + prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); + schedule(); + finish_wait(&pool->wait, &wait); + } +} + +void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) +{ + spin_lock(&pool->lock); + if (pool->num < pool->min) { + /* reset msg front_len; user may have changed it */ + msg->front.iov_len = pool->front_len; + msg->hdr.front_len = cpu_to_le32(pool->front_len); + + kref_set(&msg->kref, 1); /* retake a single ref */ + list_add(&msg->list_head, &pool->msgs); + pool->num++; + dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg, + pool->num, pool->min); + spin_unlock(&pool->lock); + wake_up(&pool->wait); + } else { + dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg, + pool->num, pool->min); + spin_unlock(&pool->lock); + ceph_msg_kfree(msg); + } +} |