summaryrefslogtreecommitdiffstats
path: root/fs/ceph/msgpool.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-03-19 09:43:06 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2010-03-19 09:43:06 -0700
commitfc7f99cf36ebae853639dabb43bc2f0098c59aef (patch)
tree3ca7050397f515f91ef98f8b6293f9f7fd84ef02 /fs/ceph/msgpool.c
parent0a492fdef8aa241f6139e6455e852cc710ae8ed1 (diff)
parentf1a3d57213fe264b4cf584e78bac36aaf9998729 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (205 commits) ceph: update for write_inode API change ceph: reset osd after relevant messages timed out ceph: fix flush_dirty_caps race with caps migration ceph: include migrating caps in issued set ceph: fix osdmap decoding when pools include (removed) snaps ceph: return EBADF if waiting for caps on closed file ceph: set osd request message front length correctly ceph: reset front len on return to msgpool; BUG on mismatched front iov ceph: fix snaptrace decoding on cap migration between mds ceph: use single osd op reply msg ceph: reset bits on connection close ceph: remove bogus mds forward warning ceph: remove fragile __map_osds optimization ceph: fix connection fault STANDBY check ceph: invalidate_authorizer without con->mutex held ceph: don't clobber write return value when using O_SYNC ceph: fix client_request_forward decoding ceph: drop messages on unregistered mds sessions; cleanup ceph: fix comments, locking in destroy_inode ceph: move dereference after NULL test ... Fix trivial conflicts in Documentation/ioctl/ioctl-number.txt
Diffstat (limited to 'fs/ceph/msgpool.c')
-rw-r--r--fs/ceph/msgpool.c186
1 files changed, 186 insertions, 0 deletions
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
new file mode 100644
index 00000000000..ca3b44a89f2
--- /dev/null
+++ b/fs/ceph/msgpool.c
@@ -0,0 +1,186 @@
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+
+#include "msgpool.h"
+
+/*
+ * We use msg pools to preallocate memory for messages we expect to
+ * receive over the wire, to avoid getting ourselves into OOM
+ * conditions at unexpected times. We take use a few different
+ * strategies:
+ *
+ * - for request/response type interactions, we preallocate the
+ * memory needed for the response when we generate the request.
+ *
+ * - for messages we can receive at any time from the MDS, we preallocate
+ * a pool of messages we can re-use.
+ *
+ * - for writeback, we preallocate some number of messages to use for
+ * requests and their replies, so that we always make forward
+ * progress.
+ *
+ * The msgpool behaves like a mempool_t, but keeps preallocated
+ * ceph_msgs strung together on a list_head instead of using a pointer
+ * vector. This avoids vector reallocation when we adjust the number
+ * of preallocated items (which happens frequently).
+ */
+
+
+/*
+ * Allocate or release as necessary to meet our target pool size.
+ */
+static int __fill_msgpool(struct ceph_msgpool *pool)
+{
+ struct ceph_msg *msg;
+
+ while (pool->num < pool->min) {
+ dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
+ pool->min);
+ spin_unlock(&pool->lock);
+ msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
+ spin_lock(&pool->lock);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
+ msg->pool = pool;
+ list_add(&msg->list_head, &pool->msgs);
+ pool->num++;
+ }
+ while (pool->num > pool->min) {
+ msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
+ dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
+ pool->min, msg);
+ list_del_init(&msg->list_head);
+ pool->num--;
+ ceph_msg_kfree(msg);
+ }
+ return 0;
+}
+
+int ceph_msgpool_init(struct ceph_msgpool *pool,
+ int front_len, int min, bool blocking)
+{
+ int ret;
+
+ dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
+ spin_lock_init(&pool->lock);
+ pool->front_len = front_len;
+ INIT_LIST_HEAD(&pool->msgs);
+ pool->num = 0;
+ pool->min = min;
+ pool->blocking = blocking;
+ init_waitqueue_head(&pool->wait);
+
+ spin_lock(&pool->lock);
+ ret = __fill_msgpool(pool);
+ spin_unlock(&pool->lock);
+ return ret;
+}
+
+void ceph_msgpool_destroy(struct ceph_msgpool *pool)
+{
+ dout("msgpool_destroy %p\n", pool);
+ spin_lock(&pool->lock);
+ pool->min = 0;
+ __fill_msgpool(pool);
+ spin_unlock(&pool->lock);
+}
+
+int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
+{
+ int ret;
+
+ spin_lock(&pool->lock);
+ dout("msgpool_resv %p delta %d\n", pool, delta);
+ pool->min += delta;
+ ret = __fill_msgpool(pool);
+ spin_unlock(&pool->lock);
+ return ret;
+}
+
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
+{
+ wait_queue_t wait;
+ struct ceph_msg *msg;
+
+ if (front_len && front_len > pool->front_len) {
+ pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
+ pool, front_len, pool->front_len);
+ WARN_ON(1);
+
+ /* try to alloc a fresh message */
+ msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+ if (!IS_ERR(msg))
+ return msg;
+ }
+
+ if (!front_len)
+ front_len = pool->front_len;
+
+ if (pool->blocking) {
+ /* mempool_t behavior; first try to alloc */
+ msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+ if (!IS_ERR(msg))
+ return msg;
+ }
+
+ while (1) {
+ spin_lock(&pool->lock);
+ if (likely(pool->num)) {
+ msg = list_entry(pool->msgs.next, struct ceph_msg,
+ list_head);
+ list_del_init(&msg->list_head);
+ pool->num--;
+ dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
+ pool->num, pool->min);
+ spin_unlock(&pool->lock);
+ return msg;
+ }
+ pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
+ pool->min, pool->blocking ? "waiting" : "may fail");
+ spin_unlock(&pool->lock);
+
+ if (!pool->blocking) {
+ WARN_ON(1);
+
+ /* maybe we can allocate it now? */
+ msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+ if (!IS_ERR(msg))
+ return msg;
+
+ pr_err("msgpool_get %p empty + alloc failed\n", pool);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ init_wait(&wait);
+ prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
+ schedule();
+ finish_wait(&pool->wait, &wait);
+ }
+}
+
+void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
+{
+ spin_lock(&pool->lock);
+ if (pool->num < pool->min) {
+ /* reset msg front_len; user may have changed it */
+ msg->front.iov_len = pool->front_len;
+ msg->hdr.front_len = cpu_to_le32(pool->front_len);
+
+ kref_set(&msg->kref, 1); /* retake a single ref */
+ list_add(&msg->list_head, &pool->msgs);
+ pool->num++;
+ dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
+ pool->num, pool->min);
+ spin_unlock(&pool->lock);
+ wake_up(&pool->wait);
+ } else {
+ dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
+ pool->num, pool->min);
+ spin_unlock(&pool->lock);
+ ceph_msg_kfree(msg);
+ }
+}