diff options
Diffstat (limited to 'net/ceph/messenger.c')
-rw-r--r-- | net/ceph/messenger.c | 179 |
1 files changed, 87 insertions, 92 deletions
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index b6ff4a1519a..05f357828a2 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -96,7 +96,7 @@ struct workqueue_struct *ceph_msgr_wq; int ceph_msgr_init(void) { - ceph_msgr_wq = create_workqueue("ceph-msgr"); + ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0); if (!ceph_msgr_wq) { pr_err("msgr_init failed to create workqueue\n"); return -ENOMEM; @@ -252,8 +252,12 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) { struct kvec iov = {buf, len}; struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + int r; - return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags); + r = kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags); + if (r == -EAGAIN) + r = 0; + return r; } /* @@ -264,13 +268,17 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, size_t kvlen, size_t len, int more) { struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + int r; if (more) msg.msg_flags |= MSG_MORE; else msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ - return kernel_sendmsg(sock, &msg, iov, kvlen, len); + r = kernel_sendmsg(sock, &msg, iov, kvlen, len); + if (r == -EAGAIN) + r = 0; + return r; } @@ -328,7 +336,6 @@ static void reset_connection(struct ceph_connection *con) ceph_msg_put(con->out_msg); con->out_msg = NULL; } - con->out_keepalive_pending = false; con->in_seq = 0; con->in_seq_acked = 0; } @@ -847,6 +854,8 @@ static int write_partial_msg_pages(struct ceph_connection *con) (msg->pages || msg->pagelist || msg->bio || in_trail)) kunmap(page); + if (ret == -EAGAIN) + ret = 0; if (ret <= 0) goto out; @@ -1238,8 +1247,6 @@ static int process_connect(struct ceph_connection *con) con->auth_retry); if (con->auth_retry == 2) { con->error_msg = "connect authorization failure"; - reset_connection(con); - set_bit(CLOSED, &con->state); return -1; } con->auth_retry = 1; @@ -1705,14 +1712,6 @@ more: /* open the socket first? */ if (con->sock == NULL) { - /* - * if we were STANDBY and are reconnecting _this_ - * connection, bump connect_seq now. Always bump - * global_seq. - */ - if (test_and_clear_bit(STANDBY, &con->state)) - con->connect_seq++; - prepare_write_banner(msgr, con); prepare_write_connect(msgr, con, 1); prepare_read_banner(con); @@ -1737,16 +1736,12 @@ more_kvec: if (con->out_skip) { ret = write_partial_skip(con); if (ret <= 0) - goto done; - if (ret < 0) { - dout("try_write write_partial_skip err %d\n", ret); - goto done; - } + goto out; } if (con->out_kvec_left) { ret = write_partial_kvec(con); if (ret <= 0) - goto done; + goto out; } /* msg pages? */ @@ -1761,11 +1756,11 @@ more_kvec: if (ret == 1) goto more_kvec; /* we need to send the footer, too! */ if (ret == 0) - goto done; + goto out; if (ret < 0) { dout("try_write write_partial_msg_pages err %d\n", ret); - goto done; + goto out; } } @@ -1789,10 +1784,9 @@ do_next: /* Nothing to do! */ clear_bit(WRITE_PENDING, &con->state); dout("try_write nothing else to write.\n"); -done: ret = 0; out: - dout("try_write done on %p\n", con); + dout("try_write done on %p ret %d\n", con, ret); return ret; } @@ -1821,19 +1815,17 @@ more: dout("try_read connecting\n"); ret = read_partial_banner(con); if (ret <= 0) - goto done; - if (process_banner(con) < 0) { - ret = -1; goto out; - } + ret = process_banner(con); + if (ret < 0) + goto out; } ret = read_partial_connect(con); if (ret <= 0) - goto done; - if (process_connect(con) < 0) { - ret = -1; goto out; - } + ret = process_connect(con); + if (ret < 0) + goto out; goto more; } @@ -1848,7 +1840,7 @@ more: dout("skipping %d / %d bytes\n", skip, -con->in_base_pos); ret = ceph_tcp_recvmsg(con->sock, buf, skip); if (ret <= 0) - goto done; + goto out; con->in_base_pos += ret; if (con->in_base_pos) goto more; @@ -1859,7 +1851,7 @@ more: */ ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1); if (ret <= 0) - goto done; + goto out; dout("try_read got tag %d\n", (int)con->in_tag); switch (con->in_tag) { case CEPH_MSGR_TAG_MSG: @@ -1870,7 +1862,7 @@ more: break; case CEPH_MSGR_TAG_CLOSE: set_bit(CLOSED, &con->state); /* fixme */ - goto done; + goto out; default: goto bad_tag; } @@ -1882,13 +1874,12 @@ more: case -EBADMSG: con->error_msg = "bad crc"; ret = -EIO; - goto out; + break; case -EIO: con->error_msg = "io error"; - goto out; - default: - goto done; + break; } + goto out; } if (con->in_tag == CEPH_MSGR_TAG_READY) goto more; @@ -1898,15 +1889,13 @@ more: if (con->in_tag == CEPH_MSGR_TAG_ACK) { ret = read_partial_ack(con); if (ret <= 0) - goto done; + goto out; process_ack(con); goto more; } -done: - ret = 0; out: - dout("try_read done on %p\n", con); + dout("try_read done on %p ret %d\n", con, ret); return ret; bad_tag: @@ -1920,20 +1909,6 @@ bad_tag: /* * Atomically queue work on a connection. Bump @con reference to * avoid races with connection teardown. - * - * There is some trickery going on with QUEUED and BUSY because we - * only want a _single_ thread operating on each connection at any - * point in time, but we want to use all available CPUs. - * - * The worker thread only proceeds if it can atomically set BUSY. It - * clears QUEUED and does it's thing. When it thinks it's done, it - * clears BUSY, then rechecks QUEUED.. if it's set again, it loops - * (tries again to set BUSY). - * - * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we - * try to queue work. If that fails (work is already queued, or BUSY) - * we give up (work also already being done or is queued) but leave QUEUED - * set so that the worker thread will loop if necessary. */ static void queue_con(struct ceph_connection *con) { @@ -1948,11 +1923,7 @@ static void queue_con(struct ceph_connection *con) return; } - set_bit(QUEUED, &con->state); - if (test_bit(BUSY, &con->state)) { - dout("queue_con %p - already BUSY\n", con); - con->ops->put(con); - } else if (!queue_work(ceph_msgr_wq, &con->work.work)) { + if (!queue_delayed_work(ceph_msgr_wq, &con->work, 0)) { dout("queue_con %p - already queued\n", con); con->ops->put(con); } else { @@ -1967,18 +1938,26 @@ static void con_work(struct work_struct *work) { struct ceph_connection *con = container_of(work, struct ceph_connection, work.work); - int backoff = 0; - -more: - if (test_and_set_bit(BUSY, &con->state) != 0) { - dout("con_work %p BUSY already set\n", con); - goto out; - } - dout("con_work %p start, clearing QUEUED\n", con); - clear_bit(QUEUED, &con->state); mutex_lock(&con->mutex); + if (test_and_clear_bit(BACKOFF, &con->state)) { + dout("con_work %p backing off\n", con); + if (queue_delayed_work(ceph_msgr_wq, &con->work, + round_jiffies_relative(con->delay))) { + dout("con_work %p backoff %lu\n", con, con->delay); + mutex_unlock(&con->mutex); + return; + } else { + con->ops->put(con); + dout("con_work %p FAILED to back off %lu\n", con, + con->delay); + } + } + if (test_bit(STANDBY, &con->state)) { + dout("con_work %p STANDBY\n", con); + goto done; + } if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ dout("con_work CLOSED\n"); con_close_socket(con); @@ -1994,28 +1973,13 @@ more: try_read(con) < 0 || try_write(con) < 0) { mutex_unlock(&con->mutex); - backoff = 1; ceph_fault(con); /* error/fault path */ goto done_unlocked; } done: mutex_unlock(&con->mutex); - done_unlocked: - clear_bit(BUSY, &con->state); - dout("con->state=%lu\n", con->state); - if (test_bit(QUEUED, &con->state)) { - if (!backoff || test_bit(OPENING, &con->state)) { - dout("con_work %p QUEUED reset, looping\n", con); - goto more; - } - dout("con_work %p QUEUED reset, but just faulted\n", con); - clear_bit(QUEUED, &con->state); - } - dout("con_work %p done\n", con); - -out: con->ops->put(con); } @@ -2050,10 +2014,12 @@ static void ceph_fault(struct ceph_connection *con) /* Requeue anything that hasn't been acked */ list_splice_init(&con->out_sent, &con->out_queue); - /* If there are no messages in the queue, place the connection - * in a STANDBY state (i.e., don't try to reconnect just yet). */ - if (list_empty(&con->out_queue) && !con->out_keepalive_pending) { - dout("fault setting STANDBY\n"); + /* If there are no messages queued or keepalive pending, place + * the connection in a STANDBY state */ + if (list_empty(&con->out_queue) && + !test_bit(KEEPALIVE_PENDING, &con->state)) { + dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); + clear_bit(WRITE_PENDING, &con->state); set_bit(STANDBY, &con->state); } else { /* retry after a delay. */ @@ -2061,11 +2027,24 @@ static void ceph_fault(struct ceph_connection *con) con->delay = BASE_DELAY_INTERVAL; else if (con->delay < MAX_DELAY_INTERVAL) con->delay *= 2; - dout("fault queueing %p delay %lu\n", con, con->delay); con->ops->get(con); if (queue_delayed_work(ceph_msgr_wq, &con->work, - round_jiffies_relative(con->delay)) == 0) + round_jiffies_relative(con->delay))) { + dout("fault queued %p delay %lu\n", con, con->delay); + } else { con->ops->put(con); + dout("fault failed to queue %p delay %lu, backoff\n", + con, con->delay); + /* + * In many cases we see a socket state change + * while con_work is running and end up + * queuing (non-delayed) work, such that we + * can't backoff with a delay. Set a flag so + * that when con_work restarts we schedule the + * delay then. + */ + set_bit(BACKOFF, &con->state); + } } out_unlock: @@ -2136,6 +2115,19 @@ void ceph_messenger_destroy(struct ceph_messenger *msgr) } EXPORT_SYMBOL(ceph_messenger_destroy); +static void clear_standby(struct ceph_connection *con) +{ + /* come back from STANDBY? */ + if (test_and_clear_bit(STANDBY, &con->state)) { + mutex_lock(&con->mutex); + dout("clear_standby %p and ++connect_seq\n", con); + con->connect_seq++; + WARN_ON(test_bit(WRITE_PENDING, &con->state)); + WARN_ON(test_bit(KEEPALIVE_PENDING, &con->state)); + mutex_unlock(&con->mutex); + } +} + /* * Queue up an outgoing message on the given connection. */ @@ -2168,6 +2160,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) /* if there wasn't anything waiting to send before, queue * new work */ + clear_standby(con); if (test_and_set_bit(WRITE_PENDING, &con->state) == 0) queue_con(con); } @@ -2233,6 +2226,8 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) */ void ceph_con_keepalive(struct ceph_connection *con) { + dout("con_keepalive %p\n", con); + clear_standby(con); if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 && test_and_set_bit(WRITE_PENDING, &con->state) == 0) queue_con(con); |