summaryrefslogtreecommitdiff
path: root/net/ceph/messenger_v1.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ceph/messenger_v1.c')
-rw-r--r--net/ceph/messenger_v1.c190
1 files changed, 131 insertions, 59 deletions
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
index d1787d7d33ef..c9e002d96319 100644
--- a/net/ceph/messenger_v1.c
+++ b/net/ceph/messenger_v1.c
@@ -40,15 +40,12 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
int page_offset, size_t length)
{
- struct bio_vec bvec = {
- .bv_page = page,
- .bv_offset = page_offset,
- .bv_len = length
- };
+ struct bio_vec bvec;
struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
int r;
BUG_ON(page_offset + length > PAGE_SIZE);
+ bvec_set_page(&bvec, page, length, page_offset);
iov_iter_bvec(&msg.msg_iter, ITER_DEST, &bvec, 1, length);
r = sock_recvmsg(sock, &msg, msg.msg_flags);
if (r == -EAGAIN)
@@ -78,18 +75,19 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
}
/*
- * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
+ * @more: MSG_MORE or 0.
*/
static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
int offset, size_t size, int more)
{
- ssize_t (*sendpage)(struct socket *sock, struct page *page,
- int offset, size_t size, int flags);
- int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
+ struct msghdr msg = {
+ .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | more,
+ };
+ struct bio_vec bvec;
int ret;
/*
- * sendpage cannot properly handle pages with page_count == 0,
+ * MSG_SPLICE_PAGES cannot properly handle pages with page_count == 0,
* we need to fall back to sendmsg if that's the case.
*
* Same goes for slab pages: skb_can_coalesce() allows
@@ -97,11 +95,12 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
* triggers one of hardened usercopy checks.
*/
if (sendpage_ok(page))
- sendpage = sock->ops->sendpage;
- else
- sendpage = sock_no_sendpage;
+ msg.msg_flags |= MSG_SPLICE_PAGES;
+
+ bvec_set_page(&bvec, page, size, offset);
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size);
- ret = sendpage(sock, page, offset, size, flags);
+ ret = sock_sendmsg(sock, &msg);
if (ret == -EAGAIN)
ret = 0;
@@ -160,19 +159,19 @@ static size_t sizeof_footer(struct ceph_connection *con)
static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
{
- /* Initialize data cursor */
+ /* Initialize data cursor if it's not a sparse read */
+ u64 len = msg->sparse_read_total ? : data_len;
- ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
+ ceph_msg_data_cursor_init(&msg->cursor, msg, len);
}
/*
* Prepare footer for currently outgoing message, and finish things
* off. Assumes out_kvec* are already valid.. we just add on to the end.
*/
-static void prepare_write_message_footer(struct ceph_connection *con)
+static void prepare_write_message_footer(struct ceph_connection *con,
+ struct ceph_msg *m)
{
- struct ceph_msg *m = con->out_msg;
-
m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
dout("prepare_write_message_footer %p\n", con);
@@ -192,9 +191,9 @@ static void prepare_write_message_footer(struct ceph_connection *con)
/*
* Prepare headers for the next outgoing message.
*/
-static void prepare_write_message(struct ceph_connection *con)
+static void prepare_write_message(struct ceph_connection *con,
+ struct ceph_msg *m)
{
- struct ceph_msg *m;
u32 crc;
con_out_kvec_reset(con);
@@ -210,9 +209,6 @@ static void prepare_write_message(struct ceph_connection *con)
&con->v1.out_temp_ack);
}
- ceph_con_get_out_msg(con);
- m = con->out_msg;
-
dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
m, con->out_seq, le16_to_cpu(m->hdr.type),
le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
@@ -231,31 +227,31 @@ static void prepare_write_message(struct ceph_connection *con)
/* fill in hdr crc and finalize hdr */
crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
- con->out_msg->hdr.crc = cpu_to_le32(crc);
- memcpy(&con->v1.out_hdr, &con->out_msg->hdr, sizeof(con->v1.out_hdr));
+ m->hdr.crc = cpu_to_le32(crc);
+ memcpy(&con->v1.out_hdr, &m->hdr, sizeof(con->v1.out_hdr));
/* fill in front and middle crc, footer */
crc = crc32c(0, m->front.iov_base, m->front.iov_len);
- con->out_msg->footer.front_crc = cpu_to_le32(crc);
+ m->footer.front_crc = cpu_to_le32(crc);
if (m->middle) {
crc = crc32c(0, m->middle->vec.iov_base,
m->middle->vec.iov_len);
- con->out_msg->footer.middle_crc = cpu_to_le32(crc);
+ m->footer.middle_crc = cpu_to_le32(crc);
} else
- con->out_msg->footer.middle_crc = 0;
+ m->footer.middle_crc = 0;
dout("%s front_crc %u middle_crc %u\n", __func__,
- le32_to_cpu(con->out_msg->footer.front_crc),
- le32_to_cpu(con->out_msg->footer.middle_crc));
- con->out_msg->footer.flags = 0;
+ le32_to_cpu(m->footer.front_crc),
+ le32_to_cpu(m->footer.middle_crc));
+ m->footer.flags = 0;
/* is there a data payload? */
- con->out_msg->footer.data_crc = 0;
+ m->footer.data_crc = 0;
if (m->data_length) {
- prepare_message_data(con->out_msg, m->data_length);
+ prepare_message_data(m, m->data_length);
con->v1.out_more = 1; /* data + footer will follow */
} else {
/* no, queue up footer too and be done */
- prepare_write_message_footer(con);
+ prepare_write_message_footer(con, m);
}
ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
@@ -462,12 +458,11 @@ out:
* 0 -> socket full, but more to do
* <0 -> error
*/
-static int write_partial_message_data(struct ceph_connection *con)
+static int write_partial_message_data(struct ceph_connection *con,
+ struct ceph_msg *msg)
{
- struct ceph_msg *msg = con->out_msg;
struct ceph_msg_data_cursor *cursor = &msg->cursor;
bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
- int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
u32 crc;
dout("%s %p msg %p\n", __func__, con, msg);
@@ -496,10 +491,8 @@ static int write_partial_message_data(struct ceph_connection *con)
}
page = ceph_msg_data_next(cursor, &page_offset, &length);
- if (length == cursor->total_resid)
- more = MSG_MORE;
ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
- more);
+ MSG_MORE);
if (ret <= 0) {
if (do_datacrc)
msg->footer.data_crc = cpu_to_le32(crc);
@@ -519,7 +512,7 @@ static int write_partial_message_data(struct ceph_connection *con)
else
msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
con_out_kvec_reset(con);
- prepare_write_message_footer(con);
+ prepare_write_message_footer(con, msg);
return 1; /* must return > 0 to indicate success */
}
@@ -529,17 +522,14 @@ static int write_partial_message_data(struct ceph_connection *con)
*/
static int write_partial_skip(struct ceph_connection *con)
{
- int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
int ret;
dout("%s %p %d left\n", __func__, con, con->v1.out_skip);
while (con->v1.out_skip > 0) {
size_t size = min(con->v1.out_skip, (int)PAGE_SIZE);
- if (size == con->v1.out_skip)
- more = MSG_MORE;
ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
- more);
+ MSG_MORE);
if (ret <= 0)
goto out;
con->v1.out_skip -= ret;
@@ -967,9 +957,9 @@ static void process_ack(struct ceph_connection *con)
prepare_read_tag(con);
}
-static int read_partial_message_section(struct ceph_connection *con,
- struct kvec *section,
- unsigned int sec_len, u32 *crc)
+static int read_partial_message_chunk(struct ceph_connection *con,
+ struct kvec *section,
+ unsigned int sec_len, u32 *crc)
{
int ret, left;
@@ -985,11 +975,91 @@ static int read_partial_message_section(struct ceph_connection *con,
section->iov_len += ret;
}
if (section->iov_len == sec_len)
- *crc = crc32c(0, section->iov_base, section->iov_len);
+ *crc = crc32c(*crc, section->iov_base, section->iov_len);
return 1;
}
+static inline int read_partial_message_section(struct ceph_connection *con,
+ struct kvec *section,
+ unsigned int sec_len, u32 *crc)
+{
+ *crc = 0;
+ return read_partial_message_chunk(con, section, sec_len, crc);
+}
+
+static int read_partial_sparse_msg_extent(struct ceph_connection *con, u32 *crc)
+{
+ struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
+ bool do_bounce = ceph_test_opt(from_msgr(con->msgr), RXBOUNCE);
+
+ if (do_bounce && unlikely(!con->bounce_page)) {
+ con->bounce_page = alloc_page(GFP_NOIO);
+ if (!con->bounce_page) {
+ pr_err("failed to allocate bounce page\n");
+ return -ENOMEM;
+ }
+ }
+
+ while (cursor->sr_resid > 0) {
+ struct page *page, *rpage;
+ size_t off, len;
+ int ret;
+
+ page = ceph_msg_data_next(cursor, &off, &len);
+ rpage = do_bounce ? con->bounce_page : page;
+
+ /* clamp to what remains in extent */
+ len = min_t(int, len, cursor->sr_resid);
+ ret = ceph_tcp_recvpage(con->sock, rpage, (int)off, len);
+ if (ret <= 0)
+ return ret;
+ *crc = ceph_crc32c_page(*crc, rpage, off, ret);
+ ceph_msg_data_advance(cursor, (size_t)ret);
+ cursor->sr_resid -= ret;
+ if (do_bounce)
+ memcpy_page(page, off, rpage, off, ret);
+ }
+ return 1;
+}
+
+static int read_partial_sparse_msg_data(struct ceph_connection *con)
+{
+ struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+ u32 crc = 0;
+ int ret = 1;
+
+ if (do_datacrc)
+ crc = con->in_data_crc;
+
+ while (cursor->total_resid) {
+ if (con->v1.in_sr_kvec.iov_base)
+ ret = read_partial_message_chunk(con,
+ &con->v1.in_sr_kvec,
+ con->v1.in_sr_len,
+ &crc);
+ else if (cursor->sr_resid > 0)
+ ret = read_partial_sparse_msg_extent(con, &crc);
+ if (ret <= 0)
+ break;
+
+ memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec));
+ ret = con->ops->sparse_read(con, cursor,
+ (char **)&con->v1.in_sr_kvec.iov_base);
+ if (ret <= 0) {
+ ret = ret ? ret : 1; /* must return > 0 to indicate success */
+ break;
+ }
+ con->v1.in_sr_len = ret;
+ }
+
+ if (do_datacrc)
+ con->in_data_crc = crc;
+
+ return ret;
+}
+
static int read_partial_msg_data(struct ceph_connection *con)
{
struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
@@ -1180,7 +1250,9 @@ static int read_partial_message(struct ceph_connection *con)
if (!m->num_data_items)
return -EIO;
- if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE))
+ if (m->sparse_read_total)
+ ret = read_partial_sparse_msg_data(con);
+ else if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE))
ret = read_partial_msg_data_bounce(con);
else
ret = read_partial_msg_data(con);
@@ -1396,6 +1468,7 @@ bad_tag:
*/
int ceph_con_v1_try_write(struct ceph_connection *con)
{
+ struct ceph_msg *msg;
int ret = 1;
dout("try_write start %p state %d\n", con, con->state);
@@ -1442,14 +1515,15 @@ more:
}
/* msg pages? */
- if (con->out_msg) {
+ msg = con->out_msg;
+ if (msg) {
if (con->v1.out_msg_done) {
- ceph_msg_put(con->out_msg);
+ ceph_msg_put(msg);
con->out_msg = NULL; /* we're done with this one */
goto do_next;
}
- ret = write_partial_message_data(con);
+ ret = write_partial_message_data(con, msg);
if (ret == 1)
goto more; /* we need to send the footer, too! */
if (ret == 0)
@@ -1469,8 +1543,8 @@ do_next:
goto more;
}
/* is anything else pending? */
- if (!list_empty(&con->out_queue)) {
- prepare_write_message(con);
+ if ((msg = ceph_con_get_out_msg(con)) != NULL) {
+ prepare_write_message(con, msg);
goto more;
}
if (con->in_seq > con->in_seq_acked) {
@@ -1488,10 +1562,8 @@ out:
return ret;
}
-void ceph_con_v1_revoke(struct ceph_connection *con)
+void ceph_con_v1_revoke(struct ceph_connection *con, struct ceph_msg *msg)
{
- struct ceph_msg *msg = con->out_msg;
-
WARN_ON(con->v1.out_skip);
/* footer */
if (con->v1.out_msg_done) {