diff options
Diffstat (limited to 'net/rds/message.c')
| -rw-r--r-- | net/rds/message.c | 281 |
1 files changed, 199 insertions, 82 deletions
diff --git a/net/rds/message.c b/net/rds/message.c index aba232f9f308..199a899a43e9 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2020 Oracle and/or its affiliates. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,6 +33,9 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/skbuff.h> +#include <linux/list.h> +#include <linux/errqueue.h> #include "rds.h" @@ -41,49 +44,137 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { [RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version), [RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma), [RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest), +[RDS_EXTHDR_NPATHS] = sizeof(__be16), +[RDS_EXTHDR_GEN_NUM] = sizeof(__be32), }; - void rds_message_addref(struct rds_message *rm) { - rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); - atomic_inc(&rm->m_refcount); + rdsdebug("addref rm %p ref %d\n", rm, refcount_read(&rm->m_refcount)); + refcount_inc(&rm->m_refcount); } EXPORT_SYMBOL_GPL(rds_message_addref); +static inline bool rds_zcookie_add(struct rds_msg_zcopy_info *info, u32 cookie) +{ + struct rds_zcopy_cookies *ck = &info->zcookies; + int ncookies = ck->num; + + if (ncookies == RDS_MAX_ZCOOKIES) + return false; + ck->cookies[ncookies] = cookie; + ck->num = ++ncookies; + return true; +} + +static struct rds_msg_zcopy_info *rds_info_from_znotifier(struct rds_znotifier *znotif) +{ + return container_of(znotif, struct rds_msg_zcopy_info, znotif); +} + +void rds_notify_msg_zcopy_purge(struct rds_msg_zcopy_queue *q) +{ + unsigned long flags; + LIST_HEAD(copy); + struct rds_msg_zcopy_info *info, *tmp; + + spin_lock_irqsave(&q->lock, flags); + list_splice(&q->zcookie_head, ©); + INIT_LIST_HEAD(&q->zcookie_head); + spin_unlock_irqrestore(&q->lock, flags); + + list_for_each_entry_safe(info, tmp, ©, rs_zcookie_next) { + list_del(&info->rs_zcookie_next); + kfree(info); + } +} + +static void rds_rm_zerocopy_callback(struct rds_sock *rs, + struct rds_znotifier *znotif) +{ + struct rds_msg_zcopy_info *info; + struct rds_msg_zcopy_queue *q; + u32 cookie = znotif->z_cookie; + struct rds_zcopy_cookies *ck; + struct list_head *head; + unsigned long flags; + + mm_unaccount_pinned_pages(&znotif->z_mmp); + q = &rs->rs_zcookie_queue; + spin_lock_irqsave(&q->lock, flags); + head = &q->zcookie_head; + if (!list_empty(head)) { + info = list_first_entry(head, struct rds_msg_zcopy_info, + rs_zcookie_next); + if (rds_zcookie_add(info, cookie)) { + spin_unlock_irqrestore(&q->lock, flags); + kfree(rds_info_from_znotifier(znotif)); + /* caller invokes rds_wake_sk_sleep() */ + return; + } + } + + info = rds_info_from_znotifier(znotif); + ck = &info->zcookies; + memset(ck, 0, sizeof(*ck)); + WARN_ON(!rds_zcookie_add(info, cookie)); + list_add_tail(&info->rs_zcookie_next, &q->zcookie_head); + + spin_unlock_irqrestore(&q->lock, flags); + /* caller invokes rds_wake_sk_sleep() */ +} + /* * This relies on dma_map_sg() not touching sg[].page during merging. */ static void rds_message_purge(struct rds_message *rm) { - unsigned long i; + unsigned long i, flags; + bool zcopy = false; if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) return; + spin_lock_irqsave(&rm->m_rs_lock, flags); + if (rm->m_rs) { + struct rds_sock *rs = rm->m_rs; + + if (rm->data.op_mmp_znotifier) { + zcopy = true; + rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier); + rds_wake_sk_sleep(rs); + rm->data.op_mmp_znotifier = NULL; + } + sock_put(rds_rs_to_sk(rs)); + rm->m_rs = NULL; + } + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + for (i = 0; i < rm->data.op_nents; i++) { - rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i])); /* XXX will have to put_page for page refs */ - __free_page(sg_page(&rm->data.op_sg[i])); + if (!zcopy) + __free_page(sg_page(&rm->data.op_sg[i])); + else + put_page(sg_page(&rm->data.op_sg[i])); } rm->data.op_nents = 0; if (rm->rdma.op_active) rds_rdma_free_op(&rm->rdma); if (rm->rdma.op_rdma_mr) - rds_mr_put(rm->rdma.op_rdma_mr); + kref_put(&rm->rdma.op_rdma_mr->r_kref, __rds_put_mr_final); if (rm->atomic.op_active) rds_atomic_free_op(&rm->atomic); if (rm->atomic.op_rdma_mr) - rds_mr_put(rm->atomic.op_rdma_mr); + kref_put(&rm->atomic.op_rdma_mr->r_kref, __rds_put_mr_final); } void rds_message_put(struct rds_message *rm) { - rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); - WARN(!atomic_read(&rm->m_refcount), "danger refcount zero on %p\n", rm); - if (atomic_dec_and_test(&rm->m_refcount)) { + rdsdebug("put rm %p ref %d\n", rm, refcount_read(&rm->m_refcount)); + WARN(!refcount_read(&rm->m_refcount), "danger refcount zero on %p\n", rm); + if (refcount_dec_and_test(&rm->m_refcount)) { BUG_ON(!list_empty(&rm->m_sock_item)); BUG_ON(!list_empty(&rm->m_conn_item)); rds_message_purge(rm); @@ -204,7 +295,7 @@ struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp) rm->m_used_sgs = 0; rm->m_total_sgs = extra_len / sizeof(struct scatterlist); - atomic_set(&rm->m_refcount, 1); + refcount_set(&rm->m_refcount, 1); INIT_LIST_HEAD(&rm->m_sock_item); INIT_LIST_HEAD(&rm->m_conn_item); spin_lock_init(&rm->m_rs_lock); @@ -222,11 +313,16 @@ struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents) struct scatterlist *sg_first = (struct scatterlist *) &rm[1]; struct scatterlist *sg_ret; - WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs); - WARN_ON(!nents); + if (nents <= 0) { + pr_warn("rds: alloc sgs failed! nents <= 0\n"); + return ERR_PTR(-EINVAL); + } - if (rm->m_used_sgs + nents > rm->m_total_sgs) - return NULL; + if (rm->m_used_sgs + nents > rm->m_total_sgs) { + pr_warn("rds: alloc sgs failed! total %d used %d nents %d\n", + rm->m_total_sgs, rm->m_used_sgs, nents); + return ERR_PTR(-ENOMEM); + } sg_ret = &sg_first[rm->m_used_sgs]; sg_init_table(sg_ret, nents); @@ -239,7 +335,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in { struct rds_message *rm; unsigned int i; - int num_sgs = ceil(total_len, PAGE_SIZE); + int num_sgs = DIV_ROUND_UP(total_len, PAGE_SIZE); int extra_bytes = num_sgs * sizeof(struct scatterlist); rm = rds_message_alloc(extra_bytes, GFP_NOWAIT); @@ -248,91 +344,128 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); - rm->data.op_nents = ceil(total_len, PAGE_SIZE); + rm->data.op_nents = DIV_ROUND_UP(total_len, PAGE_SIZE); rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); - if (!rm->data.op_sg) { + if (IS_ERR(rm->data.op_sg)) { + void *err = ERR_CAST(rm->data.op_sg); rds_message_put(rm); - return ERR_PTR(-ENOMEM); + return err; } for (i = 0; i < rm->data.op_nents; ++i) { sg_set_page(&rm->data.op_sg[i], - virt_to_page(page_addrs[i]), + virt_to_page((void *)page_addrs[i]), PAGE_SIZE, 0); } return rm; } -int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, - size_t total_len) +static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *from) { - unsigned long to_copy; - unsigned long iov_off; - unsigned long sg_off; - struct iovec *iov; struct scatterlist *sg; int ret = 0; + int length = iov_iter_count(from); + struct rds_msg_zcopy_info *info; - rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); + rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from)); /* * now allocate and copy in the data payload. */ sg = rm->data.op_sg; - iov = first_iov; - iov_off = 0; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + INIT_LIST_HEAD(&info->rs_zcookie_next); + rm->data.op_mmp_znotifier = &info->znotif; + if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp, + length)) { + ret = -ENOMEM; + goto err; + } + while (iov_iter_count(from)) { + struct page *pages; + size_t start; + ssize_t copied; + + copied = iov_iter_get_pages2(from, &pages, PAGE_SIZE, + 1, &start); + if (copied < 0) { + struct mmpin *mmp; + int i; + + for (i = 0; i < rm->data.op_nents; i++) + put_page(sg_page(&rm->data.op_sg[i])); + mmp = &rm->data.op_mmp_znotifier->z_mmp; + mm_unaccount_pinned_pages(mmp); + ret = -EFAULT; + goto err; + } + length -= copied; + sg_set_page(sg, pages, copied, start); + rm->data.op_nents++; + sg++; + } + WARN_ON_ONCE(length != 0); + return ret; +err: + kfree(info); + rm->data.op_mmp_znotifier = NULL; + return ret; +} + +int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, + bool zcopy) +{ + unsigned long to_copy, nbytes; + unsigned long sg_off; + struct scatterlist *sg; + int ret = 0; + + rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from)); + + /* now allocate and copy in the data payload. */ + sg = rm->data.op_sg; sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ - while (total_len) { + if (zcopy) + return rds_message_zcopy_from_user(rm, from); + + while (iov_iter_count(from)) { if (!sg_page(sg)) { - ret = rds_page_remainder_alloc(sg, total_len, + ret = rds_page_remainder_alloc(sg, iov_iter_count(from), GFP_HIGHUSER); if (ret) - goto out; + return ret; rm->data.op_nents++; sg_off = 0; } - while (iov_off == iov->iov_len) { - iov_off = 0; - iov++; - } - - to_copy = min(iov->iov_len - iov_off, sg->length - sg_off); - to_copy = min_t(size_t, to_copy, total_len); + to_copy = min_t(unsigned long, iov_iter_count(from), + sg->length - sg_off); - rdsdebug("copying %lu bytes from user iov [%p, %zu] + %lu to " - "sg [%p, %u, %u] + %lu\n", - to_copy, iov->iov_base, iov->iov_len, iov_off, - (void *)sg_page(sg), sg->offset, sg->length, sg_off); + rds_stats_add(s_copy_from_user, to_copy); + nbytes = copy_page_from_iter(sg_page(sg), sg->offset + sg_off, + to_copy, from); + if (nbytes != to_copy) + return -EFAULT; - ret = rds_page_copy_from_user(sg_page(sg), sg->offset + sg_off, - iov->iov_base + iov_off, - to_copy); - if (ret) - goto out; - - iov_off += to_copy; - total_len -= to_copy; sg_off += to_copy; if (sg_off == sg->length) sg++; } -out: return ret; } -int rds_message_inc_copy_to_user(struct rds_incoming *inc, - struct iovec *first_iov, size_t size) +int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to) { struct rds_message *rm; - struct iovec *iov; struct scatterlist *sg; unsigned long to_copy; - unsigned long iov_off; unsigned long vec_off; int copied; int ret; @@ -341,36 +474,21 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, rm = container_of(inc, struct rds_message, m_inc); len = be32_to_cpu(rm->m_inc.i_hdr.h_len); - iov = first_iov; - iov_off = 0; sg = rm->data.op_sg; vec_off = 0; copied = 0; - while (copied < size && copied < len) { - while (iov_off == iov->iov_len) { - iov_off = 0; - iov++; - } - - to_copy = min(iov->iov_len - iov_off, sg->length - vec_off); - to_copy = min_t(size_t, to_copy, size - copied); + while (iov_iter_count(to) && copied < len) { + to_copy = min_t(unsigned long, iov_iter_count(to), + sg->length - vec_off); to_copy = min_t(unsigned long, to_copy, len - copied); - rdsdebug("copying %lu bytes to user iov [%p, %zu] + %lu to " - "sg [%p, %u, %u] + %lu\n", - to_copy, iov->iov_base, iov->iov_len, iov_off, - sg_page(sg), sg->offset, sg->length, vec_off); - - ret = rds_page_copy_to_user(sg_page(sg), sg->offset + vec_off, - iov->iov_base + iov_off, - to_copy); - if (ret) { - copied = ret; - break; - } + rds_stats_add(s_copy_to_user, to_copy); + ret = copy_page_to_iter(sg_page(sg), sg->offset + vec_off, + to_copy, to); + if (ret != to_copy) + return -EFAULT; - iov_off += to_copy; vec_off += to_copy; copied += to_copy; @@ -399,4 +517,3 @@ void rds_message_unmapped(struct rds_message *rm) wake_up_interruptible(&rm->m_flush_wait); } EXPORT_SYMBOL_GPL(rds_message_unmapped); - |
