From 6d0c48aede85e38316d0251564cab39cbc2422f6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 30 Nov 2018 08:47:03 -0700 Subject: block: implement bio helper to add iter bvec pages to bio For an ITER_BVEC, we can just iterate the iov and add the pages to the bio directly. For now, we grab a reference to those pages, and release them normally on IO completion. This isn't really needed for the normal case of O_DIRECT from/to a file, but some of the more esoteric use cases (like splice(2)) will unconditionally put the pipe buffer pages when the buffers are released. Until we can manage that case properly, ITER_BVEC pages are treated like normal pages in terms of reference counting. Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 83a2dfa417ca..71a78d9fb8b7 100644 --- a/block/bio.c +++ b/block/bio.c @@ -836,6 +836,40 @@ int bio_add_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL(bio_add_page); +static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) +{ + const struct bio_vec *bv = iter->bvec; + unsigned int len; + size_t size; + + if (WARN_ON_ONCE(iter->iov_offset > bv->bv_len)) + return -EINVAL; + + len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count); + size = bio_add_page(bio, bv->bv_page, len, + bv->bv_offset + iter->iov_offset); + if (size == len) { + struct page *page; + int i; + + /* + * For the normal O_DIRECT case, we could skip grabbing this + * reference and then not have to put them again when IO + * completes. But this breaks some in-kernel users, like + * splicing to/from a loop device, where we release the pipe + * pages unconditionally. If we can fix that case, we can + * get rid of the get here and the need to call + * bio_release_pages() at IO completion time. + */ + mp_bvec_for_each_page(page, bv, i) + get_page(page); + iov_iter_advance(iter, size); + return 0; + } + + return -EINVAL; +} + #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) /** @@ -884,23 +918,35 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) } /** - * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio + * bio_iov_iter_get_pages - add user or kernel pages to a bio * @bio: bio to add pages to - * @iter: iov iterator describing the region to be mapped + * @iter: iov iterator describing the region to be added + * + * This takes either an iterator pointing to user memory, or one pointing to + * kernel pages (BVEC iterator). If we're adding user pages, we pin them and + * map them into the kernel. On IO completion, the caller should put those + * pages. For now, when adding kernel pages, we still grab a reference to the + * page. This isn't strictly needed for the common case, but some call paths + * end up releasing pages from eg a pipe and we can't easily control these. + * See comment in __bio_iov_bvec_add_pages(). * - * Pins pages from *iter and appends them to @bio's bvec array. The - * pages will have to be released using put_page() when done. * The function tries, but does not guarantee, to pin as many pages as - * fit into the bio, or are requested in *iter, whatever is smaller. - * If MM encounters an error pinning the requested pages, it stops. - * Error is returned only if 0 pages could be pinned. + * fit into the bio, or are requested in *iter, whatever is smaller. If + * MM encounters an error pinning the requested pages, it stops. Error + * is returned only if 0 pages could be pinned. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { + const bool is_bvec = iov_iter_is_bvec(iter); unsigned short orig_vcnt = bio->bi_vcnt; do { - int ret = __bio_iov_iter_get_pages(bio, iter); + int ret; + + if (is_bvec) + ret = __bio_iov_bvec_add_pages(bio, iter); + else + ret = __bio_iov_iter_get_pages(bio, iter); if (unlikely(ret)) return bio->bi_vcnt > orig_vcnt ? 0 : ret; -- cgit