diff options
Diffstat (limited to 'drivers/block/drbd/drbd_receiver.c')
| -rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 1028 |
1 files changed, 507 insertions, 521 deletions
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index c7e95e6380fb..3de919b6f0e1 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* drbd_receiver.c @@ -7,19 +8,6 @@ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. - drbd is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - drbd is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with drbd; see the file COPYING. If not, write to - the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ @@ -39,18 +27,19 @@ #include <uapi/linux/sched/types.h> #include <linux/sched/signal.h> #include <linux/pkt_sched.h> -#define __KERNEL_SYSCALLS__ #include <linux/unistd.h> #include <linux/vmalloc.h> #include <linux/random.h> #include <linux/string.h> #include <linux/scatterlist.h> +#include <linux/part_stat.h> +#include <linux/mempool.h> #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" #include "drbd_vli.h" -#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME) +#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME|DRBD_FF_WZEROES) struct packet_info { enum drbd_packet cmd; @@ -75,185 +64,36 @@ static int e_end_block(struct drbd_work *, int); #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) -/* - * some helper functions to deal with single linked page lists, - * page->private being our "next" pointer. - */ - -/* If at least n pages are linked at head, get n pages off. - * Otherwise, don't modify head, and return NULL. - * Locking is the responsibility of the caller. - */ -static struct page *page_chain_del(struct page **head, int n) -{ - struct page *page; - struct page *tmp; - - BUG_ON(!n); - BUG_ON(!head); - - page = *head; - - if (!page) - return NULL; - - while (page) { - tmp = page_chain_next(page); - if (--n == 0) - break; /* found sufficient pages */ - if (tmp == NULL) - /* insufficient pages, don't use any of them. */ - return NULL; - page = tmp; - } - - /* add end of list marker for the returned list */ - set_page_private(page, 0); - /* actual return value, and adjustment of head */ - page = *head; - *head = tmp; - return page; -} - -/* may be used outside of locks to find the tail of a (usually short) - * "private" page chain, before adding it back to a global chain head - * with page_chain_add() under a spinlock. */ -static struct page *page_chain_tail(struct page *page, int *len) -{ - struct page *tmp; - int i = 1; - while ((tmp = page_chain_next(page))) - ++i, page = tmp; - if (len) - *len = i; - return page; -} - -static int page_chain_free(struct page *page) -{ - struct page *tmp; - int i = 0; - page_chain_for_each_safe(page, tmp) { - put_page(page); - ++i; - } - return i; -} - -static void page_chain_add(struct page **head, - struct page *chain_first, struct page *chain_last) -{ -#if 1 - struct page *tmp; - tmp = page_chain_tail(chain_first, NULL); - BUG_ON(tmp != chain_last); -#endif - - /* add chain to head */ - set_page_private(chain_last, (unsigned long)*head); - *head = chain_first; -} - -static struct page *__drbd_alloc_pages(struct drbd_device *device, - unsigned int number) +static struct page *__drbd_alloc_pages(unsigned int number) { struct page *page = NULL; struct page *tmp = NULL; unsigned int i = 0; - /* Yes, testing drbd_pp_vacant outside the lock is racy. - * So what. It saves a spin_lock. */ - if (drbd_pp_vacant >= number) { - spin_lock(&drbd_pp_lock); - page = page_chain_del(&drbd_pp_pool, number); - if (page) - drbd_pp_vacant -= number; - spin_unlock(&drbd_pp_lock); - if (page) - return page; - } - /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ for (i = 0; i < number; i++) { - tmp = alloc_page(GFP_TRY); + tmp = mempool_alloc(&drbd_buffer_page_pool, GFP_TRY); if (!tmp) - break; + goto fail; set_page_private(tmp, (unsigned long)page); page = tmp; } - - if (i == number) - return page; - - /* Not enough pages immediately available this time. - * No need to jump around here, drbd_alloc_pages will retry this - * function "soon". */ - if (page) { - tmp = page_chain_tail(page, NULL); - spin_lock(&drbd_pp_lock); - page_chain_add(&drbd_pp_pool, page, tmp); - drbd_pp_vacant += i; - spin_unlock(&drbd_pp_lock); + return page; +fail: + page_chain_for_each_safe(page, tmp) { + set_page_private(page, 0); + mempool_free(page, &drbd_buffer_page_pool); } return NULL; } -static void reclaim_finished_net_peer_reqs(struct drbd_device *device, - struct list_head *to_be_freed) -{ - struct drbd_peer_request *peer_req, *tmp; - - /* The EEs are always appended to the end of the list. Since - they are sent in order over the wire, they have to finish - in order. As soon as we see the first not finished we can - stop to examine the list... */ - - list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) { - if (drbd_peer_req_has_active_page(peer_req)) - break; - list_move(&peer_req->w.list, to_be_freed); - } -} - -static void drbd_reclaim_net_peer_reqs(struct drbd_device *device) -{ - LIST_HEAD(reclaimed); - struct drbd_peer_request *peer_req, *t; - - spin_lock_irq(&device->resource->req_lock); - reclaim_finished_net_peer_reqs(device, &reclaimed); - spin_unlock_irq(&device->resource->req_lock); - list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) - drbd_free_net_peer_req(device, peer_req); -} - -static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection) -{ - struct drbd_peer_device *peer_device; - int vnr; - - rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - if (!atomic_read(&device->pp_in_use_by_net)) - continue; - - kref_get(&device->kref); - rcu_read_unlock(); - drbd_reclaim_net_peer_reqs(device); - kref_put(&device->kref, drbd_destroy_device); - rcu_read_lock(); - } - rcu_read_unlock(); -} - /** * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled) - * @device: DRBD device. - * @number: number of pages requested - * @retry: whether to retry, if not enough pages are available right now + * @peer_device: DRBD device. + * @number: number of pages requested + * @retry: whether to retry, if not enough pages are available right now * * Tries to allocate number pages, first from our own page pool, then from * the kernel. @@ -273,9 +113,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int bool retry) { struct drbd_device *device = peer_device->device; - struct page *page = NULL; + struct page *page; struct net_conf *nc; - DEFINE_WAIT(wait); unsigned int mxb; rcu_read_lock(); @@ -283,37 +122,9 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int mxb = nc ? nc->max_buffers : 1000000; rcu_read_unlock(); - if (atomic_read(&device->pp_in_use) < mxb) - page = __drbd_alloc_pages(device, number); - - /* Try to keep the fast path fast, but occasionally we need - * to reclaim the pages we lended to the network stack. */ - if (page && atomic_read(&device->pp_in_use_by_net) > 512) - drbd_reclaim_net_peer_reqs(device); - - while (page == NULL) { - prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); - - drbd_reclaim_net_peer_reqs(device); - - if (atomic_read(&device->pp_in_use) < mxb) { - page = __drbd_alloc_pages(device, number); - if (page) - break; - } - - if (!retry) - break; - - if (signal_pending(current)) { - drbd_warn(device, "drbd_alloc_pages interrupted!\n"); - break; - } - - if (schedule_timeout(HZ/10) == 0) - mxb = UINT_MAX; - } - finish_wait(&drbd_pp_wait, &wait); + if (atomic_read(&device->pp_in_use) >= mxb) + schedule_timeout_interruptible(HZ / 10); + page = __drbd_alloc_pages(number); if (page) atomic_add(number, &device->pp_in_use); @@ -324,29 +135,25 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int * Is also used from inside an other spin_lock_irq(&resource->req_lock); * Either links the page chain back to the global pool, * or returns all pages to the system. */ -static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net) +static void drbd_free_pages(struct drbd_device *device, struct page *page) { - atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use; - int i; + struct page *tmp; + int i = 0; if (page == NULL) return; - if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count) - i = page_chain_free(page); - else { - struct page *tmp; - tmp = page_chain_tail(page, &i); - spin_lock(&drbd_pp_lock); - page_chain_add(&drbd_pp_pool, page, tmp); - drbd_pp_vacant += i; - spin_unlock(&drbd_pp_lock); - } - i = atomic_sub_return(i, a); + page_chain_for_each_safe(page, tmp) { + set_page_private(page, 0); + if (page_count(page) == 1) + mempool_free(page, &drbd_buffer_page_pool); + else + put_page(page); + i++; + } + i = atomic_sub_return(i, &device->pp_in_use); if (i < 0) - drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n", - is_net ? "pp_in_use_by_net" : "pp_in_use", i); - wake_up(&drbd_pp_wait); + drbd_warn(device, "ASSERTION FAILED: pp_in_use: %d < 0\n", i); } /* @@ -373,12 +180,12 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto struct drbd_device *device = peer_device->device; struct drbd_peer_request *peer_req; struct page *page = NULL; - unsigned nr_pages = (payload_size + PAGE_SIZE -1) >> PAGE_SHIFT; + unsigned int nr_pages = PFN_UP(payload_size); if (drbd_insert_fault(device, DRBD_FAULT_AL_EE)) return NULL; - peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); + peer_req = mempool_alloc(&drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); if (!peer_req) { if (!(gfp_mask & __GFP_NOWARN)) drbd_err(device, "%s: allocation failed\n", __func__); @@ -390,6 +197,8 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto gfpflags_allow_blocking(gfp_mask)); if (!page) goto fail; + if (!mempool_is_saturated(&drbd_buffer_page_pool)) + peer_req->flags |= EE_RELEASE_TO_MEMPOOL; } memset(peer_req, 0, sizeof(*peer_req)); @@ -409,24 +218,23 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto return peer_req; fail: - mempool_free(peer_req, drbd_ee_mempool); + mempool_free(peer_req, &drbd_ee_mempool); return NULL; } -void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req, - int is_net) +void drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req) { might_sleep(); if (peer_req->flags & EE_HAS_DIGEST) kfree(peer_req->digest); - drbd_free_pages(device, peer_req->pages, is_net); + drbd_free_pages(device, peer_req->pages); D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0); D_ASSERT(device, drbd_interval_empty(&peer_req->i)); - if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) { + if (!expect(device, !(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) { peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; drbd_al_complete_io(device, &peer_req->i); } - mempool_free(peer_req, drbd_ee_mempool); + mempool_free(peer_req, &drbd_ee_mempool); } int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list) @@ -434,14 +242,13 @@ int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list) LIST_HEAD(work_list); struct drbd_peer_request *peer_req, *t; int count = 0; - int is_net = list == &device->net_ee; spin_lock_irq(&device->resource->req_lock); list_splice_init(list, &work_list); spin_unlock_irq(&device->resource->req_lock); list_for_each_entry_safe(peer_req, t, &work_list, w.list) { - __drbd_free_peer_req(device, peer_req, is_net); + drbd_free_peer_req(device, peer_req); count++; } return count; @@ -453,18 +260,13 @@ int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list) static int drbd_finish_peer_reqs(struct drbd_device *device) { LIST_HEAD(work_list); - LIST_HEAD(reclaimed); struct drbd_peer_request *peer_req, *t; int err = 0; spin_lock_irq(&device->resource->req_lock); - reclaim_finished_net_peer_reqs(device, &reclaimed); list_splice_init(&device->done_ee, &work_list); spin_unlock_irq(&device->resource->req_lock); - list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) - drbd_free_net_peer_req(device, peer_req); - /* possible callbacks here: * e_end_block, and e_end_resync_block, e_send_superseded. * all ignore the last argument. @@ -516,7 +318,8 @@ static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flag struct msghdr msg = { .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL) }; - return kernel_recvmsg(sock, &msg, &iov, 1, size, msg.msg_flags); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, size); + return sock_recvmsg(sock, &msg, msg.msg_flags); } static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size) @@ -647,7 +450,7 @@ static struct socket *drbd_try_connect(struct drbd_connection *connection) * a free one dynamically. */ what = "bind before connect"; - err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len); + err = sock->ops->bind(sock, (struct sockaddr_unsized *) &src_in6, my_addr_len); if (err < 0) goto out; @@ -655,7 +458,7 @@ static struct socket *drbd_try_connect(struct drbd_connection *connection) * stay C_WF_CONNECTION, don't go Disconnecting! */ disconnect_on_error = 0; what = "connect"; - err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0); + err = sock->ops->connect(sock, (struct sockaddr_unsized *) &peer_in6, peer_addr_len, 0); out: if (err < 0) { @@ -734,7 +537,7 @@ static int prepare_listen_socket(struct drbd_connection *connection, struct acce drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size); what = "bind before listen"; - err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len); + err = s_listen->ops->bind(s_listen, (struct sockaddr_unsized *)&my_addr, my_addr_len); if (err < 0) goto out; @@ -789,7 +592,7 @@ static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, timeo = connect_int * HZ; /* 28.5% random jitter */ - timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7; + timeo += get_random_u32_below(2) ? timeo / 7 : -timeo / 7; err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo); if (err <= 0) @@ -1012,7 +815,7 @@ retry: drbd_warn(connection, "Error receiving initial packet\n"); sock_release(s); randomize: - if (prandom_u32() & 1) + if (get_random_u32_below(2)) goto retry; } } @@ -1038,6 +841,9 @@ randomize: sock.socket->sk->sk_allocation = GFP_NOIO; msock.socket->sk->sk_allocation = GFP_NOIO; + sock.socket->sk->sk_use_task_frag = false; + msock.socket->sk->sk_use_task_frag = false; + sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE; @@ -1061,8 +867,8 @@ randomize: /* we don't want delays. * we use TCP_CORK where appropriate, though */ - drbd_tcp_nodelay(sock.socket); - drbd_tcp_nodelay(msock.socket); + tcp_sock_set_nodelay(sock.socket->sk); + tcp_sock_set_nodelay(msock.socket->sk); connection->data.socket = sock.socket; connection->meta.socket = msock.socket; @@ -1100,7 +906,10 @@ randomize: idr_for_each_entry(&connection->peer_devices, peer_device, vnr) mutex_lock(peer_device->device->state_mutex); + /* avoid a race with conn_request_state( C_DISCONNECTING ) */ + spin_lock_irq(&connection->resource->req_lock); set_bit(STATE_SENT, &connection->flags); + spin_unlock_irq(&connection->resource->req_lock); idr_for_each_entry(&connection->peer_devices, peer_device, vnr) mutex_unlock(peer_device->device->state_mutex); @@ -1194,6 +1003,14 @@ static int decode_header(struct drbd_connection *connection, void *header, struc return 0; } +static void drbd_unplug_all_devices(struct drbd_connection *connection) +{ + if (current->plug == &connection->receiver_plug) { + blk_finish_plug(&connection->receiver_plug); + blk_start_plug(&connection->receiver_plug); + } /* else: maybe just schedule() ?? */ +} + static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi) { void *buffer = connection->data.rbuf; @@ -1209,6 +1026,36 @@ static int drbd_recv_header(struct drbd_connection *connection, struct packet_in return err; } +static int drbd_recv_header_maybe_unplug(struct drbd_connection *connection, struct packet_info *pi) +{ + void *buffer = connection->data.rbuf; + unsigned int size = drbd_header_size(connection); + int err; + + err = drbd_recv_short(connection->data.socket, buffer, size, MSG_NOSIGNAL|MSG_DONTWAIT); + if (err != size) { + /* If we have nothing in the receive buffer now, to reduce + * application latency, try to drain the backend queues as + * quickly as possible, and let remote TCP know what we have + * received so far. */ + if (err == -EAGAIN) { + tcp_sock_set_quickack(connection->data.socket->sk, 2); + drbd_unplug_all_devices(connection); + } + if (err > 0) { + buffer += err; + size -= err; + } + err = drbd_recv_all_warn(connection, buffer, size); + if (err) + return err; + } + + err = decode_header(connection, connection->data.rbuf, pi); + connection->last_received = jiffies; + + return err; +} /* This is blkdev_issue_flush, but asynchronous. * We want to submit to all component volumes in parallel, * then wait for all completions. @@ -1223,7 +1070,7 @@ struct one_flush_context { struct issue_flush_context *ctx; }; -void one_flush_endio(struct bio *bio) +static void one_flush_endio(struct bio *bio) { struct one_flush_context *octx = bio->bi_private; struct drbd_device *device = octx->device; @@ -1246,16 +1093,16 @@ void one_flush_endio(struct bio *bio) static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx) { - struct bio *bio = bio_alloc(GFP_NOIO, 0); + struct bio *bio = bio_alloc(device->ldev->backing_bdev, 0, + REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO); struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO); - if (!bio || !octx) { - drbd_warn(device, "Could not allocate a bio, CANNOT ISSUE FLUSH\n"); + + if (!octx) { + drbd_warn(device, "Could not allocate a octx, CANNOT ISSUE FLUSH\n"); /* FIXME: what else can I do now? disconnecting or detaching * really does not help to improve the state of the world, either. */ - kfree(octx); - if (bio) - bio_put(bio); + bio_put(bio); ctx->error = -ENOMEM; put_ldev(device); @@ -1265,10 +1112,8 @@ static void submit_one_flush(struct drbd_device *device, struct issue_flush_cont octx->device = device; octx->ctx = ctx; - bio->bi_bdev = device->ldev->backing_bdev; bio->bi_private = octx; bio->bi_end_io = one_flush_endio; - bio->bi_opf = REQ_OP_FLUSH | REQ_PREFLUSH; device->flush_jif = jiffies; set_bit(FLUSH_PENDING, &device->flags); @@ -1319,7 +1164,7 @@ static void drbd_flush(struct drbd_connection *connection) /** * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it. - * @device: DRBD device. + * @connection: DRBD connection. * @epoch: Epoch object. * @ev: Epoch event. */ @@ -1408,9 +1253,8 @@ max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo) return wo; } -/** +/* * drbd_bump_write_ordering() - Fall back to an other write ordering method - * @connection: DRBD connection. * @wo: Write ordering method to try. */ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev, @@ -1448,34 +1292,145 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]); } -static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req) +/* + * Mapping "discard" to ZEROOUT with UNMAP does not work for us: + * Drivers have to "announce" q->limits.max_write_zeroes_sectors, or it + * will directly go to fallback mode, submitting normal writes, and + * never even try to UNMAP. + * + * And dm-thin does not do this (yet), mostly because in general it has + * to assume that "skip_block_zeroing" is set. See also: + * https://www.mail-archive.com/dm-devel%40redhat.com/msg07965.html + * https://www.redhat.com/archives/dm-devel/2018-January/msg00271.html + * + * We *may* ignore the discard-zeroes-data setting, if so configured. + * + * Assumption is that this "discard_zeroes_data=0" is only because the backend + * may ignore partial unaligned discards. + * + * LVM/DM thin as of at least + * LVM version: 2.02.115(2)-RHEL7 (2015-01-28) + * Library version: 1.02.93-RHEL7 (2015-01-28) + * Driver version: 4.29.0 + * still behaves this way. + * + * For unaligned (wrt. alignment and granularity) or too small discards, + * we zero-out the initial (and/or) trailing unaligned partial chunks, + * but discard all the aligned full chunks. + * + * At least for LVM/DM thin, with skip_block_zeroing=false, + * the result is effectively "discard_zeroes_data=1". + */ +/* flags: EE_TRIM|EE_ZEROOUT */ +int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, int flags) { struct block_device *bdev = device->ldev->backing_bdev; + sector_t tmp, nr; + unsigned int max_discard_sectors, granularity; + int alignment; + int err = 0; - if (blkdev_issue_zeroout(bdev, peer_req->i.sector, peer_req->i.size >> 9, - GFP_NOIO, 0)) - peer_req->flags |= EE_WAS_ERROR; + if ((flags & EE_ZEROOUT) || !(flags & EE_TRIM)) + goto zero_out; + + /* Zero-sector (unknown) and one-sector granularities are the same. */ + granularity = max(bdev_discard_granularity(bdev) >> 9, 1U); + alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; + + max_discard_sectors = min(bdev_max_discard_sectors(bdev), (1U << 22)); + max_discard_sectors -= max_discard_sectors % granularity; + if (unlikely(!max_discard_sectors)) + goto zero_out; + + if (nr_sectors < granularity) + goto zero_out; + + tmp = start; + if (sector_div(tmp, granularity) != alignment) { + if (nr_sectors < 2*granularity) + goto zero_out; + /* start + gran - (start + gran - align) % gran */ + tmp = start + granularity - alignment; + tmp = start + granularity - sector_div(tmp, granularity); + + nr = tmp - start; + /* don't flag BLKDEV_ZERO_NOUNMAP, we don't know how many + * layers are below us, some may have smaller granularity */ + err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0); + nr_sectors -= nr; + start = tmp; + } + while (nr_sectors >= max_discard_sectors) { + err |= blkdev_issue_discard(bdev, start, max_discard_sectors, + GFP_NOIO); + nr_sectors -= max_discard_sectors; + start += max_discard_sectors; + } + if (nr_sectors) { + /* max_discard_sectors is unsigned int (and a multiple of + * granularity, we made sure of that above already); + * nr is < max_discard_sectors; + * I don't need sector_div here, even though nr is sector_t */ + nr = nr_sectors; + nr -= (unsigned int)nr % granularity; + if (nr) { + err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO); + nr_sectors -= nr; + start += nr; + } + } + zero_out: + if (nr_sectors) { + err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO, + (flags & EE_TRIM) ? 0 : BLKDEV_ZERO_NOUNMAP); + } + return err != 0; +} - drbd_endio_write_sec_final(peer_req); +static bool can_do_reliable_discards(struct drbd_device *device) +{ + struct disk_conf *dc; + bool can_do; + + if (!bdev_max_discard_sectors(device->ldev->backing_bdev)) + return false; + + rcu_read_lock(); + dc = rcu_dereference(device->ldev->disk_conf); + can_do = dc->discard_zeroes_if_aligned; + rcu_read_unlock(); + return can_do; } -static void drbd_issue_peer_wsame(struct drbd_device *device, - struct drbd_peer_request *peer_req) +static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, struct drbd_peer_request *peer_req) { - struct block_device *bdev = device->ldev->backing_bdev; - sector_t s = peer_req->i.sector; - sector_t nr = peer_req->i.size >> 9; - if (blkdev_issue_write_same(bdev, s, nr, GFP_NOIO, peer_req->pages)) + /* If the backend cannot discard, or does not guarantee + * read-back zeroes in discarded ranges, we fall back to + * zero-out. Unless configuration specifically requested + * otherwise. */ + if (!can_do_reliable_discards(device)) + peer_req->flags |= EE_ZEROOUT; + + if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector, + peer_req->i.size >> 9, peer_req->flags & (EE_ZEROOUT|EE_TRIM))) peer_req->flags |= EE_WAS_ERROR; drbd_endio_write_sec_final(peer_req); } +static int peer_request_fault_type(struct drbd_peer_request *peer_req) +{ + if (peer_req_op(peer_req) == REQ_OP_READ) { + return peer_req->flags & EE_APPLICATION ? + DRBD_FAULT_DT_RD : DRBD_FAULT_RS_RD; + } else { + return peer_req->flags & EE_APPLICATION ? + DRBD_FAULT_DT_WR : DRBD_FAULT_RS_WR; + } +} /** * drbd_submit_peer_request() - * @device: DRBD device. * @peer_req: peer request - * @rw: flag field, see bio->bi_opf * * May spread the pages to multiple bios, * depending on bio_add_page restrictions. @@ -1488,19 +1443,16 @@ static void drbd_issue_peer_wsame(struct drbd_device *device, * on certain Xen deployments. */ /* TODO allocate from our own bio_set. */ -int drbd_submit_peer_request(struct drbd_device *device, - struct drbd_peer_request *peer_req, - const unsigned op, const unsigned op_flags, - const int fault_type) +int drbd_submit_peer_request(struct drbd_peer_request *peer_req) { + struct drbd_device *device = peer_req->peer_device->device; struct bio *bios = NULL; struct bio *bio; struct page *page = peer_req->pages; sector_t sector = peer_req->i.sector; - unsigned data_size = peer_req->i.size; - unsigned n_bios = 0; - unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; - int err = -ENOMEM; + unsigned int data_size = peer_req->i.size; + unsigned int n_bios = 0; + unsigned int nr_pages = PFN_UP(data_size); /* TRIM/DISCARD: for now, always use the helper function * blkdev_issue_zeroout(..., discard=true). @@ -1508,7 +1460,7 @@ int drbd_submit_peer_request(struct drbd_device *device, * Correctness first, performance later. Next step is to code an * asynchronous variant of the same. */ - if (peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) { + if (peer_req->flags & (EE_TRIM | EE_ZEROOUT)) { /* wait for all pending IO completions, before we start * zeroing things out. */ conn_wait_active_ee_empty(peer_req->peer_device->connection); @@ -1525,10 +1477,7 @@ int drbd_submit_peer_request(struct drbd_device *device, spin_unlock_irq(&device->resource->req_lock); } - if (peer_req->flags & EE_IS_TRIM) - drbd_issue_peer_discard(device, peer_req); - else /* EE_WRITE_SAME */ - drbd_issue_peer_wsame(device, peer_req); + drbd_issue_peer_discard_or_zero_out(device, peer_req); return 0; } @@ -1541,15 +1490,20 @@ int drbd_submit_peer_request(struct drbd_device *device, * generated bio, but a bio allocated on behalf of the peer. */ next_bio: - bio = bio_alloc(GFP_NOIO, nr_pages); - if (!bio) { - drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages); - goto fail; + /* _DISCARD, _WRITE_ZEROES handled above. + * REQ_OP_FLUSH (empty flush) not expected, + * should have been mapped to a "drbd protocol barrier". + * REQ_OP_SECURE_ERASE: I don't see how we could ever support that. + */ + if (!(peer_req_op(peer_req) == REQ_OP_WRITE || + peer_req_op(peer_req) == REQ_OP_READ)) { + drbd_err(device, "Invalid bio op received: 0x%x\n", peer_req->opf); + return -EINVAL; } + + bio = bio_alloc(device->ldev->backing_bdev, nr_pages, peer_req->opf, GFP_NOIO); /* > peer_req->i.sector, unless this is the first bio */ bio->bi_iter.bi_sector = sector; - bio->bi_bdev = device->ldev->backing_bdev; - bio_set_op_attrs(bio, op, op_flags); bio->bi_private = peer_req; bio->bi_end_io = drbd_peer_request_endio; @@ -1577,17 +1531,9 @@ next_bio: bios = bios->bi_next; bio->bi_next = NULL; - drbd_generic_make_request(device, fault_type, bio); + drbd_submit_bio_noacct(device, peer_request_fault_type(peer_req), bio); } while (bios); return 0; - -fail: - while (bios) { - bio = bios; - bios = bios->bi_next; - bio_put(bio); - } - return err; } static void drbd_remove_epoch_entry_interval(struct drbd_device *device, @@ -1651,7 +1597,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf break; else drbd_warn(connection, "Allocation of an epoch failed, slowing down\n"); - /* Fall through */ + fallthrough; case WO_BDEV_FLUSH: case WO_DRAIN_IO: @@ -1690,7 +1636,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf } /* quick wrapper in case payload size != request_size (write same) */ -static void drbd_csum_ee_size(struct crypto_ahash *h, +static void drbd_csum_ee_size(struct crypto_shash *h, struct drbd_peer_request *r, void *d, unsigned int payload_size) { @@ -1714,7 +1660,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, struct packet_info *pi) __must_hold(local) { struct drbd_device *device = peer_device->device; - const sector_t capacity = drbd_get_capacity(device->this_bdev); + const sector_t capacity = get_capacity(device->vdisk); struct drbd_peer_request *peer_req; struct page *page; int digest_size, err; @@ -1723,11 +1669,11 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, void *dig_vv = peer_device->connection->int_dig_vv; unsigned long *data; struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL; - struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL; + struct p_trim *zeroes = (pi->cmd == P_ZEROES) ? pi->data : NULL; digest_size = 0; if (!trim && peer_device->connection->peer_integrity_tfm) { - digest_size = crypto_ahash_digestsize(peer_device->connection->peer_integrity_tfm); + digest_size = crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm); /* * FIXME: Receive the incoming digest into the receive buffer * here, together with its struct p_data? @@ -1738,32 +1684,24 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, data_size -= digest_size; } - /* assume request_size == data_size, but special case trim and wsame. */ + /* assume request_size == data_size, but special case trim. */ ds = data_size; if (trim) { - if (!expect(data_size == 0)) + if (!expect(peer_device, data_size == 0)) return NULL; ds = be32_to_cpu(trim->size); - } else if (wsame) { - if (data_size != queue_logical_block_size(device->rq_queue)) { - drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n", - data_size, queue_logical_block_size(device->rq_queue)); - return NULL; - } - if (data_size != bdev_logical_block_size(device->ldev->backing_bdev)) { - drbd_err(peer_device, "data size (%u) != backend logical block size (%u)\n", - data_size, bdev_logical_block_size(device->ldev->backing_bdev)); + } else if (zeroes) { + if (!expect(peer_device, data_size == 0)) return NULL; - } - ds = be32_to_cpu(wsame->size); + ds = be32_to_cpu(zeroes->size); } - if (!expect(IS_ALIGNED(ds, 512))) + if (!expect(peer_device, IS_ALIGNED(ds, 512))) return NULL; - if (trim || wsame) { - if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9))) + if (trim || zeroes) { + if (!expect(peer_device, ds <= (DRBD_MAX_BBIO_SECTORS << 9))) return NULL; - } else if (!expect(ds <= DRBD_MAX_BIO_SIZE)) + } else if (!expect(peer_device, ds <= DRBD_MAX_BIO_SIZE)) return NULL; /* even though we trust out peer, @@ -1785,24 +1723,26 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, peer_req->flags |= EE_WRITE; if (trim) { - peer_req->flags |= EE_IS_TRIM; + peer_req->flags |= EE_TRIM; + return peer_req; + } + if (zeroes) { + peer_req->flags |= EE_ZEROOUT; return peer_req; } - if (wsame) - peer_req->flags |= EE_WRITE_SAME; /* receive payload size bytes into page chain */ ds = data_size; page = peer_req->pages; page_chain_for_each(page) { unsigned len = min_t(int, ds, PAGE_SIZE); - data = kmap(page); + data = kmap_local_page(page); err = drbd_recv_all_warn(peer_device->connection, data, len); if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) { drbd_err(device, "Fault injection: Corrupting data on receive\n"); data[0] = data[0] ^ (unsigned long)-1; } - kunmap(page); + kunmap_local(data); if (err) { drbd_free_peer_req(device, peer_req); return NULL; @@ -1837,7 +1777,7 @@ static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size) page = drbd_alloc_pages(peer_device, 1, 1); - data = kmap(page); + data = kmap_local_page(page); while (data_size) { unsigned int len = min_t(int, data_size, PAGE_SIZE); @@ -1846,8 +1786,8 @@ static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size) break; data_size -= len; } - kunmap(page); - drbd_free_pages(peer_device->device, page, 0); + kunmap_local(data); + drbd_free_pages(peer_device->device, page); return err; } @@ -1863,7 +1803,7 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req digest_size = 0; if (peer_device->connection->peer_integrity_tfm) { - digest_size = crypto_ahash_digestsize(peer_device->connection->peer_integrity_tfm); + digest_size = crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm); err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size); if (err) return err; @@ -1878,10 +1818,10 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector); bio_for_each_segment(bvec, bio, iter) { - void *mapped = kmap(bvec.bv_page) + bvec.bv_offset; + void *mapped = bvec_kmap_local(&bvec); expect = min_t(int, data_size, bvec.bv_len); err = drbd_recv_all_warn(peer_device->connection, mapped, expect); - kunmap(bvec.bv_page); + kunmap_local(mapped); if (err) return err; data_size -= expect; @@ -1915,11 +1855,11 @@ static int e_end_resync_block(struct drbd_work *w, int unused) D_ASSERT(device, drbd_interval_empty(&peer_req->i)); if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { - drbd_set_in_sync(device, sector, peer_req->i.size); + drbd_set_in_sync(peer_device, sector, peer_req->i.size); err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req); } else { /* Record failure to sync */ - drbd_rs_failed_io(device, sector, peer_req->i.size); + drbd_rs_failed_io(peer_device, sector, peer_req->i.size); err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req); } @@ -1938,13 +1878,14 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto if (!peer_req) goto fail; - dec_rs_pending(device); + dec_rs_pending(peer_device); inc_unacked(device); /* corresponding dec_unacked() in e_end_resync_block() * respective _drbd_clear_done_ee */ peer_req->w.cb = e_end_resync_block; + peer_req->opf = REQ_OP_WRITE; peer_req->submit_jif = jiffies; spin_lock_irq(&device->resource->req_lock); @@ -1952,8 +1893,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto spin_unlock_irq(&device->resource->req_lock); atomic_add(pi->size >> 9, &device->rs_sect_ev); - if (drbd_submit_peer_request(device, peer_req, REQ_OP_WRITE, 0, - DRBD_FAULT_RS_WR) == 0) + if (drbd_submit_peer_request(peer_req) == 0) return 0; /* don't care for the reason here */ @@ -2007,12 +1947,9 @@ static int receive_DataReply(struct drbd_connection *connection, struct packet_i if (unlikely(!req)) return -EIO; - /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid - * special casing it there for the various failure cases. - * still no race with drbd_fail_pending_reads */ err = recv_dless_read(peer_device, req, sector, pi->size); if (!err) - req_mod(req, DATA_RECEIVED); + req_mod(req, DATA_RECEIVED, peer_device); /* else: nothing. handled from drbd_disconnect... * I don't think we may complete this just yet * in case we are "on-disconnect: freeze" */ @@ -2042,7 +1979,7 @@ static int receive_RSDataReply(struct drbd_connection *connection, struct packet * or in drbd_peer_request_endio. */ err = recv_resync_read(peer_device, sector, pi); } else { - if (__ratelimit(&drbd_ratelimit_state)) + if (drbd_ratelimit()) drbd_err(device, "Can not write resync data to local disk.\n"); err = drbd_drain_block(peer_device, pi->size); @@ -2070,7 +2007,7 @@ static void restart_conflicting_writes(struct drbd_device *device, continue; /* as it is RQ_POSTPONED, this will cause it to * be queued on the retry workqueue. */ - __req_mod(req, CONFLICT_RESOLVED, NULL); + __req_mod(req, CONFLICT_RESOLVED, NULL, NULL); } } @@ -2094,7 +2031,7 @@ static int e_end_block(struct drbd_work *w, int cancel) P_RS_WRITE_ACK : P_WRITE_ACK; err = drbd_send_ack(peer_device, pcmd, peer_req); if (pcmd == P_RS_WRITE_ACK) - drbd_set_in_sync(device, sector, peer_req->i.size); + drbd_set_in_sync(peer_device, sector, peer_req->i.size); } else { err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req); /* we expect it to be marked out of sync anyways... @@ -2272,27 +2209,29 @@ static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, co return ret; } -/* see also bio_flags_to_wire() - * DRBD_REQ_*, because we need to semantically map the flags to data packet - * flags and back. We may replicate to other kernel versions. */ -static unsigned long wire_flags_to_bio_flags(u32 dpf) +static enum req_op wire_flags_to_bio_op(u32 dpf) { - return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) | - (dpf & DP_FUA ? REQ_FUA : 0) | - (dpf & DP_FLUSH ? REQ_PREFLUSH : 0); -} - -static unsigned long wire_flags_to_bio_op(u32 dpf) -{ - if (dpf & DP_DISCARD) + if (dpf & DP_ZEROES) return REQ_OP_WRITE_ZEROES; + if (dpf & DP_DISCARD) + return REQ_OP_DISCARD; else return REQ_OP_WRITE; } +/* see also bio_flags_to_wire() */ +static blk_opf_t wire_flags_to_bio(struct drbd_connection *connection, u32 dpf) +{ + return wire_flags_to_bio_op(dpf) | + (dpf & DP_RW_SYNC ? REQ_SYNC : 0) | + (dpf & DP_FUA ? REQ_FUA : 0) | + (dpf & DP_FLUSH ? REQ_PREFLUSH : 0); +} + static void fail_postponed_requests(struct drbd_device *device, sector_t sector, unsigned int size) { + struct drbd_peer_device *peer_device = first_peer_device(device); struct drbd_interval *i; repeat: @@ -2306,7 +2245,7 @@ static void fail_postponed_requests(struct drbd_device *device, sector_t sector, if (!(req->rq_state & RQ_POSTPONED)) continue; req->rq_state &= ~RQ_POSTPONED; - __req_mod(req, NEG_ACKED, &m); + __req_mod(req, NEG_ACKED, peer_device, &m); spin_unlock_irq(&device->resource->req_lock); if (m.bio) complete_master_bio(device, &m); @@ -2373,7 +2312,11 @@ static int handle_write_conflicts(struct drbd_device *device, peer_req->w.cb = superseded ? e_send_superseded : e_send_retry_write; list_add_tail(&peer_req->w.list, &device->done_ee); - queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work); + /* put is in drbd_send_acks_wf() */ + kref_get(&device->kref); + if (!queue_work(connection->ack_sender, + &peer_req->peer_device->send_acks_work)) + kref_put(&device->kref, drbd_destroy_device); err = -ENOENT; goto out; @@ -2433,7 +2376,6 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * struct drbd_peer_request *peer_req; struct p_data *p = pi->data; u32 peer_seq = be32_to_cpu(p->seq_num); - int op, op_flags; u32 dp_flags; int err, tp; @@ -2472,12 +2414,22 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * peer_req->flags |= EE_APPLICATION; dp_flags = be32_to_cpu(p->dp_flags); - op = wire_flags_to_bio_op(dp_flags); - op_flags = wire_flags_to_bio_flags(dp_flags); + peer_req->opf = wire_flags_to_bio(connection, dp_flags); if (pi->cmd == P_TRIM) { D_ASSERT(peer_device, peer_req->i.size > 0); - D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES); + D_ASSERT(peer_device, peer_req_op(peer_req) == REQ_OP_DISCARD); + D_ASSERT(peer_device, peer_req->pages == NULL); + /* need to play safe: an older DRBD sender + * may mean zero-out while sending P_TRIM. */ + if (0 == (connection->agreed_features & DRBD_FF_WZEROES)) + peer_req->flags |= EE_ZEROOUT; + } else if (pi->cmd == P_ZEROES) { + D_ASSERT(peer_device, peer_req->i.size > 0); + D_ASSERT(peer_device, peer_req_op(peer_req) == REQ_OP_WRITE_ZEROES); D_ASSERT(peer_device, peer_req->pages == NULL); + /* Do (not) pass down BLKDEV_ZERO_NOUNMAP? */ + if (dp_flags & DP_DISCARD) + peer_req->flags |= EE_TRIM; } else if (peer_req->pages == NULL) { D_ASSERT(device, peer_req->i.size == 0); D_ASSERT(device, dp_flags & DP_FLUSH); @@ -2541,11 +2493,11 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * update_peer_seq(peer_device, peer_seq); spin_lock_irq(&device->resource->req_lock); } - /* TRIM and WRITE_SAME are processed synchronously, + /* TRIM and is processed synchronously, * we wait for all pending requests, respectively wait for * active_ee to become empty in drbd_submit_peer_request(); * better not add ourselves here. */ - if ((peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) == 0) + if ((peer_req->flags & (EE_TRIM | EE_ZEROOUT)) == 0) list_add_tail(&peer_req->w.list, &device->active_ee); spin_unlock_irq(&device->resource->req_lock); @@ -2554,14 +2506,13 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * if (device->state.pdsk < D_INCONSISTENT) { /* In case we have the only disk of the cluster, */ - drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size); + drbd_set_out_of_sync(peer_device, peer_req->i.sector, peer_req->i.size); peer_req->flags &= ~EE_MAY_SET_IN_SYNC; drbd_al_begin_io(device, &peer_req->i); peer_req->flags |= EE_CALL_AL_COMPLETE_IO; } - err = drbd_submit_peer_request(device, peer_req, op, op_flags, - DRBD_FAULT_DT_WR); + err = drbd_submit_peer_request(peer_req); if (!err) return 0; @@ -2594,9 +2545,10 @@ out_interrupted: * The current sync rate used here uses only the most recent two step marks, * to have a short time average so we can react faster. */ -bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, +bool drbd_rs_should_slow_down(struct drbd_peer_device *peer_device, sector_t sector, bool throttle_if_app_is_waiting) { + struct drbd_device *device = peer_device->device; struct lc_element *tmp; bool throttle = drbd_rs_c_min_rate_throttle(device); @@ -2619,7 +2571,7 @@ bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, bool drbd_rs_c_min_rate_throttle(struct drbd_device *device) { - struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk; + struct gendisk *disk = device->ldev->backing_bdev->bd_disk; unsigned long db, dt, dbdt; unsigned int c_min_rate; int curr_events; @@ -2632,8 +2584,7 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device) if (c_min_rate == 0) return false; - curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + - (int)part_stat_read(&disk->part0, sectors[1]) - + curr_events = (int)part_stat_read_accum(disk->part0, sectors) - atomic_read(&device->rs_sect_ev); if (atomic_read(&device->ap_actlog_cnt) @@ -2673,14 +2624,13 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet struct drbd_peer_request *peer_req; struct digest_info *di = NULL; int size, verb; - unsigned int fault_type; struct p_block_req *p = pi->data; peer_device = conn_peer_device(connection, pi->vnr); if (!peer_device) return -EIO; device = peer_device->device; - capacity = drbd_get_capacity(device->this_bdev); + capacity = get_capacity(device->vdisk); sector = be64_to_cpu(p->sector); size = be32_to_cpu(p->blksize); @@ -2710,13 +2660,13 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet break; case P_OV_REPLY: verb = 0; - dec_rs_pending(device); + dec_rs_pending(peer_device); drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC); break; default: BUG(); } - if (verb && __ratelimit(&drbd_ratelimit_state)) + if (verb && drbd_ratelimit()) drbd_err(device, "Can not satisfy peer's read request, " "no local data.\n"); @@ -2733,11 +2683,11 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet put_ldev(device); return -ENOMEM; } + peer_req->opf = REQ_OP_READ; switch (pi->cmd) { case P_DATA_REQUEST: peer_req->w.cb = w_e_end_data_req; - fault_type = DRBD_FAULT_DT_RD; /* application IO, don't drbd_rs_begin_io */ peer_req->flags |= EE_APPLICATION; goto submit; @@ -2748,16 +2698,15 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet then we would do something smarter here than reading the block... */ peer_req->flags |= EE_RS_THIN_REQ; + fallthrough; case P_RS_DATA_REQUEST: peer_req->w.cb = w_e_end_rsdata_req; - fault_type = DRBD_FAULT_RS_RD; /* used in the sector offset progress display */ device->bm_resync_fo = BM_SECT_TO_BIT(sector); break; case P_OV_REPLY: case P_CSUM_RS_REQUEST: - fault_type = DRBD_FAULT_RS_RD; di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO); if (!di) goto out_free_e; @@ -2782,7 +2731,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet /* track progress, we may need to throttle */ atomic_add(size >> 9, &device->rs_sect_in); peer_req->w.cb = w_e_end_ov_reply; - dec_rs_pending(device); + dec_rs_pending(peer_device); /* drbd_rs_begin_io done when we sent this request, * but accounting still needs to be done. */ goto submit_for_resync; @@ -2806,7 +2755,6 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet (unsigned long long)sector); } peer_req->w.cb = w_e_end_ov_req; - fault_type = DRBD_FAULT_RS_RD; break; default: @@ -2846,7 +2794,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet update_receiver_timing_details(connection, drbd_rs_should_slow_down); if (device->state.peer != R_PRIMARY - && drbd_rs_should_slow_down(device, sector, false)) + && drbd_rs_should_slow_down(peer_device, sector, false)) schedule_timeout_uninterruptible(HZ/10); update_receiver_timing_details(connection, drbd_rs_begin_io); if (drbd_rs_begin_io(device, sector)) @@ -2858,8 +2806,7 @@ submit_for_resync: submit: update_receiver_timing_details(connection, drbd_submit_peer_request); inc_unacked(device); - if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, 0, - fault_type) == 0) + if (drbd_submit_peer_request(peer_req) == 0) return 0; /* don't care for the reason here */ @@ -2876,7 +2823,7 @@ out_free_e: return -EIO; } -/** +/* * drbd_asb_recover_0p - Recover after split-brain with no remaining primaries */ static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local) @@ -2913,7 +2860,7 @@ static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold rv = 1; break; } - /* Else fall through to one of the other strategies... */ + fallthrough; /* to one of the other strategies */ case ASB_DISCARD_OLDER_PRI: if (self == 0 && peer == 1) { rv = 1; @@ -2926,6 +2873,7 @@ static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold /* Else fall through to one of the other strategies... */ drbd_warn(device, "Discard younger/older primary did not find a decision\n" "Using discard-least-changes instead\n"); + fallthrough; case ASB_DISCARD_ZERO_CHG: if (ch_peer == 0 && ch_self == 0) { rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) @@ -2937,6 +2885,7 @@ static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold } if (after_sb_0p == ASB_DISCARD_ZERO_CHG) break; + fallthrough; case ASB_DISCARD_LEAST_CHG: if (ch_self < ch_peer) rv = -1; @@ -2957,7 +2906,7 @@ static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold return rv; } -/** +/* * drbd_asb_recover_1p - Recover after split-brain with one remaining primary */ static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local) @@ -3014,7 +2963,7 @@ static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold return rv; } -/** +/* * drbd_asb_recover_2p - Recover after split-brain with two remaining primaries */ static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local) @@ -3094,10 +3043,11 @@ static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid, -1096 requires proto 96 */ -static int drbd_uuid_compare(struct drbd_device *const device, enum drbd_role const peer_role, int *rule_nr) __must_hold(local) +static int drbd_uuid_compare(struct drbd_peer_device *const peer_device, + enum drbd_role const peer_role, int *rule_nr) __must_hold(local) { - struct drbd_peer_device *const peer_device = first_peer_device(device); - struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; + struct drbd_connection *const connection = peer_device->connection; + struct drbd_device *device = peer_device->device; u64 self, peer; int i, j; @@ -3320,7 +3270,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, enum drbd_conns rv = C_MASK; enum drbd_disk_state mydisk; struct net_conf *nc; - int hg, rule_nr, rr_conflict, tentative; + int hg, rule_nr, rr_conflict, tentative, always_asbp; mydisk = device->state.disk; if (mydisk == D_NEGOTIATING) @@ -3333,7 +3283,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]); - hg = drbd_uuid_compare(device, peer_role, &rule_nr); + hg = drbd_uuid_compare(peer_device, peer_role, &rule_nr); spin_unlock_irq(&device->ldev->md.uuid_lock); drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr); @@ -3371,8 +3321,12 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, rcu_read_lock(); nc = rcu_dereference(peer_device->connection->net_conf); + always_asbp = nc->always_asbp; + rr_conflict = nc->rr_conflict; + tentative = nc->tentative; + rcu_read_unlock(); - if (hg == 100 || (hg == -100 && nc->always_asbp)) { + if (hg == 100 || (hg == -100 && always_asbp)) { int pcount = (device->state.role == R_PRIMARY) + (peer_role == R_PRIMARY); int forced = (hg == -100); @@ -3411,9 +3365,6 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, "Sync from %s node\n", (hg < 0) ? "peer" : "this"); } - rr_conflict = nc->rr_conflict; - tentative = nc->tentative; - rcu_read_unlock(); if (hg == -100) { /* FIXME this log message is not correct if we end up here @@ -3435,7 +3386,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, switch (rr_conflict) { case ASB_CALL_HELPER: drbd_khelper(device, "pri-lost"); - /* fall through */ + fallthrough; case ASB_DISCONNECT: drbd_err(device, "I shall become SyncTarget, but I am primary!\n"); return C_MASK; @@ -3458,7 +3409,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, if (abs(hg) >= 2) { drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n"); if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake", - BM_LOCKED_SET_ALLOWED)) + BM_LOCKED_SET_ALLOWED, NULL)) return C_MASK; } @@ -3498,7 +3449,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in int p_proto, p_discard_my_data, p_two_primaries, cf; struct net_conf *nc, *old_net_conf, *new_net_conf = NULL; char integrity_alg[SHARED_SECRET_MAX] = ""; - struct crypto_ahash *peer_integrity_tfm = NULL; + struct crypto_shash *peer_integrity_tfm = NULL; void *int_dig_in = NULL, *int_dig_vv = NULL; p_proto = be32_to_cpu(p->protocol); @@ -3579,7 +3530,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in * change. */ - peer_integrity_tfm = crypto_alloc_ahash(integrity_alg, 0, CRYPTO_ALG_ASYNC); + peer_integrity_tfm = crypto_alloc_shash(integrity_alg, 0, 0); if (IS_ERR(peer_integrity_tfm)) { peer_integrity_tfm = NULL; drbd_err(connection, "peer data-integrity-alg %s not supported\n", @@ -3587,7 +3538,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in goto disconnect; } - hash_size = crypto_ahash_digestsize(peer_integrity_tfm); + hash_size = crypto_shash_digestsize(peer_integrity_tfm); int_dig_in = kmalloc(hash_size, GFP_KERNEL); int_dig_vv = kmalloc(hash_size, GFP_KERNEL); if (!(int_dig_in && int_dig_vv)) { @@ -3597,10 +3548,8 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in } new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); - if (!new_net_conf) { - drbd_err(connection, "Allocation of new net_conf failed\n"); + if (!new_net_conf) goto disconnect; - } mutex_lock(&connection->data.mutex); mutex_lock(&connection->resource->conf_update); @@ -3617,7 +3566,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in mutex_unlock(&connection->resource->conf_update); mutex_unlock(&connection->data.mutex); - crypto_free_ahash(connection->peer_integrity_tfm); + crypto_free_shash(connection->peer_integrity_tfm); kfree(connection->int_dig_in); kfree(connection->int_dig_vv); connection->peer_integrity_tfm = peer_integrity_tfm; @@ -3628,14 +3577,13 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in drbd_info(connection, "peer data-integrity-alg: %s\n", integrity_alg[0] ? integrity_alg : "(none)"); - synchronize_rcu(); - kfree(old_net_conf); + kvfree_rcu_mightsleep(old_net_conf); return 0; disconnect_rcu_unlock: rcu_read_unlock(); disconnect: - crypto_free_ahash(peer_integrity_tfm); + crypto_free_shash(peer_integrity_tfm); kfree(int_dig_in); kfree(int_dig_vv); conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); @@ -3647,15 +3595,16 @@ disconnect: * return: NULL (alg name was "") * ERR_PTR(error) if something goes wrong * or the crypto hash ptr, if it worked out ok. */ -static struct crypto_ahash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device, +static struct crypto_shash *drbd_crypto_alloc_digest_safe( + const struct drbd_device *device, const char *alg, const char *name) { - struct crypto_ahash *tfm; + struct crypto_shash *tfm; if (!alg[0]) return NULL; - tfm = crypto_alloc_ahash(alg, 0, CRYPTO_ALG_ASYNC); + tfm = crypto_alloc_shash(alg, 0, 0); if (IS_ERR(tfm)) { drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n", alg, name, PTR_ERR(tfm)); @@ -3708,13 +3657,13 @@ static int receive_SyncParam(struct drbd_connection *connection, struct packet_i struct drbd_device *device; struct p_rs_param_95 *p; unsigned int header_size, data_size, exp_max_sz; - struct crypto_ahash *verify_tfm = NULL; - struct crypto_ahash *csums_tfm = NULL; + struct crypto_shash *verify_tfm = NULL; + struct crypto_shash *csums_tfm = NULL; struct net_conf *old_net_conf, *new_net_conf = NULL; struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL; const int apv = connection->agreed_pro_version; struct fifo_buffer *old_plan = NULL, *new_plan = NULL; - int fifo_size = 0; + unsigned int fifo_size = 0; int err; peer_device = conn_peer_device(connection, pi->vnr); @@ -3749,7 +3698,8 @@ static int receive_SyncParam(struct drbd_connection *connection, struct packet_i /* initialize verify_alg and csums_alg */ p = pi->data; - memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); + BUILD_BUG_ON(sizeof(p->algs) != 2 * SHARED_SECRET_MAX); + memset(&p->algs, 0, sizeof(p->algs)); err = drbd_recv_all(peer_device->connection, p, header_size); if (err) @@ -3778,7 +3728,6 @@ static int receive_SyncParam(struct drbd_connection *connection, struct packet_i drbd_err(device, "verify-alg of wrong size, " "peer wants %u, accepting only up to %u byte\n", data_size, SHARED_SECRET_MAX); - err = -EIO; goto reconnect; } @@ -3846,24 +3795,22 @@ static int receive_SyncParam(struct drbd_connection *connection, struct packet_i if (verify_tfm || csums_tfm) { new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); - if (!new_net_conf) { - drbd_err(device, "Allocation of new net_conf failed\n"); + if (!new_net_conf) goto disconnect; - } *new_net_conf = *old_net_conf; if (verify_tfm) { strcpy(new_net_conf->verify_alg, p->verify_alg); new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1; - crypto_free_ahash(peer_device->connection->verify_tfm); + crypto_free_shash(peer_device->connection->verify_tfm); peer_device->connection->verify_tfm = verify_tfm; drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg); } if (csums_tfm) { strcpy(new_net_conf->csums_alg, p->csums_alg); new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1; - crypto_free_ahash(peer_device->connection->csums_tfm); + crypto_free_shash(peer_device->connection->csums_tfm); peer_device->connection->csums_tfm = csums_tfm; drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg); } @@ -3907,9 +3854,9 @@ disconnect: mutex_unlock(&connection->resource->conf_update); /* just for completeness: actually not needed, * as this is not reached if csums_tfm was ok. */ - crypto_free_ahash(csums_tfm); + crypto_free_shash(csums_tfm); /* but free the verify_tfm again, if csums_tfm did not work out */ - crypto_free_ahash(verify_tfm); + crypto_free_shash(verify_tfm); conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); return -EIO; } @@ -3935,6 +3882,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL; enum determine_dev_size dd = DS_UNCHANGED; sector_t p_size, p_usize, p_csize, my_usize; + sector_t new_size, cur_size; int ldsc = 0; /* local disk size changed */ enum dds_flags ddsf; @@ -3942,6 +3890,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info if (!peer_device) return config_unknown_volume(connection, pi); device = peer_device->device; + cur_size = get_capacity(device->vdisk); p_size = be64_to_cpu(p->d_size); p_usize = be64_to_cpu(p->u_size); @@ -3952,7 +3901,6 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info device->p_size = p_size; if (get_ldev(device)) { - sector_t new_size, cur_size; rcu_read_lock(); my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size; rcu_read_unlock(); @@ -3967,13 +3915,13 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info if (device->state.conn == C_WF_REPORT_PARAMS) p_usize = min_not_zero(my_usize, p_usize); - /* Never shrink a device with usable data during connect. - But allow online shrinking if we are connected. */ + /* Never shrink a device with usable data during connect, + * or "attach" on the peer. + * But allow online shrinking if we are connected. */ new_size = drbd_new_dev_size(device, device->ldev, p_usize, 0); - cur_size = drbd_get_capacity(device->this_bdev); if (new_size < cur_size && device->state.disk >= D_OUTDATED && - device->state.conn < C_CONNECTED) { + (device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS)) { drbd_err(device, "The peer's disk size is too small! (%llu < %llu sectors)\n", (unsigned long long)new_size, (unsigned long long)cur_size); conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); @@ -3986,7 +3934,6 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); if (!new_disk_conf) { - drbd_err(device, "Allocation of new disk_conf failed\n"); put_ldev(device); return -ENOMEM; } @@ -3998,11 +3945,10 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); mutex_unlock(&connection->resource->conf_update); - synchronize_rcu(); - kfree(old_disk_conf); + kvfree_rcu_mightsleep(old_disk_conf); - drbd_info(device, "Peer sets u_size to %lu sectors\n", - (unsigned long)my_usize); + drbd_info(device, "Peer sets u_size to %lu sectors (old: %lu)\n", + (unsigned long)p_usize, (unsigned long)my_usize); } put_ldev(device); @@ -4035,9 +3981,36 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info * * However, if he sends a zero current size, * take his (user-capped or) backing disk size anyways. + * + * Unless of course he does not have a disk himself. + * In which case we ignore this completely. */ + sector_t new_size = p_csize ?: p_usize ?: p_size; drbd_reconsider_queue_parameters(device, NULL, o); - drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size); + if (new_size == 0) { + /* Ignore, peer does not know nothing. */ + } else if (new_size == cur_size) { + /* nothing to do */ + } else if (cur_size != 0 && p_size == 0) { + drbd_warn(device, "Ignored diskless peer device size (peer:%llu != me:%llu sectors)!\n", + (unsigned long long)new_size, (unsigned long long)cur_size); + } else if (new_size < cur_size && device->state.role == R_PRIMARY) { + drbd_err(device, "The peer's device size is too small! (%llu < %llu sectors); demote me first!\n", + (unsigned long long)new_size, (unsigned long long)cur_size); + conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); + return -EIO; + } else { + /* I believe the peer, if + * - I don't have a current size myself + * - we agree on the size anyways + * - I do have a current size, am Secondary, + * and he has the only disk + * - I do have a current size, am Primary, + * and he has the only disk, + * which is larger than my current size + */ + drbd_set_my_capacity(device, new_size); + } } if (get_ldev(device)) { @@ -4050,8 +4023,8 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info } if (device->state.conn > C_WF_REPORT_PARAMS) { - if (be64_to_cpu(p->c_size) != - drbd_get_capacity(device->this_bdev) || ldsc) { + if (be64_to_cpu(p->c_size) != get_capacity(device->vdisk) || + ldsc) { /* we have different sizes, probably peer * needs to know my new size... */ drbd_send_sizes(peer_device, 0, ddsf); @@ -4085,11 +4058,9 @@ static int receive_uuids(struct drbd_connection *connection, struct packet_info return config_unknown_volume(connection, pi); device = peer_device->device; - p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); - if (!p_uuid) { - drbd_err(device, "kmalloc of p_uuid failed\n"); + p_uuid = kmalloc_array(UI_EXTENDED_SIZE, sizeof(*p_uuid), GFP_NOIO); + if (!p_uuid) return false; - } for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) p_uuid[i] = be64_to_cpu(p->uuid[i]); @@ -4097,7 +4068,7 @@ static int receive_uuids(struct drbd_connection *connection, struct packet_info kfree(device->p_uuid); device->p_uuid = p_uuid; - if (device->state.conn < C_CONNECTED && + if ((device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS) && device->state.disk < D_INCONSISTENT && device->state.role == R_PRIMARY && (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { @@ -4117,7 +4088,7 @@ static int receive_uuids(struct drbd_connection *connection, struct packet_info drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n"); drbd_bitmap_io(device, &drbd_bmio_clear_n_write, "clear_n_write from receive_uuids", - BM_LOCKED_TEST_ALLOWED); + BM_LOCKED_TEST_ALLOWED, NULL); _drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]); _drbd_uuid_set(device, UI_BITMAP, 0); _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), @@ -4295,7 +4266,7 @@ static int receive_state(struct drbd_connection *connection, struct packet_info else if (os.conn >= C_SYNC_SOURCE && peer_state.conn == C_CONNECTED) { if (drbd_bm_total_weight(device) <= device->rs_failed) - drbd_resync_finished(device); + drbd_resync_finished(peer_device); return 0; } } @@ -4303,8 +4274,8 @@ static int receive_state(struct drbd_connection *connection, struct packet_info /* explicit verify finished notification, stop sector reached. */ if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE && peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) { - ov_out_of_sync_print(device); - drbd_resync_finished(device); + ov_out_of_sync_print(peer_device); + drbd_resync_finished(peer_device); return 0; } @@ -4323,6 +4294,25 @@ static int receive_state(struct drbd_connection *connection, struct packet_info if (peer_state.conn == C_AHEAD) ns.conn = C_BEHIND; + /* TODO: + * if (primary and diskless and peer uuid != effective uuid) + * abort attach on peer; + * + * If this node does not have good data, was already connected, but + * the peer did a late attach only now, trying to "negotiate" with me, + * AND I am currently Primary, possibly frozen, with some specific + * "effective" uuid, this should never be reached, really, because + * we first send the uuids, then the current state. + * + * In this scenario, we already dropped the connection hard + * when we received the unsuitable uuids (receive_uuids(). + * + * Should we want to change this, that is: not drop the connection in + * receive_uuids() already, then we would need to add a branch here + * that aborts the attach of "unsuitable uuids" on the peer in case + * this node is currently Diskless Primary. + */ + if (device->p_uuid && peer_state.disk >= D_NEGOTIATING && get_ldev_if_state(device, D_NEGOTIATING)) { int cr; /* consider resync */ @@ -4335,7 +4325,7 @@ static int receive_state(struct drbd_connection *connection, struct packet_info (peer_state.disk == D_NEGOTIATING || os.disk == D_NEGOTIATING)); /* if we have both been inconsistent, and the peer has been - * forced to be UpToDate with --overwrite-data */ + * forced to be UpToDate with --force */ cr |= test_bit(CONSIDER_RESYNC, &device->flags); /* if we had been plain connected, and the admin requested to * start a sync by "invalidate" or "invalidate-remote" */ @@ -4449,7 +4439,7 @@ static int receive_sync_uuid(struct drbd_connection *connection, struct packet_i return 0; } -/** +/* * receive_bitmap_plain * * Return 0 when done, 1 when another iteration is needed, and a negative error @@ -4501,7 +4491,7 @@ static int dcbp_get_pad_bits(struct p_compressed_bm *p) return (p->encoding >> 4) & 0x7; } -/** +/* * recv_bm_rle_bits * * Return 0 when done, 1 when another iteration is needed, and a negative error @@ -4570,7 +4560,7 @@ recv_bm_rle_bits(struct drbd_peer_device *peer_device, return (s != c->bm_bits); } -/** +/* * decode_bitmap_c * * Return 0 when done, 1 when another iteration is needed, and a negative error @@ -4594,11 +4584,11 @@ decode_bitmap_c(struct drbd_peer_device *peer_device, return -EIO; } -void INFO_bm_xfer_stats(struct drbd_device *device, +void INFO_bm_xfer_stats(struct drbd_peer_device *peer_device, const char *direction, struct bm_xfer_ctx *c) { /* what would it take to transfer it "plaintext" */ - unsigned int header_size = drbd_header_size(first_peer_device(device)->connection); + unsigned int header_size = drbd_header_size(peer_device->connection); unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size; unsigned int plain = header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) + @@ -4622,7 +4612,7 @@ void INFO_bm_xfer_stats(struct drbd_device *device, r = 1000; r = 1000 - r; - drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), " + drbd_info(peer_device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), " "total %u; compression: %u.%u%%\n", direction, c->bytes[1], c->packets[1], @@ -4700,12 +4690,12 @@ static int receive_bitmap(struct drbd_connection *connection, struct packet_info goto out; } - INFO_bm_xfer_stats(device, "receive", &c); + INFO_bm_xfer_stats(peer_device, "receive", &c); if (device->state.conn == C_WF_BITMAP_T) { enum drbd_state_rv rv; - err = drbd_send_bitmap(device); + err = drbd_send_bitmap(device, peer_device); if (err) goto out; /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ @@ -4738,8 +4728,7 @@ static int receive_UnplugRemote(struct drbd_connection *connection, struct packe { /* Make sure we've acked all the TCP data associated * with the data requests being unplugged */ - drbd_tcp_quickack(connection->data.socket); - + tcp_sock_set_quickack(connection->data.socket->sk, 2); return 0; } @@ -4764,7 +4753,7 @@ static int receive_out_of_sync(struct drbd_connection *connection, struct packet drbd_conn_str(device->state.conn)); } - drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize)); + drbd_set_out_of_sync(peer_device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize)); return 0; } @@ -4785,11 +4774,10 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac sector = be64_to_cpu(p->sector); size = be32_to_cpu(p->blksize); - dec_rs_pending(device); + dec_rs_pending(peer_device); if (get_ldev(device)) { struct drbd_peer_request *peer_req; - const int op = REQ_OP_WRITE_ZEROES; peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector, size, 0, GFP_NOIO); @@ -4799,15 +4787,16 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac } peer_req->w.cb = e_end_resync_block; + peer_req->opf = REQ_OP_DISCARD; peer_req->submit_jif = jiffies; - peer_req->flags |= EE_IS_TRIM; + peer_req->flags |= EE_TRIM; spin_lock_irq(&device->resource->req_lock); list_add_tail(&peer_req->w.list, &device->sync_ee); spin_unlock_irq(&device->resource->req_lock); atomic_add(pi->size >> 9, &device->rs_sect_ev); - err = drbd_submit_peer_request(device, peer_req, op, 0, DRBD_FAULT_RS_WR); + err = drbd_submit_peer_request(peer_req); if (err) { spin_lock_irq(&device->resource->req_lock); @@ -4868,8 +4857,8 @@ static struct data_cmd drbd_cmd_handler[] = { [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data }, + [P_ZEROES] = { 0, sizeof(struct p_trim), receive_Data }, [P_RS_DEALLOCATED] = { 0, sizeof(struct p_block_desc), receive_rs_deallocated }, - [P_WSAME] = { 1, sizeof(struct p_wsame), receive_Data }, }; static void drbdd(struct drbd_connection *connection) @@ -4882,8 +4871,8 @@ static void drbdd(struct drbd_connection *connection) struct data_cmd const *cmd; drbd_thread_current_set_cpu(&connection->receiver); - update_receiver_timing_details(connection, drbd_recv_header); - if (drbd_recv_header(connection, &pi)) + update_receiver_timing_details(connection, drbd_recv_header_maybe_unplug); + if (drbd_recv_header_maybe_unplug(connection, &pi)) goto err_out; cmd = &drbd_cmd_handler[pi.cmd]; @@ -5014,8 +5003,8 @@ static int drbd_disconnected(struct drbd_peer_device *peer_device) atomic_set(&device->rs_pending_cnt, 0); wake_up(&device->misc_wait); - del_timer_sync(&device->resync_timer); - resync_timer_fn((unsigned long)device); + timer_delete_sync(&device->resync_timer); + resync_timer_fn(&device->resync_timer); /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier, * w_make_resync_request etc. which may still be on the worker queue @@ -5043,20 +5032,10 @@ static int drbd_disconnected(struct drbd_peer_device *peer_device) if (get_ldev(device)) { drbd_bitmap_io(device, &drbd_bm_write_copy_pages, - "write from disconnected", BM_LOCKED_CHANGE_ALLOWED); + "write from disconnected", BM_LOCKED_CHANGE_ALLOWED, NULL); put_ldev(device); } - /* tcp_close and release of sendpage pages can be deferred. I don't - * want to use SO_LINGER, because apparently it can be deferred for - * more than 20 seconds (longest time I checked). - * - * Actually we don't care for exactly when the network stack does its - * put_page(), but release our reference on these pages right here. - */ - i = drbd_free_peer_reqs(device, &device->net_ee); - if (i) - drbd_info(device, "net_ee not empty, killed %u entries\n", i); i = atomic_read(&device->pp_in_use_by_net); if (i) drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i); @@ -5152,11 +5131,12 @@ static int drbd_do_features(struct drbd_connection *connection) drbd_info(connection, "Handshake successful: " "Agreed network protocol version %d\n", connection->agreed_pro_version); - drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s.\n", + drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s%s.\n", connection->agreed_features, connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "", connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "", - connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : + connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : "", + connection->agreed_features & DRBD_FF_WZEROES ? " WRITE_ZEROES" : connection->agreed_features ? "" : " none"); return 1; @@ -5195,7 +5175,7 @@ static int drbd_do_auth(struct drbd_connection *connection) unsigned int key_len; char secret[SHARED_SECRET_MAX]; /* 64 byte */ unsigned int resp_size; - SHASH_DESC_ON_STACK(desc, connection->cram_hmac_tfm); + struct shash_desc *desc; struct packet_info pi; struct net_conf *nc; int err, rv; @@ -5208,8 +5188,14 @@ static int drbd_do_auth(struct drbd_connection *connection) memcpy(secret, nc->shared_secret, key_len); rcu_read_unlock(); + desc = kmalloc(sizeof(struct shash_desc) + + crypto_shash_descsize(connection->cram_hmac_tfm), + GFP_KERNEL); + if (!desc) { + rv = -1; + goto fail; + } desc->tfm = connection->cram_hmac_tfm; - desc->flags = 0; rv = crypto_shash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len); if (rv) { @@ -5239,7 +5225,7 @@ static int drbd_do_auth(struct drbd_connection *connection) if (pi.cmd != P_AUTH_CHALLENGE) { drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n", cmdname(pi.cmd), pi.cmd); - rv = 0; + rv = -1; goto fail; } @@ -5256,8 +5242,7 @@ static int drbd_do_auth(struct drbd_connection *connection) } peers_ch = kmalloc(pi.size, GFP_NOIO); - if (peers_ch == NULL) { - drbd_err(connection, "kmalloc of peers_ch failed\n"); + if (!peers_ch) { rv = -1; goto fail; } @@ -5276,8 +5261,7 @@ static int drbd_do_auth(struct drbd_connection *connection) resp_size = crypto_shash_digestsize(connection->cram_hmac_tfm); response = kmalloc(resp_size, GFP_NOIO); - if (response == NULL) { - drbd_err(connection, "kmalloc of response failed\n"); + if (!response) { rv = -1; goto fail; } @@ -5324,8 +5308,7 @@ static int drbd_do_auth(struct drbd_connection *connection) } right_response = kmalloc(resp_size, GFP_NOIO); - if (right_response == NULL) { - drbd_err(connection, "kmalloc of right_response failed\n"); + if (!right_response) { rv = -1; goto fail; } @@ -5350,7 +5333,10 @@ static int drbd_do_auth(struct drbd_connection *connection) kfree(peers_ch); kfree(response); kfree(right_response); - shash_desc_zero(desc); + if (desc) { + shash_desc_zero(desc); + kfree(desc); + } return rv; } @@ -5375,8 +5361,11 @@ int drbd_receiver(struct drbd_thread *thi) } } while (h == 0); - if (h > 0) + if (h > 0) { + blk_start_plug(&connection->receiver_plug); drbdd(connection); + blk_finish_plug(&connection->receiver_plug); + } conn_disconnect(connection); @@ -5467,22 +5456,23 @@ static int got_IsInSync(struct drbd_connection *connection, struct packet_info * if (get_ldev(device)) { drbd_rs_complete_io(device, sector); - drbd_set_in_sync(device, sector, blksize); + drbd_set_in_sync(peer_device, sector, blksize); /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); put_ldev(device); } - dec_rs_pending(device); + dec_rs_pending(peer_device); atomic_add(blksize >> 9, &device->rs_sect_in); return 0; } static int -validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector, +validate_req_change_req_state(struct drbd_peer_device *peer_device, u64 id, sector_t sector, struct rb_root *root, const char *func, enum drbd_req_event what, bool missing_ok) { + struct drbd_device *device = peer_device->device; struct drbd_request *req; struct bio_and_error m; @@ -5492,7 +5482,7 @@ validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t secto spin_unlock_irq(&device->resource->req_lock); return -EIO; } - __req_mod(req, what, &m); + __req_mod(req, what, peer_device, &m); spin_unlock_irq(&device->resource->req_lock); if (m.bio) @@ -5517,8 +5507,8 @@ static int got_BlockAck(struct drbd_connection *connection, struct packet_info * update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); if (p->block_id == ID_SYNCER) { - drbd_set_in_sync(device, sector, blksize); - dec_rs_pending(device); + drbd_set_in_sync(peer_device, sector, blksize); + dec_rs_pending(peer_device); return 0; } switch (pi->cmd) { @@ -5541,7 +5531,7 @@ static int got_BlockAck(struct drbd_connection *connection, struct packet_info * BUG(); } - return validate_req_change_req_state(device, p->block_id, sector, + return validate_req_change_req_state(peer_device, p->block_id, sector, &device->write_requests, __func__, what, false); } @@ -5563,12 +5553,12 @@ static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); if (p->block_id == ID_SYNCER) { - dec_rs_pending(device); - drbd_rs_failed_io(device, sector, size); + dec_rs_pending(peer_device); + drbd_rs_failed_io(peer_device, sector, size); return 0; } - err = validate_req_change_req_state(device, p->block_id, sector, + err = validate_req_change_req_state(peer_device, p->block_id, sector, &device->write_requests, __func__, NEG_ACKED, true); if (err) { @@ -5577,7 +5567,7 @@ static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi request is no longer in the collision hash. */ /* In Protocol B we might already have got a P_RECV_ACK but then get a P_NEG_ACK afterwards. */ - drbd_set_out_of_sync(device, sector, size); + drbd_set_out_of_sync(peer_device, sector, size); } return 0; } @@ -5599,7 +5589,7 @@ static int got_NegDReply(struct drbd_connection *connection, struct packet_info drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n", (unsigned long long)sector, be32_to_cpu(p->blksize)); - return validate_req_change_req_state(device, p->block_id, sector, + return validate_req_change_req_state(peer_device, p->block_id, sector, &device->read_requests, __func__, NEG_ACKED, false); } @@ -5622,13 +5612,14 @@ static int got_NegRSDReply(struct drbd_connection *connection, struct packet_inf update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); - dec_rs_pending(device); + dec_rs_pending(peer_device); if (get_ldev_if_state(device, D_FAILED)) { drbd_rs_complete_io(device, sector); switch (pi->cmd) { case P_NEG_RS_DREPLY: - drbd_rs_failed_io(device, sector, size); + drbd_rs_failed_io(peer_device, sector, size); + break; case P_RS_CANCEL: break; default: @@ -5684,21 +5675,21 @@ static int got_OVResult(struct drbd_connection *connection, struct packet_info * update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) - drbd_ov_out_of_sync_found(device, sector, size); + drbd_ov_out_of_sync_found(peer_device, sector, size); else - ov_out_of_sync_print(device); + ov_out_of_sync_print(peer_device); if (!get_ldev(device)) return 0; drbd_rs_complete_io(device, sector); - dec_rs_pending(device); + dec_rs_pending(peer_device); --device->ov_left; /* let's advance progress step marks only for every other megabyte */ if ((device->ov_left & 0x200) == 0x200) - drbd_advance_rs_marks(device, device->ov_left); + drbd_advance_rs_marks(peer_device, device->ov_left); if (device->ov_left == 0) { dw = kmalloc(sizeof(*dw), GFP_NOIO); @@ -5708,8 +5699,8 @@ static int got_OVResult(struct drbd_connection *connection, struct packet_info * drbd_queue_work(&peer_device->connection->sender_work, &dw->w); } else { drbd_err(device, "kmalloc(dw) failed."); - ov_out_of_sync_print(device); - drbd_resync_finished(device); + ov_out_of_sync_print(peer_device); + drbd_resync_finished(peer_device); } } put_ldev(device); @@ -5785,17 +5776,12 @@ int drbd_ack_receiver(struct drbd_thread *thi) unsigned int header_size = drbd_header_size(connection); int expect = header_size; bool ping_timeout_active = false; - struct sched_param param = { .sched_priority = 2 }; - rv = sched_setscheduler(current, SCHED_RR, ¶m); - if (rv < 0) - drbd_err(connection, "drbd_ack_receiver: ERROR set priority, ret=%d\n", rv); + sched_set_fifo_low(current); while (get_t_state(thi) == RUNNING) { drbd_thread_current_set_cpu(thi); - conn_reclaim_net_peer_reqs(connection); - if (test_and_clear_bit(SEND_PING, &connection->flags)) { if (drbd_send_ping(connection)) { drbd_err(connection, "drbd_send_ping has failed\n"); @@ -5879,7 +5865,7 @@ int drbd_ack_receiver(struct drbd_thread *thi) err = cmd->fn(connection, &pi); if (err) { - drbd_err(connection, "%pf failed\n", cmd->fn); + drbd_err(connection, "%ps failed\n", cmd->fn); goto reconnect; } @@ -5927,7 +5913,7 @@ void drbd_send_acks_wf(struct work_struct *ws) rcu_read_unlock(); if (tcp_cork) - drbd_tcp_cork(connection->meta.socket); + tcp_sock_set_cork(connection->meta.socket->sk, true); err = drbd_finish_peer_reqs(device); kref_put(&device->kref, drbd_destroy_device); @@ -5940,7 +5926,7 @@ void drbd_send_acks_wf(struct work_struct *ws) } if (tcp_cork) - drbd_tcp_uncork(connection->meta.socket); + tcp_sock_set_cork(connection->meta.socket->sk, false); return; } |
