summaryrefslogtreecommitdiff
path: root/drivers/block/nbd.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/nbd.c')
-rw-r--r--drivers/block/nbd.c1551
1 files changed, 1010 insertions, 541 deletions
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 7c9a949e876b..f6c33b21f69e 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Network block device - make block devices work over TCP
*
@@ -7,11 +8,11 @@
* Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
* Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
*
- * This file is released under GPLv2 or later.
- *
* (part of code stolen from loop.c)
*/
+#define pr_fmt(fmt) "nbd: " fmt
+
#include <linux/major.h>
#include <linux/blkdev.h>
@@ -27,6 +28,7 @@
#include <linux/ioctl.h>
#include <linux/mutex.h>
#include <linux/compiler.h>
+#include <linux/completion.h>
#include <linux/err.h>
#include <linux/kernel.h>
#include <linux/slab.h>
@@ -44,8 +46,12 @@
#include <linux/nbd-netlink.h>
#include <net/genetlink.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/nbd.h>
+
static DEFINE_IDR(nbd_index_idr);
static DEFINE_MUTEX(nbd_index_mutex);
+static struct workqueue_struct *nbd_del_wq;
static int nbd_total_devices = 0;
struct nbd_sock {
@@ -56,11 +62,13 @@ struct nbd_sock {
bool dead;
int fallback_index;
int cookie;
+ struct work_struct work;
};
struct recv_thread_args {
struct work_struct work;
struct nbd_device *nbd;
+ struct nbd_sock *nsock;
int index;
};
@@ -69,14 +77,17 @@ struct link_dead_args {
int index;
};
-#define NBD_TIMEDOUT 0
+#define NBD_RT_TIMEDOUT 0
+#define NBD_RT_DISCONNECT_REQUESTED 1
+#define NBD_RT_DISCONNECTED 2
+#define NBD_RT_HAS_PID_FILE 3
+#define NBD_RT_HAS_CONFIG_REF 4
+#define NBD_RT_BOUND 5
+#define NBD_RT_DISCONNECT_ON_CLOSE 6
+#define NBD_RT_HAS_BACKEND_FILE 7
+
+#define NBD_DESTROY_ON_DISCONNECT 0
#define NBD_DISCONNECT_REQUESTED 1
-#define NBD_DISCONNECTED 2
-#define NBD_HAS_PID_FILE 3
-#define NBD_HAS_CONFIG_REF 4
-#define NBD_BOUND 5
-#define NBD_DESTROY_ON_DISCONNECT 6
-#define NBD_DISCONNECT_ON_CLOSE 7
struct nbd_config {
u32 flags;
@@ -90,13 +101,18 @@ struct nbd_config {
atomic_t recv_threads;
wait_queue_head_t recv_wq;
- loff_t blksize;
+ unsigned int blksize_bits;
loff_t bytesize;
#if IS_ENABLED(CONFIG_DEBUG_FS)
struct dentry *dbg_dir;
#endif
};
+static inline unsigned int nbd_blksize(struct nbd_config *config)
+{
+ return 1u << config->blksize_bits;
+}
+
struct nbd_device {
struct blk_mq_tag_set tag_set;
@@ -106,19 +122,35 @@ struct nbd_device {
struct nbd_config *config;
struct mutex config_lock;
struct gendisk *disk;
+ struct workqueue_struct *recv_workq;
+ struct work_struct remove_work;
struct list_head list;
- struct task_struct *task_recv;
struct task_struct *task_setup;
+
+ unsigned long flags;
+ pid_t pid; /* pid of nbd-client, if attached */
+
+ char *backend;
};
#define NBD_CMD_REQUEUED 1
+/*
+ * This flag will be set if nbd_queue_rq() succeed, and will be checked and
+ * cleared in completion. Both setting and clearing of the flag are protected
+ * by cmd->lock.
+ */
+#define NBD_CMD_INFLIGHT 2
+
+/* Just part of request header or data payload is sent successfully */
+#define NBD_CMD_PARTIAL_SEND 3
struct nbd_cmd {
struct nbd_device *nbd;
struct mutex lock;
int index;
int cookie;
+ int retries;
blk_status_t status;
unsigned long flags;
u32 cmd_cookie;
@@ -130,11 +162,10 @@ static struct dentry *nbd_dbg_dir;
#define nbd_name(nbd) ((nbd)->disk->disk_name)
-#define NBD_MAGIC 0x68797548
+#define NBD_DEF_BLKSIZE_BITS 10
static unsigned int nbds_max = 16;
static int max_part = 16;
-static struct workqueue_struct *recv_workqueue;
static int part_shift;
static int nbd_dev_dbg_init(struct nbd_device *nbd);
@@ -154,6 +185,17 @@ static void nbd_requeue_cmd(struct nbd_cmd *cmd)
{
struct request *req = blk_mq_rq_from_pdu(cmd);
+ lockdep_assert_held(&cmd->lock);
+
+ /*
+ * Clear INFLIGHT flag so that this cmd won't be completed in
+ * normal completion path
+ *
+ * INFLIGHT flag will be set when the cmd is queued to nbd next
+ * time.
+ */
+ __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
+
if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags))
blk_mq_requeue_request(req, true);
}
@@ -195,9 +237,9 @@ static ssize_t pid_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
- struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
+ struct nbd_device *nbd = disk->private_data;
- return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
+ return sprintf(buf, "%d\n", nbd->pid);
}
static const struct device_attribute pid_attr = {
@@ -205,36 +247,59 @@ static const struct device_attribute pid_attr = {
.show = pid_show,
};
+static ssize_t backend_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct nbd_device *nbd = disk->private_data;
+
+ return sprintf(buf, "%s\n", nbd->backend ?: "");
+}
+
+static const struct device_attribute backend_attr = {
+ .attr = { .name = "backend", .mode = 0444},
+ .show = backend_show,
+};
+
static void nbd_dev_remove(struct nbd_device *nbd)
{
struct gendisk *disk = nbd->disk;
- struct request_queue *q;
- if (disk) {
- q = disk->queue;
- del_gendisk(disk);
- blk_cleanup_queue(q);
- blk_mq_free_tag_set(&nbd->tag_set);
- disk->private_data = NULL;
- put_disk(disk);
- }
- kfree(nbd);
+ del_gendisk(disk);
+ blk_mq_free_tag_set(&nbd->tag_set);
+
+ /*
+ * Remove from idr after del_gendisk() completes, so if the same ID is
+ * reused, the following add_disk() will succeed.
+ */
+ mutex_lock(&nbd_index_mutex);
+ idr_remove(&nbd_index_idr, nbd->index);
+ mutex_unlock(&nbd_index_mutex);
+ destroy_workqueue(nbd->recv_workq);
+ put_disk(disk);
+}
+
+static void nbd_dev_remove_work(struct work_struct *work)
+{
+ nbd_dev_remove(container_of(work, struct nbd_device, remove_work));
}
static void nbd_put(struct nbd_device *nbd)
{
- if (refcount_dec_and_mutex_lock(&nbd->refs,
- &nbd_index_mutex)) {
- idr_remove(&nbd_index_idr, nbd->index);
- mutex_unlock(&nbd_index_mutex);
+ if (!refcount_dec_and_test(&nbd->refs))
+ return;
+
+ /* Call del_gendisk() asynchrounously to prevent deadlock */
+ if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags))
+ queue_work(nbd_del_wq, &nbd->remove_work);
+ else
nbd_dev_remove(nbd);
- }
}
static int nbd_disconnected(struct nbd_config *config)
{
- return test_bit(NBD_DISCONNECTED, &config->runtime_flags) ||
- test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
+ return test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags) ||
+ test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
}
static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
@@ -246,15 +311,15 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
if (args) {
INIT_WORK(&args->work, nbd_dead_link_work);
args->index = nbd->index;
- queue_work(system_wq, &args->work);
+ queue_work(system_percpu_wq, &args->work);
}
}
if (!nsock->dead) {
kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
if (atomic_dec_return(&nbd->config->live_connections) == 0) {
- if (test_and_clear_bit(NBD_DISCONNECT_REQUESTED,
+ if (test_and_clear_bit(NBD_RT_DISCONNECT_REQUESTED,
&nbd->config->runtime_flags)) {
- set_bit(NBD_DISCONNECTED,
+ set_bit(NBD_RT_DISCONNECTED,
&nbd->config->runtime_flags);
dev_info(nbd_to_dev(nbd),
"Disconnected due to user request.\n");
@@ -266,46 +331,55 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
nsock->sent = 0;
}
-static void nbd_size_clear(struct nbd_device *nbd)
+static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, loff_t blksize)
{
- if (nbd->config->bytesize) {
- set_capacity(nbd->disk, 0);
- kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
- }
-}
+ struct queue_limits lim;
+ int error;
-static void nbd_size_update(struct nbd_device *nbd)
-{
- struct nbd_config *config = nbd->config;
- struct block_device *bdev = bdget_disk(nbd->disk, 0);
+ if (!blksize)
+ blksize = 1u << NBD_DEF_BLKSIZE_BITS;
- if (config->flags & NBD_FLAG_SEND_TRIM) {
- nbd->disk->queue->limits.discard_granularity = config->blksize;
- nbd->disk->queue->limits.discard_alignment = config->blksize;
- blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
- }
- blk_queue_logical_block_size(nbd->disk->queue, config->blksize);
- blk_queue_physical_block_size(nbd->disk->queue, config->blksize);
- set_capacity(nbd->disk, config->bytesize >> 9);
- if (bdev) {
- if (bdev->bd_disk) {
- bd_set_size(bdev, config->bytesize);
- set_blocksize(bdev, config->blksize);
- } else
- bdev->bd_invalidated = 1;
- bdput(bdev);
- }
- kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
-}
+ if (blk_validate_block_size(blksize))
+ return -EINVAL;
-static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize,
- loff_t nr_blocks)
-{
- struct nbd_config *config = nbd->config;
- config->blksize = blocksize;
- config->bytesize = blocksize * nr_blocks;
- if (nbd->task_recv != NULL)
- nbd_size_update(nbd);
+ if (bytesize < 0)
+ return -EINVAL;
+
+ nbd->config->bytesize = bytesize;
+ nbd->config->blksize_bits = __ffs(blksize);
+
+ if (!nbd->pid)
+ return 0;
+
+ lim = queue_limits_start_update(nbd->disk->queue);
+ if (nbd->config->flags & NBD_FLAG_SEND_TRIM)
+ lim.max_hw_discard_sectors = UINT_MAX >> SECTOR_SHIFT;
+ else
+ lim.max_hw_discard_sectors = 0;
+ if (!(nbd->config->flags & NBD_FLAG_SEND_FLUSH)) {
+ lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA);
+ } else if (nbd->config->flags & NBD_FLAG_SEND_FUA) {
+ lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
+ } else {
+ lim.features |= BLK_FEAT_WRITE_CACHE;
+ lim.features &= ~BLK_FEAT_FUA;
+ }
+ if (nbd->config->flags & NBD_FLAG_ROTATIONAL)
+ lim.features |= BLK_FEAT_ROTATIONAL;
+ if (nbd->config->flags & NBD_FLAG_SEND_WRITE_ZEROES)
+ lim.max_write_zeroes_sectors = UINT_MAX >> SECTOR_SHIFT;
+
+ lim.logical_block_size = blksize;
+ lim.physical_block_size = blksize;
+ error = queue_limits_commit_update_frozen(nbd->disk->queue, &lim);
+ if (error)
+ return error;
+
+ if (max_part)
+ set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
+ if (!set_capacity_and_notify(nbd->disk, bytesize >> 9))
+ kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
+ return 0;
}
static void nbd_complete_rq(struct request *req)
@@ -328,7 +402,7 @@ static void sock_shutdown(struct nbd_device *nbd)
if (config->num_connections == 0)
return;
- if (test_and_set_bit(NBD_DISCONNECTED, &config->runtime_flags))
+ if (test_and_set_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
return;
for (i = 0; i < config->num_connections; i++) {
@@ -340,32 +414,81 @@ static void sock_shutdown(struct nbd_device *nbd)
dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
}
-static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
- bool reserved)
+static u32 req_to_nbd_cmd_type(struct request *req)
+{
+ switch (req_op(req)) {
+ case REQ_OP_DISCARD:
+ return NBD_CMD_TRIM;
+ case REQ_OP_FLUSH:
+ return NBD_CMD_FLUSH;
+ case REQ_OP_WRITE:
+ return NBD_CMD_WRITE;
+ case REQ_OP_READ:
+ return NBD_CMD_READ;
+ case REQ_OP_WRITE_ZEROES:
+ return NBD_CMD_WRITE_ZEROES;
+ default:
+ return U32_MAX;
+ }
+}
+
+static struct nbd_config *nbd_get_config_unlocked(struct nbd_device *nbd)
+{
+ if (refcount_inc_not_zero(&nbd->config_refs)) {
+ /*
+ * Add smp_mb__after_atomic to ensure that reading nbd->config_refs
+ * and reading nbd->config is ordered. The pair is the barrier in
+ * nbd_alloc_and_init_config(), avoid nbd->config_refs is set
+ * before nbd->config.
+ */
+ smp_mb__after_atomic();
+ return nbd->config;
+ }
+
+ return NULL;
+}
+
+static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req)
{
struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
struct nbd_device *nbd = cmd->nbd;
struct nbd_config *config;
- if (!refcount_inc_not_zero(&nbd->config_refs)) {
+ if (!mutex_trylock(&cmd->lock))
+ return BLK_EH_RESET_TIMER;
+
+ /* partial send is handled in nbd_sock's work function */
+ if (test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)) {
+ mutex_unlock(&cmd->lock);
+ return BLK_EH_RESET_TIMER;
+ }
+
+ if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+ mutex_unlock(&cmd->lock);
+ return BLK_EH_DONE;
+ }
+
+ config = nbd_get_config_unlocked(nbd);
+ if (!config) {
cmd->status = BLK_STS_TIMEOUT;
+ __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
+ mutex_unlock(&cmd->lock);
goto done;
}
- config = nbd->config;
-
- if (!mutex_trylock(&cmd->lock))
- return BLK_EH_RESET_TIMER;
- if (config->num_connections > 1) {
+ if (config->num_connections > 1 ||
+ (config->num_connections == 1 && nbd->tag_set.timeout)) {
dev_err_ratelimited(nbd_to_dev(nbd),
"Connection timed out, retrying (%d/%d alive)\n",
atomic_read(&config->live_connections),
config->num_connections);
/*
* Hooray we have more connections, requeue this IO, the submit
- * path will put it on a real connection.
+ * path will put it on a real connection. Or if only one
+ * connection is configured, the submit path will wait util
+ * a new connection is reconfigured or util dead timeout.
*/
- if (config->socks && config->num_connections > 1) {
+ if (config->socks) {
if (cmd->index < config->num_connections) {
struct nbd_sock *nsock =
config->socks[cmd->index];
@@ -380,17 +503,43 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
nbd_mark_nsock_dead(nbd, nsock, 1);
mutex_unlock(&nsock->tx_lock);
}
+ nbd_requeue_cmd(cmd);
mutex_unlock(&cmd->lock);
+ nbd_config_put(nbd);
+ return BLK_EH_DONE;
+ }
+ }
+
+ if (!nbd->tag_set.timeout) {
+ /*
+ * Userspace sets timeout=0 to disable socket disconnection,
+ * so just warn and reset the timer.
+ */
+ struct nbd_sock *nsock = config->socks[cmd->index];
+ cmd->retries++;
+ dev_info(nbd_to_dev(nbd), "Possible stuck request %p: control (%s@%llu,%uB). Runtime %u seconds\n",
+ req, nbdcmd_to_ascii(req_to_nbd_cmd_type(req)),
+ (unsigned long long)blk_rq_pos(req) << 9,
+ blk_rq_bytes(req), (req->timeout / HZ) * cmd->retries);
+
+ mutex_lock(&nsock->tx_lock);
+ if (cmd->cookie != nsock->cookie) {
nbd_requeue_cmd(cmd);
+ mutex_unlock(&nsock->tx_lock);
+ mutex_unlock(&cmd->lock);
nbd_config_put(nbd);
return BLK_EH_DONE;
}
- } else {
- dev_err_ratelimited(nbd_to_dev(nbd),
- "Connection timed out\n");
+ mutex_unlock(&nsock->tx_lock);
+ mutex_unlock(&cmd->lock);
+ nbd_config_put(nbd);
+ return BLK_EH_RESET_TIMER;
}
- set_bit(NBD_TIMEDOUT, &config->runtime_flags);
+
+ dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n");
+ set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags);
cmd->status = BLK_STS_IOERR;
+ __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
mutex_unlock(&cmd->lock);
sock_shutdown(nbd);
nbd_config_put(nbd);
@@ -399,16 +548,11 @@ done:
return BLK_EH_DONE;
}
-/*
- * Send or receive packet.
- */
-static int sock_xmit(struct nbd_device *nbd, int index, int send,
- struct iov_iter *iter, int msg_flags, int *sent)
+static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send,
+ struct iov_iter *iter, int msg_flags, int *sent)
{
- struct nbd_config *config = nbd->config;
- struct socket *sock = config->socks[index]->sock;
int result;
- struct msghdr msg;
+ struct msghdr msg = {} ;
unsigned int noreclaim_flag;
if (unlikely(!sock)) {
@@ -421,27 +565,27 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
msg.msg_iter = *iter;
noreclaim_flag = memalloc_noreclaim_save();
- do {
- sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
- msg.msg_name = NULL;
- msg.msg_namelen = 0;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_flags = msg_flags | MSG_NOSIGNAL;
-
- if (send)
- result = sock_sendmsg(sock, &msg);
- else
- result = sock_recvmsg(sock, &msg, msg.msg_flags);
-
- if (result <= 0) {
- if (result == 0)
- result = -EPIPE; /* short read */
- break;
- }
- if (sent)
- *sent += result;
- } while (msg_data_left(&msg));
+
+ scoped_with_kernel_creds() {
+ do {
+ sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
+ sock->sk->sk_use_task_frag = false;
+ msg.msg_flags = msg_flags | MSG_NOSIGNAL;
+
+ if (send)
+ result = sock_sendmsg(sock, &msg);
+ else
+ result = sock_recvmsg(sock, &msg, msg.msg_flags);
+
+ if (result <= 0) {
+ if (result == 0)
+ result = -EPIPE; /* short read */
+ break;
+ }
+ if (sent)
+ *sent += result;
+ } while (msg_data_left(&msg));
+ }
memalloc_noreclaim_restore(noreclaim_flag);
@@ -449,6 +593,19 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
}
/*
+ * Send or receive packet. Return a positive value on success and
+ * negtive value on failure, and never return 0.
+ */
+static int sock_xmit(struct nbd_device *nbd, int index, int send,
+ struct iov_iter *iter, int msg_flags, int *sent)
+{
+ struct nbd_config *config = nbd->config;
+ struct socket *sock = config->socks[index]->sock;
+
+ return __sock_xmit(nbd, sock, send, iter, msg_flags, sent);
+}
+
+/*
* Different settings for sk->sk_sndtimeo can result in different return values
* if there is a signal pending when we enter sendmsg, because reasons?
*/
@@ -457,8 +614,36 @@ static inline int was_interrupted(int result)
return result == -ERESTARTSYS || result == -EINTR;
}
-/* always call with the tx_lock held */
-static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
+/*
+ * We've already sent header or part of data payload, have no choice but
+ * to set pending and schedule it in work.
+ *
+ * And we have to return BLK_STS_OK to block core, otherwise this same
+ * request may be re-dispatched with different tag, but our header has
+ * been sent out with old tag, and this way does confuse reply handling.
+ */
+static void nbd_sched_pending_work(struct nbd_device *nbd,
+ struct nbd_sock *nsock,
+ struct nbd_cmd *cmd, int sent)
+{
+ struct request *req = blk_mq_rq_from_pdu(cmd);
+
+ /* pending work should be scheduled only once */
+ WARN_ON_ONCE(test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags));
+
+ nsock->pending = req;
+ nsock->sent = sent;
+ set_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags);
+ refcount_inc(&nbd->config_refs);
+ schedule_work(&nsock->work);
+}
+
+/*
+ * Returns BLK_STS_RESOURCE if the caller should retry after a delay.
+ * Returns BLK_STS_IOERR if sending failed.
+ */
+static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd,
+ int index)
{
struct request *req = blk_mq_rq_from_pdu(cmd);
struct nbd_config *config = nbd->config;
@@ -467,41 +652,32 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
struct iov_iter from;
- unsigned long size = blk_rq_bytes(req);
struct bio *bio;
u64 handle;
u32 type;
u32 nbd_cmd_flags = 0;
int sent = nsock->sent, skip = 0;
- iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
+ lockdep_assert_held(&cmd->lock);
+ lockdep_assert_held(&nsock->tx_lock);
- switch (req_op(req)) {
- case REQ_OP_DISCARD:
- type = NBD_CMD_TRIM;
- break;
- case REQ_OP_FLUSH:
- type = NBD_CMD_FLUSH;
- break;
- case REQ_OP_WRITE:
- type = NBD_CMD_WRITE;
- break;
- case REQ_OP_READ:
- type = NBD_CMD_READ;
- break;
- default:
- return -EIO;
- }
+ iov_iter_kvec(&from, ITER_SOURCE, &iov, 1, sizeof(request));
+
+ type = req_to_nbd_cmd_type(req);
+ if (type == U32_MAX)
+ return BLK_STS_IOERR;
if (rq_data_dir(req) == WRITE &&
(config->flags & NBD_FLAG_READ_ONLY)) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
"Write on read-only\n");
- return -EIO;
+ return BLK_STS_IOERR;
}
if (req->cmd_flags & REQ_FUA)
nbd_cmd_flags |= NBD_CMD_FLAG_FUA;
+ if ((req->cmd_flags & REQ_NOUNMAP) && (type == NBD_CMD_WRITE_ZEROES))
+ nbd_cmd_flags |= NBD_CMD_FLAG_NO_HOLE;
/* We did a partial send previously, and we at least sent the whole
* request struct, so just go and send the rest of the pages in the
@@ -510,6 +686,10 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
if (sent) {
if (sent >= sizeof(request)) {
skip = sent - sizeof(request);
+
+ /* initialize handle for tracing purposes */
+ handle = nbd_cmd_handle(cmd);
+
goto send_pages;
}
iov_iter_advance(&from, sent);
@@ -518,36 +698,40 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
}
cmd->index = index;
cmd->cookie = nsock->cookie;
+ cmd->retries = 0;
request.type = htonl(type | nbd_cmd_flags);
if (type != NBD_CMD_FLUSH) {
request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
- request.len = htonl(size);
+ request.len = htonl(blk_rq_bytes(req));
}
handle = nbd_cmd_handle(cmd);
- memcpy(request.handle, &handle, sizeof(handle));
+ request.cookie = cpu_to_be64(handle);
+
+ trace_nbd_send_request(&request, nbd->index, blk_mq_rq_from_pdu(cmd));
dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
req, nbdcmd_to_ascii(type),
(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
result = sock_xmit(nbd, index, 1, &from,
(type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
- if (result <= 0) {
+ trace_nbd_header_sent(req, handle);
+ if (result < 0) {
if (was_interrupted(result)) {
- /* If we havne't sent anything we can just return BUSY,
+ /* If we haven't sent anything we can just return BUSY,
* however if we have sent something we need to make
* sure we only allow this req to be sent until we are
* completely done.
*/
if (sent) {
- nsock->pending = req;
- nsock->sent = sent;
+ nbd_sched_pending_work(nbd, nsock, cmd, sent);
+ return BLK_STS_OK;
}
set_bit(NBD_CMD_REQUEUED, &cmd->flags);
return BLK_STS_RESOURCE;
}
dev_err_ratelimited(disk_to_dev(nbd->disk),
"Send control failed (result %d)\n", result);
- return -EAGAIN;
+ goto requeue;
}
send_pages:
if (type != NBD_CMD_WRITE)
@@ -565,7 +749,7 @@ send_pages:
dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
req, bvec.bv_len);
- iov_iter_bvec(&from, WRITE, &bvec, 1, bvec.bv_len);
+ iov_iter_bvec(&from, ITER_SOURCE, &bvec, 1, bvec.bv_len);
if (skip) {
if (skip >= iov_iter_count(&from)) {
skip -= iov_iter_count(&from);
@@ -575,21 +759,15 @@ send_pages:
skip = 0;
}
result = sock_xmit(nbd, index, 1, &from, flags, &sent);
- if (result <= 0) {
+ if (result < 0) {
if (was_interrupted(result)) {
- /* We've already sent the header, we
- * have no choice but to set pending and
- * return BUSY.
- */
- nsock->pending = req;
- nsock->sent = sent;
- set_bit(NBD_CMD_REQUEUED, &cmd->flags);
- return BLK_STS_RESOURCE;
+ nbd_sched_pending_work(nbd, nsock, cmd, sent);
+ return BLK_STS_OK;
}
dev_err(disk_to_dev(nbd->disk),
"Send data failed (result %d)\n",
result);
- return -EAGAIN;
+ goto requeue;
}
/*
* The completion might already have come in,
@@ -603,43 +781,106 @@ send_pages:
bio = next;
}
out:
+ trace_nbd_payload_sent(req, handle);
nsock->pending = NULL;
nsock->sent = 0;
+ __set_bit(NBD_CMD_INFLIGHT, &cmd->flags);
+ return BLK_STS_OK;
+
+requeue:
+ /*
+ * Can't requeue in case we are dealing with partial send
+ *
+ * We must run from pending work function.
+ * */
+ if (test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags))
+ return BLK_STS_OK;
+
+ /* retry on a different socket */
+ dev_err_ratelimited(disk_to_dev(nbd->disk),
+ "Request send failed, requeueing\n");
+ nbd_mark_nsock_dead(nbd, nsock, 1);
+ nbd_requeue_cmd(cmd);
+ return BLK_STS_OK;
+}
+
+/* handle partial sending */
+static void nbd_pending_cmd_work(struct work_struct *work)
+{
+ struct nbd_sock *nsock = container_of(work, struct nbd_sock, work);
+ struct request *req = nsock->pending;
+ struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
+ struct nbd_device *nbd = cmd->nbd;
+ unsigned long deadline = READ_ONCE(req->deadline);
+ unsigned int wait_ms = 2;
+
+ mutex_lock(&cmd->lock);
+
+ WARN_ON_ONCE(test_bit(NBD_CMD_REQUEUED, &cmd->flags));
+ if (WARN_ON_ONCE(!test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)))
+ goto out;
+
+ mutex_lock(&nsock->tx_lock);
+ while (true) {
+ nbd_send_cmd(nbd, cmd, cmd->index);
+ if (!nsock->pending)
+ break;
+
+ /* don't bother timeout handler for partial sending */
+ if (READ_ONCE(jiffies) + msecs_to_jiffies(wait_ms) >= deadline) {
+ cmd->status = BLK_STS_IOERR;
+ blk_mq_complete_request(req);
+ break;
+ }
+ msleep(wait_ms);
+ wait_ms *= 2;
+ }
+ mutex_unlock(&nsock->tx_lock);
+ clear_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags);
+out:
+ mutex_unlock(&cmd->lock);
+ nbd_config_put(nbd);
+}
+
+static int nbd_read_reply(struct nbd_device *nbd, struct socket *sock,
+ struct nbd_reply *reply)
+{
+ struct kvec iov = {.iov_base = reply, .iov_len = sizeof(*reply)};
+ struct iov_iter to;
+ int result;
+
+ reply->magic = 0;
+ iov_iter_kvec(&to, ITER_DEST, &iov, 1, sizeof(*reply));
+ result = __sock_xmit(nbd, sock, 0, &to, MSG_WAITALL, NULL);
+ if (result < 0) {
+ if (!nbd_disconnected(nbd->config))
+ dev_err(disk_to_dev(nbd->disk),
+ "Receive control failed (result %d)\n", result);
+ return result;
+ }
+
+ if (ntohl(reply->magic) != NBD_REPLY_MAGIC) {
+ dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
+ (unsigned long)ntohl(reply->magic));
+ return -EPROTO;
+ }
+
return 0;
}
/* NULL returned = something went wrong, inform userspace */
-static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
+static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index,
+ struct nbd_reply *reply)
{
- struct nbd_config *config = nbd->config;
int result;
- struct nbd_reply reply;
struct nbd_cmd *cmd;
struct request *req = NULL;
u64 handle;
u16 hwq;
u32 tag;
- struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)};
- struct iov_iter to;
int ret = 0;
- reply.magic = 0;
- iov_iter_kvec(&to, READ, &iov, 1, sizeof(reply));
- result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
- if (result <= 0) {
- if (!nbd_disconnected(config))
- dev_err(disk_to_dev(nbd->disk),
- "Receive control failed (result %d)\n", result);
- return ERR_PTR(result);
- }
-
- if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
- dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
- (unsigned long)ntohl(reply.magic));
- return ERR_PTR(-EPROTO);
- }
-
- memcpy(&handle, reply.handle, sizeof(handle));
+ handle = be64_to_cpu(reply->cookie);
tag = nbd_handle_to_tag(handle);
hwq = blk_mq_unique_tag_to_hwq(tag);
if (hwq < nbd->tag_set.nr_hw_queues)
@@ -650,24 +891,43 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
tag, req);
return ERR_PTR(-ENOENT);
}
+ trace_nbd_header_received(req, handle);
cmd = blk_mq_rq_to_pdu(req);
mutex_lock(&cmd->lock);
+ if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+ dev_err(disk_to_dev(nbd->disk), "Suspicious reply %d (status %u flags %lu)",
+ tag, cmd->status, cmd->flags);
+ ret = -ENOENT;
+ goto out;
+ }
+ if (cmd->index != index) {
+ dev_err(disk_to_dev(nbd->disk), "Unexpected reply %d from different sock %d (expected %d)",
+ tag, index, cmd->index);
+ ret = -ENOENT;
+ goto out;
+ }
if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) {
dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n",
req, cmd->cmd_cookie, nbd_handle_to_cookie(handle));
ret = -ENOENT;
goto out;
}
+ if (cmd->status != BLK_STS_OK) {
+ dev_err(disk_to_dev(nbd->disk), "Command already handled %p\n",
+ req);
+ ret = -ENOENT;
+ goto out;
+ }
if (test_bit(NBD_CMD_REQUEUED, &cmd->flags)) {
dev_err(disk_to_dev(nbd->disk), "Raced with timeout on req %p\n",
req);
ret = -ENOENT;
goto out;
}
- if (ntohl(reply.error)) {
+ if (ntohl(reply->error)) {
dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
- ntohl(reply.error));
+ ntohl(reply->error));
cmd->status = BLK_STS_IOERR;
goto out;
}
@@ -676,22 +936,21 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
if (rq_data_dir(req) != WRITE) {
struct req_iterator iter;
struct bio_vec bvec;
+ struct iov_iter to;
rq_for_each_segment(bvec, req, iter) {
- iov_iter_bvec(&to, READ, &bvec, 1, bvec.bv_len);
+ iov_iter_bvec(&to, ITER_DEST, &bvec, 1, bvec.bv_len);
result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
- if (result <= 0) {
+ if (result < 0) {
dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
result);
/*
- * If we've disconnected or we only have 1
- * connection then we need to make sure we
+ * If we've disconnected, we need to make sure we
* complete this request, otherwise error out
* and let the timeout stuff handle resubmitting
* this request onto another connection.
*/
- if (nbd_disconnected(config) ||
- config->num_connections <= 1) {
+ if (nbd_disconnected(nbd->config)) {
cmd->status = BLK_STS_IOERR;
goto out;
}
@@ -703,6 +962,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
}
}
out:
+ trace_nbd_payload_received(req, handle);
mutex_unlock(&cmd->lock);
return ret ? ERR_PTR(ret) : cmd;
}
@@ -714,32 +974,75 @@ static void recv_work(struct work_struct *work)
work);
struct nbd_device *nbd = args->nbd;
struct nbd_config *config = nbd->config;
+ struct request_queue *q = nbd->disk->queue;
+ struct nbd_sock *nsock = args->nsock;
struct nbd_cmd *cmd;
+ struct request *rq;
while (1) {
- cmd = nbd_read_stat(nbd, args->index);
- if (IS_ERR(cmd)) {
- struct nbd_sock *nsock = config->socks[args->index];
+ struct nbd_reply reply;
- mutex_lock(&nsock->tx_lock);
- nbd_mark_nsock_dead(nbd, nsock, 1);
- mutex_unlock(&nsock->tx_lock);
+ if (nbd_read_reply(nbd, nsock->sock, &reply))
+ break;
+
+ /*
+ * Grab .q_usage_counter so request pool won't go away, then no
+ * request use-after-free is possible during nbd_handle_reply().
+ * If queue is frozen, there won't be any inflight requests, we
+ * needn't to handle the incoming garbage message.
+ */
+ if (!percpu_ref_tryget(&q->q_usage_counter)) {
+ dev_err(disk_to_dev(nbd->disk), "%s: no io inflight\n",
+ __func__);
+ break;
+ }
+
+ cmd = nbd_handle_reply(nbd, args->index, &reply);
+ if (IS_ERR(cmd)) {
+ percpu_ref_put(&q->q_usage_counter);
break;
}
- blk_mq_complete_request(blk_mq_rq_from_pdu(cmd));
+ rq = blk_mq_rq_from_pdu(cmd);
+ if (likely(!blk_should_fake_timeout(rq->q))) {
+ bool complete;
+
+ mutex_lock(&cmd->lock);
+ complete = __test_and_clear_bit(NBD_CMD_INFLIGHT,
+ &cmd->flags);
+ mutex_unlock(&cmd->lock);
+ if (complete)
+ blk_mq_complete_request(rq);
+ }
+ percpu_ref_put(&q->q_usage_counter);
}
+
+ mutex_lock(&nsock->tx_lock);
+ nbd_mark_nsock_dead(nbd, nsock, 1);
+ mutex_unlock(&nsock->tx_lock);
+
atomic_dec(&config->recv_threads);
wake_up(&config->recv_wq);
nbd_config_put(nbd);
kfree(args);
}
-static bool nbd_clear_req(struct request *req, void *data, bool reserved)
+static bool nbd_clear_req(struct request *req, void *data)
{
struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
+ /* don't abort one completed request */
+ if (blk_mq_request_completed(req))
+ return true;
+
+ mutex_lock(&cmd->lock);
+ if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+ mutex_unlock(&cmd->lock);
+ return true;
+ }
cmd->status = BLK_STS_IOERR;
+ mutex_unlock(&cmd->lock);
+
blk_mq_complete_request(req);
return true;
}
@@ -759,12 +1062,12 @@ static int find_fallback(struct nbd_device *nbd, int index)
struct nbd_sock *nsock = config->socks[index];
int fallback = nsock->fallback_index;
- if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
+ if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
return new_index;
if (config->num_connections <= 1) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
- "Attempted send on invalid socket\n");
+ "Dead connection, failed to find a fallback\n");
return new_index;
}
@@ -800,35 +1103,39 @@ static int wait_for_reconnect(struct nbd_device *nbd)
struct nbd_config *config = nbd->config;
if (!config->dead_conn_timeout)
return 0;
- if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
+
+ if (!wait_event_timeout(config->conn_wait,
+ test_bit(NBD_RT_DISCONNECTED,
+ &config->runtime_flags) ||
+ atomic_read(&config->live_connections) > 0,
+ config->dead_conn_timeout))
return 0;
- return wait_event_timeout(config->conn_wait,
- atomic_read(&config->live_connections) > 0,
- config->dead_conn_timeout) > 0;
+
+ return !test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags);
}
-static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
+static blk_status_t nbd_handle_cmd(struct nbd_cmd *cmd, int index)
{
struct request *req = blk_mq_rq_from_pdu(cmd);
struct nbd_device *nbd = cmd->nbd;
struct nbd_config *config;
struct nbd_sock *nsock;
- int ret;
+ blk_status_t ret;
+
+ lockdep_assert_held(&cmd->lock);
- if (!refcount_inc_not_zero(&nbd->config_refs)) {
+ config = nbd_get_config_unlocked(nbd);
+ if (!config) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
"Socks array is empty\n");
- blk_mq_start_request(req);
- return -EINVAL;
+ return BLK_STS_IOERR;
}
- config = nbd->config;
if (index >= config->num_connections) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
"Attempted send on invalid socket\n");
nbd_config_put(nbd);
- blk_mq_start_request(req);
- return -EINVAL;
+ return BLK_STS_IOERR;
}
cmd->status = BLK_STS_OK;
again:
@@ -851,8 +1158,7 @@ again:
*/
sock_shutdown(nbd);
nbd_config_put(nbd);
- blk_mq_start_request(req);
- return -EIO;
+ return BLK_STS_IOERR;
}
goto again;
}
@@ -865,21 +1171,10 @@ again:
blk_mq_start_request(req);
if (unlikely(nsock->pending && nsock->pending != req)) {
nbd_requeue_cmd(cmd);
- ret = 0;
+ ret = BLK_STS_OK;
goto out;
}
- /*
- * Some failures are related to the link going down, so anything that
- * returns EAGAIN can be retried on a different socket.
- */
ret = nbd_send_cmd(nbd, cmd, index);
- if (ret == -EAGAIN) {
- dev_err_ratelimited(disk_to_dev(nbd->disk),
- "Request send failed, requeueing\n");
- nbd_mark_nsock_dead(nbd, nsock, 1);
- nbd_requeue_cmd(cmd);
- ret = 0;
- }
out:
mutex_unlock(&nsock->tx_lock);
nbd_config_put(nbd);
@@ -890,7 +1185,7 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
- int ret;
+ blk_status_t ret;
/*
* Since we look at the bio's to send the request over the network we
@@ -910,15 +1205,39 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
* appropriate.
*/
ret = nbd_handle_cmd(cmd, hctx->queue_num);
- if (ret < 0)
- ret = BLK_STS_IOERR;
- else if (!ret)
- ret = BLK_STS_OK;
mutex_unlock(&cmd->lock);
return ret;
}
+static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd,
+ int *err)
+{
+ struct socket *sock;
+
+ *err = 0;
+ sock = sockfd_lookup(fd, err);
+ if (!sock)
+ return NULL;
+
+ if (!sk_is_tcp(sock->sk) &&
+ !sk_is_stream_unix(sock->sk)) {
+ dev_err(disk_to_dev(nbd->disk), "Unsupported socket: should be TCP or UNIX.\n");
+ *err = -EINVAL;
+ sockfd_put(sock);
+ return NULL;
+ }
+
+ if (sock->ops->shutdown == sock_no_shutdown) {
+ dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n");
+ *err = -EINVAL;
+ sockfd_put(sock);
+ return NULL;
+ }
+
+ return sock;
+}
+
static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
bool netlink)
{
@@ -926,35 +1245,47 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
struct socket *sock;
struct nbd_sock **socks;
struct nbd_sock *nsock;
+ unsigned int memflags;
int err;
- sock = sockfd_lookup(arg, &err);
+ /* Arg will be cast to int, check it to avoid overflow */
+ if (arg > INT_MAX)
+ return -EINVAL;
+ sock = nbd_get_socket(nbd, arg, &err);
if (!sock)
return err;
+ /*
+ * We need to make sure we don't get any errant requests while we're
+ * reallocating the ->socks array.
+ */
+ memflags = blk_mq_freeze_queue(nbd->disk->queue);
+
if (!netlink && !nbd->task_setup &&
- !test_bit(NBD_BOUND, &config->runtime_flags))
+ !test_bit(NBD_RT_BOUND, &config->runtime_flags))
nbd->task_setup = current;
if (!netlink &&
(nbd->task_setup != current ||
- test_bit(NBD_BOUND, &config->runtime_flags))) {
+ test_bit(NBD_RT_BOUND, &config->runtime_flags))) {
dev_err(disk_to_dev(nbd->disk),
"Device being setup by another task");
- sockfd_put(sock);
- return -EBUSY;
+ err = -EBUSY;
+ goto put_socket;
+ }
+
+ nsock = kzalloc(sizeof(*nsock), GFP_KERNEL);
+ if (!nsock) {
+ err = -ENOMEM;
+ goto put_socket;
}
socks = krealloc(config->socks, (config->num_connections + 1) *
sizeof(struct nbd_sock *), GFP_KERNEL);
if (!socks) {
- sockfd_put(sock);
- return -ENOMEM;
- }
- nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL);
- if (!nsock) {
- sockfd_put(sock);
- return -ENOMEM;
+ kfree(nsock);
+ err = -ENOMEM;
+ goto put_socket;
}
config->socks = socks;
@@ -966,10 +1297,17 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
nsock->pending = NULL;
nsock->sent = 0;
nsock->cookie = 0;
+ INIT_WORK(&nsock->work, nbd_pending_cmd_work);
socks[config->num_connections++] = nsock;
atomic_inc(&config->live_connections);
+ blk_mq_unfreeze_queue(nbd->disk->queue, memflags);
return 0;
+
+put_socket:
+ blk_mq_unfreeze_queue(nbd->disk->queue, memflags);
+ sockfd_put(sock);
+ return err;
}
static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
@@ -980,7 +1318,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
int i;
int err;
- sock = sockfd_lookup(arg, &err);
+ sock = nbd_get_socket(nbd, arg, &err);
if (!sock)
return err;
@@ -1013,16 +1351,17 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
INIT_WORK(&args->work, recv_work);
args->index = i;
args->nbd = nbd;
+ args->nsock = nsock;
nsock->cookie++;
mutex_unlock(&nsock->tx_lock);
sockfd_put(old);
- clear_bit(NBD_DISCONNECTED, &config->runtime_flags);
+ clear_bit(NBD_RT_DISCONNECTED, &config->runtime_flags);
/* We take the tx_mutex in an error path in the recv_work, so we
* need to queue_work outside of the tx_mutex.
*/
- queue_work(recv_workqueue, &args->work);
+ queue_work(nbd->recv_workq, &args->work);
atomic_inc(&config->live_connections);
wake_up(&config->conn_wait);
@@ -1033,30 +1372,19 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
return -ENOSPC;
}
-static void nbd_bdev_reset(struct block_device *bdev)
+static void nbd_bdev_reset(struct nbd_device *nbd)
{
- if (bdev->bd_openers > 1)
+ if (disk_openers(nbd->disk) > 1)
return;
- bd_set_size(bdev, 0);
+ set_capacity(nbd->disk, 0);
}
static void nbd_parse_flags(struct nbd_device *nbd)
{
- struct nbd_config *config = nbd->config;
- if (config->flags & NBD_FLAG_READ_ONLY)
+ if (nbd->config->flags & NBD_FLAG_READ_ONLY)
set_disk_ro(nbd->disk, true);
else
set_disk_ro(nbd->disk, false);
- if (config->flags & NBD_FLAG_SEND_TRIM)
- blk_queue_flag_set(QUEUE_FLAG_DISCARD, nbd->disk->queue);
- if (config->flags & NBD_FLAG_SEND_FLUSH) {
- if (config->flags & NBD_FLAG_SEND_FUA)
- blk_queue_write_cache(nbd->disk->queue, true, true);
- else
- blk_queue_write_cache(nbd->disk->queue, true, false);
- }
- else
- blk_queue_write_cache(nbd->disk->queue, false, false);
}
static void send_disconnects(struct nbd_device *nbd)
@@ -1073,10 +1401,10 @@ static void send_disconnects(struct nbd_device *nbd)
for (i = 0; i < config->num_connections; i++) {
struct nbd_sock *nsock = config->socks[i];
- iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
+ iov_iter_kvec(&from, ITER_SOURCE, &iov, 1, sizeof(request));
mutex_lock(&nsock->tx_lock);
ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
- if (ret <= 0)
+ if (ret < 0)
dev_err(disk_to_dev(nbd->disk),
"Send disconnect failed %d\n", ret);
mutex_unlock(&nsock->tx_lock);
@@ -1088,7 +1416,8 @@ static int nbd_disconnect(struct nbd_device *nbd)
struct nbd_config *config = nbd->config;
dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
- set_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
+ set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
+ set_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags);
send_disconnects(nbd);
return 0;
}
@@ -1106,11 +1435,19 @@ static void nbd_config_put(struct nbd_device *nbd)
&nbd->config_lock)) {
struct nbd_config *config = nbd->config;
nbd_dev_dbg_close(nbd);
- nbd_size_clear(nbd);
- if (test_and_clear_bit(NBD_HAS_PID_FILE,
+ invalidate_disk(nbd->disk);
+ if (nbd->config->bytesize)
+ kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
+ if (test_and_clear_bit(NBD_RT_HAS_PID_FILE,
&config->runtime_flags))
device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
- nbd->task_recv = NULL;
+ nbd->pid = 0;
+ if (test_and_clear_bit(NBD_RT_HAS_BACKEND_FILE,
+ &config->runtime_flags)) {
+ device_remove_file(disk_to_dev(nbd->disk), &backend_attr);
+ kfree(nbd->backend);
+ nbd->backend = NULL;
+ }
nbd_clear_sock(nbd);
if (config->num_connections) {
int i;
@@ -1124,10 +1461,6 @@ static void nbd_config_put(struct nbd_device *nbd)
nbd->config = NULL;
nbd->tag_set.timeout = 0;
- nbd->disk->queue->limits.discard_granularity = 0;
- nbd->disk->queue->limits.discard_alignment = 0;
- blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
- blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue);
mutex_unlock(&nbd->config_lock);
nbd_put(nbd);
@@ -1141,7 +1474,7 @@ static int nbd_start_device(struct nbd_device *nbd)
int num_connections = config->num_connections;
int error = 0, i;
- if (nbd->task_recv)
+ if (nbd->pid)
return -EBUSY;
if (!config->socks)
return -EINVAL;
@@ -1151,17 +1484,27 @@ static int nbd_start_device(struct nbd_device *nbd)
return -EINVAL;
}
- blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
- nbd->task_recv = current;
+retry:
+ mutex_unlock(&nbd->config_lock);
+ blk_mq_update_nr_hw_queues(&nbd->tag_set, num_connections);
+ mutex_lock(&nbd->config_lock);
+
+ /* if another code path updated nr_hw_queues, retry until succeed */
+ if (num_connections != config->num_connections) {
+ num_connections = config->num_connections;
+ goto retry;
+ }
+
+ nbd->pid = task_pid_nr(current);
nbd_parse_flags(nbd);
error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
if (error) {
- dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
+ dev_err(disk_to_dev(nbd->disk), "device_create_file failed for pid!\n");
return error;
}
- set_bit(NBD_HAS_PID_FILE, &config->runtime_flags);
+ set_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags);
nbd_dev_dbg_init(nbd);
for (i = 0; i < num_connections; i++) {
@@ -1170,6 +1513,16 @@ static int nbd_start_device(struct nbd_device *nbd)
args = kzalloc(sizeof(*args), GFP_KERNEL);
if (!args) {
sock_shutdown(nbd);
+ /*
+ * If num_connections is m (2 < m),
+ * and NO.1 ~ NO.n(1 < n < m) kzallocs are successful.
+ * But NO.(n + 1) failed. We still have n recv threads.
+ * So, add flush_workqueue here to prevent recv threads
+ * dropping the last config_refs and trying to destroy
+ * the workqueue from inside the workqueue.
+ */
+ if (i)
+ flush_workqueue(nbd->recv_workq);
return -ENOMEM;
}
sk_set_memalloc(config->socks[i]->sock->sk);
@@ -1180,14 +1533,14 @@ static int nbd_start_device(struct nbd_device *nbd)
refcount_inc(&nbd->config_refs);
INIT_WORK(&args->work, recv_work);
args->nbd = nbd;
+ args->nsock = config->socks[i];
args->index = i;
- queue_work(recv_workqueue, &args->work);
+ queue_work(nbd->recv_workq, &args->work);
}
- nbd_size_update(nbd);
- return error;
+ return nbd_set_size(nbd, config->bytesize, nbd_blksize(config));
}
-static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev)
+static int nbd_start_device_ioctl(struct nbd_device *nbd)
{
struct nbd_config *config = nbd->config;
int ret;
@@ -1197,73 +1550,77 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b
return ret;
if (max_part)
- bdev->bd_invalidated = 1;
+ set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
mutex_unlock(&nbd->config_lock);
ret = wait_event_interruptible(config->recv_wq,
atomic_read(&config->recv_threads) == 0);
- if (ret)
+ if (ret) {
sock_shutdown(nbd);
+ nbd_clear_que(nbd);
+ }
+
+ flush_workqueue(nbd->recv_workq);
mutex_lock(&nbd->config_lock);
- nbd_bdev_reset(bdev);
+ nbd_bdev_reset(nbd);
/* user requested, ignore socket errors */
- if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags))
+ if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags))
ret = 0;
- if (test_bit(NBD_TIMEDOUT, &config->runtime_flags))
+ if (test_bit(NBD_RT_TIMEDOUT, &config->runtime_flags))
ret = -ETIMEDOUT;
return ret;
}
-static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
- struct block_device *bdev)
+static void nbd_clear_sock_ioctl(struct nbd_device *nbd)
{
- sock_shutdown(nbd);
- kill_bdev(bdev);
- nbd_bdev_reset(bdev);
- if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
+ nbd_clear_sock(nbd);
+ disk_force_media_change(nbd->disk);
+ nbd_bdev_reset(nbd);
+ if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
&nbd->config->runtime_flags))
nbd_config_put(nbd);
}
+static void nbd_set_cmd_timeout(struct nbd_device *nbd, u64 timeout)
+{
+ nbd->tag_set.timeout = timeout * HZ;
+ if (timeout)
+ blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
+ else
+ blk_queue_rq_timeout(nbd->disk->queue, 30 * HZ);
+}
+
/* Must be called with config_lock held */
static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
unsigned int cmd, unsigned long arg)
{
struct nbd_config *config = nbd->config;
+ loff_t bytesize;
switch (cmd) {
case NBD_DISCONNECT:
return nbd_disconnect(nbd);
case NBD_CLEAR_SOCK:
- nbd_clear_sock_ioctl(nbd, bdev);
+ nbd_clear_sock_ioctl(nbd);
return 0;
case NBD_SET_SOCK:
return nbd_add_socket(nbd, arg, false);
case NBD_SET_BLKSIZE:
- if (!arg || !is_power_of_2(arg) || arg < 512 ||
- arg > PAGE_SIZE)
- return -EINVAL;
- nbd_size_set(nbd, arg,
- div_s64(config->bytesize, arg));
- return 0;
+ return nbd_set_size(nbd, config->bytesize, arg);
case NBD_SET_SIZE:
- nbd_size_set(nbd, config->blksize,
- div_s64(arg, config->blksize));
- return 0;
+ return nbd_set_size(nbd, arg, nbd_blksize(config));
case NBD_SET_SIZE_BLOCKS:
- nbd_size_set(nbd, config->blksize, arg);
- return 0;
+ if (check_shl_overflow(arg, config->blksize_bits, &bytesize))
+ return -EINVAL;
+ return nbd_set_size(nbd, bytesize, nbd_blksize(config));
case NBD_SET_TIMEOUT:
- if (arg) {
- nbd->tag_set.timeout = arg * HZ;
- blk_queue_rq_timeout(nbd->disk->queue, arg * HZ);
- }
+ nbd_set_cmd_timeout(nbd, arg);
return 0;
case NBD_SET_FLAGS:
config->flags = arg;
return 0;
case NBD_DO_IT:
- return nbd_start_device_ioctl(nbd, bdev);
+ return nbd_start_device_ioctl(nbd);
case NBD_CLEAR_QUE:
/*
* This is for compatibility only. The queue is always cleared
@@ -1280,7 +1637,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
return -ENOTTY;
}
-static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
+static int nbd_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned int cmd, unsigned long arg)
{
struct nbd_device *nbd = bdev->bd_disk->private_data;
@@ -1301,7 +1658,7 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
/* Don't allow ioctl operations on a nbd device that was created with
* netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
*/
- if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
+ if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
(cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK))
error = __nbd_ioctl(bdev, nbd, cmd, arg);
else
@@ -1310,29 +1667,49 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
return error;
}
-static struct nbd_config *nbd_alloc_config(void)
+static int nbd_alloc_and_init_config(struct nbd_device *nbd)
{
struct nbd_config *config;
+ if (WARN_ON(nbd->config))
+ return -EINVAL;
+
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
- if (!config)
- return NULL;
+ if (!config) {
+ module_put(THIS_MODULE);
+ return -ENOMEM;
+ }
+
atomic_set(&config->recv_threads, 0);
init_waitqueue_head(&config->recv_wq);
init_waitqueue_head(&config->conn_wait);
- config->blksize = 1024;
+ config->blksize_bits = NBD_DEF_BLKSIZE_BITS;
atomic_set(&config->live_connections, 0);
- try_module_get(THIS_MODULE);
- return config;
+
+ nbd->config = config;
+ /*
+ * Order refcount_set(&nbd->config_refs, 1) and nbd->config assignment,
+ * its pair is the barrier in nbd_get_config_unlocked().
+ * So nbd_get_config_unlocked() won't see nbd->config as null after
+ * refcount_inc_not_zero() succeed.
+ */
+ smp_mb__before_atomic();
+ refcount_set(&nbd->config_refs, 1);
+
+ return 0;
}
-static int nbd_open(struct block_device *bdev, fmode_t mode)
+static int nbd_open(struct gendisk *disk, blk_mode_t mode)
{
struct nbd_device *nbd;
+ struct nbd_config *config;
int ret = 0;
mutex_lock(&nbd_index_mutex);
- nbd = bdev->bd_disk->private_data;
+ nbd = disk->private_data;
if (!nbd) {
ret = -ENXIO;
goto out;
@@ -1341,45 +1718,52 @@ static int nbd_open(struct block_device *bdev, fmode_t mode)
ret = -ENXIO;
goto out;
}
- if (!refcount_inc_not_zero(&nbd->config_refs)) {
- struct nbd_config *config;
+ config = nbd_get_config_unlocked(nbd);
+ if (!config) {
mutex_lock(&nbd->config_lock);
if (refcount_inc_not_zero(&nbd->config_refs)) {
mutex_unlock(&nbd->config_lock);
goto out;
}
- config = nbd->config = nbd_alloc_config();
- if (!config) {
- ret = -ENOMEM;
+ ret = nbd_alloc_and_init_config(nbd);
+ if (ret) {
mutex_unlock(&nbd->config_lock);
goto out;
}
- refcount_set(&nbd->config_refs, 1);
+
refcount_inc(&nbd->refs);
mutex_unlock(&nbd->config_lock);
- bdev->bd_invalidated = 1;
- } else if (nbd_disconnected(nbd->config)) {
- bdev->bd_invalidated = 1;
+ if (max_part)
+ set_bit(GD_NEED_PART_SCAN, &disk->state);
+ } else if (nbd_disconnected(config)) {
+ if (max_part)
+ set_bit(GD_NEED_PART_SCAN, &disk->state);
}
out:
mutex_unlock(&nbd_index_mutex);
return ret;
}
-static void nbd_release(struct gendisk *disk, fmode_t mode)
+static void nbd_release(struct gendisk *disk)
{
struct nbd_device *nbd = disk->private_data;
- struct block_device *bdev = bdget_disk(disk, 0);
- if (test_bit(NBD_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) &&
- bdev->bd_openers == 0)
+ if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) &&
+ disk_openers(disk) == 0)
nbd_disconnect_and_put(nbd);
nbd_config_put(nbd);
nbd_put(nbd);
}
+static void nbd_free_disk(struct gendisk *disk)
+{
+ struct nbd_device *nbd = disk->private_data;
+
+ kfree(nbd);
+}
+
static const struct block_device_operations nbd_fops =
{
.owner = THIS_MODULE,
@@ -1387,6 +1771,7 @@ static const struct block_device_operations nbd_fops =
.release = nbd_release,
.ioctl = nbd_ioctl,
.compat_ioctl = nbd_ioctl,
+ .free_disk = nbd_free_disk,
};
#if IS_ENABLED(CONFIG_DEBUG_FS)
@@ -1395,23 +1780,13 @@ static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
{
struct nbd_device *nbd = s->private;
- if (nbd->task_recv)
- seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
+ if (nbd->pid)
+ seq_printf(s, "recv: %d\n", nbd->pid);
return 0;
}
-static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
-{
- return single_open(file, nbd_dbg_tasks_show, inode->i_private);
-}
-
-static const struct file_operations nbd_dbg_tasks_ops = {
- .open = nbd_dbg_tasks_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(nbd_dbg_tasks);
static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
{
@@ -1432,21 +1807,15 @@ static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
seq_puts(s, "NBD_FLAG_SEND_FUA\n");
if (flags & NBD_FLAG_SEND_TRIM)
seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
+ if (flags & NBD_FLAG_SEND_WRITE_ZEROES)
+ seq_puts(s, "NBD_FLAG_SEND_WRITE_ZEROES\n");
+ if (flags & NBD_FLAG_ROTATIONAL)
+ seq_puts(s, "NBD_FLAG_ROTATIONAL\n");
return 0;
}
-static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
-{
- return single_open(file, nbd_dbg_flags_show, inode->i_private);
-}
-
-static const struct file_operations nbd_dbg_flags_ops = {
- .open = nbd_dbg_flags_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(nbd_dbg_flags);
static int nbd_dev_dbg_init(struct nbd_device *nbd)
{
@@ -1457,18 +1826,18 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd)
return -EIO;
dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
- if (!dir) {
+ if (IS_ERR(dir)) {
dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
nbd_name(nbd));
return -EIO;
}
config->dbg_dir = dir;
- debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
+ debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_fops);
debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
- debugfs_create_u64("blocksize", 0444, dir, &config->blksize);
- debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
+ debugfs_create_u32("blocksize_bits", 0444, dir, &config->blksize_bits);
+ debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_fops);
return 0;
}
@@ -1483,7 +1852,7 @@ static int nbd_dbg_init(void)
struct dentry *dbg_dir;
dbg_dir = debugfs_create_dir("nbd", NULL);
- if (!dbg_dir)
+ if (IS_ERR(dbg_dir))
return -EIO;
nbd_dbg_dir = dbg_dir;
@@ -1535,104 +1904,126 @@ static const struct blk_mq_ops nbd_mq_ops = {
.timeout = nbd_xmit_timeout,
};
-static int nbd_dev_add(int index)
+static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
{
+ struct queue_limits lim = {
+ .max_hw_sectors = 65536,
+ .io_opt = 256 << SECTOR_SHIFT,
+ .max_segments = USHRT_MAX,
+ .max_segment_size = UINT_MAX,
+ };
struct nbd_device *nbd;
struct gendisk *disk;
- struct request_queue *q;
int err = -ENOMEM;
nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL);
if (!nbd)
goto out;
- disk = alloc_disk(1 << part_shift);
- if (!disk)
+ nbd->tag_set.ops = &nbd_mq_ops;
+ nbd->tag_set.nr_hw_queues = 1;
+ nbd->tag_set.queue_depth = 128;
+ nbd->tag_set.numa_node = NUMA_NO_NODE;
+ nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
+ nbd->tag_set.flags = BLK_MQ_F_BLOCKING;
+ nbd->tag_set.driver_data = nbd;
+ INIT_WORK(&nbd->remove_work, nbd_dev_remove_work);
+ nbd->backend = NULL;
+
+ err = blk_mq_alloc_tag_set(&nbd->tag_set);
+ if (err)
goto out_free_nbd;
+ mutex_lock(&nbd_index_mutex);
if (index >= 0) {
err = idr_alloc(&nbd_index_idr, nbd, index, index + 1,
GFP_KERNEL);
if (err == -ENOSPC)
err = -EEXIST;
} else {
- err = idr_alloc(&nbd_index_idr, nbd, 0, 0, GFP_KERNEL);
+ err = idr_alloc(&nbd_index_idr, nbd, 0,
+ (MINORMASK >> part_shift) + 1, GFP_KERNEL);
if (err >= 0)
index = err;
}
- if (err < 0)
- goto out_free_disk;
-
nbd->index = index;
- nbd->disk = disk;
- nbd->tag_set.ops = &nbd_mq_ops;
- nbd->tag_set.nr_hw_queues = 1;
- nbd->tag_set.queue_depth = 128;
- nbd->tag_set.numa_node = NUMA_NO_NODE;
- nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
- nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
- BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
- nbd->tag_set.driver_data = nbd;
+ mutex_unlock(&nbd_index_mutex);
+ if (err < 0)
+ goto out_free_tags;
- err = blk_mq_alloc_tag_set(&nbd->tag_set);
- if (err)
+ disk = blk_mq_alloc_disk(&nbd->tag_set, &lim, NULL);
+ if (IS_ERR(disk)) {
+ err = PTR_ERR(disk);
goto out_free_idr;
-
- q = blk_mq_init_queue(&nbd->tag_set);
- if (IS_ERR(q)) {
- err = PTR_ERR(q);
- goto out_free_tags;
}
- disk->queue = q;
+ nbd->disk = disk;
- /*
- * Tell the block layer that we are not a rotational device
- */
- blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
- blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
- disk->queue->limits.discard_granularity = 0;
- disk->queue->limits.discard_alignment = 0;
- blk_queue_max_discard_sectors(disk->queue, 0);
- blk_queue_max_segment_size(disk->queue, UINT_MAX);
- blk_queue_max_segments(disk->queue, USHRT_MAX);
- blk_queue_max_hw_sectors(disk->queue, 65536);
- disk->queue->limits.max_sectors = 256;
+ nbd->recv_workq = alloc_workqueue("nbd%d-recv",
+ WQ_MEM_RECLAIM | WQ_HIGHPRI |
+ WQ_UNBOUND, 0, nbd->index);
+ if (!nbd->recv_workq) {
+ dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n");
+ err = -ENOMEM;
+ goto out_err_disk;
+ }
mutex_init(&nbd->config_lock);
refcount_set(&nbd->config_refs, 0);
- refcount_set(&nbd->refs, 1);
+ /*
+ * Start out with a zero references to keep other threads from using
+ * this device until it is fully initialized.
+ */
+ refcount_set(&nbd->refs, 0);
INIT_LIST_HEAD(&nbd->list);
disk->major = NBD_MAJOR;
disk->first_minor = index << part_shift;
+ disk->minors = 1 << part_shift;
disk->fops = &nbd_fops;
disk->private_data = nbd;
sprintf(disk->disk_name, "nbd%d", index);
- add_disk(disk);
+ err = add_disk(disk);
+ if (err)
+ goto out_free_work;
+
+ /*
+ * Now publish the device.
+ */
+ refcount_set(&nbd->refs, refs);
nbd_total_devices++;
- return index;
+ return nbd;
-out_free_tags:
- blk_mq_free_tag_set(&nbd->tag_set);
+out_free_work:
+ destroy_workqueue(nbd->recv_workq);
+out_err_disk:
+ put_disk(disk);
out_free_idr:
+ mutex_lock(&nbd_index_mutex);
idr_remove(&nbd_index_idr, index);
-out_free_disk:
- put_disk(disk);
+ mutex_unlock(&nbd_index_mutex);
+out_free_tags:
+ blk_mq_free_tag_set(&nbd->tag_set);
out_free_nbd:
kfree(nbd);
out:
- return err;
+ return ERR_PTR(err);
}
-static int find_free_cb(int id, void *ptr, void *data)
+static struct nbd_device *nbd_find_get_unused(void)
{
- struct nbd_device *nbd = ptr;
- struct nbd_device **found = data;
+ struct nbd_device *nbd;
+ int id;
- if (!refcount_read(&nbd->config_refs)) {
- *found = nbd;
- return 1;
+ lockdep_assert_held(&nbd_index_mutex);
+
+ idr_for_each_entry(&nbd_index_idr, nbd, id) {
+ if (refcount_read(&nbd->config_refs) ||
+ test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags))
+ continue;
+ if (refcount_inc_not_zero(&nbd->refs))
+ return nbd;
}
- return 0;
+
+ return NULL;
}
/* Netlink interface. */
@@ -1646,6 +2037,7 @@ static const struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
[NBD_ATTR_SOCKETS] = { .type = NLA_NESTED},
[NBD_ATTR_DEAD_CONN_TIMEOUT] = { .type = NLA_U64 },
[NBD_ATTR_DEVICE_LIST] = { .type = NLA_NESTED},
+ [NBD_ATTR_BACKEND_IDENTIFIER] = { .type = NLA_STRING},
};
static const struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
@@ -1661,9 +2053,26 @@ nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = {
[NBD_DEVICE_CONNECTED] = { .type = NLA_U8 },
};
+static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd)
+{
+ struct nbd_config *config = nbd->config;
+ u64 bsize = nbd_blksize(config);
+ u64 bytes = config->bytesize;
+
+ if (info->attrs[NBD_ATTR_SIZE_BYTES])
+ bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
+
+ if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES])
+ bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
+
+ if (bytes != config->bytesize || bsize != nbd_blksize(config))
+ return nbd_set_size(nbd, bytes, bsize);
+ return 0;
+}
+
static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
{
- struct nbd_device *nbd = NULL;
+ struct nbd_device *nbd;
struct nbd_config *config;
int index = -1;
int ret;
@@ -1672,57 +2081,53 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
if (!netlink_capable(skb, CAP_SYS_ADMIN))
return -EPERM;
- if (info->attrs[NBD_ATTR_INDEX])
+ if (info->attrs[NBD_ATTR_INDEX]) {
index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
- if (!info->attrs[NBD_ATTR_SOCKETS]) {
- printk(KERN_ERR "nbd: must specify at least one socket\n");
+
+ /*
+ * Too big first_minor can cause duplicate creation of
+ * sysfs files/links, since index << part_shift might overflow, or
+ * MKDEV() expect that the max bits of first_minor is 20.
+ */
+ if (index < 0 || index > MINORMASK >> part_shift) {
+ pr_err("illegal input index %d\n", index);
+ return -EINVAL;
+ }
+ }
+ if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_SOCKETS)) {
+ pr_err("must specify at least one socket\n");
return -EINVAL;
}
- if (!info->attrs[NBD_ATTR_SIZE_BYTES]) {
- printk(KERN_ERR "nbd: must specify a size in bytes for the device\n");
+ if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_SIZE_BYTES)) {
+ pr_err("must specify a size in bytes for the device\n");
return -EINVAL;
}
again:
mutex_lock(&nbd_index_mutex);
if (index == -1) {
- ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd);
- if (ret == 0) {
- int new_index;
- new_index = nbd_dev_add(-1);
- if (new_index < 0) {
- mutex_unlock(&nbd_index_mutex);
- printk(KERN_ERR "nbd: failed to add new device\n");
- return new_index;
- }
- nbd = idr_find(&nbd_index_idr, new_index);
- }
+ nbd = nbd_find_get_unused();
} else {
nbd = idr_find(&nbd_index_idr, index);
- if (!nbd) {
- ret = nbd_dev_add(index);
- if (ret < 0) {
+ if (nbd) {
+ if ((test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) &&
+ test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) ||
+ !refcount_inc_not_zero(&nbd->refs)) {
mutex_unlock(&nbd_index_mutex);
- printk(KERN_ERR "nbd: failed to add new device\n");
- return ret;
+ pr_err("device at index %d is going down\n",
+ index);
+ return -EINVAL;
}
- nbd = idr_find(&nbd_index_idr, index);
}
}
+ mutex_unlock(&nbd_index_mutex);
+
if (!nbd) {
- printk(KERN_ERR "nbd: couldn't find device at index %d\n",
- index);
- mutex_unlock(&nbd_index_mutex);
- return -EINVAL;
- }
- if (!refcount_inc_not_zero(&nbd->refs)) {
- mutex_unlock(&nbd_index_mutex);
- if (index == -1)
- goto again;
- printk(KERN_ERR "nbd: device at index %d is going down\n",
- index);
- return -EINVAL;
+ nbd = nbd_dev_add(index, 2);
+ if (IS_ERR(nbd)) {
+ pr_err("failed to add new device\n");
+ return PTR_ERR(nbd);
+ }
}
- mutex_unlock(&nbd_index_mutex);
mutex_lock(&nbd->config_lock);
if (refcount_read(&nbd->config_refs)) {
@@ -1730,39 +2135,27 @@ again:
nbd_put(nbd);
if (index == -1)
goto again;
- printk(KERN_ERR "nbd: nbd%d already in use\n", index);
+ pr_err("nbd%d already in use\n", index);
return -EBUSY;
}
- if (WARN_ON(nbd->config)) {
- mutex_unlock(&nbd->config_lock);
- nbd_put(nbd);
- return -EINVAL;
- }
- config = nbd->config = nbd_alloc_config();
- if (!nbd->config) {
+
+ ret = nbd_alloc_and_init_config(nbd);
+ if (ret) {
mutex_unlock(&nbd->config_lock);
nbd_put(nbd);
- printk(KERN_ERR "nbd: couldn't allocate config\n");
- return -ENOMEM;
+ pr_err("couldn't allocate config\n");
+ return ret;
}
- refcount_set(&nbd->config_refs, 1);
- set_bit(NBD_BOUND, &config->runtime_flags);
- if (info->attrs[NBD_ATTR_SIZE_BYTES]) {
- u64 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
- nbd_size_set(nbd, config->blksize,
- div64_u64(bytes, config->blksize));
- }
- if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) {
- u64 bsize =
- nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
- nbd_size_set(nbd, bsize, div64_u64(config->bytesize, bsize));
- }
- if (info->attrs[NBD_ATTR_TIMEOUT]) {
- u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
- nbd->tag_set.timeout = timeout * HZ;
- blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
- }
+ config = nbd->config;
+ set_bit(NBD_RT_BOUND, &config->runtime_flags);
+ ret = nbd_genl_size_set(info, nbd);
+ if (ret)
+ goto out;
+
+ if (info->attrs[NBD_ATTR_TIMEOUT])
+ nbd_set_cmd_timeout(nbd,
+ nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
config->dead_conn_timeout =
nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
@@ -1774,12 +2167,24 @@ again:
if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
- set_bit(NBD_DESTROY_ON_DISCONNECT,
- &config->runtime_flags);
- put_dev = true;
+ /*
+ * We have 1 ref to keep the device around, and then 1
+ * ref for our current operation here, which will be
+ * inherited by the config. If we already have
+ * DESTROY_ON_DISCONNECT set then we know we don't have
+ * that extra ref already held so we don't need the
+ * put_dev.
+ */
+ if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
+ &nbd->flags))
+ put_dev = true;
+ } else {
+ if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
+ &nbd->flags))
+ refcount_inc(&nbd->refs);
}
if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
- set_bit(NBD_DISCONNECT_ON_CLOSE,
+ set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
&config->runtime_flags);
}
}
@@ -1793,14 +2198,16 @@ again:
struct nlattr *socks[NBD_SOCK_MAX+1];
if (nla_type(attr) != NBD_SOCK_ITEM) {
- printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
+ pr_err("socks must be embedded in a SOCK_ITEM attr\n");
ret = -EINVAL;
goto out;
}
- ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr,
- nbd_sock_policy, info->extack);
+ ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX,
+ attr,
+ nbd_sock_policy,
+ info->extack);
if (ret != 0) {
- printk(KERN_ERR "nbd: error processing sock list\n");
+ pr_err("error processing sock list\n");
ret = -EINVAL;
goto out;
}
@@ -1812,14 +2219,32 @@ again:
goto out;
}
}
+
+ if (info->attrs[NBD_ATTR_BACKEND_IDENTIFIER]) {
+ nbd->backend = nla_strdup(info->attrs[NBD_ATTR_BACKEND_IDENTIFIER],
+ GFP_KERNEL);
+ if (!nbd->backend) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ ret = device_create_file(disk_to_dev(nbd->disk), &backend_attr);
+ if (ret) {
+ dev_err(disk_to_dev(nbd->disk),
+ "device_create_file failed for backend!\n");
+ goto out;
+ }
+ set_bit(NBD_RT_HAS_BACKEND_FILE, &config->runtime_flags);
+
ret = nbd_start_device(nbd);
out:
- mutex_unlock(&nbd->config_lock);
if (!ret) {
- set_bit(NBD_HAS_CONFIG_REF, &config->runtime_flags);
+ set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags);
refcount_inc(&nbd->config_refs);
nbd_connect_reply(info, nbd->index);
}
+ mutex_unlock(&nbd->config_lock);
+
nbd_config_put(nbd);
if (put_dev)
nbd_put(nbd);
@@ -1830,9 +2255,19 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd)
{
mutex_lock(&nbd->config_lock);
nbd_disconnect(nbd);
- nbd_clear_sock(nbd);
+ sock_shutdown(nbd);
+ wake_up(&nbd->config->conn_wait);
+ /*
+ * Make sure recv thread has finished, we can safely call nbd_clear_que()
+ * to cancel the inflight I/Os.
+ */
+ flush_workqueue(nbd->recv_workq);
+ nbd_clear_que(nbd);
+ nbd->task_setup = NULL;
+ clear_bit(NBD_RT_BOUND, &nbd->config->runtime_flags);
mutex_unlock(&nbd->config_lock);
- if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
+
+ if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
&nbd->config->runtime_flags))
nbd_config_put(nbd);
}
@@ -1845,8 +2280,8 @@ static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
if (!netlink_capable(skb, CAP_SYS_ADMIN))
return -EPERM;
- if (!info->attrs[NBD_ATTR_INDEX]) {
- printk(KERN_ERR "nbd: must specify an index to disconnect\n");
+ if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_INDEX)) {
+ pr_err("must specify an index to disconnect\n");
return -EINVAL;
}
index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
@@ -1854,23 +2289,20 @@ static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
nbd = idr_find(&nbd_index_idr, index);
if (!nbd) {
mutex_unlock(&nbd_index_mutex);
- printk(KERN_ERR "nbd: couldn't find device at index %d\n",
- index);
+ pr_err("couldn't find device at index %d\n", index);
return -EINVAL;
}
if (!refcount_inc_not_zero(&nbd->refs)) {
mutex_unlock(&nbd_index_mutex);
- printk(KERN_ERR "nbd: device at index %d is going down\n",
- index);
+ pr_err("device at index %d is going down\n", index);
return -EINVAL;
}
mutex_unlock(&nbd_index_mutex);
- if (!refcount_inc_not_zero(&nbd->config_refs)) {
- nbd_put(nbd);
- return 0;
- }
+ if (!refcount_inc_not_zero(&nbd->config_refs))
+ goto put_nbd;
nbd_disconnect_and_put(nbd);
nbd_config_put(nbd);
+put_nbd:
nbd_put(nbd);
return 0;
}
@@ -1886,8 +2318,8 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
if (!netlink_capable(skb, CAP_SYS_ADMIN))
return -EPERM;
- if (!info->attrs[NBD_ATTR_INDEX]) {
- printk(KERN_ERR "nbd: must specify a device to reconfigure\n");
+ if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_INDEX)) {
+ pr_err("must specify a device to reconfigure\n");
return -EINVAL;
}
index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
@@ -1895,19 +2327,34 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
nbd = idr_find(&nbd_index_idr, index);
if (!nbd) {
mutex_unlock(&nbd_index_mutex);
- printk(KERN_ERR "nbd: couldn't find a device at index %d\n",
- index);
+ pr_err("couldn't find a device at index %d\n", index);
return -EINVAL;
}
+ if (nbd->backend) {
+ if (info->attrs[NBD_ATTR_BACKEND_IDENTIFIER]) {
+ if (nla_strcmp(info->attrs[NBD_ATTR_BACKEND_IDENTIFIER],
+ nbd->backend)) {
+ mutex_unlock(&nbd_index_mutex);
+ dev_err(nbd_to_dev(nbd),
+ "backend image doesn't match with %s\n",
+ nbd->backend);
+ return -EINVAL;
+ }
+ } else {
+ mutex_unlock(&nbd_index_mutex);
+ dev_err(nbd_to_dev(nbd), "must specify backend\n");
+ return -EINVAL;
+ }
+ }
if (!refcount_inc_not_zero(&nbd->refs)) {
mutex_unlock(&nbd_index_mutex);
- printk(KERN_ERR "nbd: device at index %d is going down\n",
- index);
+ pr_err("device at index %d is going down\n", index);
return -EINVAL;
}
mutex_unlock(&nbd_index_mutex);
- if (!refcount_inc_not_zero(&nbd->config_refs)) {
+ config = nbd_get_config_unlocked(nbd);
+ if (!config) {
dev_err(nbd_to_dev(nbd),
"not configured, cannot reconfigure\n");
nbd_put(nbd);
@@ -1915,20 +2362,21 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
}
mutex_lock(&nbd->config_lock);
- config = nbd->config;
- if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
- !nbd->task_recv) {
+ if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
+ !nbd->pid) {
dev_err(nbd_to_dev(nbd),
"not configured, cannot reconfigure\n");
ret = -EINVAL;
goto out;
}
- if (info->attrs[NBD_ATTR_TIMEOUT]) {
- u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
- nbd->tag_set.timeout = timeout * HZ;
- blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
- }
+ ret = nbd_genl_size_set(info, nbd);
+ if (ret)
+ goto out;
+
+ if (info->attrs[NBD_ATTR_TIMEOUT])
+ nbd_set_cmd_timeout(nbd,
+ nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
config->dead_conn_timeout =
nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
@@ -1938,19 +2386,19 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
- &config->runtime_flags))
+ &nbd->flags))
put_dev = true;
} else {
if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
- &config->runtime_flags))
+ &nbd->flags))
refcount_inc(&nbd->refs);
}
if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
- set_bit(NBD_DISCONNECT_ON_CLOSE,
+ set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
&config->runtime_flags);
} else {
- clear_bit(NBD_DISCONNECT_ON_CLOSE,
+ clear_bit(NBD_RT_DISCONNECT_ON_CLOSE,
&config->runtime_flags);
}
}
@@ -1964,14 +2412,16 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
struct nlattr *socks[NBD_SOCK_MAX+1];
if (nla_type(attr) != NBD_SOCK_ITEM) {
- printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
+ pr_err("socks must be embedded in a SOCK_ITEM attr\n");
ret = -EINVAL;
goto out;
}
- ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr,
- nbd_sock_policy, info->extack);
+ ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX,
+ attr,
+ nbd_sock_policy,
+ info->extack);
if (ret != 0) {
- printk(KERN_ERR "nbd: error processing sock list\n");
+ pr_err("error processing sock list\n");
ret = -EINVAL;
goto out;
}
@@ -1996,25 +2446,25 @@ out:
return ret;
}
-static const struct genl_ops nbd_connect_genl_ops[] = {
+static const struct genl_small_ops nbd_connect_genl_ops[] = {
{
.cmd = NBD_CMD_CONNECT,
- .policy = nbd_attr_policy,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nbd_genl_connect,
},
{
.cmd = NBD_CMD_DISCONNECT,
- .policy = nbd_attr_policy,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nbd_genl_disconnect,
},
{
.cmd = NBD_CMD_RECONFIGURE,
- .policy = nbd_attr_policy,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nbd_genl_reconfigure,
},
{
.cmd = NBD_CMD_STATUS,
- .policy = nbd_attr_policy,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nbd_genl_status,
},
};
@@ -2028,12 +2478,16 @@ static struct genl_family nbd_genl_family __ro_after_init = {
.name = NBD_GENL_FAMILY_NAME,
.version = NBD_GENL_VERSION,
.module = THIS_MODULE,
- .ops = nbd_connect_genl_ops,
- .n_ops = ARRAY_SIZE(nbd_connect_genl_ops),
+ .small_ops = nbd_connect_genl_ops,
+ .n_small_ops = ARRAY_SIZE(nbd_connect_genl_ops),
+ .resv_start_op = NBD_CMD_STATUS + 1,
.maxattr = NBD_ATTR_MAX,
+ .netnsok = 1,
+ .policy = nbd_attr_policy,
.mcgrps = nbd_mcast_grps,
.n_mcgrps = ARRAY_SIZE(nbd_mcast_grps),
};
+MODULE_ALIAS_GENL_FAMILY(NBD_GENL_FAMILY_NAME);
static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
{
@@ -2050,7 +2504,7 @@ static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
*/
if (refcount_read(&nbd->config_refs))
connected = 1;
- dev_opt = nla_nest_start(reply, NBD_DEVICE_ITEM);
+ dev_opt = nla_nest_start_noflag(reply, NBD_DEVICE_ITEM);
if (!dev_opt)
return -EMSGSIZE;
ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index);
@@ -2098,7 +2552,13 @@ static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
goto out;
}
- dev_list = nla_nest_start(reply, NBD_ATTR_DEVICE_LIST);
+ dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST);
+ if (!dev_list) {
+ nlmsg_free(reply);
+ ret = -EMSGSIZE;
+ goto out;
+ }
+
if (index == -1) {
ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
if (ret) {
@@ -2118,8 +2578,7 @@ static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
}
nla_nest_end(reply, dev_list);
genlmsg_end(reply, reply_head);
- genlmsg_reply(reply, info);
- ret = 0;
+ ret = genlmsg_reply(reply, info);
out:
mutex_unlock(&nbd_index_mutex);
return ret;
@@ -2188,7 +2647,7 @@ static int __init nbd_init(void)
BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
if (max_part < 0) {
- printk(KERN_ERR "nbd: max_part must be >= 0\n");
+ pr_err("max_part must be >= 0\n");
return -EINVAL;
}
@@ -2212,28 +2671,25 @@ static int __init nbd_init(void)
if (nbds_max > 1UL << (MINORBITS - part_shift))
return -EINVAL;
- recv_workqueue = alloc_workqueue("knbd-recv",
- WQ_MEM_RECLAIM | WQ_HIGHPRI |
- WQ_UNBOUND, 0);
- if (!recv_workqueue)
- return -ENOMEM;
- if (register_blkdev(NBD_MAJOR, "nbd")) {
- destroy_workqueue(recv_workqueue);
+ if (register_blkdev(NBD_MAJOR, "nbd"))
return -EIO;
+
+ nbd_del_wq = alloc_workqueue("nbd-del", WQ_UNBOUND, 0);
+ if (!nbd_del_wq) {
+ unregister_blkdev(NBD_MAJOR, "nbd");
+ return -ENOMEM;
}
if (genl_register_family(&nbd_genl_family)) {
+ destroy_workqueue(nbd_del_wq);
unregister_blkdev(NBD_MAJOR, "nbd");
- destroy_workqueue(recv_workqueue);
return -EINVAL;
}
nbd_dbg_init();
- mutex_lock(&nbd_index_mutex);
for (i = 0; i < nbds_max; i++)
- nbd_dev_add(i);
- mutex_unlock(&nbd_index_mutex);
+ nbd_dev_add(i, 1);
return 0;
}
@@ -2242,7 +2698,10 @@ static int nbd_exit_cb(int id, void *ptr, void *data)
struct list_head *list = (struct list_head *)data;
struct nbd_device *nbd = ptr;
- list_add_tail(&nbd->list, list);
+ /* Skip nbd that is being removed asynchronously */
+ if (refcount_read(&nbd->refs))
+ list_add_tail(&nbd->list, list);
+
return 0;
}
@@ -2251,6 +2710,12 @@ static void __exit nbd_cleanup(void)
struct nbd_device *nbd;
LIST_HEAD(del_list);
+ /*
+ * Unregister netlink interface prior to waiting
+ * for the completion of netlink commands.
+ */
+ genl_unregister_family(&nbd_genl_family);
+
nbd_dbg_close();
mutex_lock(&nbd_index_mutex);
@@ -2260,14 +2725,18 @@ static void __exit nbd_cleanup(void)
while (!list_empty(&del_list)) {
nbd = list_first_entry(&del_list, struct nbd_device, list);
list_del_init(&nbd->list);
+ if (refcount_read(&nbd->config_refs))
+ pr_err("possibly leaking nbd_config (ref %d)\n",
+ refcount_read(&nbd->config_refs));
if (refcount_read(&nbd->refs) != 1)
- printk(KERN_ERR "nbd: possibly leaking a device\n");
+ pr_err("possibly leaking a device\n");
nbd_put(nbd);
}
+ /* Also wait for nbd_dev_remove_work() completes */
+ destroy_workqueue(nbd_del_wq);
+
idr_destroy(&nbd_index_idr);
- genl_unregister_family(&nbd_genl_family);
- destroy_workqueue(recv_workqueue);
unregister_blkdev(NBD_MAJOR, "nbd");
}