summaryrefslogtreecommitdiff
path: root/drivers/block/nbd.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/nbd.c')
-rw-r--r--drivers/block/nbd.c1295
1 files changed, 798 insertions, 497 deletions
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 43cff01a5a67..f6c33b21f69e 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -11,6 +11,8 @@
* (part of code stolen from loop.c)
*/
+#define pr_fmt(fmt) "nbd: " fmt
+
#include <linux/major.h>
#include <linux/blkdev.h>
@@ -49,6 +51,7 @@
static DEFINE_IDR(nbd_index_idr);
static DEFINE_MUTEX(nbd_index_mutex);
+static struct workqueue_struct *nbd_del_wq;
static int nbd_total_devices = 0;
struct nbd_sock {
@@ -59,11 +62,13 @@ struct nbd_sock {
bool dead;
int fallback_index;
int cookie;
+ struct work_struct work;
};
struct recv_thread_args {
struct work_struct work;
struct nbd_device *nbd;
+ struct nbd_sock *nsock;
int index;
};
@@ -78,8 +83,8 @@ struct link_dead_args {
#define NBD_RT_HAS_PID_FILE 3
#define NBD_RT_HAS_CONFIG_REF 4
#define NBD_RT_BOUND 5
-#define NBD_RT_DESTROY_ON_DISCONNECT 6
-#define NBD_RT_DISCONNECT_ON_CLOSE 7
+#define NBD_RT_DISCONNECT_ON_CLOSE 6
+#define NBD_RT_HAS_BACKEND_FILE 7
#define NBD_DESTROY_ON_DISCONNECT 0
#define NBD_DISCONNECT_REQUESTED 1
@@ -96,13 +101,18 @@ struct nbd_config {
atomic_t recv_threads;
wait_queue_head_t recv_wq;
- loff_t blksize;
+ unsigned int blksize_bits;
loff_t bytesize;
#if IS_ENABLED(CONFIG_DEBUG_FS)
struct dentry *dbg_dir;
#endif
};
+static inline unsigned int nbd_blksize(struct nbd_config *config)
+{
+ return 1u << config->blksize_bits;
+}
+
struct nbd_device {
struct blk_mq_tag_set tag_set;
@@ -113,16 +123,27 @@ struct nbd_device {
struct mutex config_lock;
struct gendisk *disk;
struct workqueue_struct *recv_workq;
+ struct work_struct remove_work;
struct list_head list;
- struct task_struct *task_recv;
struct task_struct *task_setup;
- struct completion *destroy_complete;
unsigned long flags;
+ pid_t pid; /* pid of nbd-client, if attached */
+
+ char *backend;
};
#define NBD_CMD_REQUEUED 1
+/*
+ * This flag will be set if nbd_queue_rq() succeed, and will be checked and
+ * cleared in completion. Both setting and clearing of the flag are protected
+ * by cmd->lock.
+ */
+#define NBD_CMD_INFLIGHT 2
+
+/* Just part of request header or data payload is sent successfully */
+#define NBD_CMD_PARTIAL_SEND 3
struct nbd_cmd {
struct nbd_device *nbd;
@@ -141,9 +162,7 @@ static struct dentry *nbd_dbg_dir;
#define nbd_name(nbd) ((nbd)->disk->disk_name)
-#define NBD_MAGIC 0x68797548
-
-#define NBD_DEF_BLKSIZE 1024
+#define NBD_DEF_BLKSIZE_BITS 10
static unsigned int nbds_max = 16;
static int max_part = 16;
@@ -166,6 +185,17 @@ static void nbd_requeue_cmd(struct nbd_cmd *cmd)
{
struct request *req = blk_mq_rq_from_pdu(cmd);
+ lockdep_assert_held(&cmd->lock);
+
+ /*
+ * Clear INFLIGHT flag so that this cmd won't be completed in
+ * normal completion path
+ *
+ * INFLIGHT flag will be set when the cmd is queued to nbd next
+ * time.
+ */
+ __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
+
if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags))
blk_mq_requeue_request(req, true);
}
@@ -207,9 +237,9 @@ static ssize_t pid_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
- struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
+ struct nbd_device *nbd = disk->private_data;
- return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
+ return sprintf(buf, "%d\n", nbd->pid);
}
static const struct device_attribute pid_attr = {
@@ -217,40 +247,53 @@ static const struct device_attribute pid_attr = {
.show = pid_show,
};
+static ssize_t backend_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct nbd_device *nbd = disk->private_data;
+
+ return sprintf(buf, "%s\n", nbd->backend ?: "");
+}
+
+static const struct device_attribute backend_attr = {
+ .attr = { .name = "backend", .mode = 0444},
+ .show = backend_show,
+};
+
static void nbd_dev_remove(struct nbd_device *nbd)
{
struct gendisk *disk = nbd->disk;
- struct request_queue *q;
- if (disk) {
- q = disk->queue;
- del_gendisk(disk);
- blk_cleanup_queue(q);
- blk_mq_free_tag_set(&nbd->tag_set);
- disk->private_data = NULL;
- put_disk(disk);
- }
+ del_gendisk(disk);
+ blk_mq_free_tag_set(&nbd->tag_set);
/*
- * Place this in the last just before the nbd is freed to
- * make sure that the disk and the related kobject are also
- * totally removed to avoid duplicate creation of the same
- * one.
+ * Remove from idr after del_gendisk() completes, so if the same ID is
+ * reused, the following add_disk() will succeed.
*/
- if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && nbd->destroy_complete)
- complete(nbd->destroy_complete);
+ mutex_lock(&nbd_index_mutex);
+ idr_remove(&nbd_index_idr, nbd->index);
+ mutex_unlock(&nbd_index_mutex);
+ destroy_workqueue(nbd->recv_workq);
+ put_disk(disk);
+}
- kfree(nbd);
+static void nbd_dev_remove_work(struct work_struct *work)
+{
+ nbd_dev_remove(container_of(work, struct nbd_device, remove_work));
}
static void nbd_put(struct nbd_device *nbd)
{
- if (refcount_dec_and_mutex_lock(&nbd->refs,
- &nbd_index_mutex)) {
- idr_remove(&nbd_index_idr, nbd->index);
+ if (!refcount_dec_and_test(&nbd->refs))
+ return;
+
+ /* Call del_gendisk() asynchrounously to prevent deadlock */
+ if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags))
+ queue_work(nbd_del_wq, &nbd->remove_work);
+ else
nbd_dev_remove(nbd);
- mutex_unlock(&nbd_index_mutex);
- }
}
static int nbd_disconnected(struct nbd_config *config)
@@ -268,7 +311,7 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
if (args) {
INIT_WORK(&args->work, nbd_dead_link_work);
args->index = nbd->index;
- queue_work(system_wq, &args->work);
+ queue_work(system_percpu_wq, &args->work);
}
}
if (!nsock->dead) {
@@ -288,46 +331,55 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
nsock->sent = 0;
}
-static void nbd_size_clear(struct nbd_device *nbd)
+static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, loff_t blksize)
{
- if (nbd->config->bytesize) {
- set_capacity(nbd->disk, 0);
- kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
- }
-}
+ struct queue_limits lim;
+ int error;
-static void nbd_size_update(struct nbd_device *nbd)
-{
- struct nbd_config *config = nbd->config;
- struct block_device *bdev = bdget_disk(nbd->disk, 0);
+ if (!blksize)
+ blksize = 1u << NBD_DEF_BLKSIZE_BITS;
- if (config->flags & NBD_FLAG_SEND_TRIM) {
- nbd->disk->queue->limits.discard_granularity = config->blksize;
- nbd->disk->queue->limits.discard_alignment = config->blksize;
- blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
- }
- blk_queue_logical_block_size(nbd->disk->queue, config->blksize);
- blk_queue_physical_block_size(nbd->disk->queue, config->blksize);
- set_capacity(nbd->disk, config->bytesize >> 9);
- if (bdev) {
- if (bdev->bd_disk) {
- bd_set_size(bdev, config->bytesize);
- set_blocksize(bdev, config->blksize);
- } else
- bdev->bd_invalidated = 1;
- bdput(bdev);
- }
- kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
-}
+ if (blk_validate_block_size(blksize))
+ return -EINVAL;
-static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize,
- loff_t nr_blocks)
-{
- struct nbd_config *config = nbd->config;
- config->blksize = blocksize;
- config->bytesize = blocksize * nr_blocks;
- if (nbd->task_recv != NULL)
- nbd_size_update(nbd);
+ if (bytesize < 0)
+ return -EINVAL;
+
+ nbd->config->bytesize = bytesize;
+ nbd->config->blksize_bits = __ffs(blksize);
+
+ if (!nbd->pid)
+ return 0;
+
+ lim = queue_limits_start_update(nbd->disk->queue);
+ if (nbd->config->flags & NBD_FLAG_SEND_TRIM)
+ lim.max_hw_discard_sectors = UINT_MAX >> SECTOR_SHIFT;
+ else
+ lim.max_hw_discard_sectors = 0;
+ if (!(nbd->config->flags & NBD_FLAG_SEND_FLUSH)) {
+ lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA);
+ } else if (nbd->config->flags & NBD_FLAG_SEND_FUA) {
+ lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
+ } else {
+ lim.features |= BLK_FEAT_WRITE_CACHE;
+ lim.features &= ~BLK_FEAT_FUA;
+ }
+ if (nbd->config->flags & NBD_FLAG_ROTATIONAL)
+ lim.features |= BLK_FEAT_ROTATIONAL;
+ if (nbd->config->flags & NBD_FLAG_SEND_WRITE_ZEROES)
+ lim.max_write_zeroes_sectors = UINT_MAX >> SECTOR_SHIFT;
+
+ lim.logical_block_size = blksize;
+ lim.physical_block_size = blksize;
+ error = queue_limits_commit_update_frozen(nbd->disk->queue, &lim);
+ if (error)
+ return error;
+
+ if (max_part)
+ set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
+ if (!set_capacity_and_notify(nbd->disk, bytesize >> 9))
+ kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
+ return 0;
}
static void nbd_complete_rq(struct request *req)
@@ -373,13 +425,30 @@ static u32 req_to_nbd_cmd_type(struct request *req)
return NBD_CMD_WRITE;
case REQ_OP_READ:
return NBD_CMD_READ;
+ case REQ_OP_WRITE_ZEROES:
+ return NBD_CMD_WRITE_ZEROES;
default:
return U32_MAX;
}
}
-static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
- bool reserved)
+static struct nbd_config *nbd_get_config_unlocked(struct nbd_device *nbd)
+{
+ if (refcount_inc_not_zero(&nbd->config_refs)) {
+ /*
+ * Add smp_mb__after_atomic to ensure that reading nbd->config_refs
+ * and reading nbd->config is ordered. The pair is the barrier in
+ * nbd_alloc_and_init_config(), avoid nbd->config_refs is set
+ * before nbd->config.
+ */
+ smp_mb__after_atomic();
+ return nbd->config;
+ }
+
+ return NULL;
+}
+
+static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req)
{
struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
struct nbd_device *nbd = cmd->nbd;
@@ -388,12 +457,24 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
if (!mutex_trylock(&cmd->lock))
return BLK_EH_RESET_TIMER;
- if (!refcount_inc_not_zero(&nbd->config_refs)) {
+ /* partial send is handled in nbd_sock's work function */
+ if (test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)) {
+ mutex_unlock(&cmd->lock);
+ return BLK_EH_RESET_TIMER;
+ }
+
+ if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+ mutex_unlock(&cmd->lock);
+ return BLK_EH_DONE;
+ }
+
+ config = nbd_get_config_unlocked(nbd);
+ if (!config) {
cmd->status = BLK_STS_TIMEOUT;
+ __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
mutex_unlock(&cmd->lock);
goto done;
}
- config = nbd->config;
if (config->num_connections > 1 ||
(config->num_connections == 1 && nbd->tag_set.timeout)) {
@@ -422,8 +503,8 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
nbd_mark_nsock_dead(nbd, nsock, 1);
mutex_unlock(&nsock->tx_lock);
}
- mutex_unlock(&cmd->lock);
nbd_requeue_cmd(cmd);
+ mutex_unlock(&cmd->lock);
nbd_config_put(nbd);
return BLK_EH_DONE;
}
@@ -458,6 +539,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n");
set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags);
cmd->status = BLK_STS_IOERR;
+ __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
mutex_unlock(&cmd->lock);
sock_shutdown(nbd);
nbd_config_put(nbd);
@@ -466,16 +548,11 @@ done:
return BLK_EH_DONE;
}
-/*
- * Send or receive packet.
- */
-static int sock_xmit(struct nbd_device *nbd, int index, int send,
- struct iov_iter *iter, int msg_flags, int *sent)
+static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send,
+ struct iov_iter *iter, int msg_flags, int *sent)
{
- struct nbd_config *config = nbd->config;
- struct socket *sock = config->socks[index]->sock;
int result;
- struct msghdr msg;
+ struct msghdr msg = {} ;
unsigned int noreclaim_flag;
if (unlikely(!sock)) {
@@ -488,27 +565,27 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
msg.msg_iter = *iter;
noreclaim_flag = memalloc_noreclaim_save();
- do {
- sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
- msg.msg_name = NULL;
- msg.msg_namelen = 0;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_flags = msg_flags | MSG_NOSIGNAL;
-
- if (send)
- result = sock_sendmsg(sock, &msg);
- else
- result = sock_recvmsg(sock, &msg, msg.msg_flags);
-
- if (result <= 0) {
- if (result == 0)
- result = -EPIPE; /* short read */
- break;
- }
- if (sent)
- *sent += result;
- } while (msg_data_left(&msg));
+
+ scoped_with_kernel_creds() {
+ do {
+ sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
+ sock->sk->sk_use_task_frag = false;
+ msg.msg_flags = msg_flags | MSG_NOSIGNAL;
+
+ if (send)
+ result = sock_sendmsg(sock, &msg);
+ else
+ result = sock_recvmsg(sock, &msg, msg.msg_flags);
+
+ if (result <= 0) {
+ if (result == 0)
+ result = -EPIPE; /* short read */
+ break;
+ }
+ if (sent)
+ *sent += result;
+ } while (msg_data_left(&msg));
+ }
memalloc_noreclaim_restore(noreclaim_flag);
@@ -516,6 +593,19 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
}
/*
+ * Send or receive packet. Return a positive value on success and
+ * negtive value on failure, and never return 0.
+ */
+static int sock_xmit(struct nbd_device *nbd, int index, int send,
+ struct iov_iter *iter, int msg_flags, int *sent)
+{
+ struct nbd_config *config = nbd->config;
+ struct socket *sock = config->socks[index]->sock;
+
+ return __sock_xmit(nbd, sock, send, iter, msg_flags, sent);
+}
+
+/*
* Different settings for sk->sk_sndtimeo can result in different return values
* if there is a signal pending when we enter sendmsg, because reasons?
*/
@@ -524,8 +614,36 @@ static inline int was_interrupted(int result)
return result == -ERESTARTSYS || result == -EINTR;
}
-/* always call with the tx_lock held */
-static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
+/*
+ * We've already sent header or part of data payload, have no choice but
+ * to set pending and schedule it in work.
+ *
+ * And we have to return BLK_STS_OK to block core, otherwise this same
+ * request may be re-dispatched with different tag, but our header has
+ * been sent out with old tag, and this way does confuse reply handling.
+ */
+static void nbd_sched_pending_work(struct nbd_device *nbd,
+ struct nbd_sock *nsock,
+ struct nbd_cmd *cmd, int sent)
+{
+ struct request *req = blk_mq_rq_from_pdu(cmd);
+
+ /* pending work should be scheduled only once */
+ WARN_ON_ONCE(test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags));
+
+ nsock->pending = req;
+ nsock->sent = sent;
+ set_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags);
+ refcount_inc(&nbd->config_refs);
+ schedule_work(&nsock->work);
+}
+
+/*
+ * Returns BLK_STS_RESOURCE if the caller should retry after a delay.
+ * Returns BLK_STS_IOERR if sending failed.
+ */
+static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd,
+ int index)
{
struct request *req = blk_mq_rq_from_pdu(cmd);
struct nbd_config *config = nbd->config;
@@ -534,28 +652,32 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
struct iov_iter from;
- unsigned long size = blk_rq_bytes(req);
struct bio *bio;
u64 handle;
u32 type;
u32 nbd_cmd_flags = 0;
int sent = nsock->sent, skip = 0;
- iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
+ lockdep_assert_held(&cmd->lock);
+ lockdep_assert_held(&nsock->tx_lock);
+
+ iov_iter_kvec(&from, ITER_SOURCE, &iov, 1, sizeof(request));
type = req_to_nbd_cmd_type(req);
if (type == U32_MAX)
- return -EIO;
+ return BLK_STS_IOERR;
if (rq_data_dir(req) == WRITE &&
(config->flags & NBD_FLAG_READ_ONLY)) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
"Write on read-only\n");
- return -EIO;
+ return BLK_STS_IOERR;
}
if (req->cmd_flags & REQ_FUA)
nbd_cmd_flags |= NBD_CMD_FLAG_FUA;
+ if ((req->cmd_flags & REQ_NOUNMAP) && (type == NBD_CMD_WRITE_ZEROES))
+ nbd_cmd_flags |= NBD_CMD_FLAG_NO_HOLE;
/* We did a partial send previously, and we at least sent the whole
* request struct, so just go and send the rest of the pages in the
@@ -580,10 +702,10 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
request.type = htonl(type | nbd_cmd_flags);
if (type != NBD_CMD_FLUSH) {
request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
- request.len = htonl(size);
+ request.len = htonl(blk_rq_bytes(req));
}
handle = nbd_cmd_handle(cmd);
- memcpy(request.handle, &handle, sizeof(handle));
+ request.cookie = cpu_to_be64(handle);
trace_nbd_send_request(&request, nbd->index, blk_mq_rq_from_pdu(cmd));
@@ -593,23 +715,23 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
result = sock_xmit(nbd, index, 1, &from,
(type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
trace_nbd_header_sent(req, handle);
- if (result <= 0) {
+ if (result < 0) {
if (was_interrupted(result)) {
- /* If we havne't sent anything we can just return BUSY,
+ /* If we haven't sent anything we can just return BUSY,
* however if we have sent something we need to make
* sure we only allow this req to be sent until we are
* completely done.
*/
if (sent) {
- nsock->pending = req;
- nsock->sent = sent;
+ nbd_sched_pending_work(nbd, nsock, cmd, sent);
+ return BLK_STS_OK;
}
set_bit(NBD_CMD_REQUEUED, &cmd->flags);
return BLK_STS_RESOURCE;
}
dev_err_ratelimited(disk_to_dev(nbd->disk),
"Send control failed (result %d)\n", result);
- return -EAGAIN;
+ goto requeue;
}
send_pages:
if (type != NBD_CMD_WRITE)
@@ -627,7 +749,7 @@ send_pages:
dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
req, bvec.bv_len);
- iov_iter_bvec(&from, WRITE, &bvec, 1, bvec.bv_len);
+ iov_iter_bvec(&from, ITER_SOURCE, &bvec, 1, bvec.bv_len);
if (skip) {
if (skip >= iov_iter_count(&from)) {
skip -= iov_iter_count(&from);
@@ -637,21 +759,15 @@ send_pages:
skip = 0;
}
result = sock_xmit(nbd, index, 1, &from, flags, &sent);
- if (result <= 0) {
+ if (result < 0) {
if (was_interrupted(result)) {
- /* We've already sent the header, we
- * have no choice but to set pending and
- * return BUSY.
- */
- nsock->pending = req;
- nsock->sent = sent;
- set_bit(NBD_CMD_REQUEUED, &cmd->flags);
- return BLK_STS_RESOURCE;
+ nbd_sched_pending_work(nbd, nsock, cmd, sent);
+ return BLK_STS_OK;
}
dev_err(disk_to_dev(nbd->disk),
"Send data failed (result %d)\n",
result);
- return -EAGAIN;
+ goto requeue;
}
/*
* The completion might already have come in,
@@ -668,41 +784,103 @@ out:
trace_nbd_payload_sent(req, handle);
nsock->pending = NULL;
nsock->sent = 0;
+ __set_bit(NBD_CMD_INFLIGHT, &cmd->flags);
+ return BLK_STS_OK;
+
+requeue:
+ /*
+ * Can't requeue in case we are dealing with partial send
+ *
+ * We must run from pending work function.
+ * */
+ if (test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags))
+ return BLK_STS_OK;
+
+ /* retry on a different socket */
+ dev_err_ratelimited(disk_to_dev(nbd->disk),
+ "Request send failed, requeueing\n");
+ nbd_mark_nsock_dead(nbd, nsock, 1);
+ nbd_requeue_cmd(cmd);
+ return BLK_STS_OK;
+}
+
+/* handle partial sending */
+static void nbd_pending_cmd_work(struct work_struct *work)
+{
+ struct nbd_sock *nsock = container_of(work, struct nbd_sock, work);
+ struct request *req = nsock->pending;
+ struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
+ struct nbd_device *nbd = cmd->nbd;
+ unsigned long deadline = READ_ONCE(req->deadline);
+ unsigned int wait_ms = 2;
+
+ mutex_lock(&cmd->lock);
+
+ WARN_ON_ONCE(test_bit(NBD_CMD_REQUEUED, &cmd->flags));
+ if (WARN_ON_ONCE(!test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)))
+ goto out;
+
+ mutex_lock(&nsock->tx_lock);
+ while (true) {
+ nbd_send_cmd(nbd, cmd, cmd->index);
+ if (!nsock->pending)
+ break;
+
+ /* don't bother timeout handler for partial sending */
+ if (READ_ONCE(jiffies) + msecs_to_jiffies(wait_ms) >= deadline) {
+ cmd->status = BLK_STS_IOERR;
+ blk_mq_complete_request(req);
+ break;
+ }
+ msleep(wait_ms);
+ wait_ms *= 2;
+ }
+ mutex_unlock(&nsock->tx_lock);
+ clear_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags);
+out:
+ mutex_unlock(&cmd->lock);
+ nbd_config_put(nbd);
+}
+
+static int nbd_read_reply(struct nbd_device *nbd, struct socket *sock,
+ struct nbd_reply *reply)
+{
+ struct kvec iov = {.iov_base = reply, .iov_len = sizeof(*reply)};
+ struct iov_iter to;
+ int result;
+
+ reply->magic = 0;
+ iov_iter_kvec(&to, ITER_DEST, &iov, 1, sizeof(*reply));
+ result = __sock_xmit(nbd, sock, 0, &to, MSG_WAITALL, NULL);
+ if (result < 0) {
+ if (!nbd_disconnected(nbd->config))
+ dev_err(disk_to_dev(nbd->disk),
+ "Receive control failed (result %d)\n", result);
+ return result;
+ }
+
+ if (ntohl(reply->magic) != NBD_REPLY_MAGIC) {
+ dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
+ (unsigned long)ntohl(reply->magic));
+ return -EPROTO;
+ }
+
return 0;
}
/* NULL returned = something went wrong, inform userspace */
-static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
+static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index,
+ struct nbd_reply *reply)
{
- struct nbd_config *config = nbd->config;
int result;
- struct nbd_reply reply;
struct nbd_cmd *cmd;
struct request *req = NULL;
u64 handle;
u16 hwq;
u32 tag;
- struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)};
- struct iov_iter to;
int ret = 0;
- reply.magic = 0;
- iov_iter_kvec(&to, READ, &iov, 1, sizeof(reply));
- result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
- if (result <= 0) {
- if (!nbd_disconnected(config))
- dev_err(disk_to_dev(nbd->disk),
- "Receive control failed (result %d)\n", result);
- return ERR_PTR(result);
- }
-
- if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
- dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
- (unsigned long)ntohl(reply.magic));
- return ERR_PTR(-EPROTO);
- }
-
- memcpy(&handle, reply.handle, sizeof(handle));
+ handle = be64_to_cpu(reply->cookie);
tag = nbd_handle_to_tag(handle);
hwq = blk_mq_unique_tag_to_hwq(tag);
if (hwq < nbd->tag_set.nr_hw_queues)
@@ -717,6 +895,18 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
cmd = blk_mq_rq_to_pdu(req);
mutex_lock(&cmd->lock);
+ if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+ dev_err(disk_to_dev(nbd->disk), "Suspicious reply %d (status %u flags %lu)",
+ tag, cmd->status, cmd->flags);
+ ret = -ENOENT;
+ goto out;
+ }
+ if (cmd->index != index) {
+ dev_err(disk_to_dev(nbd->disk), "Unexpected reply %d from different sock %d (expected %d)",
+ tag, index, cmd->index);
+ ret = -ENOENT;
+ goto out;
+ }
if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) {
dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n",
req, cmd->cmd_cookie, nbd_handle_to_cookie(handle));
@@ -735,9 +925,9 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
ret = -ENOENT;
goto out;
}
- if (ntohl(reply.error)) {
+ if (ntohl(reply->error)) {
dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
- ntohl(reply.error));
+ ntohl(reply->error));
cmd->status = BLK_STS_IOERR;
goto out;
}
@@ -746,11 +936,12 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
if (rq_data_dir(req) != WRITE) {
struct req_iterator iter;
struct bio_vec bvec;
+ struct iov_iter to;
rq_for_each_segment(bvec, req, iter) {
- iov_iter_bvec(&to, READ, &bvec, 1, bvec.bv_len);
+ iov_iter_bvec(&to, ITER_DEST, &bvec, 1, bvec.bv_len);
result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
- if (result <= 0) {
+ if (result < 0) {
dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
result);
/*
@@ -759,7 +950,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
* and let the timeout stuff handle resubmitting
* this request onto another connection.
*/
- if (nbd_disconnected(config)) {
+ if (nbd_disconnected(nbd->config)) {
cmd->status = BLK_STS_IOERR;
goto out;
}
@@ -783,32 +974,72 @@ static void recv_work(struct work_struct *work)
work);
struct nbd_device *nbd = args->nbd;
struct nbd_config *config = nbd->config;
+ struct request_queue *q = nbd->disk->queue;
+ struct nbd_sock *nsock = args->nsock;
struct nbd_cmd *cmd;
+ struct request *rq;
while (1) {
- cmd = nbd_read_stat(nbd, args->index);
- if (IS_ERR(cmd)) {
- struct nbd_sock *nsock = config->socks[args->index];
+ struct nbd_reply reply;
- mutex_lock(&nsock->tx_lock);
- nbd_mark_nsock_dead(nbd, nsock, 1);
- mutex_unlock(&nsock->tx_lock);
+ if (nbd_read_reply(nbd, nsock->sock, &reply))
+ break;
+
+ /*
+ * Grab .q_usage_counter so request pool won't go away, then no
+ * request use-after-free is possible during nbd_handle_reply().
+ * If queue is frozen, there won't be any inflight requests, we
+ * needn't to handle the incoming garbage message.
+ */
+ if (!percpu_ref_tryget(&q->q_usage_counter)) {
+ dev_err(disk_to_dev(nbd->disk), "%s: no io inflight\n",
+ __func__);
+ break;
+ }
+
+ cmd = nbd_handle_reply(nbd, args->index, &reply);
+ if (IS_ERR(cmd)) {
+ percpu_ref_put(&q->q_usage_counter);
break;
}
- blk_mq_complete_request(blk_mq_rq_from_pdu(cmd));
+ rq = blk_mq_rq_from_pdu(cmd);
+ if (likely(!blk_should_fake_timeout(rq->q))) {
+ bool complete;
+
+ mutex_lock(&cmd->lock);
+ complete = __test_and_clear_bit(NBD_CMD_INFLIGHT,
+ &cmd->flags);
+ mutex_unlock(&cmd->lock);
+ if (complete)
+ blk_mq_complete_request(rq);
+ }
+ percpu_ref_put(&q->q_usage_counter);
}
+
+ mutex_lock(&nsock->tx_lock);
+ nbd_mark_nsock_dead(nbd, nsock, 1);
+ mutex_unlock(&nsock->tx_lock);
+
atomic_dec(&config->recv_threads);
wake_up(&config->recv_wq);
nbd_config_put(nbd);
kfree(args);
}
-static bool nbd_clear_req(struct request *req, void *data, bool reserved)
+static bool nbd_clear_req(struct request *req, void *data)
{
struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
+ /* don't abort one completed request */
+ if (blk_mq_request_completed(req))
+ return true;
+
mutex_lock(&cmd->lock);
+ if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+ mutex_unlock(&cmd->lock);
+ return true;
+ }
cmd->status = BLK_STS_IOERR;
mutex_unlock(&cmd->lock);
@@ -872,35 +1103,39 @@ static int wait_for_reconnect(struct nbd_device *nbd)
struct nbd_config *config = nbd->config;
if (!config->dead_conn_timeout)
return 0;
- if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
+
+ if (!wait_event_timeout(config->conn_wait,
+ test_bit(NBD_RT_DISCONNECTED,
+ &config->runtime_flags) ||
+ atomic_read(&config->live_connections) > 0,
+ config->dead_conn_timeout))
return 0;
- return wait_event_timeout(config->conn_wait,
- atomic_read(&config->live_connections) > 0,
- config->dead_conn_timeout) > 0;
+
+ return !test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags);
}
-static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
+static blk_status_t nbd_handle_cmd(struct nbd_cmd *cmd, int index)
{
struct request *req = blk_mq_rq_from_pdu(cmd);
struct nbd_device *nbd = cmd->nbd;
struct nbd_config *config;
struct nbd_sock *nsock;
- int ret;
+ blk_status_t ret;
- if (!refcount_inc_not_zero(&nbd->config_refs)) {
+ lockdep_assert_held(&cmd->lock);
+
+ config = nbd_get_config_unlocked(nbd);
+ if (!config) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
"Socks array is empty\n");
- blk_mq_start_request(req);
- return -EINVAL;
+ return BLK_STS_IOERR;
}
- config = nbd->config;
if (index >= config->num_connections) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
"Attempted send on invalid socket\n");
nbd_config_put(nbd);
- blk_mq_start_request(req);
- return -EINVAL;
+ return BLK_STS_IOERR;
}
cmd->status = BLK_STS_OK;
again:
@@ -923,8 +1158,7 @@ again:
*/
sock_shutdown(nbd);
nbd_config_put(nbd);
- blk_mq_start_request(req);
- return -EIO;
+ return BLK_STS_IOERR;
}
goto again;
}
@@ -937,21 +1171,10 @@ again:
blk_mq_start_request(req);
if (unlikely(nsock->pending && nsock->pending != req)) {
nbd_requeue_cmd(cmd);
- ret = 0;
+ ret = BLK_STS_OK;
goto out;
}
- /*
- * Some failures are related to the link going down, so anything that
- * returns EAGAIN can be retried on a different socket.
- */
ret = nbd_send_cmd(nbd, cmd, index);
- if (ret == -EAGAIN) {
- dev_err_ratelimited(disk_to_dev(nbd->disk),
- "Request send failed, requeueing\n");
- nbd_mark_nsock_dead(nbd, nsock, 1);
- nbd_requeue_cmd(cmd);
- ret = 0;
- }
out:
mutex_unlock(&nsock->tx_lock);
nbd_config_put(nbd);
@@ -962,7 +1185,7 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
- int ret;
+ blk_status_t ret;
/*
* Since we look at the bio's to send the request over the network we
@@ -982,10 +1205,6 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
* appropriate.
*/
ret = nbd_handle_cmd(cmd, hctx->queue_num);
- if (ret < 0)
- ret = BLK_STS_IOERR;
- else if (!ret)
- ret = BLK_STS_OK;
mutex_unlock(&cmd->lock);
return ret;
@@ -1001,6 +1220,14 @@ static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd,
if (!sock)
return NULL;
+ if (!sk_is_tcp(sock->sk) &&
+ !sk_is_stream_unix(sock->sk)) {
+ dev_err(disk_to_dev(nbd->disk), "Unsupported socket: should be TCP or UNIX.\n");
+ *err = -EINVAL;
+ sockfd_put(sock);
+ return NULL;
+ }
+
if (sock->ops->shutdown == sock_no_shutdown) {
dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n");
*err = -EINVAL;
@@ -1018,12 +1245,22 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
struct socket *sock;
struct nbd_sock **socks;
struct nbd_sock *nsock;
+ unsigned int memflags;
int err;
+ /* Arg will be cast to int, check it to avoid overflow */
+ if (arg > INT_MAX)
+ return -EINVAL;
sock = nbd_get_socket(nbd, arg, &err);
if (!sock)
return err;
+ /*
+ * We need to make sure we don't get any errant requests while we're
+ * reallocating the ->socks array.
+ */
+ memflags = blk_mq_freeze_queue(nbd->disk->queue);
+
if (!netlink && !nbd->task_setup &&
!test_bit(NBD_RT_BOUND, &config->runtime_flags))
nbd->task_setup = current;
@@ -1033,25 +1270,26 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
test_bit(NBD_RT_BOUND, &config->runtime_flags))) {
dev_err(disk_to_dev(nbd->disk),
"Device being setup by another task");
- sockfd_put(sock);
- return -EBUSY;
+ err = -EBUSY;
+ goto put_socket;
+ }
+
+ nsock = kzalloc(sizeof(*nsock), GFP_KERNEL);
+ if (!nsock) {
+ err = -ENOMEM;
+ goto put_socket;
}
socks = krealloc(config->socks, (config->num_connections + 1) *
sizeof(struct nbd_sock *), GFP_KERNEL);
if (!socks) {
- sockfd_put(sock);
- return -ENOMEM;
+ kfree(nsock);
+ err = -ENOMEM;
+ goto put_socket;
}
config->socks = socks;
- nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL);
- if (!nsock) {
- sockfd_put(sock);
- return -ENOMEM;
- }
-
nsock->fallback_index = -1;
nsock->dead = false;
mutex_init(&nsock->tx_lock);
@@ -1059,10 +1297,17 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
nsock->pending = NULL;
nsock->sent = 0;
nsock->cookie = 0;
+ INIT_WORK(&nsock->work, nbd_pending_cmd_work);
socks[config->num_connections++] = nsock;
atomic_inc(&config->live_connections);
+ blk_mq_unfreeze_queue(nbd->disk->queue, memflags);
return 0;
+
+put_socket:
+ blk_mq_unfreeze_queue(nbd->disk->queue, memflags);
+ sockfd_put(sock);
+ return err;
}
static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
@@ -1106,6 +1351,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
INIT_WORK(&args->work, recv_work);
args->index = i;
args->nbd = nbd;
+ args->nsock = nsock;
nsock->cookie++;
mutex_unlock(&nsock->tx_lock);
sockfd_put(old);
@@ -1126,30 +1372,19 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
return -ENOSPC;
}
-static void nbd_bdev_reset(struct block_device *bdev)
+static void nbd_bdev_reset(struct nbd_device *nbd)
{
- if (bdev->bd_openers > 1)
+ if (disk_openers(nbd->disk) > 1)
return;
- bd_set_size(bdev, 0);
+ set_capacity(nbd->disk, 0);
}
static void nbd_parse_flags(struct nbd_device *nbd)
{
- struct nbd_config *config = nbd->config;
- if (config->flags & NBD_FLAG_READ_ONLY)
+ if (nbd->config->flags & NBD_FLAG_READ_ONLY)
set_disk_ro(nbd->disk, true);
else
set_disk_ro(nbd->disk, false);
- if (config->flags & NBD_FLAG_SEND_TRIM)
- blk_queue_flag_set(QUEUE_FLAG_DISCARD, nbd->disk->queue);
- if (config->flags & NBD_FLAG_SEND_FLUSH) {
- if (config->flags & NBD_FLAG_SEND_FUA)
- blk_queue_write_cache(nbd->disk->queue, true, true);
- else
- blk_queue_write_cache(nbd->disk->queue, true, false);
- }
- else
- blk_queue_write_cache(nbd->disk->queue, false, false);
}
static void send_disconnects(struct nbd_device *nbd)
@@ -1166,10 +1401,10 @@ static void send_disconnects(struct nbd_device *nbd)
for (i = 0; i < config->num_connections; i++) {
struct nbd_sock *nsock = config->socks[i];
- iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
+ iov_iter_kvec(&from, ITER_SOURCE, &iov, 1, sizeof(request));
mutex_lock(&nsock->tx_lock);
ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
- if (ret <= 0)
+ if (ret < 0)
dev_err(disk_to_dev(nbd->disk),
"Send disconnect failed %d\n", ret);
mutex_unlock(&nsock->tx_lock);
@@ -1200,11 +1435,19 @@ static void nbd_config_put(struct nbd_device *nbd)
&nbd->config_lock)) {
struct nbd_config *config = nbd->config;
nbd_dev_dbg_close(nbd);
- nbd_size_clear(nbd);
+ invalidate_disk(nbd->disk);
+ if (nbd->config->bytesize)
+ kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
if (test_and_clear_bit(NBD_RT_HAS_PID_FILE,
&config->runtime_flags))
device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
- nbd->task_recv = NULL;
+ nbd->pid = 0;
+ if (test_and_clear_bit(NBD_RT_HAS_BACKEND_FILE,
+ &config->runtime_flags)) {
+ device_remove_file(disk_to_dev(nbd->disk), &backend_attr);
+ kfree(nbd->backend);
+ nbd->backend = NULL;
+ }
nbd_clear_sock(nbd);
if (config->num_connections) {
int i;
@@ -1217,15 +1460,7 @@ static void nbd_config_put(struct nbd_device *nbd)
kfree(nbd->config);
nbd->config = NULL;
- if (nbd->recv_workq)
- destroy_workqueue(nbd->recv_workq);
- nbd->recv_workq = NULL;
-
nbd->tag_set.timeout = 0;
- nbd->disk->queue->limits.discard_granularity = 0;
- nbd->disk->queue->limits.discard_alignment = 0;
- blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
- blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue);
mutex_unlock(&nbd->config_lock);
nbd_put(nbd);
@@ -1239,7 +1474,7 @@ static int nbd_start_device(struct nbd_device *nbd)
int num_connections = config->num_connections;
int error = 0, i;
- if (nbd->task_recv)
+ if (nbd->pid)
return -EBUSY;
if (!config->socks)
return -EINVAL;
@@ -1249,22 +1484,24 @@ static int nbd_start_device(struct nbd_device *nbd)
return -EINVAL;
}
- nbd->recv_workq = alloc_workqueue("knbd%d-recv",
- WQ_MEM_RECLAIM | WQ_HIGHPRI |
- WQ_UNBOUND, 0, nbd->index);
- if (!nbd->recv_workq) {
- dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n");
- return -ENOMEM;
+retry:
+ mutex_unlock(&nbd->config_lock);
+ blk_mq_update_nr_hw_queues(&nbd->tag_set, num_connections);
+ mutex_lock(&nbd->config_lock);
+
+ /* if another code path updated nr_hw_queues, retry until succeed */
+ if (num_connections != config->num_connections) {
+ num_connections = config->num_connections;
+ goto retry;
}
- blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
- nbd->task_recv = current;
+ nbd->pid = task_pid_nr(current);
nbd_parse_flags(nbd);
error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
if (error) {
- dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
+ dev_err(disk_to_dev(nbd->disk), "device_create_file failed for pid!\n");
return error;
}
set_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags);
@@ -1296,14 +1533,14 @@ static int nbd_start_device(struct nbd_device *nbd)
refcount_inc(&nbd->config_refs);
INIT_WORK(&args->work, recv_work);
args->nbd = nbd;
+ args->nsock = config->socks[i];
args->index = i;
queue_work(nbd->recv_workq, &args->work);
}
- nbd_size_update(nbd);
- return error;
+ return nbd_set_size(nbd, config->bytesize, nbd_blksize(config));
}
-static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev)
+static int nbd_start_device_ioctl(struct nbd_device *nbd)
{
struct nbd_config *config = nbd->config;
int ret;
@@ -1313,16 +1550,18 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b
return ret;
if (max_part)
- bdev->bd_invalidated = 1;
+ set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
mutex_unlock(&nbd->config_lock);
ret = wait_event_interruptible(config->recv_wq,
atomic_read(&config->recv_threads) == 0);
- if (ret)
+ if (ret) {
sock_shutdown(nbd);
- flush_workqueue(nbd->recv_workq);
+ nbd_clear_que(nbd);
+ }
+ flush_workqueue(nbd->recv_workq);
mutex_lock(&nbd->config_lock);
- nbd_bdev_reset(bdev);
+ nbd_bdev_reset(nbd);
/* user requested, ignore socket errors */
if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags))
ret = 0;
@@ -1331,30 +1570,23 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b
return ret;
}
-static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
- struct block_device *bdev)
+static void nbd_clear_sock_ioctl(struct nbd_device *nbd)
{
- sock_shutdown(nbd);
- __invalidate_device(bdev, true);
- nbd_bdev_reset(bdev);
+ nbd_clear_sock(nbd);
+ disk_force_media_change(nbd->disk);
+ nbd_bdev_reset(nbd);
if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
&nbd->config->runtime_flags))
nbd_config_put(nbd);
}
-static bool nbd_is_valid_blksize(unsigned long blksize)
-{
- if (!blksize || !is_power_of_2(blksize) || blksize < 512 ||
- blksize > PAGE_SIZE)
- return false;
- return true;
-}
-
static void nbd_set_cmd_timeout(struct nbd_device *nbd, u64 timeout)
{
nbd->tag_set.timeout = timeout * HZ;
if (timeout)
blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
+ else
+ blk_queue_rq_timeout(nbd->disk->queue, 30 * HZ);
}
/* Must be called with config_lock held */
@@ -1362,30 +1594,24 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
unsigned int cmd, unsigned long arg)
{
struct nbd_config *config = nbd->config;
+ loff_t bytesize;
switch (cmd) {
case NBD_DISCONNECT:
return nbd_disconnect(nbd);
case NBD_CLEAR_SOCK:
- nbd_clear_sock_ioctl(nbd, bdev);
+ nbd_clear_sock_ioctl(nbd);
return 0;
case NBD_SET_SOCK:
return nbd_add_socket(nbd, arg, false);
case NBD_SET_BLKSIZE:
- if (!arg)
- arg = NBD_DEF_BLKSIZE;
- if (!nbd_is_valid_blksize(arg))
- return -EINVAL;
- nbd_size_set(nbd, arg,
- div_s64(config->bytesize, arg));
- return 0;
+ return nbd_set_size(nbd, config->bytesize, arg);
case NBD_SET_SIZE:
- nbd_size_set(nbd, config->blksize,
- div_s64(arg, config->blksize));
- return 0;
+ return nbd_set_size(nbd, arg, nbd_blksize(config));
case NBD_SET_SIZE_BLOCKS:
- nbd_size_set(nbd, config->blksize, arg);
- return 0;
+ if (check_shl_overflow(arg, config->blksize_bits, &bytesize))
+ return -EINVAL;
+ return nbd_set_size(nbd, bytesize, nbd_blksize(config));
case NBD_SET_TIMEOUT:
nbd_set_cmd_timeout(nbd, arg);
return 0;
@@ -1394,7 +1620,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
config->flags = arg;
return 0;
case NBD_DO_IT:
- return nbd_start_device_ioctl(nbd, bdev);
+ return nbd_start_device_ioctl(nbd);
case NBD_CLEAR_QUE:
/*
* This is for compatibility only. The queue is always cleared
@@ -1411,7 +1637,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
return -ENOTTY;
}
-static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
+static int nbd_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned int cmd, unsigned long arg)
{
struct nbd_device *nbd = bdev->bd_disk->private_data;
@@ -1441,29 +1667,49 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
return error;
}
-static struct nbd_config *nbd_alloc_config(void)
+static int nbd_alloc_and_init_config(struct nbd_device *nbd)
{
struct nbd_config *config;
+ if (WARN_ON(nbd->config))
+ return -EINVAL;
+
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
- if (!config)
- return NULL;
+ if (!config) {
+ module_put(THIS_MODULE);
+ return -ENOMEM;
+ }
+
atomic_set(&config->recv_threads, 0);
init_waitqueue_head(&config->recv_wq);
init_waitqueue_head(&config->conn_wait);
- config->blksize = NBD_DEF_BLKSIZE;
+ config->blksize_bits = NBD_DEF_BLKSIZE_BITS;
atomic_set(&config->live_connections, 0);
- try_module_get(THIS_MODULE);
- return config;
+
+ nbd->config = config;
+ /*
+ * Order refcount_set(&nbd->config_refs, 1) and nbd->config assignment,
+ * its pair is the barrier in nbd_get_config_unlocked().
+ * So nbd_get_config_unlocked() won't see nbd->config as null after
+ * refcount_inc_not_zero() succeed.
+ */
+ smp_mb__before_atomic();
+ refcount_set(&nbd->config_refs, 1);
+
+ return 0;
}
-static int nbd_open(struct block_device *bdev, fmode_t mode)
+static int nbd_open(struct gendisk *disk, blk_mode_t mode)
{
struct nbd_device *nbd;
+ struct nbd_config *config;
int ret = 0;
mutex_lock(&nbd_index_mutex);
- nbd = bdev->bd_disk->private_data;
+ nbd = disk->private_data;
if (!nbd) {
ret = -ENXIO;
goto out;
@@ -1472,45 +1718,52 @@ static int nbd_open(struct block_device *bdev, fmode_t mode)
ret = -ENXIO;
goto out;
}
- if (!refcount_inc_not_zero(&nbd->config_refs)) {
- struct nbd_config *config;
+ config = nbd_get_config_unlocked(nbd);
+ if (!config) {
mutex_lock(&nbd->config_lock);
if (refcount_inc_not_zero(&nbd->config_refs)) {
mutex_unlock(&nbd->config_lock);
goto out;
}
- config = nbd->config = nbd_alloc_config();
- if (!config) {
- ret = -ENOMEM;
+ ret = nbd_alloc_and_init_config(nbd);
+ if (ret) {
mutex_unlock(&nbd->config_lock);
goto out;
}
- refcount_set(&nbd->config_refs, 1);
+
refcount_inc(&nbd->refs);
mutex_unlock(&nbd->config_lock);
- bdev->bd_invalidated = 1;
- } else if (nbd_disconnected(nbd->config)) {
- bdev->bd_invalidated = 1;
+ if (max_part)
+ set_bit(GD_NEED_PART_SCAN, &disk->state);
+ } else if (nbd_disconnected(config)) {
+ if (max_part)
+ set_bit(GD_NEED_PART_SCAN, &disk->state);
}
out:
mutex_unlock(&nbd_index_mutex);
return ret;
}
-static void nbd_release(struct gendisk *disk, fmode_t mode)
+static void nbd_release(struct gendisk *disk)
{
struct nbd_device *nbd = disk->private_data;
- struct block_device *bdev = bdget_disk(disk, 0);
if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) &&
- bdev->bd_openers == 0)
+ disk_openers(disk) == 0)
nbd_disconnect_and_put(nbd);
nbd_config_put(nbd);
nbd_put(nbd);
}
+static void nbd_free_disk(struct gendisk *disk)
+{
+ struct nbd_device *nbd = disk->private_data;
+
+ kfree(nbd);
+}
+
static const struct block_device_operations nbd_fops =
{
.owner = THIS_MODULE,
@@ -1518,6 +1771,7 @@ static const struct block_device_operations nbd_fops =
.release = nbd_release,
.ioctl = nbd_ioctl,
.compat_ioctl = nbd_ioctl,
+ .free_disk = nbd_free_disk,
};
#if IS_ENABLED(CONFIG_DEBUG_FS)
@@ -1526,23 +1780,13 @@ static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
{
struct nbd_device *nbd = s->private;
- if (nbd->task_recv)
- seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
+ if (nbd->pid)
+ seq_printf(s, "recv: %d\n", nbd->pid);
return 0;
}
-static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
-{
- return single_open(file, nbd_dbg_tasks_show, inode->i_private);
-}
-
-static const struct file_operations nbd_dbg_tasks_ops = {
- .open = nbd_dbg_tasks_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(nbd_dbg_tasks);
static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
{
@@ -1563,21 +1807,15 @@ static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
seq_puts(s, "NBD_FLAG_SEND_FUA\n");
if (flags & NBD_FLAG_SEND_TRIM)
seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
+ if (flags & NBD_FLAG_SEND_WRITE_ZEROES)
+ seq_puts(s, "NBD_FLAG_SEND_WRITE_ZEROES\n");
+ if (flags & NBD_FLAG_ROTATIONAL)
+ seq_puts(s, "NBD_FLAG_ROTATIONAL\n");
return 0;
}
-static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
-{
- return single_open(file, nbd_dbg_flags_show, inode->i_private);
-}
-
-static const struct file_operations nbd_dbg_flags_ops = {
- .open = nbd_dbg_flags_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(nbd_dbg_flags);
static int nbd_dev_dbg_init(struct nbd_device *nbd)
{
@@ -1588,18 +1826,18 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd)
return -EIO;
dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
- if (!dir) {
+ if (IS_ERR(dir)) {
dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
nbd_name(nbd));
return -EIO;
}
config->dbg_dir = dir;
- debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
+ debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_fops);
debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
- debugfs_create_u64("blocksize", 0444, dir, &config->blksize);
- debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
+ debugfs_create_u32("blocksize_bits", 0444, dir, &config->blksize_bits);
+ debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_fops);
return 0;
}
@@ -1614,7 +1852,7 @@ static int nbd_dbg_init(void)
struct dentry *dbg_dir;
dbg_dir = debugfs_create_dir("nbd", NULL);
- if (!dbg_dir)
+ if (IS_ERR(dbg_dir))
return -EIO;
nbd_dbg_dir = dbg_dir;
@@ -1666,105 +1904,126 @@ static const struct blk_mq_ops nbd_mq_ops = {
.timeout = nbd_xmit_timeout,
};
-static int nbd_dev_add(int index)
+static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
{
+ struct queue_limits lim = {
+ .max_hw_sectors = 65536,
+ .io_opt = 256 << SECTOR_SHIFT,
+ .max_segments = USHRT_MAX,
+ .max_segment_size = UINT_MAX,
+ };
struct nbd_device *nbd;
struct gendisk *disk;
- struct request_queue *q;
int err = -ENOMEM;
nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL);
if (!nbd)
goto out;
- disk = alloc_disk(1 << part_shift);
- if (!disk)
+ nbd->tag_set.ops = &nbd_mq_ops;
+ nbd->tag_set.nr_hw_queues = 1;
+ nbd->tag_set.queue_depth = 128;
+ nbd->tag_set.numa_node = NUMA_NO_NODE;
+ nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
+ nbd->tag_set.flags = BLK_MQ_F_BLOCKING;
+ nbd->tag_set.driver_data = nbd;
+ INIT_WORK(&nbd->remove_work, nbd_dev_remove_work);
+ nbd->backend = NULL;
+
+ err = blk_mq_alloc_tag_set(&nbd->tag_set);
+ if (err)
goto out_free_nbd;
+ mutex_lock(&nbd_index_mutex);
if (index >= 0) {
err = idr_alloc(&nbd_index_idr, nbd, index, index + 1,
GFP_KERNEL);
if (err == -ENOSPC)
err = -EEXIST;
} else {
- err = idr_alloc(&nbd_index_idr, nbd, 0, 0, GFP_KERNEL);
+ err = idr_alloc(&nbd_index_idr, nbd, 0,
+ (MINORMASK >> part_shift) + 1, GFP_KERNEL);
if (err >= 0)
index = err;
}
- if (err < 0)
- goto out_free_disk;
-
nbd->index = index;
- nbd->disk = disk;
- nbd->tag_set.ops = &nbd_mq_ops;
- nbd->tag_set.nr_hw_queues = 1;
- nbd->tag_set.queue_depth = 128;
- nbd->tag_set.numa_node = NUMA_NO_NODE;
- nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
- nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
- BLK_MQ_F_BLOCKING;
- nbd->tag_set.driver_data = nbd;
- nbd->destroy_complete = NULL;
+ mutex_unlock(&nbd_index_mutex);
+ if (err < 0)
+ goto out_free_tags;
- err = blk_mq_alloc_tag_set(&nbd->tag_set);
- if (err)
+ disk = blk_mq_alloc_disk(&nbd->tag_set, &lim, NULL);
+ if (IS_ERR(disk)) {
+ err = PTR_ERR(disk);
goto out_free_idr;
-
- q = blk_mq_init_queue(&nbd->tag_set);
- if (IS_ERR(q)) {
- err = PTR_ERR(q);
- goto out_free_tags;
}
- disk->queue = q;
+ nbd->disk = disk;
- /*
- * Tell the block layer that we are not a rotational device
- */
- blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
- blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
- disk->queue->limits.discard_granularity = 0;
- disk->queue->limits.discard_alignment = 0;
- blk_queue_max_discard_sectors(disk->queue, 0);
- blk_queue_max_segment_size(disk->queue, UINT_MAX);
- blk_queue_max_segments(disk->queue, USHRT_MAX);
- blk_queue_max_hw_sectors(disk->queue, 65536);
- disk->queue->limits.max_sectors = 256;
+ nbd->recv_workq = alloc_workqueue("nbd%d-recv",
+ WQ_MEM_RECLAIM | WQ_HIGHPRI |
+ WQ_UNBOUND, 0, nbd->index);
+ if (!nbd->recv_workq) {
+ dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n");
+ err = -ENOMEM;
+ goto out_err_disk;
+ }
mutex_init(&nbd->config_lock);
refcount_set(&nbd->config_refs, 0);
- refcount_set(&nbd->refs, 1);
+ /*
+ * Start out with a zero references to keep other threads from using
+ * this device until it is fully initialized.
+ */
+ refcount_set(&nbd->refs, 0);
INIT_LIST_HEAD(&nbd->list);
disk->major = NBD_MAJOR;
disk->first_minor = index << part_shift;
+ disk->minors = 1 << part_shift;
disk->fops = &nbd_fops;
disk->private_data = nbd;
sprintf(disk->disk_name, "nbd%d", index);
- add_disk(disk);
+ err = add_disk(disk);
+ if (err)
+ goto out_free_work;
+
+ /*
+ * Now publish the device.
+ */
+ refcount_set(&nbd->refs, refs);
nbd_total_devices++;
- return index;
+ return nbd;
-out_free_tags:
- blk_mq_free_tag_set(&nbd->tag_set);
+out_free_work:
+ destroy_workqueue(nbd->recv_workq);
+out_err_disk:
+ put_disk(disk);
out_free_idr:
+ mutex_lock(&nbd_index_mutex);
idr_remove(&nbd_index_idr, index);
-out_free_disk:
- put_disk(disk);
+ mutex_unlock(&nbd_index_mutex);
+out_free_tags:
+ blk_mq_free_tag_set(&nbd->tag_set);
out_free_nbd:
kfree(nbd);
out:
- return err;
+ return ERR_PTR(err);
}
-static int find_free_cb(int id, void *ptr, void *data)
+static struct nbd_device *nbd_find_get_unused(void)
{
- struct nbd_device *nbd = ptr;
- struct nbd_device **found = data;
+ struct nbd_device *nbd;
+ int id;
- if (!refcount_read(&nbd->config_refs)) {
- *found = nbd;
- return 1;
+ lockdep_assert_held(&nbd_index_mutex);
+
+ idr_for_each_entry(&nbd_index_idr, nbd, id) {
+ if (refcount_read(&nbd->config_refs) ||
+ test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags))
+ continue;
+ if (refcount_inc_not_zero(&nbd->refs))
+ return nbd;
}
- return 0;
+
+ return NULL;
}
/* Netlink interface. */
@@ -1778,6 +2037,7 @@ static const struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
[NBD_ATTR_SOCKETS] = { .type = NLA_NESTED},
[NBD_ATTR_DEAD_CONN_TIMEOUT] = { .type = NLA_U64 },
[NBD_ATTR_DEVICE_LIST] = { .type = NLA_NESTED},
+ [NBD_ATTR_BACKEND_IDENTIFIER] = { .type = NLA_STRING},
};
static const struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
@@ -1796,31 +2056,23 @@ nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = {
static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd)
{
struct nbd_config *config = nbd->config;
- u64 bsize = config->blksize;
+ u64 bsize = nbd_blksize(config);
u64 bytes = config->bytesize;
if (info->attrs[NBD_ATTR_SIZE_BYTES])
bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
- if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) {
+ if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES])
bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
- if (!bsize)
- bsize = NBD_DEF_BLKSIZE;
- if (!nbd_is_valid_blksize(bsize)) {
- printk(KERN_ERR "Invalid block size %llu\n", bsize);
- return -EINVAL;
- }
- }
- if (bytes != config->bytesize || bsize != config->blksize)
- nbd_size_set(nbd, bsize, div64_u64(bytes, bsize));
+ if (bytes != config->bytesize || bsize != nbd_blksize(config))
+ return nbd_set_size(nbd, bytes, bsize);
return 0;
}
static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
{
- DECLARE_COMPLETION_ONSTACK(destroy_complete);
- struct nbd_device *nbd = NULL;
+ struct nbd_device *nbd;
struct nbd_config *config;
int index = -1;
int ret;
@@ -1829,68 +2081,53 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
if (!netlink_capable(skb, CAP_SYS_ADMIN))
return -EPERM;
- if (info->attrs[NBD_ATTR_INDEX])
+ if (info->attrs[NBD_ATTR_INDEX]) {
index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
- if (!info->attrs[NBD_ATTR_SOCKETS]) {
- printk(KERN_ERR "nbd: must specify at least one socket\n");
+
+ /*
+ * Too big first_minor can cause duplicate creation of
+ * sysfs files/links, since index << part_shift might overflow, or
+ * MKDEV() expect that the max bits of first_minor is 20.
+ */
+ if (index < 0 || index > MINORMASK >> part_shift) {
+ pr_err("illegal input index %d\n", index);
+ return -EINVAL;
+ }
+ }
+ if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_SOCKETS)) {
+ pr_err("must specify at least one socket\n");
return -EINVAL;
}
- if (!info->attrs[NBD_ATTR_SIZE_BYTES]) {
- printk(KERN_ERR "nbd: must specify a size in bytes for the device\n");
+ if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_SIZE_BYTES)) {
+ pr_err("must specify a size in bytes for the device\n");
return -EINVAL;
}
again:
mutex_lock(&nbd_index_mutex);
if (index == -1) {
- ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd);
- if (ret == 0) {
- int new_index;
- new_index = nbd_dev_add(-1);
- if (new_index < 0) {
- mutex_unlock(&nbd_index_mutex);
- printk(KERN_ERR "nbd: failed to add new device\n");
- return new_index;
- }
- nbd = idr_find(&nbd_index_idr, new_index);
- }
+ nbd = nbd_find_get_unused();
} else {
nbd = idr_find(&nbd_index_idr, index);
- if (!nbd) {
- ret = nbd_dev_add(index);
- if (ret < 0) {
+ if (nbd) {
+ if ((test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) &&
+ test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) ||
+ !refcount_inc_not_zero(&nbd->refs)) {
mutex_unlock(&nbd_index_mutex);
- printk(KERN_ERR "nbd: failed to add new device\n");
- return ret;
+ pr_err("device at index %d is going down\n",
+ index);
+ return -EINVAL;
}
- nbd = idr_find(&nbd_index_idr, index);
}
}
- if (!nbd) {
- printk(KERN_ERR "nbd: couldn't find device at index %d\n",
- index);
- mutex_unlock(&nbd_index_mutex);
- return -EINVAL;
- }
-
- if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) &&
- test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) {
- nbd->destroy_complete = &destroy_complete;
- mutex_unlock(&nbd_index_mutex);
-
- /* Wait untill the the nbd stuff is totally destroyed */
- wait_for_completion(&destroy_complete);
- goto again;
- }
+ mutex_unlock(&nbd_index_mutex);
- if (!refcount_inc_not_zero(&nbd->refs)) {
- mutex_unlock(&nbd_index_mutex);
- if (index == -1)
- goto again;
- printk(KERN_ERR "nbd: device at index %d is going down\n",
- index);
- return -EINVAL;
+ if (!nbd) {
+ nbd = nbd_dev_add(index, 2);
+ if (IS_ERR(nbd)) {
+ pr_err("failed to add new device\n");
+ return PTR_ERR(nbd);
+ }
}
- mutex_unlock(&nbd_index_mutex);
mutex_lock(&nbd->config_lock);
if (refcount_read(&nbd->config_refs)) {
@@ -1898,24 +2135,20 @@ again:
nbd_put(nbd);
if (index == -1)
goto again;
- printk(KERN_ERR "nbd: nbd%d already in use\n", index);
+ pr_err("nbd%d already in use\n", index);
return -EBUSY;
}
- if (WARN_ON(nbd->config)) {
- mutex_unlock(&nbd->config_lock);
- nbd_put(nbd);
- return -EINVAL;
- }
- config = nbd->config = nbd_alloc_config();
- if (!nbd->config) {
+
+ ret = nbd_alloc_and_init_config(nbd);
+ if (ret) {
mutex_unlock(&nbd->config_lock);
nbd_put(nbd);
- printk(KERN_ERR "nbd: couldn't allocate config\n");
- return -ENOMEM;
+ pr_err("couldn't allocate config\n");
+ return ret;
}
- refcount_set(&nbd->config_refs, 1);
- set_bit(NBD_RT_BOUND, &config->runtime_flags);
+ config = nbd->config;
+ set_bit(NBD_RT_BOUND, &config->runtime_flags);
ret = nbd_genl_size_set(info, nbd);
if (ret)
goto out;
@@ -1934,12 +2167,21 @@ again:
if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
- set_bit(NBD_RT_DESTROY_ON_DISCONNECT,
- &config->runtime_flags);
- set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
- put_dev = true;
+ /*
+ * We have 1 ref to keep the device around, and then 1
+ * ref for our current operation here, which will be
+ * inherited by the config. If we already have
+ * DESTROY_ON_DISCONNECT set then we know we don't have
+ * that extra ref already held so we don't need the
+ * put_dev.
+ */
+ if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
+ &nbd->flags))
+ put_dev = true;
} else {
- clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
+ if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
+ &nbd->flags))
+ refcount_inc(&nbd->refs);
}
if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
@@ -1956,7 +2198,7 @@ again:
struct nlattr *socks[NBD_SOCK_MAX+1];
if (nla_type(attr) != NBD_SOCK_ITEM) {
- printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
+ pr_err("socks must be embedded in a SOCK_ITEM attr\n");
ret = -EINVAL;
goto out;
}
@@ -1965,7 +2207,7 @@ again:
nbd_sock_policy,
info->extack);
if (ret != 0) {
- printk(KERN_ERR "nbd: error processing sock list\n");
+ pr_err("error processing sock list\n");
ret = -EINVAL;
goto out;
}
@@ -1977,14 +2219,32 @@ again:
goto out;
}
}
+
+ if (info->attrs[NBD_ATTR_BACKEND_IDENTIFIER]) {
+ nbd->backend = nla_strdup(info->attrs[NBD_ATTR_BACKEND_IDENTIFIER],
+ GFP_KERNEL);
+ if (!nbd->backend) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ ret = device_create_file(disk_to_dev(nbd->disk), &backend_attr);
+ if (ret) {
+ dev_err(disk_to_dev(nbd->disk),
+ "device_create_file failed for backend!\n");
+ goto out;
+ }
+ set_bit(NBD_RT_HAS_BACKEND_FILE, &config->runtime_flags);
+
ret = nbd_start_device(nbd);
out:
- mutex_unlock(&nbd->config_lock);
if (!ret) {
set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags);
refcount_inc(&nbd->config_refs);
nbd_connect_reply(info, nbd->index);
}
+ mutex_unlock(&nbd->config_lock);
+
nbd_config_put(nbd);
if (put_dev)
nbd_put(nbd);
@@ -1995,14 +2255,18 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd)
{
mutex_lock(&nbd->config_lock);
nbd_disconnect(nbd);
- nbd_clear_sock(nbd);
- mutex_unlock(&nbd->config_lock);
+ sock_shutdown(nbd);
+ wake_up(&nbd->config->conn_wait);
/*
- * Make sure recv thread has finished, so it does not drop the last
- * config ref and try to destroy the workqueue from inside the work
- * queue.
+ * Make sure recv thread has finished, we can safely call nbd_clear_que()
+ * to cancel the inflight I/Os.
*/
flush_workqueue(nbd->recv_workq);
+ nbd_clear_que(nbd);
+ nbd->task_setup = NULL;
+ clear_bit(NBD_RT_BOUND, &nbd->config->runtime_flags);
+ mutex_unlock(&nbd->config_lock);
+
if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
&nbd->config->runtime_flags))
nbd_config_put(nbd);
@@ -2016,8 +2280,8 @@ static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
if (!netlink_capable(skb, CAP_SYS_ADMIN))
return -EPERM;
- if (!info->attrs[NBD_ATTR_INDEX]) {
- printk(KERN_ERR "nbd: must specify an index to disconnect\n");
+ if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_INDEX)) {
+ pr_err("must specify an index to disconnect\n");
return -EINVAL;
}
index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
@@ -2025,23 +2289,20 @@ static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
nbd = idr_find(&nbd_index_idr, index);
if (!nbd) {
mutex_unlock(&nbd_index_mutex);
- printk(KERN_ERR "nbd: couldn't find device at index %d\n",
- index);
+ pr_err("couldn't find device at index %d\n", index);
return -EINVAL;
}
if (!refcount_inc_not_zero(&nbd->refs)) {
mutex_unlock(&nbd_index_mutex);
- printk(KERN_ERR "nbd: device at index %d is going down\n",
- index);
+ pr_err("device at index %d is going down\n", index);
return -EINVAL;
}
mutex_unlock(&nbd_index_mutex);
- if (!refcount_inc_not_zero(&nbd->config_refs)) {
- nbd_put(nbd);
- return 0;
- }
+ if (!refcount_inc_not_zero(&nbd->config_refs))
+ goto put_nbd;
nbd_disconnect_and_put(nbd);
nbd_config_put(nbd);
+put_nbd:
nbd_put(nbd);
return 0;
}
@@ -2057,8 +2318,8 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
if (!netlink_capable(skb, CAP_SYS_ADMIN))
return -EPERM;
- if (!info->attrs[NBD_ATTR_INDEX]) {
- printk(KERN_ERR "nbd: must specify a device to reconfigure\n");
+ if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_INDEX)) {
+ pr_err("must specify a device to reconfigure\n");
return -EINVAL;
}
index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
@@ -2066,19 +2327,34 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
nbd = idr_find(&nbd_index_idr, index);
if (!nbd) {
mutex_unlock(&nbd_index_mutex);
- printk(KERN_ERR "nbd: couldn't find a device at index %d\n",
- index);
+ pr_err("couldn't find a device at index %d\n", index);
return -EINVAL;
}
+ if (nbd->backend) {
+ if (info->attrs[NBD_ATTR_BACKEND_IDENTIFIER]) {
+ if (nla_strcmp(info->attrs[NBD_ATTR_BACKEND_IDENTIFIER],
+ nbd->backend)) {
+ mutex_unlock(&nbd_index_mutex);
+ dev_err(nbd_to_dev(nbd),
+ "backend image doesn't match with %s\n",
+ nbd->backend);
+ return -EINVAL;
+ }
+ } else {
+ mutex_unlock(&nbd_index_mutex);
+ dev_err(nbd_to_dev(nbd), "must specify backend\n");
+ return -EINVAL;
+ }
+ }
if (!refcount_inc_not_zero(&nbd->refs)) {
mutex_unlock(&nbd_index_mutex);
- printk(KERN_ERR "nbd: device at index %d is going down\n",
- index);
+ pr_err("device at index %d is going down\n", index);
return -EINVAL;
}
mutex_unlock(&nbd_index_mutex);
- if (!refcount_inc_not_zero(&nbd->config_refs)) {
+ config = nbd_get_config_unlocked(nbd);
+ if (!config) {
dev_err(nbd_to_dev(nbd),
"not configured, cannot reconfigure\n");
nbd_put(nbd);
@@ -2086,9 +2362,8 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
}
mutex_lock(&nbd->config_lock);
- config = nbd->config;
if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
- !nbd->task_recv) {
+ !nbd->pid) {
dev_err(nbd_to_dev(nbd),
"not configured, cannot reconfigure\n");
ret = -EINVAL;
@@ -2110,15 +2385,13 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
- if (!test_and_set_bit(NBD_RT_DESTROY_ON_DISCONNECT,
- &config->runtime_flags))
+ if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
+ &nbd->flags))
put_dev = true;
- set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
} else {
- if (test_and_clear_bit(NBD_RT_DESTROY_ON_DISCONNECT,
- &config->runtime_flags))
+ if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
+ &nbd->flags))
refcount_inc(&nbd->refs);
- clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags);
}
if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
@@ -2139,7 +2412,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
struct nlattr *socks[NBD_SOCK_MAX+1];
if (nla_type(attr) != NBD_SOCK_ITEM) {
- printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
+ pr_err("socks must be embedded in a SOCK_ITEM attr\n");
ret = -EINVAL;
goto out;
}
@@ -2148,7 +2421,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
nbd_sock_policy,
info->extack);
if (ret != 0) {
- printk(KERN_ERR "nbd: error processing sock list\n");
+ pr_err("error processing sock list\n");
ret = -EINVAL;
goto out;
}
@@ -2173,7 +2446,7 @@ out:
return ret;
}
-static const struct genl_ops nbd_connect_genl_ops[] = {
+static const struct genl_small_ops nbd_connect_genl_ops[] = {
{
.cmd = NBD_CMD_CONNECT,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -2205,13 +2478,16 @@ static struct genl_family nbd_genl_family __ro_after_init = {
.name = NBD_GENL_FAMILY_NAME,
.version = NBD_GENL_VERSION,
.module = THIS_MODULE,
- .ops = nbd_connect_genl_ops,
- .n_ops = ARRAY_SIZE(nbd_connect_genl_ops),
+ .small_ops = nbd_connect_genl_ops,
+ .n_small_ops = ARRAY_SIZE(nbd_connect_genl_ops),
+ .resv_start_op = NBD_CMD_STATUS + 1,
.maxattr = NBD_ATTR_MAX,
+ .netnsok = 1,
.policy = nbd_attr_policy,
.mcgrps = nbd_mcast_grps,
.n_mcgrps = ARRAY_SIZE(nbd_mcast_grps),
};
+MODULE_ALIAS_GENL_FAMILY(NBD_GENL_FAMILY_NAME);
static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
{
@@ -2277,6 +2553,12 @@ static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
}
dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST);
+ if (!dev_list) {
+ nlmsg_free(reply);
+ ret = -EMSGSIZE;
+ goto out;
+ }
+
if (index == -1) {
ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
if (ret) {
@@ -2365,7 +2647,7 @@ static int __init nbd_init(void)
BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
if (max_part < 0) {
- printk(KERN_ERR "nbd: max_part must be >= 0\n");
+ pr_err("max_part must be >= 0\n");
return -EINVAL;
}
@@ -2393,16 +2675,21 @@ static int __init nbd_init(void)
if (register_blkdev(NBD_MAJOR, "nbd"))
return -EIO;
+ nbd_del_wq = alloc_workqueue("nbd-del", WQ_UNBOUND, 0);
+ if (!nbd_del_wq) {
+ unregister_blkdev(NBD_MAJOR, "nbd");
+ return -ENOMEM;
+ }
+
if (genl_register_family(&nbd_genl_family)) {
+ destroy_workqueue(nbd_del_wq);
unregister_blkdev(NBD_MAJOR, "nbd");
return -EINVAL;
}
nbd_dbg_init();
- mutex_lock(&nbd_index_mutex);
for (i = 0; i < nbds_max; i++)
- nbd_dev_add(i);
- mutex_unlock(&nbd_index_mutex);
+ nbd_dev_add(i, 1);
return 0;
}
@@ -2411,7 +2698,10 @@ static int nbd_exit_cb(int id, void *ptr, void *data)
struct list_head *list = (struct list_head *)data;
struct nbd_device *nbd = ptr;
- list_add_tail(&nbd->list, list);
+ /* Skip nbd that is being removed asynchronously */
+ if (refcount_read(&nbd->refs))
+ list_add_tail(&nbd->list, list);
+
return 0;
}
@@ -2420,6 +2710,12 @@ static void __exit nbd_cleanup(void)
struct nbd_device *nbd;
LIST_HEAD(del_list);
+ /*
+ * Unregister netlink interface prior to waiting
+ * for the completion of netlink commands.
+ */
+ genl_unregister_family(&nbd_genl_family);
+
nbd_dbg_close();
mutex_lock(&nbd_index_mutex);
@@ -2429,13 +2725,18 @@ static void __exit nbd_cleanup(void)
while (!list_empty(&del_list)) {
nbd = list_first_entry(&del_list, struct nbd_device, list);
list_del_init(&nbd->list);
+ if (refcount_read(&nbd->config_refs))
+ pr_err("possibly leaking nbd_config (ref %d)\n",
+ refcount_read(&nbd->config_refs));
if (refcount_read(&nbd->refs) != 1)
- printk(KERN_ERR "nbd: possibly leaking a device\n");
+ pr_err("possibly leaking a device\n");
nbd_put(nbd);
}
+ /* Also wait for nbd_dev_remove_work() completes */
+ destroy_workqueue(nbd_del_wq);
+
idr_destroy(&nbd_index_idr);
- genl_unregister_family(&nbd_genl_family);
unregister_blkdev(NBD_MAJOR, "nbd");
}