summaryrefslogtreecommitdiff
path: root/drivers/block/nbd.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/nbd.c')
-rw-r--r--drivers/block/nbd.c529
1 files changed, 347 insertions, 182 deletions
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 592cfa8b765a..f6c33b21f69e 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -62,11 +62,13 @@ struct nbd_sock {
bool dead;
int fallback_index;
int cookie;
+ struct work_struct work;
};
struct recv_thread_args {
struct work_struct work;
struct nbd_device *nbd;
+ struct nbd_sock *nsock;
int index;
};
@@ -140,6 +142,9 @@ struct nbd_device {
*/
#define NBD_CMD_INFLIGHT 2
+/* Just part of request header or data payload is sent successfully */
+#define NBD_CMD_PARTIAL_SEND 3
+
struct nbd_cmd {
struct nbd_device *nbd;
struct mutex lock;
@@ -180,6 +185,17 @@ static void nbd_requeue_cmd(struct nbd_cmd *cmd)
{
struct request *req = blk_mq_rq_from_pdu(cmd);
+ lockdep_assert_held(&cmd->lock);
+
+ /*
+ * Clear INFLIGHT flag so that this cmd won't be completed in
+ * normal completion path
+ *
+ * INFLIGHT flag will be set when the cmd is queued to nbd next
+ * time.
+ */
+ __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
+
if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags))
blk_mq_requeue_request(req, true);
}
@@ -221,7 +237,7 @@ static ssize_t pid_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
- struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
+ struct nbd_device *nbd = disk->private_data;
return sprintf(buf, "%d\n", nbd->pid);
}
@@ -235,7 +251,7 @@ static ssize_t backend_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
- struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
+ struct nbd_device *nbd = disk->private_data;
return sprintf(buf, "%s\n", nbd->backend ?: "");
}
@@ -250,7 +266,6 @@ static void nbd_dev_remove(struct nbd_device *nbd)
struct gendisk *disk = nbd->disk;
del_gendisk(disk);
- put_disk(disk);
blk_mq_free_tag_set(&nbd->tag_set);
/*
@@ -261,7 +276,7 @@ static void nbd_dev_remove(struct nbd_device *nbd)
idr_remove(&nbd_index_idr, nbd->index);
mutex_unlock(&nbd_index_mutex);
destroy_workqueue(nbd->recv_workq);
- kfree(nbd);
+ put_disk(disk);
}
static void nbd_dev_remove_work(struct work_struct *work)
@@ -296,7 +311,7 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
if (args) {
INIT_WORK(&args->work, nbd_dead_link_work);
args->index = nbd->index;
- queue_work(system_wq, &args->work);
+ queue_work(system_percpu_wq, &args->work);
}
}
if (!nsock->dead) {
@@ -316,27 +331,49 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
nsock->sent = 0;
}
-static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
- loff_t blksize)
+static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, loff_t blksize)
{
+ struct queue_limits lim;
+ int error;
+
if (!blksize)
blksize = 1u << NBD_DEF_BLKSIZE_BITS;
if (blk_validate_block_size(blksize))
return -EINVAL;
+ if (bytesize < 0)
+ return -EINVAL;
+
nbd->config->bytesize = bytesize;
nbd->config->blksize_bits = __ffs(blksize);
if (!nbd->pid)
return 0;
- if (nbd->config->flags & NBD_FLAG_SEND_TRIM) {
- nbd->disk->queue->limits.discard_granularity = blksize;
- blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
- }
- blk_queue_logical_block_size(nbd->disk->queue, blksize);
- blk_queue_physical_block_size(nbd->disk->queue, blksize);
+ lim = queue_limits_start_update(nbd->disk->queue);
+ if (nbd->config->flags & NBD_FLAG_SEND_TRIM)
+ lim.max_hw_discard_sectors = UINT_MAX >> SECTOR_SHIFT;
+ else
+ lim.max_hw_discard_sectors = 0;
+ if (!(nbd->config->flags & NBD_FLAG_SEND_FLUSH)) {
+ lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA);
+ } else if (nbd->config->flags & NBD_FLAG_SEND_FUA) {
+ lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
+ } else {
+ lim.features |= BLK_FEAT_WRITE_CACHE;
+ lim.features &= ~BLK_FEAT_FUA;
+ }
+ if (nbd->config->flags & NBD_FLAG_ROTATIONAL)
+ lim.features |= BLK_FEAT_ROTATIONAL;
+ if (nbd->config->flags & NBD_FLAG_SEND_WRITE_ZEROES)
+ lim.max_write_zeroes_sectors = UINT_MAX >> SECTOR_SHIFT;
+
+ lim.logical_block_size = blksize;
+ lim.physical_block_size = blksize;
+ error = queue_limits_commit_update_frozen(nbd->disk->queue, &lim);
+ if (error)
+ return error;
if (max_part)
set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
@@ -388,11 +425,29 @@ static u32 req_to_nbd_cmd_type(struct request *req)
return NBD_CMD_WRITE;
case REQ_OP_READ:
return NBD_CMD_READ;
+ case REQ_OP_WRITE_ZEROES:
+ return NBD_CMD_WRITE_ZEROES;
default:
return U32_MAX;
}
}
+static struct nbd_config *nbd_get_config_unlocked(struct nbd_device *nbd)
+{
+ if (refcount_inc_not_zero(&nbd->config_refs)) {
+ /*
+ * Add smp_mb__after_atomic to ensure that reading nbd->config_refs
+ * and reading nbd->config is ordered. The pair is the barrier in
+ * nbd_alloc_and_init_config(), avoid nbd->config_refs is set
+ * before nbd->config.
+ */
+ smp_mb__after_atomic();
+ return nbd->config;
+ }
+
+ return NULL;
+}
+
static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req)
{
struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
@@ -402,18 +457,24 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req)
if (!mutex_trylock(&cmd->lock))
return BLK_EH_RESET_TIMER;
+ /* partial send is handled in nbd_sock's work function */
+ if (test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)) {
+ mutex_unlock(&cmd->lock);
+ return BLK_EH_RESET_TIMER;
+ }
+
if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
mutex_unlock(&cmd->lock);
return BLK_EH_DONE;
}
- if (!refcount_inc_not_zero(&nbd->config_refs)) {
+ config = nbd_get_config_unlocked(nbd);
+ if (!config) {
cmd->status = BLK_STS_TIMEOUT;
__clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
mutex_unlock(&cmd->lock);
goto done;
}
- config = nbd->config;
if (config->num_connections > 1 ||
(config->num_connections == 1 && nbd->tag_set.timeout)) {
@@ -442,8 +503,8 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req)
nbd_mark_nsock_dead(nbd, nsock, 1);
mutex_unlock(&nsock->tx_lock);
}
- mutex_unlock(&cmd->lock);
nbd_requeue_cmd(cmd);
+ mutex_unlock(&cmd->lock);
nbd_config_put(nbd);
return BLK_EH_DONE;
}
@@ -487,17 +548,11 @@ done:
return BLK_EH_DONE;
}
-/*
- * Send or receive packet. Return a positive value on success and
- * negtive value on failue, and never return 0.
- */
-static int sock_xmit(struct nbd_device *nbd, int index, int send,
- struct iov_iter *iter, int msg_flags, int *sent)
+static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send,
+ struct iov_iter *iter, int msg_flags, int *sent)
{
- struct nbd_config *config = nbd->config;
- struct socket *sock = config->socks[index]->sock;
int result;
- struct msghdr msg;
+ struct msghdr msg = {} ;
unsigned int noreclaim_flag;
if (unlikely(!sock)) {
@@ -510,28 +565,27 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
msg.msg_iter = *iter;
noreclaim_flag = memalloc_noreclaim_save();
- do {
- sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
- sock->sk->sk_use_task_frag = false;
- msg.msg_name = NULL;
- msg.msg_namelen = 0;
- msg.msg_control = NULL;
- msg.msg_controllen = 0;
- msg.msg_flags = msg_flags | MSG_NOSIGNAL;
-
- if (send)
- result = sock_sendmsg(sock, &msg);
- else
- result = sock_recvmsg(sock, &msg, msg.msg_flags);
-
- if (result <= 0) {
- if (result == 0)
- result = -EPIPE; /* short read */
- break;
- }
- if (sent)
- *sent += result;
- } while (msg_data_left(&msg));
+
+ scoped_with_kernel_creds() {
+ do {
+ sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
+ sock->sk->sk_use_task_frag = false;
+ msg.msg_flags = msg_flags | MSG_NOSIGNAL;
+
+ if (send)
+ result = sock_sendmsg(sock, &msg);
+ else
+ result = sock_recvmsg(sock, &msg, msg.msg_flags);
+
+ if (result <= 0) {
+ if (result == 0)
+ result = -EPIPE; /* short read */
+ break;
+ }
+ if (sent)
+ *sent += result;
+ } while (msg_data_left(&msg));
+ }
memalloc_noreclaim_restore(noreclaim_flag);
@@ -539,6 +593,19 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
}
/*
+ * Send or receive packet. Return a positive value on success and
+ * negtive value on failure, and never return 0.
+ */
+static int sock_xmit(struct nbd_device *nbd, int index, int send,
+ struct iov_iter *iter, int msg_flags, int *sent)
+{
+ struct nbd_config *config = nbd->config;
+ struct socket *sock = config->socks[index]->sock;
+
+ return __sock_xmit(nbd, sock, send, iter, msg_flags, sent);
+}
+
+/*
* Different settings for sk->sk_sndtimeo can result in different return values
* if there is a signal pending when we enter sendmsg, because reasons?
*/
@@ -547,8 +614,36 @@ static inline int was_interrupted(int result)
return result == -ERESTARTSYS || result == -EINTR;
}
-/* always call with the tx_lock held */
-static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
+/*
+ * We've already sent header or part of data payload, have no choice but
+ * to set pending and schedule it in work.
+ *
+ * And we have to return BLK_STS_OK to block core, otherwise this same
+ * request may be re-dispatched with different tag, but our header has
+ * been sent out with old tag, and this way does confuse reply handling.
+ */
+static void nbd_sched_pending_work(struct nbd_device *nbd,
+ struct nbd_sock *nsock,
+ struct nbd_cmd *cmd, int sent)
+{
+ struct request *req = blk_mq_rq_from_pdu(cmd);
+
+ /* pending work should be scheduled only once */
+ WARN_ON_ONCE(test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags));
+
+ nsock->pending = req;
+ nsock->sent = sent;
+ set_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags);
+ refcount_inc(&nbd->config_refs);
+ schedule_work(&nsock->work);
+}
+
+/*
+ * Returns BLK_STS_RESOURCE if the caller should retry after a delay.
+ * Returns BLK_STS_IOERR if sending failed.
+ */
+static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd,
+ int index)
{
struct request *req = blk_mq_rq_from_pdu(cmd);
struct nbd_config *config = nbd->config;
@@ -557,28 +652,32 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
struct iov_iter from;
- unsigned long size = blk_rq_bytes(req);
struct bio *bio;
u64 handle;
u32 type;
u32 nbd_cmd_flags = 0;
int sent = nsock->sent, skip = 0;
+ lockdep_assert_held(&cmd->lock);
+ lockdep_assert_held(&nsock->tx_lock);
+
iov_iter_kvec(&from, ITER_SOURCE, &iov, 1, sizeof(request));
type = req_to_nbd_cmd_type(req);
if (type == U32_MAX)
- return -EIO;
+ return BLK_STS_IOERR;
if (rq_data_dir(req) == WRITE &&
(config->flags & NBD_FLAG_READ_ONLY)) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
"Write on read-only\n");
- return -EIO;
+ return BLK_STS_IOERR;
}
if (req->cmd_flags & REQ_FUA)
nbd_cmd_flags |= NBD_CMD_FLAG_FUA;
+ if ((req->cmd_flags & REQ_NOUNMAP) && (type == NBD_CMD_WRITE_ZEROES))
+ nbd_cmd_flags |= NBD_CMD_FLAG_NO_HOLE;
/* We did a partial send previously, and we at least sent the whole
* request struct, so just go and send the rest of the pages in the
@@ -603,10 +702,10 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
request.type = htonl(type | nbd_cmd_flags);
if (type != NBD_CMD_FLUSH) {
request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
- request.len = htonl(size);
+ request.len = htonl(blk_rq_bytes(req));
}
handle = nbd_cmd_handle(cmd);
- memcpy(request.handle, &handle, sizeof(handle));
+ request.cookie = cpu_to_be64(handle);
trace_nbd_send_request(&request, nbd->index, blk_mq_rq_from_pdu(cmd));
@@ -618,21 +717,21 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
trace_nbd_header_sent(req, handle);
if (result < 0) {
if (was_interrupted(result)) {
- /* If we havne't sent anything we can just return BUSY,
+ /* If we haven't sent anything we can just return BUSY,
* however if we have sent something we need to make
* sure we only allow this req to be sent until we are
* completely done.
*/
if (sent) {
- nsock->pending = req;
- nsock->sent = sent;
+ nbd_sched_pending_work(nbd, nsock, cmd, sent);
+ return BLK_STS_OK;
}
set_bit(NBD_CMD_REQUEUED, &cmd->flags);
return BLK_STS_RESOURCE;
}
dev_err_ratelimited(disk_to_dev(nbd->disk),
"Send control failed (result %d)\n", result);
- return -EAGAIN;
+ goto requeue;
}
send_pages:
if (type != NBD_CMD_WRITE)
@@ -662,19 +761,13 @@ send_pages:
result = sock_xmit(nbd, index, 1, &from, flags, &sent);
if (result < 0) {
if (was_interrupted(result)) {
- /* We've already sent the header, we
- * have no choice but to set pending and
- * return BUSY.
- */
- nsock->pending = req;
- nsock->sent = sent;
- set_bit(NBD_CMD_REQUEUED, &cmd->flags);
- return BLK_STS_RESOURCE;
+ nbd_sched_pending_work(nbd, nsock, cmd, sent);
+ return BLK_STS_OK;
}
dev_err(disk_to_dev(nbd->disk),
"Send data failed (result %d)\n",
result);
- return -EAGAIN;
+ goto requeue;
}
/*
* The completion might already have come in,
@@ -691,10 +784,65 @@ out:
trace_nbd_payload_sent(req, handle);
nsock->pending = NULL;
nsock->sent = 0;
- return 0;
+ __set_bit(NBD_CMD_INFLIGHT, &cmd->flags);
+ return BLK_STS_OK;
+
+requeue:
+ /*
+ * Can't requeue in case we are dealing with partial send
+ *
+ * We must run from pending work function.
+ * */
+ if (test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags))
+ return BLK_STS_OK;
+
+ /* retry on a different socket */
+ dev_err_ratelimited(disk_to_dev(nbd->disk),
+ "Request send failed, requeueing\n");
+ nbd_mark_nsock_dead(nbd, nsock, 1);
+ nbd_requeue_cmd(cmd);
+ return BLK_STS_OK;
+}
+
+/* handle partial sending */
+static void nbd_pending_cmd_work(struct work_struct *work)
+{
+ struct nbd_sock *nsock = container_of(work, struct nbd_sock, work);
+ struct request *req = nsock->pending;
+ struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
+ struct nbd_device *nbd = cmd->nbd;
+ unsigned long deadline = READ_ONCE(req->deadline);
+ unsigned int wait_ms = 2;
+
+ mutex_lock(&cmd->lock);
+
+ WARN_ON_ONCE(test_bit(NBD_CMD_REQUEUED, &cmd->flags));
+ if (WARN_ON_ONCE(!test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)))
+ goto out;
+
+ mutex_lock(&nsock->tx_lock);
+ while (true) {
+ nbd_send_cmd(nbd, cmd, cmd->index);
+ if (!nsock->pending)
+ break;
+
+ /* don't bother timeout handler for partial sending */
+ if (READ_ONCE(jiffies) + msecs_to_jiffies(wait_ms) >= deadline) {
+ cmd->status = BLK_STS_IOERR;
+ blk_mq_complete_request(req);
+ break;
+ }
+ msleep(wait_ms);
+ wait_ms *= 2;
+ }
+ mutex_unlock(&nsock->tx_lock);
+ clear_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags);
+out:
+ mutex_unlock(&cmd->lock);
+ nbd_config_put(nbd);
}
-static int nbd_read_reply(struct nbd_device *nbd, int index,
+static int nbd_read_reply(struct nbd_device *nbd, struct socket *sock,
struct nbd_reply *reply)
{
struct kvec iov = {.iov_base = reply, .iov_len = sizeof(*reply)};
@@ -703,7 +851,7 @@ static int nbd_read_reply(struct nbd_device *nbd, int index,
reply->magic = 0;
iov_iter_kvec(&to, ITER_DEST, &iov, 1, sizeof(*reply));
- result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
+ result = __sock_xmit(nbd, sock, 0, &to, MSG_WAITALL, NULL);
if (result < 0) {
if (!nbd_disconnected(nbd->config))
dev_err(disk_to_dev(nbd->disk),
@@ -732,7 +880,7 @@ static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index,
u32 tag;
int ret = 0;
- memcpy(&handle, reply->handle, sizeof(handle));
+ handle = be64_to_cpu(reply->cookie);
tag = nbd_handle_to_tag(handle);
hwq = blk_mq_unique_tag_to_hwq(tag);
if (hwq < nbd->tag_set.nr_hw_queues)
@@ -827,14 +975,14 @@ static void recv_work(struct work_struct *work)
struct nbd_device *nbd = args->nbd;
struct nbd_config *config = nbd->config;
struct request_queue *q = nbd->disk->queue;
- struct nbd_sock *nsock;
+ struct nbd_sock *nsock = args->nsock;
struct nbd_cmd *cmd;
struct request *rq;
while (1) {
struct nbd_reply reply;
- if (nbd_read_reply(nbd, args->index, &reply))
+ if (nbd_read_reply(nbd, nsock->sock, &reply))
break;
/*
@@ -869,14 +1017,13 @@ static void recv_work(struct work_struct *work)
percpu_ref_put(&q->q_usage_counter);
}
- nsock = config->socks[args->index];
mutex_lock(&nsock->tx_lock);
nbd_mark_nsock_dead(nbd, nsock, 1);
mutex_unlock(&nsock->tx_lock);
- nbd_config_put(nbd);
atomic_dec(&config->recv_threads);
wake_up(&config->recv_wq);
+ nbd_config_put(nbd);
kfree(args);
}
@@ -967,26 +1114,28 @@ static int wait_for_reconnect(struct nbd_device *nbd)
return !test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags);
}
-static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
+static blk_status_t nbd_handle_cmd(struct nbd_cmd *cmd, int index)
{
struct request *req = blk_mq_rq_from_pdu(cmd);
struct nbd_device *nbd = cmd->nbd;
struct nbd_config *config;
struct nbd_sock *nsock;
- int ret;
+ blk_status_t ret;
+
+ lockdep_assert_held(&cmd->lock);
- if (!refcount_inc_not_zero(&nbd->config_refs)) {
+ config = nbd_get_config_unlocked(nbd);
+ if (!config) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
"Socks array is empty\n");
- return -EINVAL;
+ return BLK_STS_IOERR;
}
- config = nbd->config;
if (index >= config->num_connections) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
"Attempted send on invalid socket\n");
nbd_config_put(nbd);
- return -EINVAL;
+ return BLK_STS_IOERR;
}
cmd->status = BLK_STS_OK;
again:
@@ -1009,7 +1158,7 @@ again:
*/
sock_shutdown(nbd);
nbd_config_put(nbd);
- return -EIO;
+ return BLK_STS_IOERR;
}
goto again;
}
@@ -1022,27 +1171,10 @@ again:
blk_mq_start_request(req);
if (unlikely(nsock->pending && nsock->pending != req)) {
nbd_requeue_cmd(cmd);
- ret = 0;
+ ret = BLK_STS_OK;
goto out;
}
- /*
- * Some failures are related to the link going down, so anything that
- * returns EAGAIN can be retried on a different socket.
- */
ret = nbd_send_cmd(nbd, cmd, index);
- /*
- * Access to this flag is protected by cmd->lock, thus it's safe to set
- * the flag after nbd_send_cmd() succeed to send request to server.
- */
- if (!ret)
- __set_bit(NBD_CMD_INFLIGHT, &cmd->flags);
- else if (ret == -EAGAIN) {
- dev_err_ratelimited(disk_to_dev(nbd->disk),
- "Request send failed, requeueing\n");
- nbd_mark_nsock_dead(nbd, nsock, 1);
- nbd_requeue_cmd(cmd);
- ret = 0;
- }
out:
mutex_unlock(&nsock->tx_lock);
nbd_config_put(nbd);
@@ -1053,7 +1185,7 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
- int ret;
+ blk_status_t ret;
/*
* Since we look at the bio's to send the request over the network we
@@ -1073,10 +1205,6 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
* appropriate.
*/
ret = nbd_handle_cmd(cmd, hctx->queue_num);
- if (ret < 0)
- ret = BLK_STS_IOERR;
- else if (!ret)
- ret = BLK_STS_OK;
mutex_unlock(&cmd->lock);
return ret;
@@ -1092,6 +1220,14 @@ static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd,
if (!sock)
return NULL;
+ if (!sk_is_tcp(sock->sk) &&
+ !sk_is_stream_unix(sock->sk)) {
+ dev_err(disk_to_dev(nbd->disk), "Unsupported socket: should be TCP or UNIX.\n");
+ *err = -EINVAL;
+ sockfd_put(sock);
+ return NULL;
+ }
+
if (sock->ops->shutdown == sock_no_shutdown) {
dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n");
*err = -EINVAL;
@@ -1109,8 +1245,12 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
struct socket *sock;
struct nbd_sock **socks;
struct nbd_sock *nsock;
+ unsigned int memflags;
int err;
+ /* Arg will be cast to int, check it to avoid overflow */
+ if (arg > INT_MAX)
+ return -EINVAL;
sock = nbd_get_socket(nbd, arg, &err);
if (!sock)
return err;
@@ -1119,7 +1259,7 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
* We need to make sure we don't get any errant requests while we're
* reallocating the ->socks array.
*/
- blk_mq_freeze_queue(nbd->disk->queue);
+ memflags = blk_mq_freeze_queue(nbd->disk->queue);
if (!netlink && !nbd->task_setup &&
!test_bit(NBD_RT_BOUND, &config->runtime_flags))
@@ -1157,14 +1297,15 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
nsock->pending = NULL;
nsock->sent = 0;
nsock->cookie = 0;
+ INIT_WORK(&nsock->work, nbd_pending_cmd_work);
socks[config->num_connections++] = nsock;
atomic_inc(&config->live_connections);
- blk_mq_unfreeze_queue(nbd->disk->queue);
+ blk_mq_unfreeze_queue(nbd->disk->queue, memflags);
return 0;
put_socket:
- blk_mq_unfreeze_queue(nbd->disk->queue);
+ blk_mq_unfreeze_queue(nbd->disk->queue, memflags);
sockfd_put(sock);
return err;
}
@@ -1210,6 +1351,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
INIT_WORK(&args->work, recv_work);
args->index = i;
args->nbd = nbd;
+ args->nsock = nsock;
nsock->cookie++;
mutex_unlock(&nsock->tx_lock);
sockfd_put(old);
@@ -1239,19 +1381,10 @@ static void nbd_bdev_reset(struct nbd_device *nbd)
static void nbd_parse_flags(struct nbd_device *nbd)
{
- struct nbd_config *config = nbd->config;
- if (config->flags & NBD_FLAG_READ_ONLY)
+ if (nbd->config->flags & NBD_FLAG_READ_ONLY)
set_disk_ro(nbd->disk, true);
else
set_disk_ro(nbd->disk, false);
- if (config->flags & NBD_FLAG_SEND_FLUSH) {
- if (config->flags & NBD_FLAG_SEND_FUA)
- blk_queue_write_cache(nbd->disk->queue, true, true);
- else
- blk_queue_write_cache(nbd->disk->queue, true, false);
- }
- else
- blk_queue_write_cache(nbd->disk->queue, false, false);
}
static void send_disconnects(struct nbd_device *nbd)
@@ -1328,8 +1461,6 @@ static void nbd_config_put(struct nbd_device *nbd)
nbd->config = NULL;
nbd->tag_set.timeout = 0;
- nbd->disk->queue->limits.discard_granularity = 0;
- blk_queue_max_discard_sectors(nbd->disk->queue, 0);
mutex_unlock(&nbd->config_lock);
nbd_put(nbd);
@@ -1353,7 +1484,17 @@ static int nbd_start_device(struct nbd_device *nbd)
return -EINVAL;
}
- blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
+retry:
+ mutex_unlock(&nbd->config_lock);
+ blk_mq_update_nr_hw_queues(&nbd->tag_set, num_connections);
+ mutex_lock(&nbd->config_lock);
+
+ /* if another code path updated nr_hw_queues, retry until succeed */
+ if (num_connections != config->num_connections) {
+ num_connections = config->num_connections;
+ goto retry;
+ }
+
nbd->pid = task_pid_nr(current);
nbd_parse_flags(nbd);
@@ -1392,6 +1533,7 @@ static int nbd_start_device(struct nbd_device *nbd)
refcount_inc(&nbd->config_refs);
INIT_WORK(&args->work, recv_work);
args->nbd = nbd;
+ args->nsock = config->socks[i];
args->index = i;
queue_work(nbd->recv_workq, &args->work);
}
@@ -1428,11 +1570,10 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd)
return ret;
}
-static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
- struct block_device *bdev)
+static void nbd_clear_sock_ioctl(struct nbd_device *nbd)
{
nbd_clear_sock(nbd);
- __invalidate_device(bdev, true);
+ disk_force_media_change(nbd->disk);
nbd_bdev_reset(nbd);
if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
&nbd->config->runtime_flags))
@@ -1459,7 +1600,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
case NBD_DISCONNECT:
return nbd_disconnect(nbd);
case NBD_CLEAR_SOCK:
- nbd_clear_sock_ioctl(nbd, bdev);
+ nbd_clear_sock_ioctl(nbd);
return 0;
case NBD_SET_SOCK:
return nbd_add_socket(nbd, arg, false);
@@ -1496,7 +1637,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
return -ENOTTY;
}
-static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
+static int nbd_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned int cmd, unsigned long arg)
{
struct nbd_device *nbd = bdev->bd_disk->private_data;
@@ -1526,17 +1667,20 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
return error;
}
-static struct nbd_config *nbd_alloc_config(void)
+static int nbd_alloc_and_init_config(struct nbd_device *nbd)
{
struct nbd_config *config;
+ if (WARN_ON(nbd->config))
+ return -EINVAL;
+
if (!try_module_get(THIS_MODULE))
- return ERR_PTR(-ENODEV);
+ return -ENODEV;
config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
if (!config) {
module_put(THIS_MODULE);
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
}
atomic_set(&config->recv_threads, 0);
@@ -1544,16 +1688,28 @@ static struct nbd_config *nbd_alloc_config(void)
init_waitqueue_head(&config->conn_wait);
config->blksize_bits = NBD_DEF_BLKSIZE_BITS;
atomic_set(&config->live_connections, 0);
- return config;
+
+ nbd->config = config;
+ /*
+ * Order refcount_set(&nbd->config_refs, 1) and nbd->config assignment,
+ * its pair is the barrier in nbd_get_config_unlocked().
+ * So nbd_get_config_unlocked() won't see nbd->config as null after
+ * refcount_inc_not_zero() succeed.
+ */
+ smp_mb__before_atomic();
+ refcount_set(&nbd->config_refs, 1);
+
+ return 0;
}
-static int nbd_open(struct block_device *bdev, fmode_t mode)
+static int nbd_open(struct gendisk *disk, blk_mode_t mode)
{
struct nbd_device *nbd;
+ struct nbd_config *config;
int ret = 0;
mutex_lock(&nbd_index_mutex);
- nbd = bdev->bd_disk->private_data;
+ nbd = disk->private_data;
if (!nbd) {
ret = -ENXIO;
goto out;
@@ -1562,36 +1718,34 @@ static int nbd_open(struct block_device *bdev, fmode_t mode)
ret = -ENXIO;
goto out;
}
- if (!refcount_inc_not_zero(&nbd->config_refs)) {
- struct nbd_config *config;
+ config = nbd_get_config_unlocked(nbd);
+ if (!config) {
mutex_lock(&nbd->config_lock);
if (refcount_inc_not_zero(&nbd->config_refs)) {
mutex_unlock(&nbd->config_lock);
goto out;
}
- config = nbd_alloc_config();
- if (IS_ERR(config)) {
- ret = PTR_ERR(config);
+ ret = nbd_alloc_and_init_config(nbd);
+ if (ret) {
mutex_unlock(&nbd->config_lock);
goto out;
}
- nbd->config = config;
- refcount_set(&nbd->config_refs, 1);
+
refcount_inc(&nbd->refs);
mutex_unlock(&nbd->config_lock);
if (max_part)
- set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
- } else if (nbd_disconnected(nbd->config)) {
+ set_bit(GD_NEED_PART_SCAN, &disk->state);
+ } else if (nbd_disconnected(config)) {
if (max_part)
- set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
+ set_bit(GD_NEED_PART_SCAN, &disk->state);
}
out:
mutex_unlock(&nbd_index_mutex);
return ret;
}
-static void nbd_release(struct gendisk *disk, fmode_t mode)
+static void nbd_release(struct gendisk *disk)
{
struct nbd_device *nbd = disk->private_data;
@@ -1603,6 +1757,13 @@ static void nbd_release(struct gendisk *disk, fmode_t mode)
nbd_put(nbd);
}
+static void nbd_free_disk(struct gendisk *disk)
+{
+ struct nbd_device *nbd = disk->private_data;
+
+ kfree(nbd);
+}
+
static const struct block_device_operations nbd_fops =
{
.owner = THIS_MODULE,
@@ -1610,6 +1771,7 @@ static const struct block_device_operations nbd_fops =
.release = nbd_release,
.ioctl = nbd_ioctl,
.compat_ioctl = nbd_ioctl,
+ .free_disk = nbd_free_disk,
};
#if IS_ENABLED(CONFIG_DEBUG_FS)
@@ -1645,6 +1807,10 @@ static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
seq_puts(s, "NBD_FLAG_SEND_FUA\n");
if (flags & NBD_FLAG_SEND_TRIM)
seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
+ if (flags & NBD_FLAG_SEND_WRITE_ZEROES)
+ seq_puts(s, "NBD_FLAG_SEND_WRITE_ZEROES\n");
+ if (flags & NBD_FLAG_ROTATIONAL)
+ seq_puts(s, "NBD_FLAG_ROTATIONAL\n");
return 0;
}
@@ -1660,7 +1826,7 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd)
return -EIO;
dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
- if (!dir) {
+ if (IS_ERR(dir)) {
dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
nbd_name(nbd));
return -EIO;
@@ -1686,7 +1852,7 @@ static int nbd_dbg_init(void)
struct dentry *dbg_dir;
dbg_dir = debugfs_create_dir("nbd", NULL);
- if (!dbg_dir)
+ if (IS_ERR(dbg_dir))
return -EIO;
nbd_dbg_dir = dbg_dir;
@@ -1740,6 +1906,12 @@ static const struct blk_mq_ops nbd_mq_ops = {
static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
{
+ struct queue_limits lim = {
+ .max_hw_sectors = 65536,
+ .io_opt = 256 << SECTOR_SHIFT,
+ .max_segments = USHRT_MAX,
+ .max_segment_size = UINT_MAX,
+ };
struct nbd_device *nbd;
struct gendisk *disk;
int err = -ENOMEM;
@@ -1753,8 +1925,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
nbd->tag_set.queue_depth = 128;
nbd->tag_set.numa_node = NUMA_NO_NODE;
nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
- nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
- BLK_MQ_F_BLOCKING;
+ nbd->tag_set.flags = BLK_MQ_F_BLOCKING;
nbd->tag_set.driver_data = nbd;
INIT_WORK(&nbd->remove_work, nbd_dev_remove_work);
nbd->backend = NULL;
@@ -1770,7 +1941,8 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
if (err == -ENOSPC)
err = -EEXIST;
} else {
- err = idr_alloc(&nbd_index_idr, nbd, 0, 0, GFP_KERNEL);
+ err = idr_alloc(&nbd_index_idr, nbd, 0,
+ (MINORMASK >> part_shift) + 1, GFP_KERNEL);
if (err >= 0)
index = err;
}
@@ -1779,7 +1951,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
if (err < 0)
goto out_free_tags;
- disk = blk_mq_alloc_disk(&nbd->tag_set, NULL);
+ disk = blk_mq_alloc_disk(&nbd->tag_set, &lim, NULL);
if (IS_ERR(disk)) {
err = PTR_ERR(disk);
goto out_free_idr;
@@ -1795,18 +1967,6 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
goto out_err_disk;
}
- /*
- * Tell the block layer that we are not a rotational device
- */
- blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
- blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
- disk->queue->limits.discard_granularity = 0;
- blk_queue_max_discard_sectors(disk->queue, 0);
- blk_queue_max_segment_size(disk->queue, UINT_MAX);
- blk_queue_max_segments(disk->queue, USHRT_MAX);
- blk_queue_max_hw_sectors(disk->queue, 65536);
- disk->queue->limits.max_sectors = 256;
-
mutex_init(&nbd->config_lock);
refcount_set(&nbd->config_refs, 0);
/*
@@ -1934,11 +2094,11 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
}
}
- if (!info->attrs[NBD_ATTR_SOCKETS]) {
+ if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_SOCKETS)) {
pr_err("must specify at least one socket\n");
return -EINVAL;
}
- if (!info->attrs[NBD_ATTR_SIZE_BYTES]) {
+ if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_SIZE_BYTES)) {
pr_err("must specify a size in bytes for the device\n");
return -EINVAL;
}
@@ -1978,22 +2138,17 @@ again:
pr_err("nbd%d already in use\n", index);
return -EBUSY;
}
- if (WARN_ON(nbd->config)) {
- mutex_unlock(&nbd->config_lock);
- nbd_put(nbd);
- return -EINVAL;
- }
- config = nbd_alloc_config();
- if (IS_ERR(config)) {
+
+ ret = nbd_alloc_and_init_config(nbd);
+ if (ret) {
mutex_unlock(&nbd->config_lock);
nbd_put(nbd);
pr_err("couldn't allocate config\n");
- return PTR_ERR(config);
+ return ret;
}
- nbd->config = config;
- refcount_set(&nbd->config_refs, 1);
- set_bit(NBD_RT_BOUND, &config->runtime_flags);
+ config = nbd->config;
+ set_bit(NBD_RT_BOUND, &config->runtime_flags);
ret = nbd_genl_size_set(info, nbd);
if (ret)
goto out;
@@ -2064,9 +2219,7 @@ again:
goto out;
}
}
- ret = nbd_start_device(nbd);
- if (ret)
- goto out;
+
if (info->attrs[NBD_ATTR_BACKEND_IDENTIFIER]) {
nbd->backend = nla_strdup(info->attrs[NBD_ATTR_BACKEND_IDENTIFIER],
GFP_KERNEL);
@@ -2082,13 +2235,16 @@ again:
goto out;
}
set_bit(NBD_RT_HAS_BACKEND_FILE, &config->runtime_flags);
+
+ ret = nbd_start_device(nbd);
out:
- mutex_unlock(&nbd->config_lock);
if (!ret) {
set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags);
refcount_inc(&nbd->config_refs);
nbd_connect_reply(info, nbd->index);
}
+ mutex_unlock(&nbd->config_lock);
+
nbd_config_put(nbd);
if (put_dev)
nbd_put(nbd);
@@ -2108,6 +2264,7 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd)
flush_workqueue(nbd->recv_workq);
nbd_clear_que(nbd);
nbd->task_setup = NULL;
+ clear_bit(NBD_RT_BOUND, &nbd->config->runtime_flags);
mutex_unlock(&nbd->config_lock);
if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
@@ -2123,7 +2280,7 @@ static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
if (!netlink_capable(skb, CAP_SYS_ADMIN))
return -EPERM;
- if (!info->attrs[NBD_ATTR_INDEX]) {
+ if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_INDEX)) {
pr_err("must specify an index to disconnect\n");
return -EINVAL;
}
@@ -2161,7 +2318,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
if (!netlink_capable(skb, CAP_SYS_ADMIN))
return -EPERM;
- if (!info->attrs[NBD_ATTR_INDEX]) {
+ if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_INDEX)) {
pr_err("must specify a device to reconfigure\n");
return -EINVAL;
}
@@ -2196,7 +2353,8 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
}
mutex_unlock(&nbd_index_mutex);
- if (!refcount_inc_not_zero(&nbd->config_refs)) {
+ config = nbd_get_config_unlocked(nbd);
+ if (!config) {
dev_err(nbd_to_dev(nbd),
"not configured, cannot reconfigure\n");
nbd_put(nbd);
@@ -2204,7 +2362,6 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
}
mutex_lock(&nbd->config_lock);
- config = nbd->config;
if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
!nbd->pid) {
dev_err(nbd_to_dev(nbd),
@@ -2325,10 +2482,12 @@ static struct genl_family nbd_genl_family __ro_after_init = {
.n_small_ops = ARRAY_SIZE(nbd_connect_genl_ops),
.resv_start_op = NBD_CMD_STATUS + 1,
.maxattr = NBD_ATTR_MAX,
+ .netnsok = 1,
.policy = nbd_attr_policy,
.mcgrps = nbd_mcast_grps,
.n_mcgrps = ARRAY_SIZE(nbd_mcast_grps),
};
+MODULE_ALIAS_GENL_FAMILY(NBD_GENL_FAMILY_NAME);
static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
{
@@ -2394,6 +2553,12 @@ static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
}
dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST);
+ if (!dev_list) {
+ nlmsg_free(reply);
+ ret = -EMSGSIZE;
+ goto out;
+ }
+
if (index == -1) {
ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
if (ret) {