diff options
Diffstat (limited to 'drivers/scsi/storvsc_drv.c')
| -rw-r--r-- | drivers/scsi/storvsc_drv.c | 1920 |
1 files changed, 1140 insertions, 780 deletions
diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 83ec1aa85964..6e4112143c76 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2009, Microsoft Corporation. * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * * Authors: * Haiyang Zhang <haiyangz@microsoft.com> * Hank Janssen <hjanssen@microsoft.com> @@ -32,7 +20,9 @@ #include <linux/module.h> #include <linux/device.h> #include <linux/hyperv.h> -#include <linux/mempool.h> +#include <linux/blkdev.h> +#include <linux/dma-mapping.h> + #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_host.h> @@ -41,6 +31,8 @@ #include <scsi/scsi_eh.h> #include <scsi/scsi_devinfo.h> #include <scsi/scsi_dbg.h> +#include <scsi/scsi_transport_fc.h> +#include <scsi/scsi_transport.h> /* * All wire protocol details (storage protocol between the guest and the host) @@ -56,15 +48,20 @@ * V1 RC > 2008/1/31: 2.0 * Win7: 4.2 * Win8: 5.1 + * Win8.1: 6.0 + * Win10: 6.2 */ +#define VMSTOR_PROTO_VERSION(MAJOR_, MINOR_) ((((MAJOR_) & 0xff) << 8) | \ + (((MINOR_) & 0xff))) +#define VMSTOR_PROTO_VERSION_WIN6 VMSTOR_PROTO_VERSION(2, 0) +#define VMSTOR_PROTO_VERSION_WIN7 VMSTOR_PROTO_VERSION(4, 2) +#define VMSTOR_PROTO_VERSION_WIN8 VMSTOR_PROTO_VERSION(5, 1) +#define VMSTOR_PROTO_VERSION_WIN8_1 VMSTOR_PROTO_VERSION(6, 0) +#define VMSTOR_PROTO_VERSION_WIN10 VMSTOR_PROTO_VERSION(6, 2) -#define VMSTOR_WIN7_MAJOR 4 -#define VMSTOR_WIN7_MINOR 2 - -#define VMSTOR_WIN8_MAJOR 5 -#define VMSTOR_WIN8_MINOR 1 - +/* channel callback timeout in ms */ +#define CALLBACK_TIMEOUT 2 /* Packet structure describing virtual storage requests. */ enum vstor_packet_operation { @@ -89,9 +86,8 @@ enum vstor_packet_operation { */ struct hv_fc_wwn_packet { - bool primary_active; - u8 reserved1; - u8 reserved2; + u8 primary_active; + u8 reserved1[3]; u8 primary_port_wwn[8]; u8 primary_node_wwn[8]; u8 secondary_port_wwn[8]; @@ -132,6 +128,8 @@ struct hv_fc_wwn_packet { #define SRB_FLAGS_PORT_DRIVER_RESERVED 0x0F000000 #define SRB_FLAGS_CLASS_DRIVER_RESERVED 0xF0000000 +#define SP_UNTAGGED ((unsigned char) ~0) +#define SRB_SIMPLE_TAG_REQUEST 0x20 /* * Platform neutral description of a scsi request - @@ -140,41 +138,44 @@ struct hv_fc_wwn_packet { */ #define STORVSC_MAX_CMD_LEN 0x10 -#define POST_WIN7_STORVSC_SENSE_BUFFER_SIZE 0x14 -#define PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE 0x12 - +/* Sense buffer size is the same for all versions since Windows 8 */ #define STORVSC_SENSE_BUFFER_SIZE 0x14 #define STORVSC_MAX_BUF_LEN_WITH_PADDING 0x14 /* - * Sense buffer size changed in win8; have a run-time - * variable to track the size we should use. - */ -static int sense_buffer_size; + * The storage protocol version is determined during the + * initial exchange with the host. It will indicate which + * storage functionality is available in the host. +*/ +static int vmstor_proto_version; -/* - * The size of the vmscsi_request has changed in win8. The - * additional size is because of new elements added to the - * structure. These elements are valid only when we are talking - * to a win8 host. - * Track the correction to size we need to apply. - */ +static bool hv_dev_is_fc(struct hv_device *hv_dev); -static int vmscsi_size_delta; -static int vmstor_current_major; -static int vmstor_current_minor; +#define STORVSC_LOGGING_NONE 0 +#define STORVSC_LOGGING_ERROR 1 +#define STORVSC_LOGGING_WARN 2 -struct vmscsi_win8_extension { - /* - * The following were added in Windows 8 - */ - u16 reserve; - u8 queue_tag; - u8 queue_action; - u32 srb_flags; - u32 time_out_value; - u32 queue_sort_ey; -} __packed; +static int logging_level = STORVSC_LOGGING_ERROR; +module_param(logging_level, int, S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(logging_level, + "Logging level, 0 - None, 1 - Error (default), 2 - Warning."); + +static inline bool do_logging(int level) +{ + return logging_level >= level; +} + +#define storvsc_log(dev, level, fmt, ...) \ +do { \ + if (do_logging(level)) \ + dev_warn(&(dev)->device, fmt, ##__VA_ARGS__); \ +} while (0) + +#define storvsc_log_ratelimited(dev, level, fmt, ...) \ +do { \ + if (do_logging(level)) \ + dev_warn_ratelimited(&(dev)->device, fmt, ##__VA_ARGS__); \ +} while (0) struct vmscsi_request { u16 length; @@ -201,13 +202,28 @@ struct vmscsi_request { /* * The following was added in win8. */ - struct vmscsi_win8_extension win8_extension; + u16 reserve; + u8 queue_tag; + u8 queue_action; + u32 srb_flags; + u32 time_out_value; + u32 queue_sort_ey; } __attribute((packed)); +/* + * The list of windows version in order of preference. + */ + +static const int protocol_version[] = { + VMSTOR_PROTO_VERSION_WIN10, + VMSTOR_PROTO_VERSION_WIN8_1, + VMSTOR_PROTO_VERSION_WIN8, +}; + /* - * This structure is sent during the intialization phase to get the different + * This structure is sent during the initialization phase to get the different * properties of the channel. */ @@ -295,67 +311,97 @@ enum storvsc_request_type { }; /* - * SRB status codes and masks; a subset of the codes used here. + * SRB status codes and masks. In the 8-bit field, the two high order bits + * are flags, while the remaining 6 bits are an integer status code. The + * definitions here include only the subset of the integer status codes that + * are tested for in this driver. */ - #define SRB_STATUS_AUTOSENSE_VALID 0x80 -#define SRB_STATUS_INVALID_LUN 0x20 -#define SRB_STATUS_SUCCESS 0x01 -#define SRB_STATUS_ABORTED 0x02 -#define SRB_STATUS_ERROR 0x04 - +#define SRB_STATUS_QUEUE_FROZEN 0x40 + +/* SRB status integer codes */ +#define SRB_STATUS_SUCCESS 0x01 +#define SRB_STATUS_ABORTED 0x02 +#define SRB_STATUS_ERROR 0x04 +#define SRB_STATUS_INVALID_REQUEST 0x06 +#define SRB_STATUS_TIMEOUT 0x09 +#define SRB_STATUS_SELECTION_TIMEOUT 0x0A +#define SRB_STATUS_BUS_RESET 0x0E +#define SRB_STATUS_DATA_OVERRUN 0x12 +#define SRB_STATUS_INVALID_LUN 0x20 +#define SRB_STATUS_INTERNAL_ERROR 0x30 + +#define SRB_STATUS(status) \ + (status & ~(SRB_STATUS_AUTOSENSE_VALID | SRB_STATUS_QUEUE_FROZEN)) /* * This is the end of Protocol specific defines. */ +static int storvsc_ringbuffer_size = (128 * 1024); +static int aligned_ringbuffer_size; +static u32 max_outstanding_req_per_channel; +static int storvsc_change_queue_depth(struct scsi_device *sdev, int queue_depth); -/* - * We setup a mempool to allocate request structures for this driver - * on a per-lun basis. The following define specifies the number of - * elements in the pool. - */ - -#define STORVSC_MIN_BUF_NR 64 -static int storvsc_ringbuffer_size = (20 * PAGE_SIZE); +static int storvsc_vcpus_per_sub_channel = 4; +static unsigned int storvsc_max_hw_queues; module_param(storvsc_ringbuffer_size, int, S_IRUGO); MODULE_PARM_DESC(storvsc_ringbuffer_size, "Ring buffer size (bytes)"); +module_param(storvsc_max_hw_queues, uint, 0644); +MODULE_PARM_DESC(storvsc_max_hw_queues, "Maximum number of hardware queues"); + +module_param(storvsc_vcpus_per_sub_channel, int, S_IRUGO); +MODULE_PARM_DESC(storvsc_vcpus_per_sub_channel, "Ratio of VCPUs to subchannels"); + +static int ring_avail_percent_lowater = 10; +module_param(ring_avail_percent_lowater, int, S_IRUGO); +MODULE_PARM_DESC(ring_avail_percent_lowater, + "Select a channel if available ring size > this in percent"); + /* * Timeout in seconds for all devices managed by this driver. */ -static int storvsc_timeout = 180; +static const int storvsc_timeout = 180; -#define STORVSC_MAX_IO_REQUESTS 200 +#if IS_ENABLED(CONFIG_SCSI_FC_ATTRS) +static struct scsi_transport_template *fc_transport_template; +#endif +static struct scsi_host_template scsi_driver; static void storvsc_on_channel_callback(void *context); -/* - * In Hyper-V, each port/path/target maps to 1 scsi host adapter. In - * reality, the path/target is not used (ie always set to 0) so our - * scsi host adapter essentially has 1 bus with 1 target that contains - * up to 256 luns. - */ -#define STORVSC_MAX_LUNS_PER_TARGET 64 -#define STORVSC_MAX_TARGETS 1 -#define STORVSC_MAX_CHANNELS 1 +#define STORVSC_MAX_LUNS_PER_TARGET 255 +#define STORVSC_MAX_TARGETS 2 +#define STORVSC_MAX_CHANNELS 8 +#define STORVSC_FC_MAX_LUNS_PER_TARGET 255 +#define STORVSC_FC_MAX_TARGETS 128 +#define STORVSC_FC_MAX_CHANNELS 8 +#define STORVSC_FC_MAX_XFER_SIZE ((u32)(512 * 1024)) +#define STORVSC_IDE_MAX_LUNS_PER_TARGET 64 +#define STORVSC_IDE_MAX_TARGETS 1 +#define STORVSC_IDE_MAX_CHANNELS 1 + +/* + * Upper bound on the size of a storvsc packet. + */ +#define STORVSC_MAX_PKT_SIZE (sizeof(struct vmpacket_descriptor) +\ + sizeof(struct vstor_packet)) struct storvsc_cmd_request { - struct list_head entry; struct scsi_cmnd *cmd; - unsigned int bounce_sgl_count; - struct scatterlist *bounce_sgl; - struct hv_device *device; /* Synchronize the request/response if needed */ struct completion wait_event; - unsigned char *sense_buffer; - struct hv_multipage_buffer data_buffer; + struct vmbus_channel_packet_multipage_buffer mpb; + struct vmbus_packet_mpb_array *payload; + u32 payload_sz; + struct vstor_packet vstor_packet; }; @@ -366,7 +412,6 @@ struct storvsc_device { bool destroy; bool drain_notify; - bool open_sub_channel; atomic_t num_outstanding_req; struct Scsi_Host *host; @@ -381,14 +426,35 @@ struct storvsc_device { unsigned char path_id; unsigned char target_id; + /* + * Max I/O, the device can support. + */ + u32 max_transfer_bytes; + /* + * Number of sub-channels we will open. + */ + u16 num_sc; + struct vmbus_channel **stor_chns; + /* + * Mask of CPUs bound to subchannels. + */ + struct cpumask alloced_cpus; + /* + * Serializes modifications of stor_chns[] from storvsc_do_io() + * and storvsc_change_target_cpu(). + */ + spinlock_t lock; /* Used for vsc/vsp channel reset process */ struct storvsc_cmd_request init_request; struct storvsc_cmd_request reset_request; -}; - -struct stor_mem_pools { - struct kmem_cache *request_pool; - mempool_t *request_mempool; + /* + * Currently active port and node names for FC devices. + */ + u64 node_name; + u64 port_name; +#if IS_ENABLED(CONFIG_SCSI_FC_ATTRS) + struct fc_rport *rport; +#endif }; struct hv_host_device { @@ -396,49 +462,62 @@ struct hv_host_device { unsigned int port; unsigned char path; unsigned char target; + struct workqueue_struct *handle_error_wq; + struct work_struct host_scan_work; + struct Scsi_Host *host; }; struct storvsc_scan_work { struct work_struct work; struct Scsi_Host *host; - uint lun; + u8 lun; + u8 tgt_id; }; static void storvsc_device_scan(struct work_struct *work) { struct storvsc_scan_work *wrk; - uint lun; struct scsi_device *sdev; wrk = container_of(work, struct storvsc_scan_work, work); - lun = wrk->lun; - sdev = scsi_device_lookup(wrk->host, 0, 0, lun); + sdev = scsi_device_lookup(wrk->host, 0, wrk->tgt_id, wrk->lun); if (!sdev) goto done; - scsi_rescan_device(&sdev->sdev_gendev); + scsi_rescan_device(sdev); scsi_device_put(sdev); done: kfree(wrk); } -static void storvsc_bus_scan(struct work_struct *work) +static void storvsc_host_scan(struct work_struct *work) { - struct storvsc_scan_work *wrk; - int id, order_id; - - wrk = container_of(work, struct storvsc_scan_work, work); - for (id = 0; id < wrk->host->max_id; ++id) { - if (wrk->host->reverse_ordering) - order_id = wrk->host->max_id - id - 1; - else - order_id = id; + struct Scsi_Host *host; + struct scsi_device *sdev; + struct hv_host_device *host_device = + container_of(work, struct hv_host_device, host_scan_work); - scsi_scan_target(&wrk->host->shost_gendev, 0, - order_id, SCAN_WILD_CARD, 1); - } - kfree(wrk); + host = host_device->host; + /* + * Before scanning the host, first check to see if any of the + * currently known devices have been hot removed. We issue a + * "unit ready" command against all currently known devices. + * This I/O will result in an error for devices that have been + * removed. As part of handling the I/O error, we remove the device. + * + * When a LUN is added or removed, the host sends us a signal to + * scan the host. Thus we are forced to discover the LUNs that + * may have been removed this way. + */ + mutex_lock(&host->scan_mutex); + shost_for_each_device(sdev, host) + scsi_test_unit_ready(sdev, 1, 1, NULL); + mutex_unlock(&host->scan_mutex); + /* + * Now scan the host to discover LUNs that may have been added. + */ + scsi_scan_host(host); } static void storvsc_remove_lun(struct work_struct *work) @@ -450,7 +529,7 @@ static void storvsc_remove_lun(struct work_struct *work) if (!scsi_host_get(wrk->host)) goto done; - sdev = scsi_device_lookup(wrk->host, 0, 0, wrk->lun); + sdev = scsi_device_lookup(wrk->host, 0, wrk->tgt_id, wrk->lun); if (sdev) { scsi_remove_device(sdev); @@ -462,18 +541,6 @@ done: kfree(wrk); } -/* - * Major/minor macros. Minor version is in LSB, meaning that earlier flat - * version numbers will be interpreted as "0.x" (i.e., 1 becomes 0.1). - */ - -static inline u16 storvsc_get_version(u8 major, u8 minor) -{ - u16 version; - - version = ((major << 8) | minor); - return version; -} /* * We can get incoming messages from the host that are not in response to @@ -536,284 +603,152 @@ get_in_err: } -static void destroy_bounce_buffer(struct scatterlist *sgl, - unsigned int sg_count) +static void storvsc_change_target_cpu(struct vmbus_channel *channel, u32 old, + u32 new) { - int i; - struct page *page_buf; + struct storvsc_device *stor_device; + struct vmbus_channel *cur_chn; + bool old_is_alloced = false; + struct hv_device *device; + unsigned long flags; + int cpu; - for (i = 0; i < sg_count; i++) { - page_buf = sg_page((&sgl[i])); - if (page_buf != NULL) - __free_page(page_buf); - } + device = channel->primary_channel ? + channel->primary_channel->device_obj + : channel->device_obj; + stor_device = get_out_stor_device(device); + if (!stor_device) + return; - kfree(sgl); -} + /* See storvsc_do_io() -> get_og_chn(). */ + spin_lock_irqsave(&stor_device->lock, flags); -static int do_bounce_buffer(struct scatterlist *sgl, unsigned int sg_count) -{ - int i; - - /* No need to check */ - if (sg_count < 2) - return -1; - - /* We have at least 2 sg entries */ - for (i = 0; i < sg_count; i++) { - if (i == 0) { - /* make sure 1st one does not have hole */ - if (sgl[i].offset + sgl[i].length != PAGE_SIZE) - return i; - } else if (i == sg_count - 1) { - /* make sure last one does not have hole */ - if (sgl[i].offset != 0) - return i; - } else { - /* make sure no hole in the middle */ - if (sgl[i].length != PAGE_SIZE || sgl[i].offset != 0) - return i; - } + /* + * Determines if the storvsc device has other channels assigned to + * the "old" CPU to update the alloced_cpus mask and the stor_chns + * array. + */ + if (device->channel != channel && device->channel->target_cpu == old) { + cur_chn = device->channel; + old_is_alloced = true; + goto old_is_alloced; } - return -1; -} - -static struct scatterlist *create_bounce_buffer(struct scatterlist *sgl, - unsigned int sg_count, - unsigned int len, - int write) -{ - int i; - int num_pages; - struct scatterlist *bounce_sgl; - struct page *page_buf; - unsigned int buf_len = ((write == WRITE_TYPE) ? 0 : PAGE_SIZE); - - num_pages = ALIGN(len, PAGE_SIZE) >> PAGE_SHIFT; - - bounce_sgl = kcalloc(num_pages, sizeof(struct scatterlist), GFP_ATOMIC); - if (!bounce_sgl) - return NULL; - - sg_init_table(bounce_sgl, num_pages); - for (i = 0; i < num_pages; i++) { - page_buf = alloc_page(GFP_ATOMIC); - if (!page_buf) - goto cleanup; - sg_set_page(&bounce_sgl[i], page_buf, buf_len, 0); + list_for_each_entry(cur_chn, &device->channel->sc_list, sc_list) { + if (cur_chn == channel) + continue; + if (cur_chn->target_cpu == old) { + old_is_alloced = true; + goto old_is_alloced; + } } - return bounce_sgl; - -cleanup: - destroy_bounce_buffer(bounce_sgl, num_pages); - return NULL; -} - -/* Disgusting wrapper functions */ -static inline unsigned long sg_kmap_atomic(struct scatterlist *sgl, int idx) -{ - void *addr = kmap_atomic(sg_page(sgl + idx)); - return (unsigned long)addr; -} - -static inline void sg_kunmap_atomic(unsigned long addr) -{ - kunmap_atomic((void *)addr); -} - - -/* Assume the original sgl has enough room */ -static unsigned int copy_from_bounce_buffer(struct scatterlist *orig_sgl, - struct scatterlist *bounce_sgl, - unsigned int orig_sgl_count, - unsigned int bounce_sgl_count) -{ - int i; - int j = 0; - unsigned long src, dest; - unsigned int srclen, destlen, copylen; - unsigned int total_copied = 0; - unsigned long bounce_addr = 0; - unsigned long dest_addr = 0; - unsigned long flags; - - local_irq_save(flags); - - for (i = 0; i < orig_sgl_count; i++) { - dest_addr = sg_kmap_atomic(orig_sgl,i) + orig_sgl[i].offset; - dest = dest_addr; - destlen = orig_sgl[i].length; - - if (bounce_addr == 0) - bounce_addr = sg_kmap_atomic(bounce_sgl,j); - - while (destlen) { - src = bounce_addr + bounce_sgl[j].offset; - srclen = bounce_sgl[j].length - bounce_sgl[j].offset; - - copylen = min(srclen, destlen); - memcpy((void *)dest, (void *)src, copylen); - - total_copied += copylen; - bounce_sgl[j].offset += copylen; - destlen -= copylen; - dest += copylen; - - if (bounce_sgl[j].offset == bounce_sgl[j].length) { - /* full */ - sg_kunmap_atomic(bounce_addr); - j++; - - /* - * It is possible that the number of elements - * in the bounce buffer may not be equal to - * the number of elements in the original - * scatter list. Handle this correctly. - */ - - if (j == bounce_sgl_count) { - /* - * We are done; cleanup and return. - */ - sg_kunmap_atomic(dest_addr - orig_sgl[i].offset); - local_irq_restore(flags); - return total_copied; - } - - /* if we need to use another bounce buffer */ - if (destlen || i != orig_sgl_count - 1) - bounce_addr = sg_kmap_atomic(bounce_sgl,j); - } else if (destlen == 0 && i == orig_sgl_count - 1) { - /* unmap the last bounce that is < PAGE_SIZE */ - sg_kunmap_atomic(bounce_addr); - } - } +old_is_alloced: + if (old_is_alloced) + WRITE_ONCE(stor_device->stor_chns[old], cur_chn); + else + cpumask_clear_cpu(old, &stor_device->alloced_cpus); - sg_kunmap_atomic(dest_addr - orig_sgl[i].offset); + /* "Flush" the stor_chns array. */ + for_each_possible_cpu(cpu) { + if (stor_device->stor_chns[cpu] && !cpumask_test_cpu( + cpu, &stor_device->alloced_cpus)) + WRITE_ONCE(stor_device->stor_chns[cpu], NULL); } - local_irq_restore(flags); + WRITE_ONCE(stor_device->stor_chns[new], channel); + cpumask_set_cpu(new, &stor_device->alloced_cpus); - return total_copied; + spin_unlock_irqrestore(&stor_device->lock, flags); } -/* Assume the bounce_sgl has enough room ie using the create_bounce_buffer() */ -static unsigned int copy_to_bounce_buffer(struct scatterlist *orig_sgl, - struct scatterlist *bounce_sgl, - unsigned int orig_sgl_count) +static u64 storvsc_next_request_id(struct vmbus_channel *channel, u64 rqst_addr) { - int i; - int j = 0; - unsigned long src, dest; - unsigned int srclen, destlen, copylen; - unsigned int total_copied = 0; - unsigned long bounce_addr = 0; - unsigned long src_addr = 0; - unsigned long flags; - - local_irq_save(flags); - - for (i = 0; i < orig_sgl_count; i++) { - src_addr = sg_kmap_atomic(orig_sgl,i) + orig_sgl[i].offset; - src = src_addr; - srclen = orig_sgl[i].length; - - if (bounce_addr == 0) - bounce_addr = sg_kmap_atomic(bounce_sgl,j); - - while (srclen) { - /* assume bounce offset always == 0 */ - dest = bounce_addr + bounce_sgl[j].length; - destlen = PAGE_SIZE - bounce_sgl[j].length; - - copylen = min(srclen, destlen); - memcpy((void *)dest, (void *)src, copylen); - - total_copied += copylen; - bounce_sgl[j].length += copylen; - srclen -= copylen; - src += copylen; + struct storvsc_cmd_request *request = + (struct storvsc_cmd_request *)(unsigned long)rqst_addr; - if (bounce_sgl[j].length == PAGE_SIZE) { - /* full..move to next entry */ - sg_kunmap_atomic(bounce_addr); - j++; + if (rqst_addr == VMBUS_RQST_INIT) + return VMBUS_RQST_INIT; + if (rqst_addr == VMBUS_RQST_RESET) + return VMBUS_RQST_RESET; - /* if we need to use another bounce buffer */ - if (srclen || i != orig_sgl_count - 1) - bounce_addr = sg_kmap_atomic(bounce_sgl,j); - - } else if (srclen == 0 && i == orig_sgl_count - 1) { - /* unmap the last bounce that is < PAGE_SIZE */ - sg_kunmap_atomic(bounce_addr); - } - } - - sg_kunmap_atomic(src_addr - orig_sgl[i].offset); - } - - local_irq_restore(flags); - - return total_copied; + /* + * Cannot return an ID of 0, which is reserved for an unsolicited + * message from Hyper-V. + */ + return (u64)blk_mq_unique_tag(scsi_cmd_to_rq(request->cmd)) + 1; } static void handle_sc_creation(struct vmbus_channel *new_sc) { struct hv_device *device = new_sc->primary_channel->device_obj; + struct device *dev = &device->device; struct storvsc_device *stor_device; struct vmstorage_channel_properties props; + int ret; stor_device = get_out_stor_device(device); if (!stor_device) return; - if (stor_device->open_sub_channel == false) + memset(&props, 0, sizeof(struct vmstorage_channel_properties)); + new_sc->max_pkt_size = STORVSC_MAX_PKT_SIZE; + + new_sc->next_request_id_callback = storvsc_next_request_id; + + ret = vmbus_open(new_sc, + aligned_ringbuffer_size, + aligned_ringbuffer_size, + (void *)&props, + sizeof(struct vmstorage_channel_properties), + storvsc_on_channel_callback, new_sc); + + /* In case vmbus_open() fails, we don't use the sub-channel. */ + if (ret != 0) { + dev_err(dev, "Failed to open sub-channel: err=%d\n", ret); return; + } - memset(&props, 0, sizeof(struct vmstorage_channel_properties)); + new_sc->change_target_cpu_callback = storvsc_change_target_cpu; - vmbus_open(new_sc, - storvsc_ringbuffer_size, - storvsc_ringbuffer_size, - (void *)&props, - sizeof(struct vmstorage_channel_properties), - storvsc_on_channel_callback, new_sc); + /* Add the sub-channel to the array of available channels. */ + stor_device->stor_chns[new_sc->target_cpu] = new_sc; + cpumask_set_cpu(new_sc->target_cpu, &stor_device->alloced_cpus); } static void handle_multichannel_storage(struct hv_device *device, int max_chns) { + struct device *dev = &device->device; struct storvsc_device *stor_device; - int num_cpus = num_online_cpus(); int num_sc; struct storvsc_cmd_request *request; struct vstor_packet *vstor_packet; int ret, t; - num_sc = ((max_chns > num_cpus) ? num_cpus : max_chns); + /* + * If the number of CPUs is artificially restricted, such as + * with maxcpus=1 on the kernel boot line, Hyper-V could offer + * sub-channels >= the number of CPUs. These sub-channels + * should not be created. The primary channel is already created + * and assigned to one CPU, so check against # CPUs - 1. + */ + num_sc = min((int)(num_online_cpus() - 1), max_chns); + if (!num_sc) + return; + stor_device = get_out_stor_device(device); if (!stor_device) return; + stor_device->num_sc = num_sc; request = &stor_device->init_request; vstor_packet = &request->vstor_packet; - stor_device->open_sub_channel = true; /* * Establish a handler for dealing with subchannels. */ vmbus_set_sc_create_callback(device->channel, handle_sc_creation); /* - * Check to see if sub-channels have already been created. This - * can happen when this driver is re-loaded after unloading. - */ - - if (vmbus_are_subchannels_present(device->channel)) - return; - - stor_device->open_sub_channel = false; - /* * Request the host to create sub-channels. */ memset(request, 0, sizeof(struct storvsc_cmd_request)); @@ -823,132 +758,161 @@ static void handle_multichannel_storage(struct hv_device *device, int max_chns) vstor_packet->sub_channel_count = num_sc; ret = vmbus_sendpacket(device->channel, vstor_packet, - (sizeof(struct vstor_packet) - - vmscsi_size_delta), - (unsigned long)request, + sizeof(struct vstor_packet), + VMBUS_RQST_INIT, VM_PKT_DATA_INBAND, VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); - if (ret != 0) + if (ret != 0) { + dev_err(dev, "Failed to create sub-channel: err=%d\n", ret); return; + } - t = wait_for_completion_timeout(&request->wait_event, 10*HZ); - if (t == 0) + t = wait_for_completion_timeout(&request->wait_event, storvsc_timeout * HZ); + if (t == 0) { + dev_err(dev, "Failed to create sub-channel: timed out\n"); return; + } if (vstor_packet->operation != VSTOR_OPERATION_COMPLETE_IO || - vstor_packet->status != 0) + vstor_packet->status != 0) { + dev_err(dev, "Failed to create sub-channel: op=%d, host=0x%x\n", + vstor_packet->operation, vstor_packet->status); return; + } /* - * Now that we created the sub-channels, invoke the check; this - * may trigger the callback. + * We need to do nothing here, because vmbus_process_offer() + * invokes channel->sc_creation_callback, which will open and use + * the sub-channel(s). */ - stor_device->open_sub_channel = true; - vmbus_are_subchannels_present(device->channel); } -static int storvsc_channel_init(struct hv_device *device) +static void cache_wwn(struct storvsc_device *stor_device, + struct vstor_packet *vstor_packet) +{ + /* + * Cache the currently active port and node ww names. + */ + if (vstor_packet->wwn_packet.primary_active) { + stor_device->node_name = + wwn_to_u64(vstor_packet->wwn_packet.primary_node_wwn); + stor_device->port_name = + wwn_to_u64(vstor_packet->wwn_packet.primary_port_wwn); + } else { + stor_device->node_name = + wwn_to_u64(vstor_packet->wwn_packet.secondary_node_wwn); + stor_device->port_name = + wwn_to_u64(vstor_packet->wwn_packet.secondary_port_wwn); + } +} + + +static int storvsc_execute_vstor_op(struct hv_device *device, + struct storvsc_cmd_request *request, + bool status_check) { struct storvsc_device *stor_device; - struct storvsc_cmd_request *request; struct vstor_packet *vstor_packet; int ret, t; - int max_chns; - bool process_sub_channels = false; stor_device = get_out_stor_device(device); if (!stor_device) return -ENODEV; - request = &stor_device->init_request; vstor_packet = &request->vstor_packet; - /* - * Now, initiate the vsc/vsp initialization protocol on the open - * channel - */ - memset(request, 0, sizeof(struct storvsc_cmd_request)); init_completion(&request->wait_event); - vstor_packet->operation = VSTOR_OPERATION_BEGIN_INITIALIZATION; vstor_packet->flags = REQUEST_COMPLETION_FLAG; ret = vmbus_sendpacket(device->channel, vstor_packet, - (sizeof(struct vstor_packet) - - vmscsi_size_delta), - (unsigned long)request, + sizeof(struct vstor_packet), + VMBUS_RQST_INIT, VM_PKT_DATA_INBAND, VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); if (ret != 0) - goto cleanup; + return ret; - t = wait_for_completion_timeout(&request->wait_event, 5*HZ); - if (t == 0) { - ret = -ETIMEDOUT; - goto cleanup; - } + t = wait_for_completion_timeout(&request->wait_event, storvsc_timeout * HZ); + if (t == 0) + return -ETIMEDOUT; + + if (!status_check) + return ret; if (vstor_packet->operation != VSTOR_OPERATION_COMPLETE_IO || vstor_packet->status != 0) - goto cleanup; + return -EINVAL; + return ret; +} - /* reuse the packet for version range supported */ - memset(vstor_packet, 0, sizeof(struct vstor_packet)); - vstor_packet->operation = VSTOR_OPERATION_QUERY_PROTOCOL_VERSION; - vstor_packet->flags = REQUEST_COMPLETION_FLAG; +static int storvsc_channel_init(struct hv_device *device, bool is_fc) +{ + struct storvsc_device *stor_device; + struct storvsc_cmd_request *request; + struct vstor_packet *vstor_packet; + int ret, i; + int max_chns; + bool process_sub_channels = false; - vstor_packet->version.major_minor = - storvsc_get_version(vmstor_current_major, vmstor_current_minor); + stor_device = get_out_stor_device(device); + if (!stor_device) + return -ENODEV; + request = &stor_device->init_request; + vstor_packet = &request->vstor_packet; + + /* + * Now, initiate the vsc/vsp initialization protocol on the open + * channel + */ + memset(request, 0, sizeof(struct storvsc_cmd_request)); + vstor_packet->operation = VSTOR_OPERATION_BEGIN_INITIALIZATION; + ret = storvsc_execute_vstor_op(device, request, true); + if (ret) + return ret; /* - * The revision number is only used in Windows; set it to 0. + * Query host supported protocol version. */ - vstor_packet->version.revision = 0; - ret = vmbus_sendpacket(device->channel, vstor_packet, - (sizeof(struct vstor_packet) - - vmscsi_size_delta), - (unsigned long)request, - VM_PKT_DATA_INBAND, - VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); - if (ret != 0) - goto cleanup; + for (i = 0; i < ARRAY_SIZE(protocol_version); i++) { + /* reuse the packet for version range supported */ + memset(vstor_packet, 0, sizeof(struct vstor_packet)); + vstor_packet->operation = + VSTOR_OPERATION_QUERY_PROTOCOL_VERSION; - t = wait_for_completion_timeout(&request->wait_event, 5*HZ); - if (t == 0) { - ret = -ETIMEDOUT; - goto cleanup; - } + vstor_packet->version.major_minor = protocol_version[i]; - if (vstor_packet->operation != VSTOR_OPERATION_COMPLETE_IO || - vstor_packet->status != 0) - goto cleanup; + /* + * The revision number is only used in Windows; set it to 0. + */ + vstor_packet->version.revision = 0; + ret = storvsc_execute_vstor_op(device, request, false); + if (ret != 0) + return ret; + if (vstor_packet->operation != VSTOR_OPERATION_COMPLETE_IO) + return -EINVAL; - memset(vstor_packet, 0, sizeof(struct vstor_packet)); - vstor_packet->operation = VSTOR_OPERATION_QUERY_PROPERTIES; - vstor_packet->flags = REQUEST_COMPLETION_FLAG; + if (vstor_packet->status == 0) { + vmstor_proto_version = protocol_version[i]; - ret = vmbus_sendpacket(device->channel, vstor_packet, - (sizeof(struct vstor_packet) - - vmscsi_size_delta), - (unsigned long)request, - VM_PKT_DATA_INBAND, - VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); - - if (ret != 0) - goto cleanup; + break; + } + } - t = wait_for_completion_timeout(&request->wait_event, 5*HZ); - if (t == 0) { - ret = -ETIMEDOUT; - goto cleanup; + if (vstor_packet->status != 0) { + dev_err(&device->device, "Obsolete Hyper-V version\n"); + return -EINVAL; } - if (vstor_packet->operation != VSTOR_OPERATION_COMPLETE_IO || - vstor_packet->status != 0) - goto cleanup; + + memset(vstor_packet, 0, sizeof(struct vstor_packet)); + vstor_packet->operation = VSTOR_OPERATION_QUERY_PROPERTIES; + ret = storvsc_execute_vstor_op(device, request, true); + if (ret != 0) + return ret; /* * Check to see if multi-channel support is there. @@ -956,42 +920,61 @@ static int storvsc_channel_init(struct hv_device *device) * support multi-channel. */ max_chns = vstor_packet->storage_channel_properties.max_channel_cnt; - if ((vmbus_proto_version != VERSION_WIN7) && - (vmbus_proto_version != VERSION_WS2008)) { - if (vstor_packet->storage_channel_properties.flags & - STORAGE_CHANNEL_SUPPORTS_MULTI_CHANNEL) - process_sub_channels = true; - } - memset(vstor_packet, 0, sizeof(struct vstor_packet)); - vstor_packet->operation = VSTOR_OPERATION_END_INITIALIZATION; - vstor_packet->flags = REQUEST_COMPLETION_FLAG; + /* + * Allocate state to manage the sub-channels. + * We allocate an array based on the number of CPU ids. This array + * is initially sparsely populated for the CPUs assigned to channels: + * primary + sub-channels. As I/Os are initiated by different CPUs, + * the slots for all online CPUs are populated to evenly distribute + * the load across all channels. + */ + stor_device->stor_chns = kcalloc(nr_cpu_ids, sizeof(void *), + GFP_KERNEL); + if (stor_device->stor_chns == NULL) + return -ENOMEM; - ret = vmbus_sendpacket(device->channel, vstor_packet, - (sizeof(struct vstor_packet) - - vmscsi_size_delta), - (unsigned long)request, - VM_PKT_DATA_INBAND, - VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + device->channel->change_target_cpu_callback = storvsc_change_target_cpu; + + stor_device->stor_chns[device->channel->target_cpu] = device->channel; + cpumask_set_cpu(device->channel->target_cpu, + &stor_device->alloced_cpus); + + if (vstor_packet->storage_channel_properties.flags & + STORAGE_CHANNEL_SUPPORTS_MULTI_CHANNEL) + process_sub_channels = true; + stor_device->max_transfer_bytes = + vstor_packet->storage_channel_properties.max_transfer_bytes; + + if (!is_fc) + goto done; + + /* + * For FC devices retrieve FC HBA data. + */ + memset(vstor_packet, 0, sizeof(struct vstor_packet)); + vstor_packet->operation = VSTOR_OPERATION_FCHBA_DATA; + ret = storvsc_execute_vstor_op(device, request, true); if (ret != 0) - goto cleanup; + return ret; - t = wait_for_completion_timeout(&request->wait_event, 5*HZ); - if (t == 0) { - ret = -ETIMEDOUT; - goto cleanup; - } + /* + * Cache the currently active port and node ww names. + */ + cache_wwn(stor_device, vstor_packet); - if (vstor_packet->operation != VSTOR_OPERATION_COMPLETE_IO || - vstor_packet->status != 0) - goto cleanup; +done: + + memset(vstor_packet, 0, sizeof(struct vstor_packet)); + vstor_packet->operation = VSTOR_OPERATION_END_INITIALIZATION; + ret = storvsc_execute_vstor_op(device, request, true); + if (ret != 0) + return ret; if (process_sub_channels) handle_multichannel_storage(device, max_chns); - -cleanup: return ret; } @@ -1002,10 +985,49 @@ static void storvsc_handle_error(struct vmscsi_request *vm_srb, { struct storvsc_scan_work *wrk; void (*process_err_fn)(struct work_struct *work); - bool do_work = false; + struct hv_host_device *host_dev = shost_priv(host); - switch (vm_srb->srb_status) { + switch (SRB_STATUS(vm_srb->srb_status)) { case SRB_STATUS_ERROR: + case SRB_STATUS_ABORTED: + case SRB_STATUS_INVALID_REQUEST: + case SRB_STATUS_INTERNAL_ERROR: + case SRB_STATUS_TIMEOUT: + case SRB_STATUS_SELECTION_TIMEOUT: + case SRB_STATUS_BUS_RESET: + case SRB_STATUS_DATA_OVERRUN: + if (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID) { + /* Check for capacity change */ + if ((asc == 0x2a) && (ascq == 0x9)) { + process_err_fn = storvsc_device_scan; + /* Retry the I/O that triggered this. */ + set_host_byte(scmnd, DID_REQUEUE); + goto do_work; + } + + /* + * Check for "Operating parameters have changed" + * due to Hyper-V changing the VHD/VHDX BlockSize + * when adding/removing a differencing disk. This + * causes discard_granularity to change, so do a + * rescan to pick up the new granularity. We don't + * want scsi_report_sense() to output a message + * that a sysadmin wouldn't know what to do with. + */ + if ((asc == 0x3f) && (ascq != 0x03) && + (ascq != 0x0e)) { + process_err_fn = storvsc_device_scan; + set_host_byte(scmnd, DID_REQUEUE); + goto do_work; + } + + /* + * Otherwise, let upper layer deal with the + * error when sense message is present + */ + return; + } + /* * If there is an error; offline the device since all * error recovery strategies would have already been @@ -1017,105 +1039,105 @@ static void storvsc_handle_error(struct vmscsi_request *vm_srb, case ATA_12: set_host_byte(scmnd, DID_PASSTHROUGH); break; + /* + * On some Hyper-V hosts TEST_UNIT_READY command can + * return SRB_STATUS_ERROR. Let the upper level code + * deal with it based on the sense information. + */ + case TEST_UNIT_READY: + break; default: - set_host_byte(scmnd, DID_TARGET_FAILURE); + set_host_byte(scmnd, DID_ERROR); } - break; + return; + case SRB_STATUS_INVALID_LUN: - do_work = true; + set_host_byte(scmnd, DID_NO_CONNECT); process_err_fn = storvsc_remove_lun; - break; - case (SRB_STATUS_ABORTED | SRB_STATUS_AUTOSENSE_VALID): - if ((asc == 0x2a) && (ascq == 0x9)) { - do_work = true; - process_err_fn = storvsc_device_scan; - /* - * Retry the I/O that trigerred this. - */ - set_host_byte(scmnd, DID_REQUEUE); - } - break; - } + goto do_work; - if (!do_work) - return; + } + return; +do_work: /* * We need to schedule work to process this error; schedule it. */ wrk = kmalloc(sizeof(struct storvsc_scan_work), GFP_ATOMIC); if (!wrk) { - set_host_byte(scmnd, DID_TARGET_FAILURE); + set_host_byte(scmnd, DID_BAD_TARGET); return; } wrk->host = host; wrk->lun = vm_srb->lun; + wrk->tgt_id = vm_srb->target_id; INIT_WORK(&wrk->work, process_err_fn); - schedule_work(&wrk->work); + queue_work(host_dev->handle_error_wq, &wrk->work); } -static void storvsc_command_completion(struct storvsc_cmd_request *cmd_request) +static void storvsc_command_completion(struct storvsc_cmd_request *cmd_request, + struct storvsc_device *stor_dev) { struct scsi_cmnd *scmnd = cmd_request->cmd; - struct hv_host_device *host_dev = shost_priv(scmnd->device->host); - void (*scsi_done_fn)(struct scsi_cmnd *); struct scsi_sense_hdr sense_hdr; struct vmscsi_request *vm_srb; - struct stor_mem_pools *memp = scmnd->device->hostdata; + u32 data_transfer_length; struct Scsi_Host *host; - struct storvsc_device *stor_dev; - struct hv_device *dev = host_dev->dev; + u32 payload_sz = cmd_request->payload_sz; + void *payload = cmd_request->payload; + bool sense_ok; - stor_dev = get_in_stor_device(dev); host = stor_dev->host; vm_srb = &cmd_request->vstor_packet.vm_srb; - if (cmd_request->bounce_sgl_count) { - if (vm_srb->data_in == READ_TYPE) - copy_from_bounce_buffer(scsi_sglist(scmnd), - cmd_request->bounce_sgl, - scsi_sg_count(scmnd), - cmd_request->bounce_sgl_count); - destroy_bounce_buffer(cmd_request->bounce_sgl, - cmd_request->bounce_sgl_count); - } + data_transfer_length = vm_srb->data_transfer_length; scmnd->result = vm_srb->scsi_status; if (scmnd->result) { - if (scsi_normalize_sense(scmnd->sense_buffer, - SCSI_SENSE_BUFFERSIZE, &sense_hdr)) - scsi_print_sense_hdr("storvsc", &sense_hdr); + sense_ok = scsi_normalize_sense(scmnd->sense_buffer, + SCSI_SENSE_BUFFERSIZE, &sense_hdr); + + if (sense_ok && do_logging(STORVSC_LOGGING_WARN)) + scsi_print_sense_hdr(scmnd->device, "storvsc", + &sense_hdr); } - if (vm_srb->srb_status != SRB_STATUS_SUCCESS) + if (vm_srb->srb_status != SRB_STATUS_SUCCESS) { storvsc_handle_error(vm_srb, scmnd, host, sense_hdr.asc, sense_hdr.ascq); + /* + * The Windows driver set data_transfer_length on + * SRB_STATUS_DATA_OVERRUN. On other errors, this value + * is untouched. In these cases we set it to 0. + */ + if (vm_srb->srb_status != SRB_STATUS_DATA_OVERRUN) + data_transfer_length = 0; + } - scsi_set_resid(scmnd, - cmd_request->data_buffer.len - - vm_srb->data_transfer_length); - - scsi_done_fn = scmnd->scsi_done; + /* Validate data_transfer_length (from Hyper-V) */ + if (data_transfer_length > cmd_request->payload->range.len) + data_transfer_length = cmd_request->payload->range.len; - scmnd->host_scribble = NULL; - scmnd->scsi_done = NULL; + scsi_set_resid(scmnd, + cmd_request->payload->range.len - data_transfer_length); - scsi_done_fn(scmnd); + scsi_done(scmnd); - mempool_free(cmd_request, memp->request_mempool); + if (payload_sz > + sizeof(struct vmbus_channel_packet_multipage_buffer)) + kfree(payload); } -static void storvsc_on_io_completion(struct hv_device *device, +static void storvsc_on_io_completion(struct storvsc_device *stor_device, struct vstor_packet *vstor_packet, struct storvsc_cmd_request *request) { - struct storvsc_device *stor_device; struct vstor_packet *stor_pkt; + struct hv_device *device = stor_device->device; - stor_device = hv_get_drvdata(device); stor_pkt = &request->vstor_packet; /* @@ -1123,6 +1145,7 @@ static void storvsc_on_io_completion(struct hv_device *device, * not correctly handle: * INQUIRY command with page code parameter set to 0x80 * MODE_SENSE command with cmd[2] == 0x1c + * MAINTENANCE_IN is not supported by HyperV FC passthrough * * Setup srb and scsi status so this won't be fatal. * We do this so we can distinguish truly fatal failues @@ -1130,80 +1153,84 @@ static void storvsc_on_io_completion(struct hv_device *device, */ if ((stor_pkt->vm_srb.cdb[0] == INQUIRY) || - (stor_pkt->vm_srb.cdb[0] == MODE_SENSE)) { + (stor_pkt->vm_srb.cdb[0] == MODE_SENSE) || + (stor_pkt->vm_srb.cdb[0] == MAINTENANCE_IN && + hv_dev_is_fc(device))) { vstor_packet->vm_srb.scsi_status = 0; vstor_packet->vm_srb.srb_status = SRB_STATUS_SUCCESS; } - /* Copy over the status...etc */ stor_pkt->vm_srb.scsi_status = vstor_packet->vm_srb.scsi_status; stor_pkt->vm_srb.srb_status = vstor_packet->vm_srb.srb_status; - stor_pkt->vm_srb.sense_info_length = - vstor_packet->vm_srb.sense_info_length; - - if (vstor_packet->vm_srb.scsi_status != 0 || - vstor_packet->vm_srb.srb_status != SRB_STATUS_SUCCESS){ - dev_warn(&device->device, - "cmd 0x%x scsi status 0x%x srb status 0x%x\n", - stor_pkt->vm_srb.cdb[0], - vstor_packet->vm_srb.scsi_status, - vstor_packet->vm_srb.srb_status); - } - if ((vstor_packet->vm_srb.scsi_status & 0xFF) == 0x02) { - /* CHECK_CONDITION */ - if (vstor_packet->vm_srb.srb_status & - SRB_STATUS_AUTOSENSE_VALID) { - /* autosense data available */ - dev_warn(&device->device, - "stor pkt %p autosense data valid - len %d\n", - request, - vstor_packet->vm_srb.sense_info_length); + /* + * Copy over the sense_info_length, but limit to the known max + * size if Hyper-V returns a bad value. + */ + stor_pkt->vm_srb.sense_info_length = min_t(u8, STORVSC_SENSE_BUFFER_SIZE, + vstor_packet->vm_srb.sense_info_length); - memcpy(request->sense_buffer, - vstor_packet->vm_srb.sense_data, - vstor_packet->vm_srb.sense_info_length); + if (vstor_packet->vm_srb.scsi_status != 0 || + vstor_packet->vm_srb.srb_status != SRB_STATUS_SUCCESS) { - } + /* + * Log TEST_UNIT_READY errors only as warnings. Hyper-V can + * return errors when detecting devices using TEST_UNIT_READY, + * and logging these as errors produces unhelpful noise. + */ + int loglevel = (stor_pkt->vm_srb.cdb[0] == TEST_UNIT_READY) ? + STORVSC_LOGGING_WARN : STORVSC_LOGGING_ERROR; + + storvsc_log_ratelimited(device, loglevel, + "tag#%d cmd 0x%x status: scsi 0x%x srb 0x%x host 0x%x\n", + scsi_cmd_to_rq(request->cmd)->tag, + stor_pkt->vm_srb.cdb[0], + vstor_packet->vm_srb.scsi_status, + vstor_packet->vm_srb.srb_status, + vstor_packet->status); } + if (vstor_packet->vm_srb.scsi_status == SAM_STAT_CHECK_CONDITION && + (vstor_packet->vm_srb.srb_status & SRB_STATUS_AUTOSENSE_VALID)) + memcpy(request->cmd->sense_buffer, + vstor_packet->vm_srb.sense_data, + stor_pkt->vm_srb.sense_info_length); + stor_pkt->vm_srb.data_transfer_length = - vstor_packet->vm_srb.data_transfer_length; + vstor_packet->vm_srb.data_transfer_length; - storvsc_command_completion(request); + storvsc_command_completion(request, stor_device); if (atomic_dec_and_test(&stor_device->num_outstanding_req) && stor_device->drain_notify) wake_up(&stor_device->waiting_to_drain); - - } -static void storvsc_on_receive(struct hv_device *device, +static void storvsc_on_receive(struct storvsc_device *stor_device, struct vstor_packet *vstor_packet, struct storvsc_cmd_request *request) { - struct storvsc_scan_work *work; - struct storvsc_device *stor_device; - + struct hv_host_device *host_dev; switch (vstor_packet->operation) { case VSTOR_OPERATION_COMPLETE_IO: - storvsc_on_io_completion(device, vstor_packet, request); + storvsc_on_io_completion(stor_device, vstor_packet, request); break; case VSTOR_OPERATION_REMOVE_DEVICE: case VSTOR_OPERATION_ENUMERATE_BUS: - stor_device = get_in_stor_device(device); - work = kmalloc(sizeof(struct storvsc_scan_work), GFP_ATOMIC); - if (!work) - return; - - INIT_WORK(&work->work, storvsc_bus_scan); - work->host = stor_device->host; - schedule_work(&work->work); + host_dev = shost_priv(stor_device->host); + queue_work( + host_dev->handle_error_wq, &host_dev->host_scan_work); break; + case VSTOR_OPERATION_FCHBA_DATA: + cache_wwn(stor_device, vstor_packet); +#if IS_ENABLED(CONFIG_SCSI_FC_ATTRS) + fc_host_node_name(stor_device->host) = stor_device->node_name; + fc_host_port_name(stor_device->host) = stor_device->port_name; +#endif + break; default: break; } @@ -1212,13 +1239,11 @@ static void storvsc_on_receive(struct hv_device *device, static void storvsc_on_channel_callback(void *context) { struct vmbus_channel *channel = (struct vmbus_channel *)context; + const struct vmpacket_descriptor *desc; struct hv_device *device; struct storvsc_device *stor_device; - u32 bytes_recvd; - u64 request_id; - unsigned char packet[ALIGN(sizeof(struct vstor_packet), 8)]; - struct storvsc_cmd_request *request; - int ret; + struct Scsi_Host *shost; + unsigned long time_limit = jiffies + msecs_to_jiffies(CALLBACK_TIMEOUT); if (channel->primary_channel != NULL) device = channel->primary_channel->device_obj; @@ -1229,43 +1254,91 @@ static void storvsc_on_channel_callback(void *context) if (!stor_device) return; - do { - ret = vmbus_recvpacket(channel, packet, - ALIGN((sizeof(struct vstor_packet) - - vmscsi_size_delta), 8), - &bytes_recvd, &request_id); - if (ret == 0 && bytes_recvd > 0) { + shost = stor_device->host; - request = (struct storvsc_cmd_request *) - (unsigned long)request_id; + foreach_vmbus_pkt(desc, channel) { + struct vstor_packet *packet = hv_pkt_data(desc); + struct storvsc_cmd_request *request = NULL; + u32 pktlen = hv_pkt_datalen(desc); + u64 rqst_id = desc->trans_id; + u32 minlen = rqst_id ? sizeof(struct vstor_packet) : + sizeof(enum vstor_packet_operation); - if ((request == &stor_device->init_request) || - (request == &stor_device->reset_request)) { + if (unlikely(time_after(jiffies, time_limit))) { + hv_pkt_iter_close(channel); + return; + } + + if (pktlen < minlen) { + dev_err(&device->device, + "Invalid pkt: id=%llu, len=%u, minlen=%u\n", + rqst_id, pktlen, minlen); + continue; + } - memcpy(&request->vstor_packet, packet, - (sizeof(struct vstor_packet) - - vmscsi_size_delta)); - complete(&request->wait_event); + if (rqst_id == VMBUS_RQST_INIT) { + request = &stor_device->init_request; + } else if (rqst_id == VMBUS_RQST_RESET) { + request = &stor_device->reset_request; + } else { + /* Hyper-V can send an unsolicited message with ID of 0 */ + if (rqst_id == 0) { + /* + * storvsc_on_receive() looks at the vstor_packet in the message + * from the ring buffer. + * + * - If the operation in the vstor_packet is COMPLETE_IO, then + * we call storvsc_on_io_completion(), and dereference the + * guest memory address. Make sure we don't call + * storvsc_on_io_completion() with a guest memory address + * that is zero if Hyper-V were to construct and send such + * a bogus packet. + * + * - If the operation in the vstor_packet is FCHBA_DATA, then + * we call cache_wwn(), and access the data payload area of + * the packet (wwn_packet); however, there is no guarantee + * that the packet is big enough to contain such area. + * Future-proof the code by rejecting such a bogus packet. + */ + if (packet->operation == VSTOR_OPERATION_COMPLETE_IO || + packet->operation == VSTOR_OPERATION_FCHBA_DATA) { + dev_err(&device->device, "Invalid packet with ID of 0\n"); + continue; + } } else { - storvsc_on_receive(device, - (struct vstor_packet *)packet, - request); + struct scsi_cmnd *scmnd; + + /* Transaction 'rqst_id' corresponds to tag 'rqst_id - 1' */ + scmnd = scsi_host_find_tag(shost, rqst_id - 1); + if (scmnd == NULL) { + dev_err(&device->device, "Incorrect transaction ID\n"); + continue; + } + request = (struct storvsc_cmd_request *)scsi_cmd_priv(scmnd); + scsi_dma_unmap(scmnd); } - } else { - break; + + storvsc_on_receive(stor_device, packet, request); + continue; } - } while (1); - return; + memcpy(&request->vstor_packet, packet, + sizeof(struct vstor_packet)); + complete(&request->wait_event); + } } -static int storvsc_connect_to_vsp(struct hv_device *device, u32 ring_size) +static int storvsc_connect_to_vsp(struct hv_device *device, u32 ring_size, + bool is_fc) { struct vmstorage_channel_properties props; int ret; memset(&props, 0, sizeof(struct vmstorage_channel_properties)); + device->channel->max_pkt_size = STORVSC_MAX_PKT_SIZE; + device->channel->next_request_id_callback = storvsc_next_request_id; + ret = vmbus_open(device->channel, ring_size, ring_size, @@ -1276,7 +1349,9 @@ static int storvsc_connect_to_vsp(struct hv_device *device, u32 ring_size) if (ret != 0) return ret; - ret = storvsc_channel_init(device); + ret = storvsc_channel_init(device, is_fc); + if (ret) + vmbus_close(device->channel); return ret; } @@ -1284,13 +1359,13 @@ static int storvsc_connect_to_vsp(struct hv_device *device, u32 ring_size) static int storvsc_dev_remove(struct hv_device *device) { struct storvsc_device *stor_device; - unsigned long flags; stor_device = hv_get_drvdata(device); - spin_lock_irqsave(&device->channel->inbound_lock, flags); stor_device->destroy = true; - spin_unlock_irqrestore(&device->channel->inbound_lock, flags); + + /* Make sure flag is set before waiting */ + wmb(); /* * At this point, all outbound traffic should be disable. We @@ -1307,24 +1382,83 @@ static int storvsc_dev_remove(struct hv_device *device) * we have drained - to drain the outgoing packets, we need to * allow incoming packets. */ - spin_lock_irqsave(&device->channel->inbound_lock, flags); hv_set_drvdata(device, NULL); - spin_unlock_irqrestore(&device->channel->inbound_lock, flags); /* Close the channel */ vmbus_close(device->channel); + kfree(stor_device->stor_chns); kfree(stor_device); return 0; } +static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device, + u16 q_num) +{ + u16 slot = 0; + u16 hash_qnum; + const struct cpumask *node_mask; + int num_channels, tgt_cpu; + + if (stor_device->num_sc == 0) { + stor_device->stor_chns[q_num] = stor_device->device->channel; + return stor_device->device->channel; + } + + /* + * Our channel array could be sparsley populated and we + * initiated I/O on a processor/hw-q that does not + * currently have a designated channel. Fix this. + * The strategy is simple: + * I. Prefer the channel associated with the current CPU + * II. Ensure NUMA locality + * III. Distribute evenly (best effort) + */ + + /* Prefer the channel on the I/O issuing processor/hw-q */ + if (cpumask_test_cpu(q_num, &stor_device->alloced_cpus)) + return stor_device->stor_chns[q_num]; + + node_mask = cpumask_of_node(cpu_to_node(q_num)); + + num_channels = 0; + for_each_cpu(tgt_cpu, &stor_device->alloced_cpus) { + if (cpumask_test_cpu(tgt_cpu, node_mask)) + num_channels++; + } + if (num_channels == 0) { + stor_device->stor_chns[q_num] = stor_device->device->channel; + return stor_device->device->channel; + } + + hash_qnum = q_num; + while (hash_qnum >= num_channels) + hash_qnum -= num_channels; + + for_each_cpu(tgt_cpu, &stor_device->alloced_cpus) { + if (!cpumask_test_cpu(tgt_cpu, node_mask)) + continue; + if (slot == hash_qnum) + break; + slot++; + } + + stor_device->stor_chns[q_num] = stor_device->stor_chns[tgt_cpu]; + + return stor_device->stor_chns[q_num]; +} + + static int storvsc_do_io(struct hv_device *device, - struct storvsc_cmd_request *request) + struct storvsc_cmd_request *request, u16 q_num) { struct storvsc_device *stor_device; struct vstor_packet *vstor_packet; - struct vmbus_channel *outgoing_channel; + struct vmbus_channel *outgoing_channel, *channel; + unsigned long flags; int ret = 0; + const struct cpumask *node_mask; + int tgt_cpu; vstor_packet = &request->vstor_packet; stor_device = get_out_stor_device(device); @@ -1335,37 +1469,88 @@ static int storvsc_do_io(struct hv_device *device, request->device = device; /* - * Select an an appropriate channel to send the request out. + * Select an appropriate channel to send the request out. */ + /* See storvsc_change_target_cpu(). */ + outgoing_channel = READ_ONCE(stor_device->stor_chns[q_num]); + if (outgoing_channel != NULL) { + if (hv_get_avail_to_write_percent(&outgoing_channel->outbound) + > ring_avail_percent_lowater) + goto found_channel; - outgoing_channel = vmbus_get_outgoing_channel(device->channel); + /* + * Channel is busy, try to find a channel on the same NUMA node + */ + node_mask = cpumask_of_node(cpu_to_node(q_num)); + for_each_cpu_wrap(tgt_cpu, &stor_device->alloced_cpus, + q_num + 1) { + if (!cpumask_test_cpu(tgt_cpu, node_mask)) + continue; + channel = READ_ONCE(stor_device->stor_chns[tgt_cpu]); + if (!channel) + continue; + if (hv_get_avail_to_write_percent(&channel->outbound) + > ring_avail_percent_lowater) { + outgoing_channel = channel; + goto found_channel; + } + } + /* + * If we reach here, all the channels on the current + * NUMA node are busy. Try to find a channel in + * all NUMA nodes + */ + for_each_cpu_wrap(tgt_cpu, &stor_device->alloced_cpus, + q_num + 1) { + channel = READ_ONCE(stor_device->stor_chns[tgt_cpu]); + if (!channel) + continue; + if (hv_get_avail_to_write_percent(&channel->outbound) + > ring_avail_percent_lowater) { + outgoing_channel = channel; + goto found_channel; + } + } + /* + * If we reach here, all the channels are busy. Use the + * original channel found. + */ + } else { + spin_lock_irqsave(&stor_device->lock, flags); + outgoing_channel = stor_device->stor_chns[q_num]; + if (outgoing_channel != NULL) { + spin_unlock_irqrestore(&stor_device->lock, flags); + goto found_channel; + } + outgoing_channel = get_og_chn(stor_device, q_num); + spin_unlock_irqrestore(&stor_device->lock, flags); + } +found_channel: vstor_packet->flags |= REQUEST_COMPLETION_FLAG; - vstor_packet->vm_srb.length = (sizeof(struct vmscsi_request) - - vmscsi_size_delta); + vstor_packet->vm_srb.length = sizeof(struct vmscsi_request); - vstor_packet->vm_srb.sense_info_length = sense_buffer_size; + vstor_packet->vm_srb.sense_info_length = STORVSC_SENSE_BUFFER_SIZE; vstor_packet->vm_srb.data_transfer_length = - request->data_buffer.len; + request->payload->range.len; vstor_packet->operation = VSTOR_OPERATION_EXECUTE_SRB; - if (request->data_buffer.len) { - ret = vmbus_sendpacket_multipagebuffer(outgoing_channel, - &request->data_buffer, + if (request->payload->range.len) { + + ret = vmbus_sendpacket_mpb_desc(outgoing_channel, + request->payload, request->payload_sz, vstor_packet, - (sizeof(struct vstor_packet) - - vmscsi_size_delta), + sizeof(struct vstor_packet), (unsigned long)request); } else { - ret = vmbus_sendpacket(device->channel, vstor_packet, - (sizeof(struct vstor_packet) - - vmscsi_size_delta), + ret = vmbus_sendpacket(outgoing_channel, vstor_packet, + sizeof(struct vstor_packet), (unsigned long)request, VM_PKT_DATA_INBAND, VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); @@ -1381,67 +1566,50 @@ static int storvsc_do_io(struct hv_device *device, static int storvsc_device_alloc(struct scsi_device *sdevice) { - struct stor_mem_pools *memp; - int number = STORVSC_MIN_BUF_NR; - - memp = kzalloc(sizeof(struct stor_mem_pools), GFP_KERNEL); - if (!memp) - return -ENOMEM; - - memp->request_pool = - kmem_cache_create(dev_name(&sdevice->sdev_dev), - sizeof(struct storvsc_cmd_request), 0, - SLAB_HWCACHE_ALIGN, NULL); - - if (!memp->request_pool) - goto err0; - - memp->request_mempool = mempool_create(number, mempool_alloc_slab, - mempool_free_slab, - memp->request_pool); - - if (!memp->request_mempool) - goto err1; - - sdevice->hostdata = memp; + /* + * Set blist flag to permit the reading of the VPD pages even when + * the target may claim SPC-2 compliance. MSFT targets currently + * claim SPC-2 compliance while they implement post SPC-2 features. + * With this flag we can correctly handle WRITE_SAME_16 issues. + * + * Hypervisor reports SCSI_UNKNOWN type for DVD ROM device but + * still supports REPORT LUN. + */ + sdevice->sdev_bflags = BLIST_REPORTLUN2 | BLIST_TRY_VPD_PAGES; return 0; - -err1: - kmem_cache_destroy(memp->request_pool); - -err0: - kfree(memp); - return -ENOMEM; } -static void storvsc_device_destroy(struct scsi_device *sdevice) +static int storvsc_sdev_configure(struct scsi_device *sdevice, + struct queue_limits *lim) { - struct stor_mem_pools *memp = sdevice->hostdata; - - mempool_destroy(memp->request_mempool); - kmem_cache_destroy(memp->request_pool); - kfree(memp); - sdevice->hostdata = NULL; -} - -static int storvsc_device_configure(struct scsi_device *sdevice) -{ - scsi_adjust_queue_depth(sdevice, MSG_SIMPLE_TAG, - STORVSC_MAX_IO_REQUESTS); - - blk_queue_max_segment_size(sdevice->request_queue, PAGE_SIZE); - - blk_queue_bounce_limit(sdevice->request_queue, BLK_BOUNCE_ANY); - blk_queue_rq_timeout(sdevice->request_queue, (storvsc_timeout * HZ)); + /* storvsc devices don't support MAINTENANCE_IN SCSI cmd */ + sdevice->no_report_opcodes = 1; sdevice->no_write_same = 1; + /* + * If the host is WIN8 or WIN8 R2, claim conformance to SPC-3 + * if the device is a MSFT virtual device. If the host is + * WIN10 or newer, allow write_same. + */ + if (!strncmp(sdevice->vendor, "Msft", 4)) { + switch (vmstor_proto_version) { + case VMSTOR_PROTO_VERSION_WIN8: + case VMSTOR_PROTO_VERSION_WIN8_1: + sdevice->scsi_level = SCSI_SPC_3; + break; + } + + if (vmstor_proto_version >= VMSTOR_PROTO_VERSION_WIN10) + sdevice->no_write_same = 0; + } + return 0; } -static int storvsc_get_chs(struct scsi_device *sdev, struct block_device * bdev, +static int storvsc_get_chs(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int *info) { sector_t nsect = capacity; @@ -1474,13 +1642,13 @@ static int storvsc_host_reset_handler(struct scsi_cmnd *scmnd) struct vstor_packet *vstor_packet; int ret, t; - stor_device = get_out_stor_device(device); if (!stor_device) return FAILED; request = &stor_device->reset_request; vstor_packet = &request->vstor_packet; + memset(vstor_packet, 0, sizeof(struct vstor_packet)); init_completion(&request->wait_event); @@ -1489,15 +1657,14 @@ static int storvsc_host_reset_handler(struct scsi_cmnd *scmnd) vstor_packet->vm_srb.path_id = stor_device->path_id; ret = vmbus_sendpacket(device->channel, vstor_packet, - (sizeof(struct vstor_packet) - - vmscsi_size_delta), - (unsigned long)&stor_device->reset_request, + sizeof(struct vstor_packet), + VMBUS_RQST_RESET, VM_PKT_DATA_INBAND, VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); if (ret != 0) return FAILED; - t = wait_for_completion_timeout(&request->wait_event, 5*HZ); + t = wait_for_completion_timeout(&request->wait_event, storvsc_timeout * HZ); if (t == 0) return TIMEOUT_ERROR; @@ -1515,6 +1682,16 @@ static int storvsc_host_reset_handler(struct scsi_cmnd *scmnd) return SUCCESS; } +/* + * The host guarantees to respond to each command, although I/O latencies might + * be unbounded on Azure. Reset the timer unconditionally to give the host a + * chance to perform EH. + */ +static enum scsi_timeout_action storvsc_eh_timed_out(struct scsi_cmnd *scmnd) +{ + return SCSI_EH_RESET_TIMER; +} + static bool storvsc_scsi_cmd_ok(struct scsi_cmnd *scmnd) { bool allowed = true; @@ -1528,7 +1705,7 @@ static bool storvsc_scsi_cmd_ok(struct scsi_cmnd *scmnd) * this. So, don't send it. */ case SET_WINDOW: - scmnd->result = ILLEGAL_REQUEST << 16; + set_host_byte(scmnd, DID_ERROR); allowed = false; break; default: @@ -1542,62 +1719,67 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd) int ret; struct hv_host_device *host_dev = shost_priv(host); struct hv_device *dev = host_dev->dev; - struct storvsc_cmd_request *cmd_request; - unsigned int request_size = 0; - int i; + struct storvsc_cmd_request *cmd_request = scsi_cmd_priv(scmnd); struct scatterlist *sgl; - unsigned int sg_count = 0; struct vmscsi_request *vm_srb; - struct stor_mem_pools *memp = scmnd->device->hostdata; + struct vmbus_packet_mpb_array *payload; + u32 payload_sz; + u32 length; - if (!storvsc_scsi_cmd_ok(scmnd)) { - scmnd->scsi_done(scmnd); - return 0; + if (vmstor_proto_version <= VMSTOR_PROTO_VERSION_WIN8) { + /* + * On legacy hosts filter unimplemented commands. + * Future hosts are expected to correctly handle + * unsupported commands. Furthermore, it is + * possible that some of the currently + * unsupported commands maybe supported in + * future versions of the host. + */ + if (!storvsc_scsi_cmd_ok(scmnd)) { + scsi_done(scmnd); + return 0; + } } - request_size = sizeof(struct storvsc_cmd_request); - - cmd_request = mempool_alloc(memp->request_mempool, - GFP_ATOMIC); - - /* - * We might be invoked in an interrupt context; hence - * mempool_alloc() can fail. - */ - if (!cmd_request) - return SCSI_MLQUEUE_DEVICE_BUSY; - - memset(cmd_request, 0, sizeof(struct storvsc_cmd_request)); - /* Setup the cmd request */ cmd_request->cmd = scmnd; - scmnd->host_scribble = (unsigned char *)cmd_request; - + memset(&cmd_request->vstor_packet, 0, sizeof(struct vstor_packet)); vm_srb = &cmd_request->vstor_packet.vm_srb; - vm_srb->win8_extension.time_out_value = 60; + vm_srb->time_out_value = 60; + vm_srb->srb_flags |= + SRB_FLAGS_DISABLE_SYNCH_TRANSFER; + + if (scmnd->device->tagged_supported) { + vm_srb->srb_flags |= + (SRB_FLAGS_QUEUE_ACTION_ENABLE | SRB_FLAGS_NO_QUEUE_FREEZE); + vm_srb->queue_tag = SP_UNTAGGED; + vm_srb->queue_action = SRB_SIMPLE_TAG_REQUEST; + } /* Build the SRB */ switch (scmnd->sc_data_direction) { case DMA_TO_DEVICE: vm_srb->data_in = WRITE_TYPE; - vm_srb->win8_extension.srb_flags |= SRB_FLAGS_DATA_OUT; - vm_srb->win8_extension.srb_flags |= - (SRB_FLAGS_QUEUE_ACTION_ENABLE | - SRB_FLAGS_DISABLE_SYNCH_TRANSFER); + vm_srb->srb_flags |= SRB_FLAGS_DATA_OUT; break; case DMA_FROM_DEVICE: vm_srb->data_in = READ_TYPE; - vm_srb->win8_extension.srb_flags |= SRB_FLAGS_DATA_IN; - vm_srb->win8_extension.srb_flags |= - (SRB_FLAGS_QUEUE_ACTION_ENABLE | - SRB_FLAGS_DISABLE_SYNCH_TRANSFER); + vm_srb->srb_flags |= SRB_FLAGS_DATA_IN; break; - default: + case DMA_NONE: vm_srb->data_in = UNKNOWN_TYPE; - vm_srb->win8_extension.srb_flags = 0; + vm_srb->srb_flags |= SRB_FLAGS_NO_DATA_TRANSFER; break; + default: + /* + * This is DMA_BIDIRECTIONAL or something else we are never + * supposed to see here. + */ + WARN(1, "Unexpected data direction: %d\n", + scmnd->sc_data_direction); + return -EINVAL; } @@ -1610,93 +1792,107 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd) memcpy(vm_srb->cdb, scmnd->cmnd, vm_srb->cdb_length); - cmd_request->sense_buffer = scmnd->sense_buffer; + sgl = (struct scatterlist *)scsi_sglist(scmnd); + length = scsi_bufflen(scmnd); + payload = (struct vmbus_packet_mpb_array *)&cmd_request->mpb; + payload->range.len = 0; + payload_sz = 0; - cmd_request->data_buffer.len = scsi_bufflen(scmnd); if (scsi_sg_count(scmnd)) { - sgl = (struct scatterlist *)scsi_sglist(scmnd); - sg_count = scsi_sg_count(scmnd); - - /* check if we need to bounce the sgl */ - if (do_bounce_buffer(sgl, scsi_sg_count(scmnd)) != -1) { - cmd_request->bounce_sgl = - create_bounce_buffer(sgl, scsi_sg_count(scmnd), - scsi_bufflen(scmnd), - vm_srb->data_in); - if (!cmd_request->bounce_sgl) { - ret = SCSI_MLQUEUE_HOST_BUSY; - goto queue_error; - } - - cmd_request->bounce_sgl_count = - ALIGN(scsi_bufflen(scmnd), PAGE_SIZE) >> - PAGE_SHIFT; + unsigned long offset_in_hvpg = offset_in_hvpage(sgl->offset); + unsigned int hvpg_count = HVPFN_UP(offset_in_hvpg + length); + struct scatterlist *sg; + unsigned long hvpfn, hvpfns_to_add; + int j, i = 0, sg_count; + + payload_sz = (hvpg_count * sizeof(u64) + + sizeof(struct vmbus_packet_mpb_array)); + + if (hvpg_count > MAX_PAGE_BUFFER_COUNT) { + payload = kzalloc(payload_sz, GFP_ATOMIC); + if (!payload) + return SCSI_MLQUEUE_DEVICE_BUSY; + } - if (vm_srb->data_in == WRITE_TYPE) - copy_to_bounce_buffer(sgl, - cmd_request->bounce_sgl, - scsi_sg_count(scmnd)); + payload->rangecount = 1; + payload->range.len = length; + payload->range.offset = offset_in_hvpg; - sgl = cmd_request->bounce_sgl; - sg_count = cmd_request->bounce_sgl_count; + sg_count = scsi_dma_map(scmnd); + if (sg_count < 0) { + ret = SCSI_MLQUEUE_DEVICE_BUSY; + goto err_free_payload; } - cmd_request->data_buffer.offset = sgl[0].offset; - - for (i = 0; i < sg_count; i++) - cmd_request->data_buffer.pfn_array[i] = - page_to_pfn(sg_page((&sgl[i]))); + for_each_sg(sgl, sg, sg_count, j) { + /* + * Init values for the current sgl entry. hvpfns_to_add + * is in units of Hyper-V size pages. Handling the + * PAGE_SIZE != HV_HYP_PAGE_SIZE case also handles + * values of sgl->offset that are larger than PAGE_SIZE. + * Such offsets are handled even on other than the first + * sgl entry, provided they are a multiple of PAGE_SIZE. + */ + hvpfn = HVPFN_DOWN(sg_dma_address(sg)); + hvpfns_to_add = HVPFN_UP(sg_dma_address(sg) + + sg_dma_len(sg)) - hvpfn; - } else if (scsi_sglist(scmnd)) { - cmd_request->data_buffer.offset = - virt_to_phys(scsi_sglist(scmnd)) & (PAGE_SIZE-1); - cmd_request->data_buffer.pfn_array[0] = - virt_to_phys(scsi_sglist(scmnd)) >> PAGE_SHIFT; + /* + * Fill the next portion of the PFN array with + * sequential Hyper-V PFNs for the continguous physical + * memory described by the sgl entry. The end of the + * last sgl should be reached at the same time that + * the PFN array is filled. + */ + while (hvpfns_to_add--) + payload->range.pfn_array[i++] = hvpfn++; + } } + cmd_request->payload = payload; + cmd_request->payload_sz = payload_sz; + /* Invokes the vsc to start an IO */ - ret = storvsc_do_io(dev, cmd_request); + ret = storvsc_do_io(dev, cmd_request, get_cpu()); + put_cpu(); + + if (ret) + scsi_dma_unmap(scmnd); if (ret == -EAGAIN) { /* no more space */ - - if (cmd_request->bounce_sgl_count) { - destroy_bounce_buffer(cmd_request->bounce_sgl, - cmd_request->bounce_sgl_count); - - ret = SCSI_MLQUEUE_DEVICE_BUSY; - goto queue_error; - } + ret = SCSI_MLQUEUE_DEVICE_BUSY; + goto err_free_payload; } return 0; -queue_error: - mempool_free(cmd_request, memp->request_mempool); - scmnd->host_scribble = NULL; +err_free_payload: + if (payload_sz > sizeof(cmd_request->mpb)) + kfree(payload); + return ret; } static struct scsi_host_template scsi_driver = { .module = THIS_MODULE, .name = "storvsc_host_t", + .cmd_size = sizeof(struct storvsc_cmd_request), .bios_param = storvsc_get_chs, .queuecommand = storvsc_queuecommand, .eh_host_reset_handler = storvsc_host_reset_handler, - .slave_alloc = storvsc_device_alloc, - .slave_destroy = storvsc_device_destroy, - .slave_configure = storvsc_device_configure, - .cmd_per_lun = 1, - /* 64 max_queue * 1 target */ - .can_queue = STORVSC_MAX_IO_REQUESTS*STORVSC_MAX_TARGETS, + .proc_name = "storvsc_host", + .eh_timed_out = storvsc_eh_timed_out, + .sdev_init = storvsc_device_alloc, + .sdev_configure = storvsc_sdev_configure, + .cmd_per_lun = 2048, .this_id = -1, - /* no use setting to 0 since ll_blk_rw reset it to 1 */ - /* currently 32 */ - .sg_tablesize = MAX_MULTIPAGE_BUFFER_COUNT, - .use_clustering = DISABLE_CLUSTERING, - /* Make sure we dont get a sg segment crosses a page boundary */ - .dma_boundary = PAGE_SIZE-1, + /* Ensure there are no gaps in presented sgls */ + .virt_boundary_mask = HV_HYP_PAGE_SIZE - 1, + .no_write_same = 1, + .track_queue_depth = 1, + .change_queue_depth = storvsc_change_queue_depth, }; enum { @@ -1724,33 +1920,40 @@ static const struct hv_vmbus_device_id id_table[] = { MODULE_DEVICE_TABLE(vmbus, id_table); +static const struct { guid_t guid; } fc_guid = { HV_SYNTHFC_GUID }; + +static bool hv_dev_is_fc(struct hv_device *hv_dev) +{ + return guid_equal(&fc_guid.guid, &hv_dev->dev_type); +} + static int storvsc_probe(struct hv_device *device, const struct hv_vmbus_device_id *dev_id) { int ret; + int num_cpus = num_online_cpus(); + int num_present_cpus = num_present_cpus(); struct Scsi_Host *host; struct hv_host_device *host_dev; - bool dev_is_ide = ((dev_id->driver_data == IDE_GUID) ? true : false); + bool dev_is_ide = dev_id->driver_data == IDE_GUID; + bool is_fc = dev_id->driver_data == SFC_GUID; int target = 0; struct storvsc_device *stor_device; + int max_sub_channels = 0; + u32 max_xfer_bytes; /* - * Based on the windows host we are running on, - * set state to properly communicate with the host. + * We support sub-channels for storage on SCSI and FC controllers. + * The number of sub-channels offerred is based on the number of + * VCPUs in the guest. */ + if (!dev_is_ide) + max_sub_channels = + (num_cpus - 1) / storvsc_vcpus_per_sub_channel; - if (vmbus_proto_version == VERSION_WIN8) { - sense_buffer_size = POST_WIN7_STORVSC_SENSE_BUFFER_SIZE; - vmscsi_size_delta = 0; - vmstor_current_major = VMSTOR_WIN8_MAJOR; - vmstor_current_minor = VMSTOR_WIN8_MINOR; - } else { - sense_buffer_size = PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE; - vmscsi_size_delta = sizeof(struct vmscsi_win8_extension); - vmstor_current_major = VMSTOR_WIN7_MAJOR; - vmstor_current_minor = VMSTOR_WIN7_MINOR; - } - + scsi_driver.can_queue = max_outstanding_req_per_channel * + (max_sub_channels + 1) * + (100 - ring_avail_percent_lowater) / 100; host = scsi_host_alloc(&scsi_driver, sizeof(struct hv_host_device)); @@ -1762,6 +1965,7 @@ static int storvsc_probe(struct hv_device *device, host_dev->port = host->host_no; host_dev->dev = device; + host_dev->host = host; stor_device = kzalloc(sizeof(struct storvsc_device), GFP_KERNEL); @@ -1771,33 +1975,102 @@ static int storvsc_probe(struct hv_device *device, } stor_device->destroy = false; - stor_device->open_sub_channel = false; init_waitqueue_head(&stor_device->waiting_to_drain); stor_device->device = device; stor_device->host = host; + spin_lock_init(&stor_device->lock); hv_set_drvdata(device, stor_device); + dma_set_min_align_mask(&device->device, HV_HYP_PAGE_SIZE - 1); stor_device->port_number = host->host_no; - ret = storvsc_connect_to_vsp(device, storvsc_ringbuffer_size); + ret = storvsc_connect_to_vsp(device, aligned_ringbuffer_size, is_fc); if (ret) goto err_out1; host_dev->path = stor_device->path_id; host_dev->target = stor_device->target_id; - /* max # of devices per target */ - host->max_lun = STORVSC_MAX_LUNS_PER_TARGET; - /* max # of targets per channel */ - host->max_id = STORVSC_MAX_TARGETS; - /* max # of channels */ - host->max_channel = STORVSC_MAX_CHANNELS - 1; + switch (dev_id->driver_data) { + case SFC_GUID: + host->max_lun = STORVSC_FC_MAX_LUNS_PER_TARGET; + host->max_id = STORVSC_FC_MAX_TARGETS; + host->max_channel = STORVSC_FC_MAX_CHANNELS - 1; +#if IS_ENABLED(CONFIG_SCSI_FC_ATTRS) + host->transportt = fc_transport_template; +#endif + break; + + case SCSI_GUID: + host->max_lun = STORVSC_MAX_LUNS_PER_TARGET; + host->max_id = STORVSC_MAX_TARGETS; + host->max_channel = STORVSC_MAX_CHANNELS - 1; + break; + + default: + host->max_lun = STORVSC_IDE_MAX_LUNS_PER_TARGET; + host->max_id = STORVSC_IDE_MAX_TARGETS; + host->max_channel = STORVSC_IDE_MAX_CHANNELS - 1; + break; + } /* max cmd length */ host->max_cmd_len = STORVSC_MAX_CMD_LEN; + /* + * Any reasonable Hyper-V configuration should provide + * max_transfer_bytes value aligning to HV_HYP_PAGE_SIZE, + * protecting it from any weird value. + */ + max_xfer_bytes = round_down(stor_device->max_transfer_bytes, HV_HYP_PAGE_SIZE); + if (is_fc) + max_xfer_bytes = min(max_xfer_bytes, STORVSC_FC_MAX_XFER_SIZE); + + /* max_hw_sectors_kb */ + host->max_sectors = max_xfer_bytes >> 9; + /* + * There are 2 requirements for Hyper-V storvsc sgl segments, + * based on which the below calculation for max segments is + * done: + * + * 1. Except for the first and last sgl segment, all sgl segments + * should be align to HV_HYP_PAGE_SIZE, that also means the + * maximum number of segments in a sgl can be calculated by + * dividing the total max transfer length by HV_HYP_PAGE_SIZE. + * + * 2. Except for the first and last, each entry in the SGL must + * have an offset that is a multiple of HV_HYP_PAGE_SIZE. + */ + host->sg_tablesize = (max_xfer_bytes >> HV_HYP_PAGE_SHIFT) + 1; + /* + * For non-IDE disks, the host supports multiple channels. + * Set the number of HW queues we are supporting. + */ + if (!dev_is_ide) { + if (storvsc_max_hw_queues > num_present_cpus) { + storvsc_max_hw_queues = 0; + storvsc_log(device, STORVSC_LOGGING_WARN, + "Resetting invalid storvsc_max_hw_queues value to default.\n"); + } + if (storvsc_max_hw_queues) + host->nr_hw_queues = storvsc_max_hw_queues; + else + host->nr_hw_queues = num_present_cpus; + } + /* + * Set the error handler work queue. + */ + host_dev->handle_error_wq = + alloc_ordered_workqueue("storvsc_error_wq_%d", + 0, + host->host_no); + if (!host_dev->handle_error_wq) { + ret = -ENOMEM; + goto err_out2; + } + INIT_WORK(&host_dev->host_scan_work, storvsc_host_scan); /* Register the HBA and start the scsi bus scan */ ret = scsi_add_host(host, &device->device); if (ret != 0) - goto err_out2; + goto err_out3; if (!dev_is_ide) { scsi_scan_host(host); @@ -1805,17 +2078,36 @@ static int storvsc_probe(struct hv_device *device, target = (device->dev_instance.b[5] << 8 | device->dev_instance.b[4]); ret = scsi_add_device(host, 0, target, 0); - if (ret) { - scsi_remove_host(host); - goto err_out2; + if (ret) + goto err_out4; + } +#if IS_ENABLED(CONFIG_SCSI_FC_ATTRS) + if (host->transportt == fc_transport_template) { + struct fc_rport_identifiers ids = { + .roles = FC_PORT_ROLE_FCP_DUMMY_INITIATOR, + }; + + fc_host_node_name(host) = stor_device->node_name; + fc_host_port_name(host) = stor_device->port_name; + stor_device->rport = fc_remote_port_add(host, 0, &ids); + if (!stor_device->rport) { + ret = -ENOMEM; + goto err_out4; } } +#endif return 0; +err_out4: + scsi_remove_host(host); + +err_out3: + destroy_workqueue(host_dev->handle_error_wq); + err_out2: /* * Once we have connected with the host, we would need to - * to invoke storvsc_dev_remove() to rollback this state and + * invoke storvsc_dev_remove() to rollback this state and * this call also frees up the stor_device; hence the jump around * err_out1 label. */ @@ -1823,6 +2115,7 @@ err_out2: goto err_out0; err_out1: + kfree(stor_device->stor_chns); kfree(stor_device); err_out0: @@ -1830,28 +2123,84 @@ err_out0: return ret; } -static int storvsc_remove(struct hv_device *dev) +/* Change a scsi target's queue depth */ +static int storvsc_change_queue_depth(struct scsi_device *sdev, int queue_depth) +{ + if (queue_depth > scsi_driver.can_queue) + queue_depth = scsi_driver.can_queue; + + return scsi_change_queue_depth(sdev, queue_depth); +} + +static void storvsc_remove(struct hv_device *dev) { struct storvsc_device *stor_device = hv_get_drvdata(dev); struct Scsi_Host *host = stor_device->host; + struct hv_host_device *host_dev = shost_priv(host); +#if IS_ENABLED(CONFIG_SCSI_FC_ATTRS) + if (host->transportt == fc_transport_template) { + fc_remote_port_delete(stor_device->rport); + fc_remove_host(host); + } +#endif + destroy_workqueue(host_dev->handle_error_wq); scsi_remove_host(host); storvsc_dev_remove(dev); scsi_host_put(host); +} + +static int storvsc_suspend(struct hv_device *hv_dev) +{ + struct storvsc_device *stor_device = hv_get_drvdata(hv_dev); + struct Scsi_Host *host = stor_device->host; + struct hv_host_device *host_dev = shost_priv(host); + + storvsc_wait_to_drain(stor_device); + + drain_workqueue(host_dev->handle_error_wq); + + vmbus_close(hv_dev->channel); + + kfree(stor_device->stor_chns); + stor_device->stor_chns = NULL; + + cpumask_clear(&stor_device->alloced_cpus); return 0; } +static int storvsc_resume(struct hv_device *hv_dev) +{ + int ret; + + ret = storvsc_connect_to_vsp(hv_dev, aligned_ringbuffer_size, + hv_dev_is_fc(hv_dev)); + return ret; +} + static struct hv_driver storvsc_drv = { .name = KBUILD_MODNAME, .id_table = id_table, .probe = storvsc_probe, .remove = storvsc_remove, + .suspend = storvsc_suspend, + .resume = storvsc_resume, + .driver = { + .probe_type = PROBE_PREFER_ASYNCHRONOUS, + }, +}; + +#if IS_ENABLED(CONFIG_SCSI_FC_ATTRS) +static struct fc_function_template fc_transport_functions = { + .show_host_node_name = 1, + .show_host_port_name = 1, }; +#endif static int __init storvsc_drv_init(void) { - u32 max_outstanding_req_per_channel; + int ret; /* * Divide the ring buffer data size (which is 1 page less @@ -1859,27 +2208,38 @@ static int __init storvsc_drv_init(void) * the ring buffer indices) by the max request size (which is * vmbus_channel_packet_multipage_buffer + struct vstor_packet + u64) */ + aligned_ringbuffer_size = VMBUS_RING_SIZE(storvsc_ringbuffer_size); max_outstanding_req_per_channel = - ((storvsc_ringbuffer_size - PAGE_SIZE) / + ((aligned_ringbuffer_size - PAGE_SIZE) / ALIGN(MAX_MULTIPAGE_BUFFER_PACKET + - sizeof(struct vstor_packet) + sizeof(u64) - - vmscsi_size_delta, + sizeof(struct vstor_packet) + sizeof(u64), sizeof(u64))); - if (max_outstanding_req_per_channel < - STORVSC_MAX_IO_REQUESTS) - return -EINVAL; +#if IS_ENABLED(CONFIG_SCSI_FC_ATTRS) + fc_transport_template = fc_attach_transport(&fc_transport_functions); + if (!fc_transport_template) + return -ENODEV; +#endif - return vmbus_driver_register(&storvsc_drv); + ret = vmbus_driver_register(&storvsc_drv); + +#if IS_ENABLED(CONFIG_SCSI_FC_ATTRS) + if (ret) + fc_release_transport(fc_transport_template); +#endif + + return ret; } static void __exit storvsc_drv_exit(void) { vmbus_driver_unregister(&storvsc_drv); +#if IS_ENABLED(CONFIG_SCSI_FC_ATTRS) + fc_release_transport(fc_transport_template); +#endif } MODULE_LICENSE("GPL"); -MODULE_VERSION(HV_DRV_VERSION); MODULE_DESCRIPTION("Microsoft Hyper-V virtual storage driver"); module_init(storvsc_drv_init); module_exit(storvsc_drv_exit); |
