diff options
Diffstat (limited to 'fs/cifs/smbdirect.c')
| -rw-r--r-- | fs/cifs/smbdirect.c | 2651 |
1 files changed, 0 insertions, 2651 deletions
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c deleted file mode 100644 index a568dac7b3a1..000000000000 --- a/fs/cifs/smbdirect.c +++ /dev/null @@ -1,2651 +0,0 @@ -/* - * Copyright (C) 2017, Microsoft Corporation. - * - * Author(s): Long Li <longli@microsoft.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. - */ -#include <linux/module.h> -#include <linux/highmem.h> -#include "smbdirect.h" -#include "cifs_debug.h" -#include "cifsproto.h" -#include "smb2proto.h" - -static struct smbd_response *get_empty_queue_buffer( - struct smbd_connection *info); -static struct smbd_response *get_receive_buffer( - struct smbd_connection *info); -static void put_receive_buffer( - struct smbd_connection *info, - struct smbd_response *response); -static int allocate_receive_buffers(struct smbd_connection *info, int num_buf); -static void destroy_receive_buffers(struct smbd_connection *info); - -static void put_empty_packet( - struct smbd_connection *info, struct smbd_response *response); -static void enqueue_reassembly( - struct smbd_connection *info, - struct smbd_response *response, int data_length); -static struct smbd_response *_get_first_reassembly( - struct smbd_connection *info); - -static int smbd_post_recv( - struct smbd_connection *info, - struct smbd_response *response); - -static int smbd_post_send_empty(struct smbd_connection *info); -static int smbd_post_send_data( - struct smbd_connection *info, - struct kvec *iov, int n_vec, int remaining_data_length); -static int smbd_post_send_page(struct smbd_connection *info, - struct page *page, unsigned long offset, - size_t size, int remaining_data_length); - -static void destroy_mr_list(struct smbd_connection *info); -static int allocate_mr_list(struct smbd_connection *info); - -/* SMBD version number */ -#define SMBD_V1 0x0100 - -/* Port numbers for SMBD transport */ -#define SMB_PORT 445 -#define SMBD_PORT 5445 - -/* Address lookup and resolve timeout in ms */ -#define RDMA_RESOLVE_TIMEOUT 5000 - -/* SMBD negotiation timeout in seconds */ -#define SMBD_NEGOTIATE_TIMEOUT 120 - -/* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */ -#define SMBD_MIN_RECEIVE_SIZE 128 -#define SMBD_MIN_FRAGMENTED_SIZE 131072 - -/* - * Default maximum number of RDMA read/write outstanding on this connection - * This value is possibly decreased during QP creation on hardware limit - */ -#define SMBD_CM_RESPONDER_RESOURCES 32 - -/* Maximum number of retries on data transfer operations */ -#define SMBD_CM_RETRY 6 -/* No need to retry on Receiver Not Ready since SMBD manages credits */ -#define SMBD_CM_RNR_RETRY 0 - -/* - * User configurable initial values per SMBD transport connection - * as defined in [MS-SMBD] 3.1.1.1 - * Those may change after a SMBD negotiation - */ -/* The local peer's maximum number of credits to grant to the peer */ -int smbd_receive_credit_max = 255; - -/* The remote peer's credit request of local peer */ -int smbd_send_credit_target = 255; - -/* The maximum single message size can be sent to remote peer */ -int smbd_max_send_size = 1364; - -/* The maximum fragmented upper-layer payload receive size supported */ -int smbd_max_fragmented_recv_size = 1024 * 1024; - -/* The maximum single-message size which can be received */ -int smbd_max_receive_size = 8192; - -/* The timeout to initiate send of a keepalive message on idle */ -int smbd_keep_alive_interval = 120; - -/* - * User configurable initial values for RDMA transport - * The actual values used may be lower and are limited to hardware capabilities - */ -/* Default maximum number of SGEs in a RDMA write/read */ -int smbd_max_frmr_depth = 2048; - -/* If payload is less than this byte, use RDMA send/recv not read/write */ -int rdma_readwrite_threshold = 4096; - -/* Transport logging functions - * Logging are defined as classes. They can be OR'ed to define the actual - * logging level via module parameter smbd_logging_class - * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and - * log_rdma_event() - */ -#define LOG_OUTGOING 0x1 -#define LOG_INCOMING 0x2 -#define LOG_READ 0x4 -#define LOG_WRITE 0x8 -#define LOG_RDMA_SEND 0x10 -#define LOG_RDMA_RECV 0x20 -#define LOG_KEEP_ALIVE 0x40 -#define LOG_RDMA_EVENT 0x80 -#define LOG_RDMA_MR 0x100 -static unsigned int smbd_logging_class; -module_param(smbd_logging_class, uint, 0644); -MODULE_PARM_DESC(smbd_logging_class, - "Logging class for SMBD transport 0x0 to 0x100"); - -#define ERR 0x0 -#define INFO 0x1 -static unsigned int smbd_logging_level = ERR; -module_param(smbd_logging_level, uint, 0644); -MODULE_PARM_DESC(smbd_logging_level, - "Logging level for SMBD transport, 0 (default): error, 1: info"); - -#define log_rdma(level, class, fmt, args...) \ -do { \ - if (level <= smbd_logging_level || class & smbd_logging_class) \ - cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\ -} while (0) - -#define log_outgoing(level, fmt, args...) \ - log_rdma(level, LOG_OUTGOING, fmt, ##args) -#define log_incoming(level, fmt, args...) \ - log_rdma(level, LOG_INCOMING, fmt, ##args) -#define log_read(level, fmt, args...) log_rdma(level, LOG_READ, fmt, ##args) -#define log_write(level, fmt, args...) log_rdma(level, LOG_WRITE, fmt, ##args) -#define log_rdma_send(level, fmt, args...) \ - log_rdma(level, LOG_RDMA_SEND, fmt, ##args) -#define log_rdma_recv(level, fmt, args...) \ - log_rdma(level, LOG_RDMA_RECV, fmt, ##args) -#define log_keep_alive(level, fmt, args...) \ - log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args) -#define log_rdma_event(level, fmt, args...) \ - log_rdma(level, LOG_RDMA_EVENT, fmt, ##args) -#define log_rdma_mr(level, fmt, args...) \ - log_rdma(level, LOG_RDMA_MR, fmt, ##args) - -/* - * Destroy the transport and related RDMA and memory resources - * Need to go through all the pending counters and make sure on one is using - * the transport while it is destroyed - */ -static void smbd_destroy_rdma_work(struct work_struct *work) -{ - struct smbd_response *response; - struct smbd_connection *info = - container_of(work, struct smbd_connection, destroy_work); - unsigned long flags; - - log_rdma_event(INFO, "destroying qp\n"); - ib_drain_qp(info->id->qp); - rdma_destroy_qp(info->id); - - /* Unblock all I/O waiting on the send queue */ - wake_up_interruptible_all(&info->wait_send_queue); - - log_rdma_event(INFO, "cancelling idle timer\n"); - cancel_delayed_work_sync(&info->idle_timer_work); - log_rdma_event(INFO, "cancelling send immediate work\n"); - cancel_delayed_work_sync(&info->send_immediate_work); - - log_rdma_event(INFO, "wait for all send to finish\n"); - wait_event(info->wait_smbd_send_pending, - info->smbd_send_pending == 0); - - log_rdma_event(INFO, "wait for all recv to finish\n"); - wake_up_interruptible(&info->wait_reassembly_queue); - wait_event(info->wait_smbd_recv_pending, - info->smbd_recv_pending == 0); - - log_rdma_event(INFO, "wait for all send posted to IB to finish\n"); - wait_event(info->wait_send_pending, - atomic_read(&info->send_pending) == 0); - wait_event(info->wait_send_payload_pending, - atomic_read(&info->send_payload_pending) == 0); - - log_rdma_event(INFO, "freeing mr list\n"); - wake_up_interruptible_all(&info->wait_mr); - wait_event(info->wait_for_mr_cleanup, - atomic_read(&info->mr_used_count) == 0); - destroy_mr_list(info); - - /* It's not posssible for upper layer to get to reassembly */ - log_rdma_event(INFO, "drain the reassembly queue\n"); - do { - spin_lock_irqsave(&info->reassembly_queue_lock, flags); - response = _get_first_reassembly(info); - if (response) { - list_del(&response->list); - spin_unlock_irqrestore( - &info->reassembly_queue_lock, flags); - put_receive_buffer(info, response); - } else - spin_unlock_irqrestore(&info->reassembly_queue_lock, flags); - } while (response); - - info->reassembly_data_length = 0; - - log_rdma_event(INFO, "free receive buffers\n"); - wait_event(info->wait_receive_queues, - info->count_receive_queue + info->count_empty_packet_queue - == info->receive_credit_max); - destroy_receive_buffers(info); - - ib_free_cq(info->send_cq); - ib_free_cq(info->recv_cq); - ib_dealloc_pd(info->pd); - rdma_destroy_id(info->id); - - /* free mempools */ - mempool_destroy(info->request_mempool); - kmem_cache_destroy(info->request_cache); - - mempool_destroy(info->response_mempool); - kmem_cache_destroy(info->response_cache); - - info->transport_status = SMBD_DESTROYED; - wake_up_all(&info->wait_destroy); -} - -static int smbd_process_disconnected(struct smbd_connection *info) -{ - schedule_work(&info->destroy_work); - return 0; -} - -static void smbd_disconnect_rdma_work(struct work_struct *work) -{ - struct smbd_connection *info = - container_of(work, struct smbd_connection, disconnect_work); - - if (info->transport_status == SMBD_CONNECTED) { - info->transport_status = SMBD_DISCONNECTING; - rdma_disconnect(info->id); - } -} - -static void smbd_disconnect_rdma_connection(struct smbd_connection *info) -{ - queue_work(info->workqueue, &info->disconnect_work); -} - -/* Upcall from RDMA CM */ -static int smbd_conn_upcall( - struct rdma_cm_id *id, struct rdma_cm_event *event) -{ - struct smbd_connection *info = id->context; - - log_rdma_event(INFO, "event=%d status=%d\n", - event->event, event->status); - - switch (event->event) { - case RDMA_CM_EVENT_ADDR_RESOLVED: - case RDMA_CM_EVENT_ROUTE_RESOLVED: - info->ri_rc = 0; - complete(&info->ri_done); - break; - - case RDMA_CM_EVENT_ADDR_ERROR: - info->ri_rc = -EHOSTUNREACH; - complete(&info->ri_done); - break; - - case RDMA_CM_EVENT_ROUTE_ERROR: - info->ri_rc = -ENETUNREACH; - complete(&info->ri_done); - break; - - case RDMA_CM_EVENT_ESTABLISHED: - log_rdma_event(INFO, "connected event=%d\n", event->event); - info->transport_status = SMBD_CONNECTED; - wake_up_interruptible(&info->conn_wait); - break; - - case RDMA_CM_EVENT_CONNECT_ERROR: - case RDMA_CM_EVENT_UNREACHABLE: - case RDMA_CM_EVENT_REJECTED: - log_rdma_event(INFO, "connecting failed event=%d\n", event->event); - info->transport_status = SMBD_DISCONNECTED; - wake_up_interruptible(&info->conn_wait); - break; - - case RDMA_CM_EVENT_DEVICE_REMOVAL: - case RDMA_CM_EVENT_DISCONNECTED: - /* This happenes when we fail the negotiation */ - if (info->transport_status == SMBD_NEGOTIATE_FAILED) { - info->transport_status = SMBD_DISCONNECTED; - wake_up(&info->conn_wait); - break; - } - - info->transport_status = SMBD_DISCONNECTED; - smbd_process_disconnected(info); - break; - - default: - break; - } - - return 0; -} - -/* Upcall from RDMA QP */ -static void -smbd_qp_async_error_upcall(struct ib_event *event, void *context) -{ - struct smbd_connection *info = context; - - log_rdma_event(ERR, "%s on device %s info %p\n", - ib_event_msg(event->event), event->device->name, info); - - switch (event->event) { - case IB_EVENT_CQ_ERR: - case IB_EVENT_QP_FATAL: - smbd_disconnect_rdma_connection(info); - - default: - break; - } -} - -static inline void *smbd_request_payload(struct smbd_request *request) -{ - return (void *)request->packet; -} - -static inline void *smbd_response_payload(struct smbd_response *response) -{ - return (void *)response->packet; -} - -/* Called when a RDMA send is done */ -static void send_done(struct ib_cq *cq, struct ib_wc *wc) -{ - int i; - struct smbd_request *request = - container_of(wc->wr_cqe, struct smbd_request, cqe); - - log_rdma_send(INFO, "smbd_request %p completed wc->status=%d\n", - request, wc->status); - - if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) { - log_rdma_send(ERR, "wc->status=%d wc->opcode=%d\n", - wc->status, wc->opcode); - smbd_disconnect_rdma_connection(request->info); - } - - for (i = 0; i < request->num_sge; i++) - ib_dma_unmap_single(request->info->id->device, - request->sge[i].addr, - request->sge[i].length, - DMA_TO_DEVICE); - - if (request->has_payload) { - if (atomic_dec_and_test(&request->info->send_payload_pending)) - wake_up(&request->info->wait_send_payload_pending); - } else { - if (atomic_dec_and_test(&request->info->send_pending)) - wake_up(&request->info->wait_send_pending); - } - - mempool_free(request, request->info->request_mempool); -} - -static void dump_smbd_negotiate_resp(struct smbd_negotiate_resp *resp) -{ - log_rdma_event(INFO, "resp message min_version %u max_version %u " - "negotiated_version %u credits_requested %u " - "credits_granted %u status %u max_readwrite_size %u " - "preferred_send_size %u max_receive_size %u " - "max_fragmented_size %u\n", - resp->min_version, resp->max_version, resp->negotiated_version, - resp->credits_requested, resp->credits_granted, resp->status, - resp->max_readwrite_size, resp->preferred_send_size, - resp->max_receive_size, resp->max_fragmented_size); -} - -/* - * Process a negotiation response message, according to [MS-SMBD]3.1.5.7 - * response, packet_length: the negotiation response message - * return value: true if negotiation is a success, false if failed - */ -static bool process_negotiation_response( - struct smbd_response *response, int packet_length) -{ - struct smbd_connection *info = response->info; - struct smbd_negotiate_resp *packet = smbd_response_payload(response); - - if (packet_length < sizeof(struct smbd_negotiate_resp)) { - log_rdma_event(ERR, - "error: packet_length=%d\n", packet_length); - return false; - } - - if (le16_to_cpu(packet->negotiated_version) != SMBD_V1) { - log_rdma_event(ERR, "error: negotiated_version=%x\n", - le16_to_cpu(packet->negotiated_version)); - return false; - } - info->protocol = le16_to_cpu(packet->negotiated_version); - - if (packet->credits_requested == 0) { - log_rdma_event(ERR, "error: credits_requested==0\n"); - return false; - } - info->receive_credit_target = le16_to_cpu(packet->credits_requested); - - if (packet->credits_granted == 0) { - log_rdma_event(ERR, "error: credits_granted==0\n"); - return false; - } - atomic_set(&info->send_credits, le16_to_cpu(packet->credits_granted)); - - atomic_set(&info->receive_credits, 0); - - if (le32_to_cpu(packet->preferred_send_size) > info->max_receive_size) { - log_rdma_event(ERR, "error: preferred_send_size=%d\n", - le32_to_cpu(packet->preferred_send_size)); - return false; - } - info->max_receive_size = le32_to_cpu(packet->preferred_send_size); - - if (le32_to_cpu(packet->max_receive_size) < SMBD_MIN_RECEIVE_SIZE) { - log_rdma_event(ERR, "error: max_receive_size=%d\n", - le32_to_cpu(packet->max_receive_size)); - return false; - } - info->max_send_size = min_t(int, info->max_send_size, - le32_to_cpu(packet->max_receive_size)); - - if (le32_to_cpu(packet->max_fragmented_size) < - SMBD_MIN_FRAGMENTED_SIZE) { - log_rdma_event(ERR, "error: max_fragmented_size=%d\n", - le32_to_cpu(packet->max_fragmented_size)); - return false; - } - info->max_fragmented_send_size = - le32_to_cpu(packet->max_fragmented_size); - info->rdma_readwrite_threshold = - rdma_readwrite_threshold > info->max_fragmented_send_size ? - info->max_fragmented_send_size : - rdma_readwrite_threshold; - - - info->max_readwrite_size = min_t(u32, - le32_to_cpu(packet->max_readwrite_size), - info->max_frmr_depth * PAGE_SIZE); - info->max_frmr_depth = info->max_readwrite_size / PAGE_SIZE; - - return true; -} - -/* - * Check and schedule to send an immediate packet - * This is used to extend credtis to remote peer to keep the transport busy - */ -static void check_and_send_immediate(struct smbd_connection *info) -{ - if (info->transport_status != SMBD_CONNECTED) - return; - - info->send_immediate = true; - - /* - * Promptly send a packet if our peer is running low on receive - * credits - */ - if (atomic_read(&info->receive_credits) < - info->receive_credit_target - 1) - queue_delayed_work( - info->workqueue, &info->send_immediate_work, 0); -} - -static void smbd_post_send_credits(struct work_struct *work) -{ - int ret = 0; - int use_receive_queue = 1; - int rc; - struct smbd_response *response; - struct smbd_connection *info = - container_of(work, struct smbd_connection, - post_send_credits_work); - - if (info->transport_status != SMBD_CONNECTED) { - wake_up(&info->wait_receive_queues); - return; - } - - if (info->receive_credit_target > - atomic_read(&info->receive_credits)) { - while (true) { - if (use_receive_queue) - response = get_receive_buffer(info); - else - response = get_empty_queue_buffer(info); - if (!response) { - /* now switch to emtpy packet queue */ - if (use_receive_queue) { - use_receive_queue = 0; - continue; - } else - break; - } - - response->type = SMBD_TRANSFER_DATA; - response->first_segment = false; - rc = smbd_post_recv(info, response); - if (rc) { - log_rdma_recv(ERR, - "post_recv failed rc=%d\n", rc); - put_receive_buffer(info, response); - break; - } - - ret++; - } - } - - spin_lock(&info->lock_new_credits_offered); - info->new_credits_offered += ret; - spin_unlock(&info->lock_new_credits_offered); - - atomic_add(ret, &info->receive_credits); - - /* Check if we can post new receive and grant credits to peer */ - check_and_send_immediate(info); -} - -static void smbd_recv_done_work(struct work_struct *work) -{ - struct smbd_connection *info = - container_of(work, struct smbd_connection, recv_done_work); - - /* - * We may have new send credits granted from remote peer - * If any sender is blcoked on lack of credets, unblock it - */ - if (atomic_read(&info->send_credits)) - wake_up_interruptible(&info->wait_send_queue); - - /* - * Check if we need to send something to remote peer to - * grant more credits or respond to KEEP_ALIVE packet - */ - check_and_send_immediate(info); -} - -/* Called from softirq, when recv is done */ -static void recv_done(struct ib_cq *cq, struct ib_wc *wc) -{ - struct smbd_data_transfer *data_transfer; - struct smbd_response *response = - container_of(wc->wr_cqe, struct smbd_response, cqe); - struct smbd_connection *info = response->info; - int data_length = 0; - - log_rdma_recv(INFO, "response=%p type=%d wc status=%d wc opcode %d " - "byte_len=%d pkey_index=%x\n", - response, response->type, wc->status, wc->opcode, - wc->byte_len, wc->pkey_index); - - if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { - log_rdma_recv(INFO, "wc->status=%d opcode=%d\n", - wc->status, wc->opcode); - smbd_disconnect_rdma_connection(info); - goto error; - } - - ib_dma_sync_single_for_cpu( - wc->qp->device, - response->sge.addr, - response->sge.length, - DMA_FROM_DEVICE); - - switch (response->type) { - /* SMBD negotiation response */ - case SMBD_NEGOTIATE_RESP: - dump_smbd_negotiate_resp(smbd_response_payload(response)); - info->full_packet_received = true; - info->negotiate_done = - process_negotiation_response(response, wc->byte_len); - complete(&info->negotiate_completion); - break; - - /* SMBD data transfer packet */ - case SMBD_TRANSFER_DATA: - data_transfer = smbd_response_payload(response); - data_length = le32_to_cpu(data_transfer->data_length); - - /* - * If this is a packet with data playload place the data in - * reassembly queue and wake up the reading thread - */ - if (data_length) { - if (info->full_packet_received) - response->first_segment = true; - - if (le32_to_cpu(data_transfer->remaining_data_length)) - info->full_packet_received = false; - else - info->full_packet_received = true; - - enqueue_reassembly( - info, - response, - data_length); - } else - put_empty_packet(info, response); - - if (data_length) - wake_up_interruptible(&info->wait_reassembly_queue); - - atomic_dec(&info->receive_credits); - info->receive_credit_target = - le16_to_cpu(data_transfer->credits_requested); - atomic_add(le16_to_cpu(data_transfer->credits_granted), - &info->send_credits); - - log_incoming(INFO, "data flags %d data_offset %d " - "data_length %d remaining_data_length %d\n", - le16_to_cpu(data_transfer->flags), - le32_to_cpu(data_transfer->data_offset), - le32_to_cpu(data_transfer->data_length), - le32_to_cpu(data_transfer->remaining_data_length)); - - /* Send a KEEP_ALIVE response right away if requested */ - info->keep_alive_requested = KEEP_ALIVE_NONE; - if (le16_to_cpu(data_transfer->flags) & - SMB_DIRECT_RESPONSE_REQUESTED) { - info->keep_alive_requested = KEEP_ALIVE_PENDING; - } - - queue_work(info->workqueue, &info->recv_done_work); - return; - - default: - log_rdma_recv(ERR, - "unexpected response type=%d\n", response->type); - } - -error: - put_receive_buffer(info, response); -} - -static struct rdma_cm_id *smbd_create_id( - struct smbd_connection *info, - struct sockaddr *dstaddr, int port) -{ - struct rdma_cm_id *id; - int rc; - __be16 *sport; - - id = rdma_create_id(&init_net, smbd_conn_upcall, info, - RDMA_PS_TCP, IB_QPT_RC); - if (IS_ERR(id)) { - rc = PTR_ERR(id); - log_rdma_event(ERR, "rdma_create_id() failed %i\n", rc); - return id; - } - - if (dstaddr->sa_family == AF_INET6) - sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port; - else - sport = &((struct sockaddr_in *)dstaddr)->sin_port; - - *sport = htons(port); - - init_completion(&info->ri_done); - info->ri_rc = -ETIMEDOUT; - - rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr, - RDMA_RESOLVE_TIMEOUT); - if (rc) { - log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc); - goto out; - } - wait_for_completion_interruptible_timeout( - &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); - rc = info->ri_rc; - if (rc) { - log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc); - goto out; - } - - info->ri_rc = -ETIMEDOUT; - rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); - if (rc) { - log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc); - goto out; - } - wait_for_completion_interruptible_timeout( - &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); - rc = info->ri_rc; - if (rc) { - log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc); - goto out; - } - - return id; - -out: - rdma_destroy_id(id); - return ERR_PTR(rc); -} - -/* - * Test if FRWR (Fast Registration Work Requests) is supported on the device - * This implementation requries FRWR on RDMA read/write - * return value: true if it is supported - */ -static bool frwr_is_supported(struct ib_device_attr *attrs) -{ - if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) - return false; - if (attrs->max_fast_reg_page_list_len == 0) - return false; - return true; -} - -static int smbd_ia_open( - struct smbd_connection *info, - struct sockaddr *dstaddr, int port) -{ - int rc; - - info->id = smbd_create_id(info, dstaddr, port); - if (IS_ERR(info->id)) { - rc = PTR_ERR(info->id); - goto out1; - } - - if (!frwr_is_supported(&info->id->device->attrs)) { - log_rdma_event(ERR, - "Fast Registration Work Requests " - "(FRWR) is not supported\n"); - log_rdma_event(ERR, - "Device capability flags = %llx " - "max_fast_reg_page_list_len = %u\n", - info->id->device->attrs.device_cap_flags, - info->id->device->attrs.max_fast_reg_page_list_len); - rc = -EPROTONOSUPPORT; - goto out2; - } - info->max_frmr_depth = min_t(int, - smbd_max_frmr_depth, - info->id->device->attrs.max_fast_reg_page_list_len); - info->mr_type = IB_MR_TYPE_MEM_REG; - if (info->id->device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG) - info->mr_type = IB_MR_TYPE_SG_GAPS; - - info->pd = ib_alloc_pd(info->id->device, 0); - if (IS_ERR(info->pd)) { - rc = PTR_ERR(info->pd); - log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc); - goto out2; - } - - return 0; - -out2: - rdma_destroy_id(info->id); - info->id = NULL; - -out1: - return rc; -} - -/* - * Send a negotiation request message to the peer - * The negotiation procedure is in [MS-SMBD] 3.1.5.2 and 3.1.5.3 - * After negotiation, the transport is connected and ready for - * carrying upper layer SMB payload - */ -static int smbd_post_send_negotiate_req(struct smbd_connection *info) -{ - struct ib_send_wr send_wr; - int rc = -ENOMEM; - struct smbd_request *request; - struct smbd_negotiate_req *packet; - - request = mempool_alloc(info->request_mempool, GFP_KERNEL); - if (!request) - return rc; - - request->info = info; - - packet = smbd_request_payload(request); - packet->min_version = cpu_to_le16(SMBD_V1); - packet->max_version = cpu_to_le16(SMBD_V1); - packet->reserved = 0; - packet->credits_requested = cpu_to_le16(info->send_credit_target); - packet->preferred_send_size = cpu_to_le32(info->max_send_size); - packet->max_receive_size = cpu_to_le32(info->max_receive_size); - packet->max_fragmented_size = - cpu_to_le32(info->max_fragmented_recv_size); - - request->num_sge = 1; - request->sge[0].addr = ib_dma_map_single( - info->id->device, (void *)packet, - sizeof(*packet), DMA_TO_DEVICE); - if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) { - rc = -EIO; - goto dma_mapping_failed; - } - - request->sge[0].length = sizeof(*packet); - request->sge[0].lkey = info->pd->local_dma_lkey; - - ib_dma_sync_single_for_device( - info->id->device, request->sge[0].addr, - request->sge[0].length, DMA_TO_DEVICE); - - request->cqe.done = send_done; - - send_wr.next = NULL; - send_wr.wr_cqe = &request->cqe; - send_wr.sg_list = request->sge; - send_wr.num_sge = request->num_sge; - send_wr.opcode = IB_WR_SEND; - send_wr.send_flags = IB_SEND_SIGNALED; - - log_rdma_send(INFO, "sge addr=%llx length=%x lkey=%x\n", - request->sge[0].addr, - request->sge[0].length, request->sge[0].lkey); - - request->has_payload = false; - atomic_inc(&info->send_pending); - rc = ib_post_send(info->id->qp, &send_wr, NULL); - if (!rc) - return 0; - - /* if we reach here, post send failed */ - log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc); - atomic_dec(&info->send_pending); - ib_dma_unmap_single(info->id->device, request->sge[0].addr, - request->sge[0].length, DMA_TO_DEVICE); - - smbd_disconnect_rdma_connection(info); - -dma_mapping_failed: - mempool_free(request, info->request_mempool); - return rc; -} - -/* - * Extend the credits to remote peer - * This implements [MS-SMBD] 3.1.5.9 - * The idea is that we should extend credits to remote peer as quickly as - * it's allowed, to maintain data flow. We allocate as much receive - * buffer as possible, and extend the receive credits to remote peer - * return value: the new credtis being granted. - */ -static int manage_credits_prior_sending(struct smbd_connection *info) -{ - int new_credits; - - spin_lock(&info->lock_new_credits_offered); - new_credits = info->new_credits_offered; - info->new_credits_offered = 0; - spin_unlock(&info->lock_new_credits_offered); - - return new_credits; -} - -/* - * Check if we need to send a KEEP_ALIVE message - * The idle connection timer triggers a KEEP_ALIVE message when expires - * SMB_DIRECT_RESPONSE_REQUESTED is set in the message flag to have peer send - * back a response. - * return value: - * 1 if SMB_DIRECT_RESPONSE_REQUESTED needs to be set - * 0: otherwise - */ -static int manage_keep_alive_before_sending(struct smbd_connection *info) -{ - if (info->keep_alive_requested == KEEP_ALIVE_PENDING) { - info->keep_alive_requested = KEEP_ALIVE_SENT; - return 1; - } - return 0; -} - -/* - * Build and prepare the SMBD packet header - * This function waits for avaialbe send credits and build a SMBD packet - * header. The caller then optional append payload to the packet after - * the header - * intput values - * size: the size of the payload - * remaining_data_length: remaining data to send if this is part of a - * fragmented packet - * output values - * request_out: the request allocated from this function - * return values: 0 on success, otherwise actual error code returned - */ -static int smbd_create_header(struct smbd_connection *info, - int size, int remaining_data_length, - struct smbd_request **request_out) -{ - struct smbd_request *request; - struct smbd_data_transfer *packet; - int header_length; - int rc; - - /* Wait for send credits. A SMBD packet needs one credit */ - rc = wait_event_interruptible(info->wait_send_queue, - atomic_read(&info->send_credits) > 0 || - info->transport_status != SMBD_CONNECTED); - if (rc) - return rc; - - if (info->transport_status != SMBD_CONNECTED) { - log_outgoing(ERR, "disconnected not sending\n"); - return -ENOENT; - } - atomic_dec(&info->send_credits); - - request = mempool_alloc(info->request_mempool, GFP_KERNEL); - if (!request) { - rc = -ENOMEM; - goto err; - } - - request->info = info; - - /* Fill in the packet header */ - packet = smbd_request_payload(request); - packet->credits_requested = cpu_to_le16(info->send_credit_target); - packet->credits_granted = - cpu_to_le16(manage_credits_prior_sending(info)); - info->send_immediate = false; - - packet->flags = 0; - if (manage_keep_alive_before_sending(info)) - packet->flags |= cpu_to_le16(SMB_DIRECT_RESPONSE_REQUESTED); - - packet->reserved = 0; - if (!size) - packet->data_offset = 0; - else - packet->data_offset = cpu_to_le32(24); - packet->data_length = cpu_to_le32(size); - packet->remaining_data_length = cpu_to_le32(remaining_data_length); - packet->padding = 0; - - log_outgoing(INFO, "credits_requested=%d credits_granted=%d " - "data_offset=%d data_length=%d remaining_data_length=%d\n", - le16_to_cpu(packet->credits_requested), - le16_to_cpu(packet->credits_granted), - le32_to_cpu(packet->data_offset), - le32_to_cpu(packet->data_length), - le32_to_cpu(packet->remaining_data_length)); - - /* Map the packet to DMA */ - header_length = sizeof(struct smbd_data_transfer); - /* If this is a packet without payload, don't send padding */ - if (!size) - header_length = offsetof(struct smbd_data_transfer, padding); - - request->num_sge = 1; - request->sge[0].addr = ib_dma_map_single(info->id->device, - (void *)packet, - header_length, - DMA_BIDIRECTIONAL); - if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) { - mempool_free(request, info->request_mempool); - rc = -EIO; - goto err; - } - - request->sge[0].length = header_length; - request->sge[0].lkey = info->pd->local_dma_lkey; - - *request_out = request; - return 0; - -err: - atomic_inc(&info->send_credits); - return rc; -} - -static void smbd_destroy_header(struct smbd_connection *info, - struct smbd_request *request) -{ - - ib_dma_unmap_single(info->id->device, - request->sge[0].addr, - request->sge[0].length, - DMA_TO_DEVICE); - mempool_free(request, info->request_mempool); - atomic_inc(&info->send_credits); -} - -/* Post the send request */ -static int smbd_post_send(struct smbd_connection *info, - struct smbd_request *request, bool has_payload) -{ - struct ib_send_wr send_wr; - int rc, i; - - for (i = 0; i < request->num_sge; i++) { - log_rdma_send(INFO, - "rdma_request sge[%d] addr=%llu length=%u\n", - i, request->sge[i].addr, request->sge[i].length); - ib_dma_sync_single_for_device( - info->id->device, - request->sge[i].addr, - request->sge[i].length, - DMA_TO_DEVICE); - } - - request->cqe.done = send_done; - - send_wr.next = NULL; - send_wr.wr_cqe = &request->cqe; - send_wr.sg_list = request->sge; - send_wr.num_sge = request->num_sge; - send_wr.opcode = IB_WR_SEND; - send_wr.send_flags = IB_SEND_SIGNALED; - - if (has_payload) { - request->has_payload = true; - atomic_inc(&info->send_payload_pending); - } else { - request->has_payload = false; - atomic_inc(&info->send_pending); - } - - rc = ib_post_send(info->id->qp, &send_wr, NULL); - if (rc) { - log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc); - if (has_payload) { - if (atomic_dec_and_test(&info->send_payload_pending)) - wake_up(&info->wait_send_payload_pending); - } else { - if (atomic_dec_and_test(&info->send_pending)) - wake_up(&info->wait_send_pending); - } - smbd_disconnect_rdma_connection(info); - } else - /* Reset timer for idle connection after packet is sent */ - mod_delayed_work(info->workqueue, &info->idle_timer_work, - info->keep_alive_interval*HZ); - - return rc; -} - -static int smbd_post_send_sgl(struct smbd_connection *info, - struct scatterlist *sgl, int data_length, int remaining_data_length) -{ - int num_sgs; - int i, rc; - struct smbd_request *request; - struct scatterlist *sg; - - rc = smbd_create_header( - info, data_length, remaining_data_length, &request); - if (rc) - return rc; - - num_sgs = sgl ? sg_nents(sgl) : 0; - for_each_sg(sgl, sg, num_sgs, i) { - request->sge[i+1].addr = - ib_dma_map_page(info->id->device, sg_page(sg), - sg->offset, sg->length, DMA_BIDIRECTIONAL); - if (ib_dma_mapping_error( - info->id->device, request->sge[i+1].addr)) { - rc = -EIO; - request->sge[i+1].addr = 0; - goto dma_mapping_failure; - } - request->sge[i+1].length = sg->length; - request->sge[i+1].lkey = info->pd->local_dma_lkey; - request->num_sge++; - } - - rc = smbd_post_send(info, request, data_length); - if (!rc) - return 0; - -dma_mapping_failure: - for (i = 1; i < request->num_sge; i++) - if (request->sge[i].addr) - ib_dma_unmap_single(info->id->device, - request->sge[i].addr, - request->sge[i].length, - DMA_TO_DEVICE); - smbd_destroy_header(info, request); - return rc; -} - -/* - * Send a page - * page: the page to send - * offset: offset in the page to send - * size: length in the page to send - * remaining_data_length: remaining data to send in this payload - */ -static int smbd_post_send_page(struct smbd_connection *info, struct page *page, - unsigned long offset, size_t size, int remaining_data_length) -{ - struct scatterlist sgl; - - sg_init_table(&sgl, 1); - sg_set_page(&sgl, page, size, offset); - - return smbd_post_send_sgl(info, &sgl, size, remaining_data_length); -} - -/* - * Send an empty message - * Empty message is used to extend credits to peer to for keep live - * while there is no upper layer payload to send at the time - */ -static int smbd_post_send_empty(struct smbd_connection *info) -{ - info->count_send_empty++; - return smbd_post_send_sgl(info, NULL, 0, 0); -} - -/* - * Send a data buffer - * iov: the iov array describing the data buffers - * n_vec: number of iov array - * remaining_data_length: remaining data to send following this packet - * in segmented SMBD packet - */ -static int smbd_post_send_data( - struct smbd_connection *info, struct kvec *iov, int n_vec, - int remaining_data_length) -{ - int i; - u32 data_length = 0; - struct scatterlist sgl[SMBDIRECT_MAX_SGE]; - - if (n_vec > SMBDIRECT_MAX_SGE) { - cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec); - return -ENOMEM; - } - - sg_init_table(sgl, n_vec); - for (i = 0; i < n_vec; i++) { - data_length += iov[i].iov_len; - sg_set_buf(&sgl[i], iov[i].iov_base, iov[i].iov_len); - } - - return smbd_post_send_sgl(info, sgl, data_length, remaining_data_length); -} - -/* - * Post a receive request to the transport - * The remote peer can only send data when a receive request is posted - * The interaction is controlled by send/receive credit system - */ -static int smbd_post_recv( - struct smbd_connection *info, struct smbd_response *response) -{ - struct ib_recv_wr recv_wr; - int rc = -EIO; - - response->sge.addr = ib_dma_map_single( - info->id->device, response->packet, - info->max_receive_size, DMA_FROM_DEVICE); - if (ib_dma_mapping_error(info->id->device, response->sge.addr)) - return rc; - - response->sge.length = info->max_receive_size; - response->sge.lkey = info->pd->local_dma_lkey; - - response->cqe.done = recv_done; - - recv_wr.wr_cqe = &response->cqe; - recv_wr.next = NULL; - recv_wr.sg_list = &response->sge; - recv_wr.num_sge = 1; - - rc = ib_post_recv(info->id->qp, &recv_wr, NULL); - if (rc) { - ib_dma_unmap_single(info->id->device, response->sge.addr, - response->sge.length, DMA_FROM_DEVICE); - smbd_disconnect_rdma_connection(info); - log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc); - } - - return rc; -} - -/* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */ -static int smbd_negotiate(struct smbd_connection *info) -{ - int rc; - struct smbd_response *response = get_receive_buffer(info); - - response->type = SMBD_NEGOTIATE_RESP; - rc = smbd_post_recv(info, response); - log_rdma_event(INFO, - "smbd_post_recv rc=%d iov.addr=%llx iov.length=%x " - "iov.lkey=%x\n", - rc, response->sge.addr, - response->sge.length, response->sge.lkey); - if (rc) - return rc; - - init_completion(&info->negotiate_completion); - info->negotiate_done = false; - rc = smbd_post_send_negotiate_req(info); - if (rc) - return rc; - - rc = wait_for_completion_interruptible_timeout( - &info->negotiate_completion, SMBD_NEGOTIATE_TIMEOUT * HZ); - log_rdma_event(INFO, "wait_for_completion_timeout rc=%d\n", rc); - - if (info->negotiate_done) - return 0; - - if (rc == 0) - rc = -ETIMEDOUT; - else if (rc == -ERESTARTSYS) - rc = -EINTR; - else - rc = -ENOTCONN; - - return rc; -} - -static void put_empty_packet( - struct smbd_connection *info, struct smbd_response *response) -{ - spin_lock(&info->empty_packet_queue_lock); - list_add_tail(&response->list, &info->empty_packet_queue); - info->count_empty_packet_queue++; - spin_unlock(&info->empty_packet_queue_lock); - - queue_work(info->workqueue, &info->post_send_credits_work); -} - -/* - * Implement Connection.FragmentReassemblyBuffer defined in [MS-SMBD] 3.1.1.1 - * This is a queue for reassembling upper layer payload and present to upper - * layer. All the inncoming payload go to the reassembly queue, regardless of - * if reassembly is required. The uuper layer code reads from the queue for all - * incoming payloads. - * Put a received packet to the reassembly queue - * response: the packet received - * data_length: the size of payload in this packet - */ -static void enqueue_reassembly( - struct smbd_connection *info, - struct smbd_response *response, - int data_length) -{ - spin_lock(&info->reassembly_queue_lock); - list_add_tail(&response->list, &info->reassembly_queue); - info->reassembly_queue_length++; - /* - * Make sure reassembly_data_length is updated after list and - * reassembly_queue_length are updated. On the dequeue side - * reassembly_data_length is checked without a lock to determine - * if reassembly_queue_length and list is up to date - */ - virt_wmb(); - info->reassembly_data_length += data_length; - spin_unlock(&info->reassembly_queue_lock); - info->count_reassembly_queue++; - info->count_enqueue_reassembly_queue++; -} - -/* - * Get the first entry at the front of reassembly queue - * Caller is responsible for locking - * return value: the first entry if any, NULL if queue is empty - */ -static struct smbd_response *_get_first_reassembly(struct smbd_connection *info) -{ - struct smbd_response *ret = NULL; - - if (!list_empty(&info->reassembly_queue)) { - ret = list_first_entry( - &info->reassembly_queue, - struct smbd_response, list); - } - return ret; -} - -static struct smbd_response *get_empty_queue_buffer( - struct smbd_connection *info) -{ - struct smbd_response *ret = NULL; - unsigned long flags; - - spin_lock_irqsave(&info->empty_packet_queue_lock, flags); - if (!list_empty(&info->empty_packet_queue)) { - ret = list_first_entry( - &info->empty_packet_queue, - struct smbd_response, list); - list_del(&ret->list); - info->count_empty_packet_queue--; - } - spin_unlock_irqrestore(&info->empty_packet_queue_lock, flags); - - return ret; -} - -/* - * Get a receive buffer - * For each remote send, we need to post a receive. The receive buffers are - * pre-allocated in advance. - * return value: the receive buffer, NULL if none is available - */ -static struct smbd_response *get_receive_buffer(struct smbd_connection *info) -{ - struct smbd_response *ret = NULL; - unsigned long flags; - - spin_lock_irqsave(&info->receive_queue_lock, flags); - if (!list_empty(&info->receive_queue)) { - ret = list_first_entry( - &info->receive_queue, - struct smbd_response, list); - list_del(&ret->list); - info->count_receive_queue--; - info->count_get_receive_buffer++; - } - spin_unlock_irqrestore(&info->receive_queue_lock, flags); - - return ret; -} - -/* - * Return a receive buffer - * Upon returning of a receive buffer, we can post new receive and extend - * more receive credits to remote peer. This is done immediately after a - * receive buffer is returned. - */ -static void put_receive_buffer( - struct smbd_connection *info, struct smbd_response *response) -{ - unsigned long flags; - - ib_dma_unmap_single(info->id->device, response->sge.addr, - response->sge.length, DMA_FROM_DEVICE); - - spin_lock_irqsave(&info->receive_queue_lock, flags); - list_add_tail(&response->list, &info->receive_queue); - info->count_receive_queue++; - info->count_put_receive_buffer++; - spin_unlock_irqrestore(&info->receive_queue_lock, flags); - - queue_work(info->workqueue, &info->post_send_credits_work); -} - -/* Preallocate all receive buffer on transport establishment */ -static int allocate_receive_buffers(struct smbd_connection *info, int num_buf) -{ - int i; - struct smbd_response *response; - - INIT_LIST_HEAD(&info->reassembly_queue); - spin_lock_init(&info->reassembly_queue_lock); - info->reassembly_data_length = 0; - info->reassembly_queue_length = 0; - - INIT_LIST_HEAD(&info->receive_queue); - spin_lock_init(&info->receive_queue_lock); - info->count_receive_queue = 0; - - INIT_LIST_HEAD(&info->empty_packet_queue); - spin_lock_init(&info->empty_packet_queue_lock); - info->count_empty_packet_queue = 0; - - init_waitqueue_head(&info->wait_receive_queues); - - for (i = 0; i < num_buf; i++) { - response = mempool_alloc(info->response_mempool, GFP_KERNEL); - if (!response) - goto allocate_failed; - - response->info = info; - list_add_tail(&response->list, &info->receive_queue); - info->count_receive_queue++; - } - - return 0; - -allocate_failed: - while (!list_empty(&info->receive_queue)) { - response = list_first_entry( - &info->receive_queue, - struct smbd_response, list); - list_del(&response->list); - info->count_receive_queue--; - - mempool_free(response, info->response_mempool); - } - return -ENOMEM; -} - -static void destroy_receive_buffers(struct smbd_connection *info) -{ - struct smbd_response *response; - - while ((response = get_receive_buffer(info))) - mempool_free(response, info->response_mempool); - - while ((response = get_empty_queue_buffer(info))) - mempool_free(response, info->response_mempool); -} - -/* - * Check and send an immediate or keep alive packet - * The condition to send those packets are defined in [MS-SMBD] 3.1.1.1 - * Connection.KeepaliveRequested and Connection.SendImmediate - * The idea is to extend credits to server as soon as it becomes available - */ -static void send_immediate_work(struct work_struct *work) -{ - struct smbd_connection *info = container_of( - work, struct smbd_connection, - send_immediate_work.work); - - if (info->keep_alive_requested == KEEP_ALIVE_PENDING || - info->send_immediate) { - log_keep_alive(INFO, "send an empty message\n"); - smbd_post_send_empty(info); - } -} - -/* Implement idle connection timer [MS-SMBD] 3.1.6.2 */ -static void idle_connection_timer(struct work_struct *work) -{ - struct smbd_connection *info = container_of( - work, struct smbd_connection, - idle_timer_work.work); - - if (info->keep_alive_requested != KEEP_ALIVE_NONE) { - log_keep_alive(ERR, - "error status info->keep_alive_requested=%d\n", - info->keep_alive_requested); - smbd_disconnect_rdma_connection(info); - return; - } - - log_keep_alive(INFO, "about to send an empty idle message\n"); - smbd_post_send_empty(info); - - /* Setup the next idle timeout work */ - queue_delayed_work(info->workqueue, &info->idle_timer_work, - info->keep_alive_interval*HZ); -} - -/* Destroy this SMBD connection, called from upper layer */ -void smbd_destroy(struct smbd_connection *info) -{ - log_rdma_event(INFO, "destroying rdma session\n"); - - /* Kick off the disconnection process */ - smbd_disconnect_rdma_connection(info); - - log_rdma_event(INFO, "wait for transport being destroyed\n"); - wait_event(info->wait_destroy, - info->transport_status == SMBD_DESTROYED); - - destroy_workqueue(info->workqueue); - kfree(info); -} - -/* - * Reconnect this SMBD connection, called from upper layer - * return value: 0 on success, or actual error code - */ -int smbd_reconnect(struct TCP_Server_Info *server) -{ - log_rdma_event(INFO, "reconnecting rdma session\n"); - - if (!server->smbd_conn) { - log_rdma_event(INFO, "rdma session already destroyed\n"); - goto create_conn; - } - - /* - * This is possible if transport is disconnected and we haven't received - * notification from RDMA, but upper layer has detected timeout - */ - if (server->smbd_conn->transport_status == SMBD_CONNECTED) { - log_rdma_event(INFO, "disconnecting transport\n"); - smbd_disconnect_rdma_connection(server->smbd_conn); - } - - /* wait until the transport is destroyed */ - if (!wait_event_timeout(server->smbd_conn->wait_destroy, - server->smbd_conn->transport_status == SMBD_DESTROYED, 5*HZ)) - return -EAGAIN; - - destroy_workqueue(server->smbd_conn->workqueue); - kfree(server->smbd_conn); - -create_conn: - log_rdma_event(INFO, "creating rdma session\n"); - server->smbd_conn = smbd_get_connection( - server, (struct sockaddr *) &server->dstaddr); - log_rdma_event(INFO, "created rdma session info=%p\n", - server->smbd_conn); - - return server->smbd_conn ? 0 : -ENOENT; -} - -static void destroy_caches_and_workqueue(struct smbd_connection *info) -{ - destroy_receive_buffers(info); - destroy_workqueue(info->workqueue); - mempool_destroy(info->response_mempool); - kmem_cache_destroy(info->response_cache); - mempool_destroy(info->request_mempool); - kmem_cache_destroy(info->request_cache); -} - -#define MAX_NAME_LEN 80 -static int allocate_caches_and_workqueue(struct smbd_connection *info) -{ - char name[MAX_NAME_LEN]; - int rc; - - snprintf(name, MAX_NAME_LEN, "smbd_request_%p", info); - info->request_cache = - kmem_cache_create( - name, - sizeof(struct smbd_request) + - sizeof(struct smbd_data_transfer), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!info->request_cache) - return -ENOMEM; - - info->request_mempool = - mempool_create(info->send_credit_target, mempool_alloc_slab, - mempool_free_slab, info->request_cache); - if (!info->request_mempool) - goto out1; - - snprintf(name, MAX_NAME_LEN, "smbd_response_%p", info); - info->response_cache = - kmem_cache_create( - name, - sizeof(struct smbd_response) + - info->max_receive_size, - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!info->response_cache) - goto out2; - - info->response_mempool = - mempool_create(info->receive_credit_max, mempool_alloc_slab, - mempool_free_slab, info->response_cache); - if (!info->response_mempool) - goto out3; - - snprintf(name, MAX_NAME_LEN, "smbd_%p", info); - info->workqueue = create_workqueue(name); - if (!info->workqueue) - goto out4; - - rc = allocate_receive_buffers(info, info->receive_credit_max); - if (rc) { - log_rdma_event(ERR, "failed to allocate receive buffers\n"); - goto out5; - } - - return 0; - -out5: - destroy_workqueue(info->workqueue); -out4: - mempool_destroy(info->response_mempool); -out3: - kmem_cache_destroy(info->response_cache); -out2: - mempool_destroy(info->request_mempool); -out1: - kmem_cache_destroy(info->request_cache); - return -ENOMEM; -} - -/* Create a SMBD connection, called by upper layer */ -static struct smbd_connection *_smbd_get_connection( - struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port) -{ - int rc; - struct smbd_connection *info; - struct rdma_conn_param conn_param; - struct ib_qp_init_attr qp_attr; - struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr; - struct ib_port_immutable port_immutable; - u32 ird_ord_hdr[2]; - - info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL); - if (!info) - return NULL; - - info->transport_status = SMBD_CONNECTING; - rc = smbd_ia_open(info, dstaddr, port); - if (rc) { - log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc); - goto create_id_failed; - } - - if (smbd_send_credit_target > info->id->device->attrs.max_cqe || - smbd_send_credit_target > info->id->device->attrs.max_qp_wr) { - log_rdma_event(ERR, - "consider lowering send_credit_target = %d. " - "Possible CQE overrun, device " - "reporting max_cpe %d max_qp_wr %d\n", - smbd_send_credit_target, - info->id->device->attrs.max_cqe, - info->id->device->attrs.max_qp_wr); - goto config_failed; - } - - if (smbd_receive_credit_max > info->id->device->attrs.max_cqe || - smbd_receive_credit_max > info->id->device->attrs.max_qp_wr) { - log_rdma_event(ERR, - "consider lowering receive_credit_max = %d. " - "Possible CQE overrun, device " - "reporting max_cpe %d max_qp_wr %d\n", - smbd_receive_credit_max, - info->id->device->attrs.max_cqe, - info->id->device->attrs.max_qp_wr); - goto config_failed; - } - - info->receive_credit_max = smbd_receive_credit_max; - info->send_credit_target = smbd_send_credit_target; - info->max_send_size = smbd_max_send_size; - info->max_fragmented_recv_size = smbd_max_fragmented_recv_size; - info->max_receive_size = smbd_max_receive_size; - info->keep_alive_interval = smbd_keep_alive_interval; - - if (info->id->device->attrs.max_send_sge < SMBDIRECT_MAX_SGE) { - log_rdma_event(ERR, - "warning: device max_send_sge = %d too small\n", - info->id->device->attrs.max_send_sge); - log_rdma_event(ERR, "Queue Pair creation may fail\n"); - } - if (info->id->device->attrs.max_recv_sge < SMBDIRECT_MAX_SGE) { - log_rdma_event(ERR, - "warning: device max_recv_sge = %d too small\n", - info->id->device->attrs.max_recv_sge); - log_rdma_event(ERR, "Queue Pair creation may fail\n"); - } - - info->send_cq = NULL; - info->recv_cq = NULL; - info->send_cq = ib_alloc_cq(info->id->device, info, - info->send_credit_target, 0, IB_POLL_SOFTIRQ); - if (IS_ERR(info->send_cq)) { - info->send_cq = NULL; - goto alloc_cq_failed; - } - - info->recv_cq = ib_alloc_cq(info->id->device, info, - info->receive_credit_max, 0, IB_POLL_SOFTIRQ); - if (IS_ERR(info->recv_cq)) { - info->recv_cq = NULL; - goto alloc_cq_failed; - } - - memset(&qp_attr, 0, sizeof(qp_attr)); - qp_attr.event_handler = smbd_qp_async_error_upcall; - qp_attr.qp_context = info; - qp_attr.cap.max_send_wr = info->send_credit_target; - qp_attr.cap.max_recv_wr = info->receive_credit_max; - qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SGE; - qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_SGE; - qp_attr.cap.max_inline_data = 0; - qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; - qp_attr.qp_type = IB_QPT_RC; - qp_attr.send_cq = info->send_cq; - qp_attr.recv_cq = info->recv_cq; - qp_attr.port_num = ~0; - - rc = rdma_create_qp(info->id, info->pd, &qp_attr); - if (rc) { - log_rdma_event(ERR, "rdma_create_qp failed %i\n", rc); - goto create_qp_failed; - } - - memset(&conn_param, 0, sizeof(conn_param)); - conn_param.initiator_depth = 0; - - conn_param.responder_resources = - info->id->device->attrs.max_qp_rd_atom - < SMBD_CM_RESPONDER_RESOURCES ? - info->id->device->attrs.max_qp_rd_atom : - SMBD_CM_RESPONDER_RESOURCES; - info->responder_resources = conn_param.responder_resources; - log_rdma_mr(INFO, "responder_resources=%d\n", - info->responder_resources); - - /* Need to send IRD/ORD in private data for iWARP */ - info->id->device->ops.get_port_immutable( - info->id->device, info->id->port_num, &port_immutable); - if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) { - ird_ord_hdr[0] = info->responder_resources; - ird_ord_hdr[1] = 1; - conn_param.private_data = ird_ord_hdr; - conn_param.private_data_len = sizeof(ird_ord_hdr); - } else { - conn_param.private_data = NULL; - conn_param.private_data_len = 0; - } - - conn_param.retry_count = SMBD_CM_RETRY; - conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY; - conn_param.flow_control = 0; - init_waitqueue_head(&info->wait_destroy); - - log_rdma_event(INFO, "connecting to IP %pI4 port %d\n", - &addr_in->sin_addr, port); - - init_waitqueue_head(&info->conn_wait); - rc = rdma_connect(info->id, &conn_param); - if (rc) { - log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc); - goto rdma_connect_failed; - } - - wait_event_interruptible( - info->conn_wait, info->transport_status != SMBD_CONNECTING); - - if (info->transport_status != SMBD_CONNECTED) { - log_rdma_event(ERR, "rdma_connect failed port=%d\n", port); - goto rdma_connect_failed; - } - - log_rdma_event(INFO, "rdma_connect connected\n"); - - rc = allocate_caches_and_workqueue(info); - if (rc) { - log_rdma_event(ERR, "cache allocation failed\n"); - goto allocate_cache_failed; - } - - init_waitqueue_head(&info->wait_send_queue); - init_waitqueue_head(&info->wait_reassembly_queue); - - INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer); - INIT_DELAYED_WORK(&info->send_immediate_work, send_immediate_work); - queue_delayed_work(info->workqueue, &info->idle_timer_work, - info->keep_alive_interval*HZ); - - init_waitqueue_head(&info->wait_smbd_send_pending); - info->smbd_send_pending = 0; - - init_waitqueue_head(&info->wait_smbd_recv_pending); - info->smbd_recv_pending = 0; - - init_waitqueue_head(&info->wait_send_pending); - atomic_set(&info->send_pending, 0); - - init_waitqueue_head(&info->wait_send_payload_pending); - atomic_set(&info->send_payload_pending, 0); - - INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work); - INIT_WORK(&info->destroy_work, smbd_destroy_rdma_work); - INIT_WORK(&info->recv_done_work, smbd_recv_done_work); - INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits); - info->new_credits_offered = 0; - spin_lock_init(&info->lock_new_credits_offered); - - rc = smbd_negotiate(info); - if (rc) { - log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc); - goto negotiation_failed; - } - - rc = allocate_mr_list(info); - if (rc) { - log_rdma_mr(ERR, "memory registration allocation failed\n"); - goto allocate_mr_failed; - } - - return info; - -allocate_mr_failed: - /* At this point, need to a full transport shutdown */ - smbd_destroy(info); - return NULL; - -negotiation_failed: - cancel_delayed_work_sync(&info->idle_timer_work); - destroy_caches_and_workqueue(info); - info->transport_status = SMBD_NEGOTIATE_FAILED; - init_waitqueue_head(&info->conn_wait); - rdma_disconnect(info->id); - wait_event(info->conn_wait, - info->transport_status == SMBD_DISCONNECTED); - -allocate_cache_failed: -rdma_connect_failed: - rdma_destroy_qp(info->id); - -create_qp_failed: -alloc_cq_failed: - if (info->send_cq) - ib_free_cq(info->send_cq); - if (info->recv_cq) - ib_free_cq(info->recv_cq); - -config_failed: - ib_dealloc_pd(info->pd); - rdma_destroy_id(info->id); - -create_id_failed: - kfree(info); - return NULL; -} - -struct smbd_connection *smbd_get_connection( - struct TCP_Server_Info *server, struct sockaddr *dstaddr) -{ - struct smbd_connection *ret; - int port = SMBD_PORT; - -try_again: - ret = _smbd_get_connection(server, dstaddr, port); - - /* Try SMB_PORT if SMBD_PORT doesn't work */ - if (!ret && port == SMBD_PORT) { - port = SMB_PORT; - goto try_again; - } - return ret; -} - -/* - * Receive data from receive reassembly queue - * All the incoming data packets are placed in reassembly queue - * buf: the buffer to read data into - * size: the length of data to read - * return value: actual data read - * Note: this implementation copies the data from reassebmly queue to receive - * buffers used by upper layer. This is not the optimal code path. A better way - * to do it is to not have upper layer allocate its receive buffers but rather - * borrow the buffer from reassembly queue, and return it after data is - * consumed. But this will require more changes to upper layer code, and also - * need to consider packet boundaries while they still being reassembled. - */ -static int smbd_recv_buf(struct smbd_connection *info, char *buf, - unsigned int size) -{ - struct smbd_response *response; - struct smbd_data_transfer *data_transfer; - int to_copy, to_read, data_read, offset; - u32 data_length, remaining_data_length, data_offset; - int rc; - -again: - if (info->transport_status != SMBD_CONNECTED) { - log_read(ERR, "disconnected\n"); - return -ENODEV; - } - - /* - * No need to hold the reassembly queue lock all the time as we are - * the only one reading from the front of the queue. The transport - * may add more entries to the back of the queue at the same time - */ - log_read(INFO, "size=%d info->reassembly_data_length=%d\n", size, - info->reassembly_data_length); - if (info->reassembly_data_length >= size) { - int queue_length; - int queue_removed = 0; - - /* - * Need to make sure reassembly_data_length is read before - * reading reassembly_queue_length and calling - * _get_first_reassembly. This call is lock free - * as we never read at the end of the queue which are being - * updated in SOFTIRQ as more data is received - */ - virt_rmb(); - queue_length = info->reassembly_queue_length; - data_read = 0; - to_read = size; - offset = info->first_entry_offset; - while (data_read < size) { - response = _get_first_reassembly(info); - data_transfer = smbd_response_payload(response); - data_length = le32_to_cpu(data_transfer->data_length); - remaining_data_length = - le32_to_cpu( - data_transfer->remaining_data_length); - data_offset = le32_to_cpu(data_transfer->data_offset); - - /* - * The upper layer expects RFC1002 length at the - * beginning of the payload. Return it to indicate - * the total length of the packet. This minimize the - * change to upper layer packet processing logic. This - * will be eventually remove when an intermediate - * transport layer is added - */ - if (response->first_segment && size == 4) { - unsigned int rfc1002_len = - data_length + remaining_data_length; - *((__be32 *)buf) = cpu_to_be32(rfc1002_len); - data_read = 4; - response->first_segment = false; - log_read(INFO, "returning rfc1002 length %d\n", - rfc1002_len); - goto read_rfc1002_done; - } - - to_copy = min_t(int, data_length - offset, to_read); - memcpy( - buf + data_read, - (char *)data_transfer + data_offset + offset, - to_copy); - - /* move on to the next buffer? */ - if (to_copy == data_length - offset) { - queue_length--; - /* - * No need to lock if we are not at the - * end of the queue - */ - if (queue_length) - list_del(&response->list); - else { - spin_lock_irq( - &info->reassembly_queue_lock); - list_del(&response->list); - spin_unlock_irq( - &info->reassembly_queue_lock); - } - queue_removed++; - info->count_reassembly_queue--; - info->count_dequeue_reassembly_queue++; - put_receive_buffer(info, response); - offset = 0; - log_read(INFO, "put_receive_buffer offset=0\n"); - } else - offset += to_copy; - - to_read -= to_copy; - data_read += to_copy; - - log_read(INFO, "_get_first_reassembly memcpy %d bytes " - "data_transfer_length-offset=%d after that " - "to_read=%d data_read=%d offset=%d\n", - to_copy, data_length - offset, - to_read, data_read, offset); - } - - spin_lock_irq(&info->reassembly_queue_lock); - info->reassembly_data_length -= data_read; - info->reassembly_queue_length -= queue_removed; - spin_unlock_irq(&info->reassembly_queue_lock); - - info->first_entry_offset = offset; - log_read(INFO, "returning to thread data_read=%d " - "reassembly_data_length=%d first_entry_offset=%d\n", - data_read, info->reassembly_data_length, - info->first_entry_offset); -read_rfc1002_done: - return data_read; - } - - log_read(INFO, "wait_event on more data\n"); - rc = wait_event_interruptible( - info->wait_reassembly_queue, - info->reassembly_data_length >= size || - info->transport_status != SMBD_CONNECTED); - /* Don't return any data if interrupted */ - if (rc) - return -ENODEV; - - goto again; -} - -/* - * Receive a page from receive reassembly queue - * page: the page to read data into - * to_read: the length of data to read - * return value: actual data read - */ -static int smbd_recv_page(struct smbd_connection *info, - struct page *page, unsigned int page_offset, - unsigned int to_read) -{ - int ret; - char *to_address; - void *page_address; - - /* make sure we have the page ready for read */ - ret = wait_event_interruptible( - info->wait_reassembly_queue, - info->reassembly_data_length >= to_read || - info->transport_status != SMBD_CONNECTED); - if (ret) - return ret; - - /* now we can read from reassembly queue and not sleep */ - page_address = kmap_atomic(page); - to_address = (char *) page_address + page_offset; - - log_read(INFO, "reading from page=%p address=%p to_read=%d\n", - page, to_address, to_read); - - ret = smbd_recv_buf(info, to_address, to_read); - kunmap_atomic(page_address); - - return ret; -} - -/* - * Receive data from transport - * msg: a msghdr point to the buffer, can be ITER_KVEC or ITER_BVEC - * return: total bytes read, or 0. SMB Direct will not do partial read. - */ -int smbd_recv(struct smbd_connection *info, struct msghdr *msg) -{ - char *buf; - struct page *page; - unsigned int to_read, page_offset; - int rc; - - info->smbd_recv_pending++; - - if (iov_iter_rw(&msg->msg_iter) == WRITE) { - /* It's a bug in upper layer to get there */ - cifs_dbg(VFS, "CIFS: invalid msg iter dir %u\n", - iov_iter_rw(&msg->msg_iter)); - rc = -EINVAL; - goto out; - } - - switch (iov_iter_type(&msg->msg_iter)) { - case ITER_KVEC: - buf = msg->msg_iter.kvec->iov_base; - to_read = msg->msg_iter.kvec->iov_len; - rc = smbd_recv_buf(info, buf, to_read); - break; - - case ITER_BVEC: - page = msg->msg_iter.bvec->bv_page; - page_offset = msg->msg_iter.bvec->bv_offset; - to_read = msg->msg_iter.bvec->bv_len; - rc = smbd_recv_page(info, page, page_offset, to_read); - break; - - default: - /* It's a bug in upper layer to get there */ - cifs_dbg(VFS, "CIFS: invalid msg type %d\n", - iov_iter_type(&msg->msg_iter)); - rc = -EINVAL; - } - -out: - info->smbd_recv_pending--; - wake_up(&info->wait_smbd_recv_pending); - - /* SMBDirect will read it all or nothing */ - if (rc > 0) - msg->msg_iter.count = 0; - return rc; -} - -/* - * Send data to transport - * Each rqst is transported as a SMBDirect payload - * rqst: the data to write - * return value: 0 if successfully write, otherwise error code - */ -int smbd_send(struct TCP_Server_Info *server, struct smb_rqst *rqst) -{ - struct smbd_connection *info = server->smbd_conn; - struct kvec vec; - int nvecs; - int size; - unsigned int buflen, remaining_data_length; - int start, i, j; - int max_iov_size = - info->max_send_size - sizeof(struct smbd_data_transfer); - struct kvec *iov; - int rc; - - info->smbd_send_pending++; - if (info->transport_status != SMBD_CONNECTED) { - rc = -ENODEV; - goto done; - } - - /* - * Skip the RFC1002 length defined in MS-SMB2 section 2.1 - * It is used only for TCP transport in the iov[0] - * In future we may want to add a transport layer under protocol - * layer so this will only be issued to TCP transport - */ - - if (rqst->rq_iov[0].iov_len != 4) { - log_write(ERR, "expected the pdu length in 1st iov, but got %zu\n", rqst->rq_iov[0].iov_len); - return -EINVAL; - } - - /* - * Add in the page array if there is one. The caller needs to set - * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and - * ends at page boundary - */ - buflen = smb_rqst_len(server, rqst); - - if (buflen + sizeof(struct smbd_data_transfer) > - info->max_fragmented_send_size) { - log_write(ERR, "payload size %d > max size %d\n", - buflen, info->max_fragmented_send_size); - rc = -EINVAL; - goto done; - } - - iov = &rqst->rq_iov[1]; - - cifs_dbg(FYI, "Sending smb (RDMA): smb_len=%u\n", buflen); - for (i = 0; i < rqst->rq_nvec-1; i++) - dump_smb(iov[i].iov_base, iov[i].iov_len); - - remaining_data_length = buflen; - - log_write(INFO, "rqst->rq_nvec=%d rqst->rq_npages=%d rq_pagesz=%d " - "rq_tailsz=%d buflen=%d\n", - rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz, - rqst->rq_tailsz, buflen); - - start = i = iov[0].iov_len ? 0 : 1; - buflen = 0; - while (true) { - buflen += iov[i].iov_len; - if (buflen > max_iov_size) { - if (i > start) { - remaining_data_length -= - (buflen-iov[i].iov_len); - log_write(INFO, "sending iov[] from start=%d " - "i=%d nvecs=%d " - "remaining_data_length=%d\n", - start, i, i-start, - remaining_data_length); - rc = smbd_post_send_data( - info, &iov[start], i-start, - remaining_data_length); - if (rc) - goto done; - } else { - /* iov[start] is too big, break it */ - nvecs = (buflen+max_iov_size-1)/max_iov_size; - log_write(INFO, "iov[%d] iov_base=%p buflen=%d" - " break to %d vectors\n", - start, iov[start].iov_base, - buflen, nvecs); - for (j = 0; j < nvecs; j++) { - vec.iov_base = - (char *)iov[start].iov_base + - j*max_iov_size; - vec.iov_len = max_iov_size; - if (j == nvecs-1) - vec.iov_len = - buflen - - max_iov_size*(nvecs-1); - remaining_data_length -= vec.iov_len; - log_write(INFO, - "sending vec j=%d iov_base=%p" - " iov_len=%zu " - "remaining_data_length=%d\n", - j, vec.iov_base, vec.iov_len, - remaining_data_length); - rc = smbd_post_send_data( - info, &vec, 1, - remaining_data_length); - if (rc) - goto done; - } - i++; - if (i == rqst->rq_nvec-1) - break; - } - start = i; - buflen = 0; - } else { - i++; - if (i == rqst->rq_nvec-1) { - /* send out all remaining vecs */ - remaining_data_length -= buflen; - log_write(INFO, - "sending iov[] from start=%d i=%d " - "nvecs=%d remaining_data_length=%d\n", - start, i, i-start, - remaining_data_length); - rc = smbd_post_send_data(info, &iov[start], - i-start, remaining_data_length); - if (rc) - goto done; - break; - } - } - log_write(INFO, "looping i=%d buflen=%d\n", i, buflen); - } - - /* now sending pages if there are any */ - for (i = 0; i < rqst->rq_npages; i++) { - unsigned int offset; - - rqst_page_get_length(rqst, i, &buflen, &offset); - nvecs = (buflen + max_iov_size - 1) / max_iov_size; - log_write(INFO, "sending pages buflen=%d nvecs=%d\n", - buflen, nvecs); - for (j = 0; j < nvecs; j++) { - size = max_iov_size; - if (j == nvecs-1) - size = buflen - j*max_iov_size; - remaining_data_length -= size; - log_write(INFO, "sending pages i=%d offset=%d size=%d" - " remaining_data_length=%d\n", - i, j*max_iov_size+offset, size, - remaining_data_length); - rc = smbd_post_send_page( - info, rqst->rq_pages[i], - j*max_iov_size + offset, - size, remaining_data_length); - if (rc) - goto done; - } - } - -done: - /* - * As an optimization, we don't wait for individual I/O to finish - * before sending the next one. - * Send them all and wait for pending send count to get to 0 - * that means all the I/Os have been out and we are good to return - */ - - wait_event(info->wait_send_payload_pending, - atomic_read(&info->send_payload_pending) == 0); - - info->smbd_send_pending--; - wake_up(&info->wait_smbd_send_pending); - - return rc; -} - -static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc) -{ - struct smbd_mr *mr; - struct ib_cqe *cqe; - - if (wc->status) { - log_rdma_mr(ERR, "status=%d\n", wc->status); - cqe = wc->wr_cqe; - mr = container_of(cqe, struct smbd_mr, cqe); - smbd_disconnect_rdma_connection(mr->conn); - } -} - -/* - * The work queue function that recovers MRs - * We need to call ib_dereg_mr() and ib_alloc_mr() before this MR can be used - * again. Both calls are slow, so finish them in a workqueue. This will not - * block I/O path. - * There is one workqueue that recovers MRs, there is no need to lock as the - * I/O requests calling smbd_register_mr will never update the links in the - * mr_list. - */ -static void smbd_mr_recovery_work(struct work_struct *work) -{ - struct smbd_connection *info = - container_of(work, struct smbd_connection, mr_recovery_work); - struct smbd_mr *smbdirect_mr; - int rc; - - list_for_each_entry(smbdirect_mr, &info->mr_list, list) { - if (smbdirect_mr->state == MR_INVALIDATED) - ib_dma_unmap_sg( - info->id->device, smbdirect_mr->sgl, - smbdirect_mr->sgl_count, - smbdirect_mr->dir); - else if (smbdirect_mr->state == MR_ERROR) { - - /* recover this MR entry */ - rc = ib_dereg_mr(smbdirect_mr->mr); - if (rc) { - log_rdma_mr(ERR, - "ib_dereg_mr failed rc=%x\n", - rc); - smbd_disconnect_rdma_connection(info); - continue; - } - - smbdirect_mr->mr = ib_alloc_mr( - info->pd, info->mr_type, - info->max_frmr_depth); - if (IS_ERR(smbdirect_mr->mr)) { - log_rdma_mr(ERR, - "ib_alloc_mr failed mr_type=%x " - "max_frmr_depth=%x\n", - info->mr_type, - info->max_frmr_depth); - smbd_disconnect_rdma_connection(info); - continue; - } - } else - /* This MR is being used, don't recover it */ - continue; - - smbdirect_mr->state = MR_READY; - - /* smbdirect_mr->state is updated by this function - * and is read and updated by I/O issuing CPUs trying - * to get a MR, the call to atomic_inc_return - * implicates a memory barrier and guarantees this - * value is updated before waking up any calls to - * get_mr() from the I/O issuing CPUs - */ - if (atomic_inc_return(&info->mr_ready_count) == 1) - wake_up_interruptible(&info->wait_mr); - } -} - -static void destroy_mr_list(struct smbd_connection *info) -{ - struct smbd_mr *mr, *tmp; - - cancel_work_sync(&info->mr_recovery_work); - list_for_each_entry_safe(mr, tmp, &info->mr_list, list) { - if (mr->state == MR_INVALIDATED) - ib_dma_unmap_sg(info->id->device, mr->sgl, - mr->sgl_count, mr->dir); - ib_dereg_mr(mr->mr); - kfree(mr->sgl); - kfree(mr); - } -} - -/* - * Allocate MRs used for RDMA read/write - * The number of MRs will not exceed hardware capability in responder_resources - * All MRs are kept in mr_list. The MR can be recovered after it's used - * Recovery is done in smbd_mr_recovery_work. The content of list entry changes - * as MRs are used and recovered for I/O, but the list links will not change - */ -static int allocate_mr_list(struct smbd_connection *info) -{ - int i; - struct smbd_mr *smbdirect_mr, *tmp; - - INIT_LIST_HEAD(&info->mr_list); - init_waitqueue_head(&info->wait_mr); - spin_lock_init(&info->mr_list_lock); - atomic_set(&info->mr_ready_count, 0); - atomic_set(&info->mr_used_count, 0); - init_waitqueue_head(&info->wait_for_mr_cleanup); - /* Allocate more MRs (2x) than hardware responder_resources */ - for (i = 0; i < info->responder_resources * 2; i++) { - smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL); - if (!smbdirect_mr) - goto out; - smbdirect_mr->mr = ib_alloc_mr(info->pd, info->mr_type, - info->max_frmr_depth); - if (IS_ERR(smbdirect_mr->mr)) { - log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x " - "max_frmr_depth=%x\n", - info->mr_type, info->max_frmr_depth); - goto out; - } - smbdirect_mr->sgl = kcalloc( - info->max_frmr_depth, - sizeof(struct scatterlist), - GFP_KERNEL); - if (!smbdirect_mr->sgl) { - log_rdma_mr(ERR, "failed to allocate sgl\n"); - ib_dereg_mr(smbdirect_mr->mr); - goto out; - } - smbdirect_mr->state = MR_READY; - smbdirect_mr->conn = info; - - list_add_tail(&smbdirect_mr->list, &info->mr_list); - atomic_inc(&info->mr_ready_count); - } - INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work); - return 0; - -out: - kfree(smbdirect_mr); - - list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) { - ib_dereg_mr(smbdirect_mr->mr); - kfree(smbdirect_mr->sgl); - kfree(smbdirect_mr); - } - return -ENOMEM; -} - -/* - * Get a MR from mr_list. This function waits until there is at least one - * MR available in the list. It may access the list while the - * smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock - * as they never modify the same places. However, there may be several CPUs - * issueing I/O trying to get MR at the same time, mr_list_lock is used to - * protect this situation. - */ -static struct smbd_mr *get_mr(struct smbd_connection *info) -{ - struct smbd_mr *ret; - int rc; -again: - rc = wait_event_interruptible(info->wait_mr, - atomic_read(&info->mr_ready_count) || - info->transport_status != SMBD_CONNECTED); - if (rc) { - log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc); - return NULL; - } - - if (info->transport_status != SMBD_CONNECTED) { - log_rdma_mr(ERR, "info->transport_status=%x\n", - info->transport_status); - return NULL; - } - - spin_lock(&info->mr_list_lock); - list_for_each_entry(ret, &info->mr_list, list) { - if (ret->state == MR_READY) { - ret->state = MR_REGISTERED; - spin_unlock(&info->mr_list_lock); - atomic_dec(&info->mr_ready_count); - atomic_inc(&info->mr_used_count); - return ret; - } - } - - spin_unlock(&info->mr_list_lock); - /* - * It is possible that we could fail to get MR because other processes may - * try to acquire a MR at the same time. If this is the case, retry it. - */ - goto again; -} - -/* - * Register memory for RDMA read/write - * pages[]: the list of pages to register memory with - * num_pages: the number of pages to register - * tailsz: if non-zero, the bytes to register in the last page - * writing: true if this is a RDMA write (SMB read), false for RDMA read - * need_invalidate: true if this MR needs to be locally invalidated after I/O - * return value: the MR registered, NULL if failed. - */ -struct smbd_mr *smbd_register_mr( - struct smbd_connection *info, struct page *pages[], int num_pages, - int offset, int tailsz, bool writing, bool need_invalidate) -{ - struct smbd_mr *smbdirect_mr; - int rc, i; - enum dma_data_direction dir; - struct ib_reg_wr *reg_wr; - - if (num_pages > info->max_frmr_depth) { - log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n", - num_pages, info->max_frmr_depth); - return NULL; - } - - smbdirect_mr = get_mr(info); - if (!smbdirect_mr) { - log_rdma_mr(ERR, "get_mr returning NULL\n"); - return NULL; - } - smbdirect_mr->need_invalidate = need_invalidate; - smbdirect_mr->sgl_count = num_pages; - sg_init_table(smbdirect_mr->sgl, num_pages); - - log_rdma_mr(INFO, "num_pages=0x%x offset=0x%x tailsz=0x%x\n", - num_pages, offset, tailsz); - - if (num_pages == 1) { - sg_set_page(&smbdirect_mr->sgl[0], pages[0], tailsz, offset); - goto skip_multiple_pages; - } - - /* We have at least two pages to register */ - sg_set_page( - &smbdirect_mr->sgl[0], pages[0], PAGE_SIZE - offset, offset); - i = 1; - while (i < num_pages - 1) { - sg_set_page(&smbdirect_mr->sgl[i], pages[i], PAGE_SIZE, 0); - i++; - } - sg_set_page(&smbdirect_mr->sgl[i], pages[i], - tailsz ? tailsz : PAGE_SIZE, 0); - -skip_multiple_pages: - dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; - smbdirect_mr->dir = dir; - rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgl, num_pages, dir); - if (!rc) { - log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n", - num_pages, dir, rc); - goto dma_map_error; - } - - rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgl, num_pages, - NULL, PAGE_SIZE); - if (rc != num_pages) { - log_rdma_mr(ERR, - "ib_map_mr_sg failed rc = %d num_pages = %x\n", - rc, num_pages); - goto map_mr_error; - } - - ib_update_fast_reg_key(smbdirect_mr->mr, - ib_inc_rkey(smbdirect_mr->mr->rkey)); - reg_wr = &smbdirect_mr->wr; - reg_wr->wr.opcode = IB_WR_REG_MR; - smbdirect_mr->cqe.done = register_mr_done; - reg_wr->wr.wr_cqe = &smbdirect_mr->cqe; - reg_wr->wr.num_sge = 0; - reg_wr->wr.send_flags = IB_SEND_SIGNALED; - reg_wr->mr = smbdirect_mr->mr; - reg_wr->key = smbdirect_mr->mr->rkey; - reg_wr->access = writing ? - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : - IB_ACCESS_REMOTE_READ; - - /* - * There is no need for waiting for complemtion on ib_post_send - * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution - * on the next ib_post_send when we actaully send I/O to remote peer - */ - rc = ib_post_send(info->id->qp, ®_wr->wr, NULL); - if (!rc) - return smbdirect_mr; - - log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n", - rc, reg_wr->key); - - /* If all failed, attempt to recover this MR by setting it MR_ERROR*/ -map_mr_error: - ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgl, - smbdirect_mr->sgl_count, smbdirect_mr->dir); - -dma_map_error: - smbdirect_mr->state = MR_ERROR; - if (atomic_dec_and_test(&info->mr_used_count)) - wake_up(&info->wait_for_mr_cleanup); - - smbd_disconnect_rdma_connection(info); - - return NULL; -} - -static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc) -{ - struct smbd_mr *smbdirect_mr; - struct ib_cqe *cqe; - - cqe = wc->wr_cqe; - smbdirect_mr = container_of(cqe, struct smbd_mr, cqe); - smbdirect_mr->state = MR_INVALIDATED; - if (wc->status != IB_WC_SUCCESS) { - log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status); - smbdirect_mr->state = MR_ERROR; - } - complete(&smbdirect_mr->invalidate_done); -} - -/* - * Deregister a MR after I/O is done - * This function may wait if remote invalidation is not used - * and we have to locally invalidate the buffer to prevent data is being - * modified by remote peer after upper layer consumes it - */ -int smbd_deregister_mr(struct smbd_mr *smbdirect_mr) -{ - struct ib_send_wr *wr; - struct smbd_connection *info = smbdirect_mr->conn; - int rc = 0; - - if (smbdirect_mr->need_invalidate) { - /* Need to finish local invalidation before returning */ - wr = &smbdirect_mr->inv_wr; - wr->opcode = IB_WR_LOCAL_INV; - smbdirect_mr->cqe.done = local_inv_done; - wr->wr_cqe = &smbdirect_mr->cqe; - wr->num_sge = 0; - wr->ex.invalidate_rkey = smbdirect_mr->mr->rkey; - wr->send_flags = IB_SEND_SIGNALED; - - init_completion(&smbdirect_mr->invalidate_done); - rc = ib_post_send(info->id->qp, wr, NULL); - if (rc) { - log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc); - smbd_disconnect_rdma_connection(info); - goto done; - } - wait_for_completion(&smbdirect_mr->invalidate_done); - smbdirect_mr->need_invalidate = false; - } else - /* - * For remote invalidation, just set it to MR_INVALIDATED - * and defer to mr_recovery_work to recover the MR for next use - */ - smbdirect_mr->state = MR_INVALIDATED; - - /* - * Schedule the work to do MR recovery for future I/Os - * MR recovery is slow and we don't want it to block the current I/O - */ - queue_work(info->workqueue, &info->mr_recovery_work); - -done: - if (atomic_dec_and_test(&info->mr_used_count)) - wake_up(&info->wait_for_mr_cleanup); - - return rc; -} |
