diff options
author | Kip Macy <kmacy@FreeBSD.org> | 2008-05-05 18:35:55 +0000 |
---|---|---|
committer | Kip Macy <kmacy@FreeBSD.org> | 2008-05-05 18:35:55 +0000 |
commit | e68ff398875b17e19f8a55cd0f2d3d502c45d821 (patch) | |
tree | 5b86b050e1ca019fffc7e10f98517cb5f92d4686 | |
parent | 143b946188bf3f298f12bee29e7adcbca3215f33 (diff) | |
download | src-test2-e68ff398875b17e19f8a55cd0f2d3d502c45d821.tar.gz src-test2-e68ff398875b17e19f8a55cd0f2d3d502c45d821.zip |
Notes
32 files changed, 15395 insertions, 0 deletions
diff --git a/sys/contrib/rdma/core_priv.h b/sys/contrib/rdma/core_priv.h new file mode 100644 index 000000000000..5d6c9d805e34 --- /dev/null +++ b/sys/contrib/rdma/core_priv.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: core_priv.h 1349 2004-12-16 21:09:43Z roland $ + * + * $FreeBSD$ + */ + +#ifndef _CORE_PRIV_H +#define _CORE_PRIV_H +#include <sys/mutex.h> +#include <sys/lock.h> + +#include <contrib/rdma/ib_verbs.h> + +#ifdef notyet +int ib_device_register_sysfs(struct ib_device *device); +void ib_device_unregister_sysfs(struct ib_device *device); + +int ib_sysfs_setup(void); +void ib_sysfs_cleanup(void); +#endif + +int ib_cache_setup(void); +void ib_cache_cleanup(void); + +#endif /* _CORE_PRIV_H */ diff --git a/sys/contrib/rdma/ib_addr.h b/sys/contrib/rdma/ib_addr.h new file mode 100644 index 000000000000..3df9949ad28d --- /dev/null +++ b/sys/contrib/rdma/ib_addr.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This Software is licensed under one of the following licenses: + * + * 1) under the terms of the "Common Public License 1.0" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/cpl.php. + * + * 2) under the terms of the "The BSD License" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/bsd-license.php. + * + * 3) under the terms of the "GNU General Public License (GPL) Version 2" a + * copy of which is available from the Open Source Initiative, see + * http://www.opensource.org/licenses/gpl-license.php. + * + * Licensee has the right to choose one of the above licenses. + * + * Redistributions of source code must retain the above copyright + * notice and one of the license notices. + * + * Redistributions in binary form must reproduce both the above copyright + * notice, one of the license notices in the documentation + * and/or other materials provided with the distribution. + * + * $FreeBSD$ + * + */ + +#if !defined(IB_ADDR_H) +#define IB_ADDR_H + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/condvar.h> + +#include <net/if.h> +#include <net/ethernet.h> + +#include <contrib/rdma/ib_verbs.h> + + +#define MAX_ADDR_LEN ETHER_ADDR_LEN /* XXX doesn't support IB! */ + +struct rdma_addr_client { + int refcount; + struct cv comp; + struct mtx lock; +}; + +/** + * rdma_addr_register_client - Register an address client. + */ +void rdma_addr_register_client(struct rdma_addr_client *client); + +/** + * rdma_addr_unregister_client - Deregister an address client. + * @client: Client object to deregister. + */ +void rdma_addr_unregister_client(struct rdma_addr_client *client); + +struct rdma_dev_addr { + unsigned char src_dev_addr[MAX_ADDR_LEN]; + unsigned char dst_dev_addr[MAX_ADDR_LEN]; + unsigned char broadcast[MAX_ADDR_LEN]; + enum rdma_node_type dev_type; +}; + +/** + * rdma_translate_ip - Translate a local IP address to an RDMA hardware + * address. + */ +int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr); + +/** + * rdma_resolve_ip - Resolve source and destination IP addresses to + * RDMA hardware addresses. + * @client: Address client associated with request. + * @src_addr: An optional source address to use in the resolution. If a + * source address is not provided, a usable address will be returned via + * the callback. + * @dst_addr: The destination address to resolve. + * @addr: A reference to a data location that will receive the resolved + * addresses. The data location must remain valid until the callback has + * been invoked. + * @timeout_ms: Amount of time to wait for the address resolution to complete. + * @callback: Call invoked once address resolution has completed, timed out, + * or been canceled. A status of 0 indicates success. + * @context: User-specified context associated with the call. + */ +int rdma_resolve_ip(struct rdma_addr_client *client, + struct sockaddr *src_addr, struct sockaddr *dst_addr, + struct rdma_dev_addr *addr, int timeout_ms, + void (*callback)(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context), + void *context); + +void rdma_addr_cancel(struct rdma_dev_addr *addr); + +int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev, + const unsigned char *dst_dev_addr); + +static inline int ip_addr_size(struct sockaddr *addr) +{ + return addr->sa_family == AF_INET6 ? + sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in); +} + +static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr) +{ + return ((u16)dev_addr->broadcast[8] << 8) | (u16)dev_addr->broadcast[9]; +} + +static inline void ib_addr_set_pkey(struct rdma_dev_addr *dev_addr, u16 pkey) +{ + dev_addr->broadcast[8] = pkey >> 8; + dev_addr->broadcast[9] = (unsigned char) pkey; +} + +static inline void ib_addr_get_mgid(struct rdma_dev_addr *dev_addr, + union ib_gid *gid) +{ + memcpy(gid, dev_addr->broadcast + 4, sizeof *gid); +} + +static inline void ib_addr_get_sgid(struct rdma_dev_addr *dev_addr, + union ib_gid *gid) +{ + memcpy(gid, dev_addr->src_dev_addr + 4, sizeof *gid); +} + +static inline void ib_addr_set_sgid(struct rdma_dev_addr *dev_addr, + union ib_gid *gid) +{ + memcpy(dev_addr->src_dev_addr + 4, gid, sizeof *gid); +} + +static inline void ib_addr_get_dgid(struct rdma_dev_addr *dev_addr, + union ib_gid *gid) +{ + memcpy(gid, dev_addr->dst_dev_addr + 4, sizeof *gid); +} + +static inline void ib_addr_set_dgid(struct rdma_dev_addr *dev_addr, + union ib_gid *gid) +{ + memcpy(dev_addr->dst_dev_addr + 4, gid, sizeof *gid); +} + +static inline void iw_addr_get_sgid(struct rdma_dev_addr *dev_addr, + union ib_gid *gid) +{ + memcpy(gid, dev_addr->src_dev_addr, sizeof *gid); +} + +static inline void iw_addr_get_dgid(struct rdma_dev_addr *dev_addr, + union ib_gid *gid) +{ + memcpy(gid, dev_addr->dst_dev_addr, sizeof *gid); +} + +#endif /* IB_ADDR_H */ diff --git a/sys/contrib/rdma/ib_cache.h b/sys/contrib/rdma/ib_cache.h new file mode 100644 index 000000000000..419bea2f252b --- /dev/null +++ b/sys/contrib/rdma/ib_cache.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: ib_cache.h 1349 2004-12-16 21:09:43Z roland $ + * + * $FreeBSD$ + */ + +#ifndef _IB_CACHE_H +#define _IB_CACHE_H + +#include <contrib/rdma/ib_verbs.h> + +/** + * ib_get_cached_gid - Returns a cached GID table entry + * @device: The device to query. + * @port_num: The port number of the device to query. + * @index: The index into the cached GID table to query. + * @gid: The GID value found at the specified index. + * + * ib_get_cached_gid() fetches the specified GID table entry stored in + * the local software cache. + */ +int ib_get_cached_gid(struct ib_device *device, + u8 port_num, + int index, + union ib_gid *gid); + +/** + * ib_find_cached_gid - Returns the port number and GID table index where + * a specified GID value occurs. + * @device: The device to query. + * @gid: The GID value to search for. + * @port_num: The port number of the device where the GID value was found. + * @index: The index into the cached GID table where the GID was found. This + * parameter may be NULL. + * + * ib_find_cached_gid() searches for the specified GID value in + * the local software cache. + */ +int ib_find_cached_gid(struct ib_device *device, + union ib_gid *gid, + u8 *port_num, + u16 *index); + +/** + * ib_get_cached_pkey - Returns a cached PKey table entry + * @device: The device to query. + * @port_num: The port number of the device to query. + * @index: The index into the cached PKey table to query. + * @pkey: The PKey value found at the specified index. + * + * ib_get_cached_pkey() fetches the specified PKey table entry stored in + * the local software cache. + */ +int ib_get_cached_pkey(struct ib_device *device_handle, + u8 port_num, + int index, + u16 *pkey); + +/** + * ib_find_cached_pkey - Returns the PKey table index where a specified + * PKey value occurs. + * @device: The device to query. + * @port_num: The port number of the device to search for the PKey. + * @pkey: The PKey value to search for. + * @index: The index into the cached PKey table where the PKey was found. + * + * ib_find_cached_pkey() searches the specified PKey table in + * the local software cache. + */ +int ib_find_cached_pkey(struct ib_device *device, + u8 port_num, + u16 pkey, + u16 *index); + +/** + * ib_get_cached_lmc - Returns a cached lmc table entry + * @device: The device to query. + * @port_num: The port number of the device to query. + * @lmc: The lmc value for the specified port for that device. + * + * ib_get_cached_lmc() fetches the specified lmc table entry stored in + * the local software cache. + */ +int ib_get_cached_lmc(struct ib_device *device, + u8 port_num, + u8 *lmc); + +#endif /* _IB_CACHE_H */ diff --git a/sys/contrib/rdma/ib_cm.h b/sys/contrib/rdma/ib_cm.h new file mode 100644 index 000000000000..5fa918a949ec --- /dev/null +++ b/sys/contrib/rdma/ib_cm.h @@ -0,0 +1,593 @@ +/* + * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: ib_cm.h 4311 2005-12-05 18:42:01Z sean.hefty $ + * + * $FreeBSD$ + */ + + +#if !defined(IB_CM_H) +#define IB_CM_H + +#include <contrib/rdma/ib_mad.h> +#include <contrib/rdma/ib_sa.h> + +enum ib_cm_state { + IB_CM_IDLE, + IB_CM_LISTEN, + IB_CM_REQ_SENT, + IB_CM_REQ_RCVD, + IB_CM_MRA_REQ_SENT, + IB_CM_MRA_REQ_RCVD, + IB_CM_REP_SENT, + IB_CM_REP_RCVD, + IB_CM_MRA_REP_SENT, + IB_CM_MRA_REP_RCVD, + IB_CM_ESTABLISHED, + IB_CM_DREQ_SENT, + IB_CM_DREQ_RCVD, + IB_CM_TIMEWAIT, + IB_CM_SIDR_REQ_SENT, + IB_CM_SIDR_REQ_RCVD +}; + +enum ib_cm_lap_state { + IB_CM_LAP_UNINIT, + IB_CM_LAP_IDLE, + IB_CM_LAP_SENT, + IB_CM_LAP_RCVD, + IB_CM_MRA_LAP_SENT, + IB_CM_MRA_LAP_RCVD, +}; + +enum ib_cm_event_type { + IB_CM_REQ_ERROR, + IB_CM_REQ_RECEIVED, + IB_CM_REP_ERROR, + IB_CM_REP_RECEIVED, + IB_CM_RTU_RECEIVED, + IB_CM_USER_ESTABLISHED, + IB_CM_DREQ_ERROR, + IB_CM_DREQ_RECEIVED, + IB_CM_DREP_RECEIVED, + IB_CM_TIMEWAIT_EXIT, + IB_CM_MRA_RECEIVED, + IB_CM_REJ_RECEIVED, + IB_CM_LAP_ERROR, + IB_CM_LAP_RECEIVED, + IB_CM_APR_RECEIVED, + IB_CM_SIDR_REQ_ERROR, + IB_CM_SIDR_REQ_RECEIVED, + IB_CM_SIDR_REP_RECEIVED +}; + +enum ib_cm_data_size { + IB_CM_REQ_PRIVATE_DATA_SIZE = 92, + IB_CM_MRA_PRIVATE_DATA_SIZE = 222, + IB_CM_REJ_PRIVATE_DATA_SIZE = 148, + IB_CM_REP_PRIVATE_DATA_SIZE = 196, + IB_CM_RTU_PRIVATE_DATA_SIZE = 224, + IB_CM_DREQ_PRIVATE_DATA_SIZE = 220, + IB_CM_DREP_PRIVATE_DATA_SIZE = 224, + IB_CM_REJ_ARI_LENGTH = 72, + IB_CM_LAP_PRIVATE_DATA_SIZE = 168, + IB_CM_APR_PRIVATE_DATA_SIZE = 148, + IB_CM_APR_INFO_LENGTH = 72, + IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE = 216, + IB_CM_SIDR_REP_PRIVATE_DATA_SIZE = 136, + IB_CM_SIDR_REP_INFO_LENGTH = 72, + IB_CM_COMPARE_SIZE = 64 +}; + +struct ib_cm_id; + +struct ib_cm_req_event_param { + struct ib_cm_id *listen_id; + u8 port; + + struct ib_sa_path_rec *primary_path; + struct ib_sa_path_rec *alternate_path; + + __be64 remote_ca_guid; + u32 remote_qkey; + u32 remote_qpn; + enum ib_qp_type qp_type; + + u32 starting_psn; + u8 responder_resources; + u8 initiator_depth; + unsigned int local_cm_response_timeout:5; + unsigned int flow_control:1; + unsigned int remote_cm_response_timeout:5; + unsigned int retry_count:3; + unsigned int rnr_retry_count:3; + unsigned int srq:1; +}; + +struct ib_cm_rep_event_param { + __be64 remote_ca_guid; + u32 remote_qkey; + u32 remote_qpn; + u32 starting_psn; + u8 responder_resources; + u8 initiator_depth; + unsigned int target_ack_delay:5; + unsigned int failover_accepted:2; + unsigned int flow_control:1; + unsigned int rnr_retry_count:3; + unsigned int srq:1; +}; + +enum ib_cm_rej_reason { + IB_CM_REJ_NO_QP = 1, + IB_CM_REJ_NO_EEC = 2, + IB_CM_REJ_NO_RESOURCES = 3, + IB_CM_REJ_TIMEOUT = 4, + IB_CM_REJ_UNSUPPORTED = 5, + IB_CM_REJ_INVALID_COMM_ID = 6, + IB_CM_REJ_INVALID_COMM_INSTANCE = 7, + IB_CM_REJ_INVALID_SERVICE_ID = 8, + IB_CM_REJ_INVALID_TRANSPORT_TYPE = 9, + IB_CM_REJ_STALE_CONN = 10, + IB_CM_REJ_RDC_NOT_EXIST = 11, + IB_CM_REJ_INVALID_GID = 12, + IB_CM_REJ_INVALID_LID = 13, + IB_CM_REJ_INVALID_SL = 14, + IB_CM_REJ_INVALID_TRAFFIC_CLASS = 15, + IB_CM_REJ_INVALID_HOP_LIMIT = 16, + IB_CM_REJ_INVALID_PACKET_RATE = 17, + IB_CM_REJ_INVALID_ALT_GID = 18, + IB_CM_REJ_INVALID_ALT_LID = 19, + IB_CM_REJ_INVALID_ALT_SL = 20, + IB_CM_REJ_INVALID_ALT_TRAFFIC_CLASS = 21, + IB_CM_REJ_INVALID_ALT_HOP_LIMIT = 22, + IB_CM_REJ_INVALID_ALT_PACKET_RATE = 23, + IB_CM_REJ_PORT_CM_REDIRECT = 24, + IB_CM_REJ_PORT_REDIRECT = 25, + IB_CM_REJ_INVALID_MTU = 26, + IB_CM_REJ_INSUFFICIENT_RESP_RESOURCES = 27, + IB_CM_REJ_CONSUMER_DEFINED = 28, + IB_CM_REJ_INVALID_RNR_RETRY = 29, + IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID = 30, + IB_CM_REJ_INVALID_CLASS_VERSION = 31, + IB_CM_REJ_INVALID_FLOW_LABEL = 32, + IB_CM_REJ_INVALID_ALT_FLOW_LABEL = 33 +}; + +struct ib_cm_rej_event_param { + enum ib_cm_rej_reason reason; + void *ari; + u8 ari_length; +}; + +struct ib_cm_mra_event_param { + u8 service_timeout; +}; + +struct ib_cm_lap_event_param { + struct ib_sa_path_rec *alternate_path; +}; + +enum ib_cm_apr_status { + IB_CM_APR_SUCCESS, + IB_CM_APR_INVALID_COMM_ID, + IB_CM_APR_UNSUPPORTED, + IB_CM_APR_REJECT, + IB_CM_APR_REDIRECT, + IB_CM_APR_IS_CURRENT, + IB_CM_APR_INVALID_QPN_EECN, + IB_CM_APR_INVALID_LID, + IB_CM_APR_INVALID_GID, + IB_CM_APR_INVALID_FLOW_LABEL, + IB_CM_APR_INVALID_TCLASS, + IB_CM_APR_INVALID_HOP_LIMIT, + IB_CM_APR_INVALID_PACKET_RATE, + IB_CM_APR_INVALID_SL +}; + +struct ib_cm_apr_event_param { + enum ib_cm_apr_status ap_status; + void *apr_info; + u8 info_len; +}; + +struct ib_cm_sidr_req_event_param { + struct ib_cm_id *listen_id; + u8 port; + u16 pkey; +}; + +enum ib_cm_sidr_status { + IB_SIDR_SUCCESS, + IB_SIDR_UNSUPPORTED, + IB_SIDR_REJECT, + IB_SIDR_NO_QP, + IB_SIDR_REDIRECT, + IB_SIDR_UNSUPPORTED_VERSION +}; + +struct ib_cm_sidr_rep_event_param { + enum ib_cm_sidr_status status; + u32 qkey; + u32 qpn; + void *info; + u8 info_len; +}; + +struct ib_cm_event { + enum ib_cm_event_type event; + union { + struct ib_cm_req_event_param req_rcvd; + struct ib_cm_rep_event_param rep_rcvd; + /* No data for RTU received events. */ + struct ib_cm_rej_event_param rej_rcvd; + struct ib_cm_mra_event_param mra_rcvd; + struct ib_cm_lap_event_param lap_rcvd; + struct ib_cm_apr_event_param apr_rcvd; + /* No data for DREQ/DREP received events. */ + struct ib_cm_sidr_req_event_param sidr_req_rcvd; + struct ib_cm_sidr_rep_event_param sidr_rep_rcvd; + enum ib_wc_status send_status; + } param; + + void *private_data; +}; + +/** + * ib_cm_handler - User-defined callback to process communication events. + * @cm_id: Communication identifier associated with the reported event. + * @event: Information about the communication event. + * + * IB_CM_REQ_RECEIVED and IB_CM_SIDR_REQ_RECEIVED communication events + * generated as a result of listen requests result in the allocation of a + * new @cm_id. The new @cm_id is returned to the user through this callback. + * Clients are responsible for destroying the new @cm_id. For peer-to-peer + * IB_CM_REQ_RECEIVED and all other events, the returned @cm_id corresponds + * to a user's existing communication identifier. + * + * Users may not call ib_destroy_cm_id while in the context of this callback; + * however, returning a non-zero value instructs the communication manager to + * destroy the @cm_id after the callback completes. + */ +typedef int (*ib_cm_handler)(struct ib_cm_id *cm_id, + struct ib_cm_event *event); + +struct ib_cm_id { + ib_cm_handler cm_handler; + void *context; + struct ib_device *device; + __be64 service_id; + __be64 service_mask; + enum ib_cm_state state; /* internal CM/debug use */ + enum ib_cm_lap_state lap_state; /* internal CM/debug use */ + __be32 local_id; + __be32 remote_id; + u32 remote_cm_qpn; /* 1 unless redirected */ +}; + +/** + * ib_create_cm_id - Allocate a communication identifier. + * @device: Device associated with the cm_id. All related communication will + * be associated with the specified device. + * @cm_handler: Callback invoked to notify the user of CM events. + * @context: User specified context associated with the communication + * identifier. + * + * Communication identifiers are used to track connection states, service + * ID resolution requests, and listen requests. + */ +struct ib_cm_id *ib_create_cm_id(struct ib_device *device, + ib_cm_handler cm_handler, + void *context); + +/** + * ib_destroy_cm_id - Destroy a connection identifier. + * @cm_id: Connection identifier to destroy. + * + * This call blocks until the connection identifier is destroyed. + */ +void ib_destroy_cm_id(struct ib_cm_id *cm_id); + +#define IB_SERVICE_ID_AGN_MASK __constant_cpu_to_be64(0xFF00000000000000ULL) +#define IB_CM_ASSIGN_SERVICE_ID __constant_cpu_to_be64(0x0200000000000000ULL) +#define IB_CMA_SERVICE_ID __constant_cpu_to_be64(0x0000000001000000ULL) +#define IB_CMA_SERVICE_ID_MASK __constant_cpu_to_be64(0xFFFFFFFFFF000000ULL) +#define IB_SDP_SERVICE_ID __constant_cpu_to_be64(0x0000000000010000ULL) +#define IB_SDP_SERVICE_ID_MASK __constant_cpu_to_be64(0xFFFFFFFFFFFF0000ULL) + +struct ib_cm_compare_data { + u8 data[IB_CM_COMPARE_SIZE]; + u8 mask[IB_CM_COMPARE_SIZE]; +}; + +/** + * ib_cm_listen - Initiates listening on the specified service ID for + * connection and service ID resolution requests. + * @cm_id: Connection identifier associated with the listen request. + * @service_id: Service identifier matched against incoming connection + * and service ID resolution requests. The service ID should be specified + * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will + * assign a service ID to the caller. + * @service_mask: Mask applied to service ID used to listen across a + * range of service IDs. If set to 0, the service ID is matched + * exactly. This parameter is ignored if %service_id is set to + * IB_CM_ASSIGN_SERVICE_ID. + * @compare_data: This parameter is optional. It specifies data that must + * appear in the private data of a connection request for the specified + * listen request. + */ +int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask, + struct ib_cm_compare_data *compare_data); + +struct ib_cm_req_param { + struct ib_sa_path_rec *primary_path; + struct ib_sa_path_rec *alternate_path; + __be64 service_id; + u32 qp_num; + enum ib_qp_type qp_type; + u32 starting_psn; + const void *private_data; + u8 private_data_len; + u8 peer_to_peer; + u8 responder_resources; + u8 initiator_depth; + u8 remote_cm_response_timeout; + u8 flow_control; + u8 local_cm_response_timeout; + u8 retry_count; + u8 rnr_retry_count; + u8 max_cm_retries; + u8 srq; +}; + +/** + * ib_send_cm_req - Sends a connection request to the remote node. + * @cm_id: Connection identifier that will be associated with the + * connection request. + * @param: Connection request information needed to establish the + * connection. + */ +int ib_send_cm_req(struct ib_cm_id *cm_id, + struct ib_cm_req_param *param); + +struct ib_cm_rep_param { + u32 qp_num; + u32 starting_psn; + const void *private_data; + u8 private_data_len; + u8 responder_resources; + u8 initiator_depth; + u8 target_ack_delay; + u8 failover_accepted; + u8 flow_control; + u8 rnr_retry_count; + u8 srq; +}; + +/** + * ib_send_cm_rep - Sends a connection reply in response to a connection + * request. + * @cm_id: Connection identifier that will be associated with the + * connection request. + * @param: Connection reply information needed to establish the + * connection. + */ +int ib_send_cm_rep(struct ib_cm_id *cm_id, + struct ib_cm_rep_param *param); + +/** + * ib_send_cm_rtu - Sends a connection ready to use message in response + * to a connection reply message. + * @cm_id: Connection identifier associated with the connection request. + * @private_data: Optional user-defined private data sent with the + * ready to use message. + * @private_data_len: Size of the private data buffer, in bytes. + */ +int ib_send_cm_rtu(struct ib_cm_id *cm_id, + const void *private_data, + u8 private_data_len); + +/** + * ib_send_cm_dreq - Sends a disconnection request for an existing + * connection. + * @cm_id: Connection identifier associated with the connection being + * released. + * @private_data: Optional user-defined private data sent with the + * disconnection request message. + * @private_data_len: Size of the private data buffer, in bytes. + */ +int ib_send_cm_dreq(struct ib_cm_id *cm_id, + const void *private_data, + u8 private_data_len); + +/** + * ib_send_cm_drep - Sends a disconnection reply to a disconnection request. + * @cm_id: Connection identifier associated with the connection being + * released. + * @private_data: Optional user-defined private data sent with the + * disconnection reply message. + * @private_data_len: Size of the private data buffer, in bytes. + * + * If the cm_id is in the correct state, the CM will transition the connection + * to the timewait state, even if an error occurs sending the DREP message. + */ +int ib_send_cm_drep(struct ib_cm_id *cm_id, + const void *private_data, + u8 private_data_len); + +/** + * ib_cm_notify - Notifies the CM of an event reported to the consumer. + * @cm_id: Connection identifier to transition to established. + * @event: Type of event. + * + * This routine should be invoked by users to notify the CM of relevant + * communication events. Events that should be reported to the CM and + * when to report them are: + * + * IB_EVENT_COMM_EST - Used when a message is received on a connected + * QP before an RTU has been received. + * IB_EVENT_PATH_MIG - Notifies the CM that the connection has failed over + * to the alternate path. + */ +int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event); + +/** + * ib_send_cm_rej - Sends a connection rejection message to the + * remote node. + * @cm_id: Connection identifier associated with the connection being + * rejected. + * @reason: Reason for the connection request rejection. + * @ari: Optional additional rejection information. + * @ari_length: Size of the additional rejection information, in bytes. + * @private_data: Optional user-defined private data sent with the + * rejection message. + * @private_data_len: Size of the private data buffer, in bytes. + */ +int ib_send_cm_rej(struct ib_cm_id *cm_id, + enum ib_cm_rej_reason reason, + void *ari, + u8 ari_length, + const void *private_data, + u8 private_data_len); + +/** + * ib_send_cm_mra - Sends a message receipt acknowledgement to a connection + * message. + * @cm_id: Connection identifier associated with the connection message. + * @service_timeout: The maximum time required for the sender to reply to + * to the connection message. + * @private_data: Optional user-defined private data sent with the + * message receipt acknowledgement. + * @private_data_len: Size of the private data buffer, in bytes. + */ +int ib_send_cm_mra(struct ib_cm_id *cm_id, + u8 service_timeout, + const void *private_data, + u8 private_data_len); + +/** + * ib_send_cm_lap - Sends a load alternate path request. + * @cm_id: Connection identifier associated with the load alternate path + * message. + * @alternate_path: A path record that identifies the alternate path to + * load. + * @private_data: Optional user-defined private data sent with the + * load alternate path message. + * @private_data_len: Size of the private data buffer, in bytes. + */ +int ib_send_cm_lap(struct ib_cm_id *cm_id, + struct ib_sa_path_rec *alternate_path, + const void *private_data, + u8 private_data_len); + +/** + * ib_cm_init_qp_attr - Initializes the QP attributes for use in transitioning + * to a specified QP state. + * @cm_id: Communication identifier associated with the QP attributes to + * initialize. + * @qp_attr: On input, specifies the desired QP state. On output, the + * mandatory and desired optional attributes will be set in order to + * modify the QP to the specified state. + * @qp_attr_mask: The QP attribute mask that may be used to transition the + * QP to the specified state. + * + * Users must set the @qp_attr->qp_state to the desired QP state. This call + * will set all required attributes for the given transition, along with + * known optional attributes. Users may override the attributes returned from + * this call before calling ib_modify_qp. + */ +int ib_cm_init_qp_attr(struct ib_cm_id *cm_id, + struct ib_qp_attr *qp_attr, + int *qp_attr_mask); + +/** + * ib_send_cm_apr - Sends an alternate path response message in response to + * a load alternate path request. + * @cm_id: Connection identifier associated with the alternate path response. + * @status: Reply status sent with the alternate path response. + * @info: Optional additional information sent with the alternate path + * response. + * @info_length: Size of the additional information, in bytes. + * @private_data: Optional user-defined private data sent with the + * alternate path response message. + * @private_data_len: Size of the private data buffer, in bytes. + */ +int ib_send_cm_apr(struct ib_cm_id *cm_id, + enum ib_cm_apr_status status, + void *info, + u8 info_length, + const void *private_data, + u8 private_data_len); + +struct ib_cm_sidr_req_param { + struct ib_sa_path_rec *path; + __be64 service_id; + int timeout_ms; + const void *private_data; + u8 private_data_len; + u8 max_cm_retries; +}; + +/** + * ib_send_cm_sidr_req - Sends a service ID resolution request to the + * remote node. + * @cm_id: Communication identifier that will be associated with the + * service ID resolution request. + * @param: Service ID resolution request information. + */ +int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, + struct ib_cm_sidr_req_param *param); + +struct ib_cm_sidr_rep_param { + u32 qp_num; + u32 qkey; + enum ib_cm_sidr_status status; + const void *info; + u8 info_length; + const void *private_data; + u8 private_data_len; +}; + +/** + * ib_send_cm_sidr_rep - Sends a service ID resolution reply to the + * remote node. + * @cm_id: Communication identifier associated with the received service ID + * resolution request. + * @param: Service ID resolution reply information. + */ +int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id, + struct ib_cm_sidr_rep_param *param); + +#endif /* IB_CM_H */ diff --git a/sys/contrib/rdma/ib_fmr_pool.h b/sys/contrib/rdma/ib_fmr_pool.h new file mode 100644 index 000000000000..55996b8228f5 --- /dev/null +++ b/sys/contrib/rdma/ib_fmr_pool.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: ib_fmr_pool.h 2730 2005-06-28 16:43:03Z sean.hefty $ + * + * $FreeBSD$ + */ + +#if !defined(IB_FMR_POOL_H) +#define IB_FMR_POOL_H + +#include <rdma/ib_verbs.h> + +struct ib_fmr_pool; + +/** + * struct ib_fmr_pool_param - Parameters for creating FMR pool + * @max_pages_per_fmr:Maximum number of pages per map request. + * @page_shift: Log2 of sizeof "pages" mapped by this fmr + * @access:Access flags for FMRs in pool. + * @pool_size:Number of FMRs to allocate for pool. + * @dirty_watermark:Flush is triggered when @dirty_watermark dirty + * FMRs are present. + * @flush_function:Callback called when unmapped FMRs are flushed and + * more FMRs are possibly available for mapping + * @flush_arg:Context passed to user's flush function. + * @cache:If set, FMRs may be reused after unmapping for identical map + * requests. + */ +struct ib_fmr_pool_param { + int max_pages_per_fmr; + int page_shift; + enum ib_access_flags access; + int pool_size; + int dirty_watermark; + void (*flush_function)(struct ib_fmr_pool *pool, + void * arg); + void *flush_arg; + unsigned cache:1; +}; + +struct ib_pool_fmr { + struct ib_fmr *fmr; + struct ib_fmr_pool *pool; + struct list_head list; + struct hlist_node cache_node; + int ref_count; + int remap_count; + u64 io_virtual_address; + int page_list_len; + u64 page_list[0]; +}; + +struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, + struct ib_fmr_pool_param *params); + +void ib_destroy_fmr_pool(struct ib_fmr_pool *pool); + +int ib_flush_fmr_pool(struct ib_fmr_pool *pool); + +struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, + u64 *page_list, + int list_len, + u64 io_virtual_address); + +int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr); + +#endif /* IB_FMR_POOL_H */ diff --git a/sys/contrib/rdma/ib_mad.h b/sys/contrib/rdma/ib_mad.h new file mode 100644 index 000000000000..7fabc97bc083 --- /dev/null +++ b/sys/contrib/rdma/ib_mad.h @@ -0,0 +1,656 @@ +/* + * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004-2006 Voltaire Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: ib_mad.h 5596 2006-03-03 01:00:07Z sean.hefty $ + * + * $FreeBSD$ + */ + +#if !defined( IB_MAD_H ) +#define IB_MAD_H + + +#include <contrib/rdma/ib_verbs.h> + +/* Management base version */ +#define IB_MGMT_BASE_VERSION 1 + +/* Management classes */ +#define IB_MGMT_CLASS_SUBN_LID_ROUTED 0x01 +#define IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE 0x81 +#define IB_MGMT_CLASS_SUBN_ADM 0x03 +#define IB_MGMT_CLASS_PERF_MGMT 0x04 +#define IB_MGMT_CLASS_BM 0x05 +#define IB_MGMT_CLASS_DEVICE_MGMT 0x06 +#define IB_MGMT_CLASS_CM 0x07 +#define IB_MGMT_CLASS_SNMP 0x08 +#define IB_MGMT_CLASS_DEVICE_ADM 0x10 +#define IB_MGMT_CLASS_BOOT_MGMT 0x11 +#define IB_MGMT_CLASS_BIS 0x12 +#define IB_MGMT_CLASS_CONG_MGMT 0x21 +#define IB_MGMT_CLASS_VENDOR_RANGE2_START 0x30 +#define IB_MGMT_CLASS_VENDOR_RANGE2_END 0x4F + +#define IB_OPENIB_OUI (0x001405) + +/* Management methods */ +#define IB_MGMT_METHOD_GET 0x01 +#define IB_MGMT_METHOD_SET 0x02 +#define IB_MGMT_METHOD_GET_RESP 0x81 +#define IB_MGMT_METHOD_SEND 0x03 +#define IB_MGMT_METHOD_TRAP 0x05 +#define IB_MGMT_METHOD_REPORT 0x06 +#define IB_MGMT_METHOD_REPORT_RESP 0x86 +#define IB_MGMT_METHOD_TRAP_REPRESS 0x07 + +#define IB_MGMT_METHOD_RESP 0x80 +#define IB_BM_ATTR_MOD_RESP cpu_to_be32(1) + +#define IB_MGMT_MAX_METHODS 128 + +/* RMPP information */ +#define IB_MGMT_RMPP_VERSION 1 + +#define IB_MGMT_RMPP_TYPE_DATA 1 +#define IB_MGMT_RMPP_TYPE_ACK 2 +#define IB_MGMT_RMPP_TYPE_STOP 3 +#define IB_MGMT_RMPP_TYPE_ABORT 4 + +#define IB_MGMT_RMPP_FLAG_ACTIVE 1 +#define IB_MGMT_RMPP_FLAG_FIRST (1<<1) +#define IB_MGMT_RMPP_FLAG_LAST (1<<2) + +#define IB_MGMT_RMPP_NO_RESPTIME 0x1F + +#define IB_MGMT_RMPP_STATUS_SUCCESS 0 +#define IB_MGMT_RMPP_STATUS_RESX 1 +#define IB_MGMT_RMPP_STATUS_ABORT_MIN 118 +#define IB_MGMT_RMPP_STATUS_T2L 118 +#define IB_MGMT_RMPP_STATUS_BAD_LEN 119 +#define IB_MGMT_RMPP_STATUS_BAD_SEG 120 +#define IB_MGMT_RMPP_STATUS_BADT 121 +#define IB_MGMT_RMPP_STATUS_W2S 122 +#define IB_MGMT_RMPP_STATUS_S2B 123 +#define IB_MGMT_RMPP_STATUS_BAD_STATUS 124 +#define IB_MGMT_RMPP_STATUS_UNV 125 +#define IB_MGMT_RMPP_STATUS_TMR 126 +#define IB_MGMT_RMPP_STATUS_UNSPEC 127 +#define IB_MGMT_RMPP_STATUS_ABORT_MAX 127 + +#define IB_QP0 0 +#define IB_QP1 __constant_htonl(1) +#define IB_QP1_QKEY 0x80010000 +#define IB_QP_SET_QKEY 0x80000000 + +enum { + IB_MGMT_MAD_HDR = 24, + IB_MGMT_MAD_DATA = 232, + IB_MGMT_RMPP_HDR = 36, + IB_MGMT_RMPP_DATA = 220, + IB_MGMT_VENDOR_HDR = 40, + IB_MGMT_VENDOR_DATA = 216, + IB_MGMT_SA_HDR = 56, + IB_MGMT_SA_DATA = 200, + IB_MGMT_DEVICE_HDR = 64, + IB_MGMT_DEVICE_DATA = 192, +}; + +struct ib_mad_hdr { + u8 base_version; + u8 mgmt_class; + u8 class_version; + u8 method; + __be16 status; + __be16 class_specific; + __be64 tid; + __be16 attr_id; + __be16 resv; + __be32 attr_mod; +}; + +struct ib_rmpp_hdr { + u8 rmpp_version; + u8 rmpp_type; + u8 rmpp_rtime_flags; + u8 rmpp_status; + __be32 seg_num; + __be32 paylen_newwin; +}; + +typedef u64 ib_sa_comp_mask; + +#define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << n)) + +/* + * ib_sa_hdr and ib_sa_mad structures must be packed because they have + * 64-bit fields that are only 32-bit aligned. 64-bit architectures will + * lay them out wrong otherwise. (And unfortunately they are sent on + * the wire so we can't change the layout) + */ +struct ib_sa_hdr { + __be64 sm_key; + __be16 attr_offset; + __be16 reserved; + ib_sa_comp_mask comp_mask; +} __attribute__ ((packed)); + +struct ib_mad { + struct ib_mad_hdr mad_hdr; + u8 data[IB_MGMT_MAD_DATA]; +}; + +struct ib_rmpp_mad { + struct ib_mad_hdr mad_hdr; + struct ib_rmpp_hdr rmpp_hdr; + u8 data[IB_MGMT_RMPP_DATA]; +}; + +struct ib_sa_mad { + struct ib_mad_hdr mad_hdr; + struct ib_rmpp_hdr rmpp_hdr; + struct ib_sa_hdr sa_hdr; + u8 data[IB_MGMT_SA_DATA]; +} __attribute__ ((packed)); + +struct ib_vendor_mad { + struct ib_mad_hdr mad_hdr; + struct ib_rmpp_hdr rmpp_hdr; + u8 reserved; + u8 oui[3]; + u8 data[IB_MGMT_VENDOR_DATA]; +}; + +struct ib_class_port_info +{ + u8 base_version; + u8 class_version; + __be16 capability_mask; + u8 reserved[3]; + u8 resp_time_value; + u8 redirect_gid[16]; + __be32 redirect_tcslfl; + __be16 redirect_lid; + __be16 redirect_pkey; + __be32 redirect_qp; + __be32 redirect_qkey; + u8 trap_gid[16]; + __be32 trap_tcslfl; + __be16 trap_lid; + __be16 trap_pkey; + __be32 trap_hlqp; + __be32 trap_qkey; +}; + +/** + * ib_mad_send_buf - MAD data buffer and work request for sends. + * @next: A pointer used to chain together MADs for posting. + * @mad: References an allocated MAD data buffer for MADs that do not have + * RMPP active. For MADs using RMPP, references the common and management + * class specific headers. + * @mad_agent: MAD agent that allocated the buffer. + * @ah: The address handle to use when sending the MAD. + * @context: User-controlled context fields. + * @hdr_len: Indicates the size of the data header of the MAD. This length + * includes the common MAD, RMPP, and class specific headers. + * @data_len: Indicates the total size of user-transferred data. + * @seg_count: The number of RMPP segments allocated for this send. + * @seg_size: Size of each RMPP segment. + * @timeout_ms: Time to wait for a response. + * @retries: Number of times to retry a request for a response. + * + * Users are responsible for initializing the MAD buffer itself, with the + * exception of any RMPP header. Additional segment buffer space allocated + * beyond data_len is padding. + */ +struct ib_mad_send_buf { + struct ib_mad_send_buf *next; + void *mad; + struct ib_mad_agent *mad_agent; + struct ib_ah *ah; + void *context[2]; + int hdr_len; + int data_len; + int seg_count; + int seg_size; + int timeout_ms; + int retries; +}; + +/** + * ib_response_mad - Returns if the specified MAD has been generated in + * response to a sent request or trap. + */ +int ib_response_mad(struct ib_mad *mad); + +/** + * ib_get_rmpp_resptime - Returns the RMPP response time. + * @rmpp_hdr: An RMPP header. + */ +static inline u8 ib_get_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr) +{ + return rmpp_hdr->rmpp_rtime_flags >> 3; +} + +/** + * ib_get_rmpp_flags - Returns the RMPP flags. + * @rmpp_hdr: An RMPP header. + */ +static inline u8 ib_get_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr) +{ + return rmpp_hdr->rmpp_rtime_flags & 0x7; +} + +/** + * ib_set_rmpp_resptime - Sets the response time in an RMPP header. + * @rmpp_hdr: An RMPP header. + * @rtime: The response time to set. + */ +static inline void ib_set_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr, u8 rtime) +{ + rmpp_hdr->rmpp_rtime_flags = ib_get_rmpp_flags(rmpp_hdr) | (rtime << 3); +} + +/** + * ib_set_rmpp_flags - Sets the flags in an RMPP header. + * @rmpp_hdr: An RMPP header. + * @flags: The flags to set. + */ +static inline void ib_set_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr, u8 flags) +{ + rmpp_hdr->rmpp_rtime_flags = (rmpp_hdr->rmpp_rtime_flags & 0xF1) | + (flags & 0x7); +} + +struct ib_mad_agent; +struct ib_mad_send_wc; +struct ib_mad_recv_wc; + +/** + * ib_mad_send_handler - callback handler for a sent MAD. + * @mad_agent: MAD agent that sent the MAD. + * @mad_send_wc: Send work completion information on the sent MAD. + */ +typedef void (*ib_mad_send_handler)(struct ib_mad_agent *mad_agent, + struct ib_mad_send_wc *mad_send_wc); + +/** + * ib_mad_snoop_handler - Callback handler for snooping sent MADs. + * @mad_agent: MAD agent that snooped the MAD. + * @send_wr: Work request information on the sent MAD. + * @mad_send_wc: Work completion information on the sent MAD. Valid + * only for snooping that occurs on a send completion. + * + * Clients snooping MADs should not modify data referenced by the @send_wr + * or @mad_send_wc. + */ +typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, + struct ib_mad_send_wc *mad_send_wc); + +/** + * ib_mad_recv_handler - callback handler for a received MAD. + * @mad_agent: MAD agent requesting the received MAD. + * @mad_recv_wc: Received work completion information on the received MAD. + * + * MADs received in response to a send request operation will be handed to + * the user before the send operation completes. All data buffers given + * to registered agents through this routine are owned by the receiving + * client, except for snooping agents. Clients snooping MADs should not + * modify the data referenced by @mad_recv_wc. + */ +typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent, + struct ib_mad_recv_wc *mad_recv_wc); + +/** + * ib_mad_agent - Used to track MAD registration with the access layer. + * @device: Reference to device registration is on. + * @qp: Reference to QP used for sending and receiving MADs. + * @mr: Memory region for system memory usable for DMA. + * @recv_handler: Callback handler for a received MAD. + * @send_handler: Callback handler for a sent MAD. + * @snoop_handler: Callback handler for snooped sent MADs. + * @context: User-specified context associated with this registration. + * @hi_tid: Access layer assigned transaction ID for this client. + * Unsolicited MADs sent by this client will have the upper 32-bits + * of their TID set to this value. + * @port_num: Port number on which QP is registered + * @rmpp_version: If set, indicates the RMPP version used by this agent. + */ +struct ib_mad_agent { + struct ib_device *device; + struct ib_qp *qp; + struct ib_mr *mr; + ib_mad_recv_handler recv_handler; + ib_mad_send_handler send_handler; + ib_mad_snoop_handler snoop_handler; + void *context; + u32 hi_tid; + u8 port_num; + u8 rmpp_version; +}; + +/** + * ib_mad_send_wc - MAD send completion information. + * @send_buf: Send MAD data buffer associated with the send MAD request. + * @status: Completion status. + * @vendor_err: Optional vendor error information returned with a failed + * request. + */ +struct ib_mad_send_wc { + struct ib_mad_send_buf *send_buf; + enum ib_wc_status status; + u32 vendor_err; +}; + +/** + * ib_mad_recv_buf - received MAD buffer information. + * @list: Reference to next data buffer for a received RMPP MAD. + * @grh: References a data buffer containing the global route header. + * The data refereced by this buffer is only valid if the GRH is + * valid. + * @mad: References the start of the received MAD. + */ +struct ib_mad_recv_buf { + TAILQ_ENTRY(ib_mad_recv_buf) entry; + struct ib_grh *grh; + struct ib_mad *mad; +}; + +/** + * ib_mad_recv_wc - received MAD information. + * @wc: Completion information for the received data. + * @recv_buf: Specifies the location of the received data buffer(s). + * @rmpp_list: Specifies a list of RMPP reassembled received MAD buffers. + * @mad_len: The length of the received MAD, without duplicated headers. + * + * For received response, the wr_id contains a pointer to the ib_mad_send_buf + * for the corresponding send request. + */ +struct ib_mad_recv_wc { + struct ib_wc *wc; + struct ib_mad_recv_buf recv_buf; + TAILQ_ENTRY(ib_mad_recv_wc) entry; + int mad_len; +}; + +/** + * ib_mad_reg_req - MAD registration request + * @mgmt_class: Indicates which management class of MADs should be receive + * by the caller. This field is only required if the user wishes to + * receive unsolicited MADs, otherwise it should be 0. + * @mgmt_class_version: Indicates which version of MADs for the given + * management class to receive. + * @oui: Indicates IEEE OUI when mgmt_class is a vendor class + * in the range from 0x30 to 0x4f. Otherwise not used. + * @method_mask: The caller will receive unsolicited MADs for any method + * where @method_mask = 1. + */ +struct ib_mad_reg_req { + u8 mgmt_class; + u8 mgmt_class_version; + u8 oui[3]; +#ifdef notyet + DECLARE_BITMAP(method_mask, IB_MGMT_MAX_METHODS); +#endif +}; + +/** + * ib_register_mad_agent - Register to send/receive MADs. + * @device: The device to register with. + * @port_num: The port on the specified device to use. + * @qp_type: Specifies which QP to access. Must be either + * IB_QPT_SMI or IB_QPT_GSI. + * @mad_reg_req: Specifies which unsolicited MADs should be received + * by the caller. This parameter may be NULL if the caller only + * wishes to receive solicited responses. + * @rmpp_version: If set, indicates that the client will send + * and receive MADs that contain the RMPP header for the given version. + * If set to 0, indicates that RMPP is not used by this client. + * @send_handler: The completion callback routine invoked after a send + * request has completed. + * @recv_handler: The completion callback routine invoked for a received + * MAD. + * @context: User specified context associated with the registration. + */ +struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, + u8 port_num, + enum ib_qp_type qp_type, + struct ib_mad_reg_req *mad_reg_req, + u8 rmpp_version, + ib_mad_send_handler send_handler, + ib_mad_recv_handler recv_handler, + void *context); + +enum ib_mad_snoop_flags { + /*IB_MAD_SNOOP_POSTED_SENDS = 1,*/ + /*IB_MAD_SNOOP_RMPP_SENDS = (1<<1),*/ + IB_MAD_SNOOP_SEND_COMPLETIONS = (1<<2), + /*IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS = (1<<3),*/ + IB_MAD_SNOOP_RECVS = (1<<4) + /*IB_MAD_SNOOP_RMPP_RECVS = (1<<5),*/ + /*IB_MAD_SNOOP_REDIRECTED_QPS = (1<<6)*/ +}; + +/** + * ib_register_mad_snoop - Register to snoop sent and received MADs. + * @device: The device to register with. + * @port_num: The port on the specified device to use. + * @qp_type: Specifies which QP traffic to snoop. Must be either + * IB_QPT_SMI or IB_QPT_GSI. + * @mad_snoop_flags: Specifies information where snooping occurs. + * @send_handler: The callback routine invoked for a snooped send. + * @recv_handler: The callback routine invoked for a snooped receive. + * @context: User specified context associated with the registration. + */ +struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device, + u8 port_num, + enum ib_qp_type qp_type, + int mad_snoop_flags, + ib_mad_snoop_handler snoop_handler, + ib_mad_recv_handler recv_handler, + void *context); + +/** + * ib_unregister_mad_agent - Unregisters a client from using MAD services. + * @mad_agent: Corresponding MAD registration request to deregister. + * + * After invoking this routine, MAD services are no longer usable by the + * client on the associated QP. + */ +int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent); + +/** + * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated + * with the registered client. + * @send_buf: Specifies the information needed to send the MAD(s). + * @bad_send_buf: Specifies the MAD on which an error was encountered. This + * parameter is optional if only a single MAD is posted. + * + * Sent MADs are not guaranteed to complete in the order that they were posted. + * + * If the MAD requires RMPP, the data buffer should contain a single copy + * of the common MAD, RMPP, and class specific headers, followed by the class + * defined data. If the class defined data would not divide evenly into + * RMPP segments, then space must be allocated at the end of the referenced + * buffer for any required padding. To indicate the amount of class defined + * data being transferred, the paylen_newwin field in the RMPP header should + * be set to the size of the class specific header plus the amount of class + * defined data being transferred. The paylen_newwin field should be + * specified in network-byte order. + */ +int ib_post_send_mad(struct ib_mad_send_buf *send_buf, + struct ib_mad_send_buf **bad_send_buf); + + +/** + * ib_free_recv_mad - Returns data buffers used to receive a MAD. + * @mad_recv_wc: Work completion information for a received MAD. + * + * Clients receiving MADs through their ib_mad_recv_handler must call this + * routine to return the work completion buffers to the access layer. + */ +void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc); + +/** + * ib_cancel_mad - Cancels an outstanding send MAD operation. + * @mad_agent: Specifies the registration associated with sent MAD. + * @send_buf: Indicates the MAD to cancel. + * + * MADs will be returned to the user through the corresponding + * ib_mad_send_handler. + */ +void ib_cancel_mad(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf); + +/** + * ib_modify_mad - Modifies an outstanding send MAD operation. + * @mad_agent: Specifies the registration associated with sent MAD. + * @send_buf: Indicates the MAD to modify. + * @timeout_ms: New timeout value for sent MAD. + * + * This call will reset the timeout value for a sent MAD to the specified + * value. + */ +int ib_modify_mad(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, u32 timeout_ms); + +/** + * ib_redirect_mad_qp - Registers a QP for MAD services. + * @qp: Reference to a QP that requires MAD services. + * @rmpp_version: If set, indicates that the client will send + * and receive MADs that contain the RMPP header for the given version. + * If set to 0, indicates that RMPP is not used by this client. + * @send_handler: The completion callback routine invoked after a send + * request has completed. + * @recv_handler: The completion callback routine invoked for a received + * MAD. + * @context: User specified context associated with the registration. + * + * Use of this call allows clients to use MAD services, such as RMPP, + * on user-owned QPs. After calling this routine, users may send + * MADs on the specified QP by calling ib_mad_post_send. + */ +struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp, + u8 rmpp_version, + ib_mad_send_handler send_handler, + ib_mad_recv_handler recv_handler, + void *context); + +/** + * ib_process_mad_wc - Processes a work completion associated with a + * MAD sent or received on a redirected QP. + * @mad_agent: Specifies the registered MAD service using the redirected QP. + * @wc: References a work completion associated with a sent or received + * MAD segment. + * + * This routine is used to complete or continue processing on a MAD request. + * If the work completion is associated with a send operation, calling + * this routine is required to continue an RMPP transfer or to wait for a + * corresponding response, if it is a request. If the work completion is + * associated with a receive operation, calling this routine is required to + * process an inbound or outbound RMPP transfer, or to match a response MAD + * with its corresponding request. + */ +int ib_process_mad_wc(struct ib_mad_agent *mad_agent, + struct ib_wc *wc); + +/** + * ib_create_send_mad - Allocate and initialize a data buffer and work request + * for sending a MAD. + * @mad_agent: Specifies the registered MAD service to associate with the MAD. + * @remote_qpn: Specifies the QPN of the receiving node. + * @pkey_index: Specifies which PKey the MAD will be sent using. This field + * is valid only if the remote_qpn is QP 1. + * @rmpp_active: Indicates if the send will enable RMPP. + * @hdr_len: Indicates the size of the data header of the MAD. This length + * should include the common MAD header, RMPP header, plus any class + * specific header. + * @data_len: Indicates the size of any user-transferred data. The call will + * automatically adjust the allocated buffer size to account for any + * additional padding that may be necessary. + * @gfp_mask: GFP mask used for the memory allocation. + * + * This routine allocates a MAD for sending. The returned MAD send buffer + * will reference a data buffer usable for sending a MAD, along + * with an initialized work request structure. Users may modify the returned + * MAD data buffer before posting the send. + * + * The returned MAD header, class specific headers, and any padding will be + * cleared. Users are responsible for initializing the common MAD header, + * any class specific header, and MAD data area. + * If @rmpp_active is set, the RMPP header will be initialized for sending. + */ +struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, + u32 remote_qpn, u16 pkey_index, + int rmpp_active, + int hdr_len, int data_len, + int gfp_mask); + +/** + * ib_is_mad_class_rmpp - returns whether given management class + * supports RMPP. + * @mgmt_class: management class + * + * This routine returns whether the management class supports RMPP. + */ +int ib_is_mad_class_rmpp(u8 mgmt_class); + +/** + * ib_get_mad_data_offset - returns the data offset for a given + * management class. + * @mgmt_class: management class + * + * This routine returns the data offset in the MAD for the management + * class requested. + */ +int ib_get_mad_data_offset(u8 mgmt_class); + +/** + * ib_get_rmpp_segment - returns the data buffer for a given RMPP segment. + * @send_buf: Previously allocated send data buffer. + * @seg_num: number of segment to return + * + * This routine returns a pointer to the data buffer of an RMPP MAD. + * Users must provide synchronization to @send_buf around this call. + */ +void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num); + +/** + * ib_free_send_mad - Returns data buffers used to send a MAD. + * @send_buf: Previously allocated send data buffer. + */ +void ib_free_send_mad(struct ib_mad_send_buf *send_buf); + +#endif /* IB_MAD_H */ diff --git a/sys/contrib/rdma/ib_marshall.h b/sys/contrib/rdma/ib_marshall.h new file mode 100644 index 000000000000..60cd219efd04 --- /dev/null +++ b/sys/contrib/rdma/ib_marshall.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $FreeBSD$ + */ + +#if !defined(IB_USER_MARSHALL_H) +#define IB_USER_MARSHALL_H + +#include <rdma/ib_verbs.h> +#include <rdma/ib_sa.h> +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_user_sa.h> + +void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, + struct ib_qp_attr *src); + +void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, + struct ib_ah_attr *src); + +void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, + struct ib_sa_path_rec *src); + +void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst, + struct ib_user_path_rec *src); + +#endif /* IB_USER_MARSHALL_H */ diff --git a/sys/contrib/rdma/ib_pack.h b/sys/contrib/rdma/ib_pack.h new file mode 100644 index 000000000000..206d1f1c0a63 --- /dev/null +++ b/sys/contrib/rdma/ib_pack.h @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: ib_pack.h 1349 2004-12-16 21:09:43Z roland $ + * + * $FreeBSD$ + */ + +#ifndef IB_PACK_H +#define IB_PACK_H + +#include <rdma/ib_verbs.h> + +enum { + IB_LRH_BYTES = 8, + IB_GRH_BYTES = 40, + IB_BTH_BYTES = 12, + IB_DETH_BYTES = 8 +}; + +struct ib_field { + size_t struct_offset_bytes; + size_t struct_size_bytes; + int offset_words; + int offset_bits; + int size_bits; + char *field_name; +}; + +#define RESERVED \ + .field_name = "reserved" + +/* + * This macro cleans up the definitions of constants for BTH opcodes. + * It is used to define constants such as IB_OPCODE_UD_SEND_ONLY, + * which becomes IB_OPCODE_UD + IB_OPCODE_SEND_ONLY, and this gives + * the correct value. + * + * In short, user code should use the constants defined using the + * macro rather than worrying about adding together other constants. +*/ +#define IB_OPCODE(transport, op) \ + IB_OPCODE_ ## transport ## _ ## op = \ + IB_OPCODE_ ## transport + IB_OPCODE_ ## op + +enum { + /* transport types -- just used to define real constants */ + IB_OPCODE_RC = 0x00, + IB_OPCODE_UC = 0x20, + IB_OPCODE_RD = 0x40, + IB_OPCODE_UD = 0x60, + + /* operations -- just used to define real constants */ + IB_OPCODE_SEND_FIRST = 0x00, + IB_OPCODE_SEND_MIDDLE = 0x01, + IB_OPCODE_SEND_LAST = 0x02, + IB_OPCODE_SEND_LAST_WITH_IMMEDIATE = 0x03, + IB_OPCODE_SEND_ONLY = 0x04, + IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE = 0x05, + IB_OPCODE_RDMA_WRITE_FIRST = 0x06, + IB_OPCODE_RDMA_WRITE_MIDDLE = 0x07, + IB_OPCODE_RDMA_WRITE_LAST = 0x08, + IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE = 0x09, + IB_OPCODE_RDMA_WRITE_ONLY = 0x0a, + IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE = 0x0b, + IB_OPCODE_RDMA_READ_REQUEST = 0x0c, + IB_OPCODE_RDMA_READ_RESPONSE_FIRST = 0x0d, + IB_OPCODE_RDMA_READ_RESPONSE_MIDDLE = 0x0e, + IB_OPCODE_RDMA_READ_RESPONSE_LAST = 0x0f, + IB_OPCODE_RDMA_READ_RESPONSE_ONLY = 0x10, + IB_OPCODE_ACKNOWLEDGE = 0x11, + IB_OPCODE_ATOMIC_ACKNOWLEDGE = 0x12, + IB_OPCODE_COMPARE_SWAP = 0x13, + IB_OPCODE_FETCH_ADD = 0x14, + + /* real constants follow -- see comment about above IB_OPCODE() + macro for more details */ + + /* RC */ + IB_OPCODE(RC, SEND_FIRST), + IB_OPCODE(RC, SEND_MIDDLE), + IB_OPCODE(RC, SEND_LAST), + IB_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE), + IB_OPCODE(RC, SEND_ONLY), + IB_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE), + IB_OPCODE(RC, RDMA_WRITE_FIRST), + IB_OPCODE(RC, RDMA_WRITE_MIDDLE), + IB_OPCODE(RC, RDMA_WRITE_LAST), + IB_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE), + IB_OPCODE(RC, RDMA_WRITE_ONLY), + IB_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE), + IB_OPCODE(RC, RDMA_READ_REQUEST), + IB_OPCODE(RC, RDMA_READ_RESPONSE_FIRST), + IB_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE), + IB_OPCODE(RC, RDMA_READ_RESPONSE_LAST), + IB_OPCODE(RC, RDMA_READ_RESPONSE_ONLY), + IB_OPCODE(RC, ACKNOWLEDGE), + IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE), + IB_OPCODE(RC, COMPARE_SWAP), + IB_OPCODE(RC, FETCH_ADD), + + /* UC */ + IB_OPCODE(UC, SEND_FIRST), + IB_OPCODE(UC, SEND_MIDDLE), + IB_OPCODE(UC, SEND_LAST), + IB_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE), + IB_OPCODE(UC, SEND_ONLY), + IB_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE), + IB_OPCODE(UC, RDMA_WRITE_FIRST), + IB_OPCODE(UC, RDMA_WRITE_MIDDLE), + IB_OPCODE(UC, RDMA_WRITE_LAST), + IB_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE), + IB_OPCODE(UC, RDMA_WRITE_ONLY), + IB_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE), + + /* RD */ + IB_OPCODE(RD, SEND_FIRST), + IB_OPCODE(RD, SEND_MIDDLE), + IB_OPCODE(RD, SEND_LAST), + IB_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE), + IB_OPCODE(RD, SEND_ONLY), + IB_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE), + IB_OPCODE(RD, RDMA_WRITE_FIRST), + IB_OPCODE(RD, RDMA_WRITE_MIDDLE), + IB_OPCODE(RD, RDMA_WRITE_LAST), + IB_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE), + IB_OPCODE(RD, RDMA_WRITE_ONLY), + IB_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE), + IB_OPCODE(RD, RDMA_READ_REQUEST), + IB_OPCODE(RD, RDMA_READ_RESPONSE_FIRST), + IB_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE), + IB_OPCODE(RD, RDMA_READ_RESPONSE_LAST), + IB_OPCODE(RD, RDMA_READ_RESPONSE_ONLY), + IB_OPCODE(RD, ACKNOWLEDGE), + IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE), + IB_OPCODE(RD, COMPARE_SWAP), + IB_OPCODE(RD, FETCH_ADD), + + /* UD */ + IB_OPCODE(UD, SEND_ONLY), + IB_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE) +}; + +enum { + IB_LNH_RAW = 0, + IB_LNH_IP = 1, + IB_LNH_IBA_LOCAL = 2, + IB_LNH_IBA_GLOBAL = 3 +}; + +struct ib_unpacked_lrh { + u8 virtual_lane; + u8 link_version; + u8 service_level; + u8 link_next_header; + __be16 destination_lid; + __be16 packet_length; + __be16 source_lid; +}; + +struct ib_unpacked_grh { + u8 ip_version; + u8 traffic_class; + __be32 flow_label; + __be16 payload_length; + u8 next_header; + u8 hop_limit; + union ib_gid source_gid; + union ib_gid destination_gid; +}; + +struct ib_unpacked_bth { + u8 opcode; + u8 solicited_event; + u8 mig_req; + u8 pad_count; + u8 transport_header_version; + __be16 pkey; + __be32 destination_qpn; + u8 ack_req; + __be32 psn; +}; + +struct ib_unpacked_deth { + __be32 qkey; + __be32 source_qpn; +}; + +struct ib_ud_header { + struct ib_unpacked_lrh lrh; + int grh_present; + struct ib_unpacked_grh grh; + struct ib_unpacked_bth bth; + struct ib_unpacked_deth deth; + int immediate_present; + __be32 immediate_data; +}; + +void ib_pack(const struct ib_field *desc, + int desc_len, + void *structure, + void *buf); + +void ib_unpack(const struct ib_field *desc, + int desc_len, + void *buf, + void *structure); + +void ib_ud_header_init(int payload_bytes, + int grh_present, + struct ib_ud_header *header); + +int ib_ud_header_pack(struct ib_ud_header *header, + void *buf); + +int ib_ud_header_unpack(void *buf, + struct ib_ud_header *header); + +#endif /* IB_PACK_H */ diff --git a/sys/contrib/rdma/ib_sa.h b/sys/contrib/rdma/ib_sa.h new file mode 100644 index 000000000000..bf6a28a1c616 --- /dev/null +++ b/sys/contrib/rdma/ib_sa.h @@ -0,0 +1,386 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * Copyright (c) 2006 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: ib_sa.h 2811 2005-07-06 18:11:43Z halr $ + * + * $FreeBSD$ + */ + +#ifndef IB_SA_H +#define IB_SA_H + +#include <machine/atomic.h> + +#include <contrib/rdma/ib_verbs.h> +#include <contrib/rdma/ib_mad.h> + +enum { + IB_SA_CLASS_VERSION = 2, /* IB spec version 1.1/1.2 */ + + IB_SA_METHOD_GET_TABLE = 0x12, + IB_SA_METHOD_GET_TABLE_RESP = 0x92, + IB_SA_METHOD_DELETE = 0x15, + IB_SA_METHOD_DELETE_RESP = 0x95, + IB_SA_METHOD_GET_MULTI = 0x14, + IB_SA_METHOD_GET_MULTI_RESP = 0x94, + IB_SA_METHOD_GET_TRACE_TBL = 0x13 +}; + +enum { + IB_SA_ATTR_CLASS_PORTINFO = 0x01, + IB_SA_ATTR_NOTICE = 0x02, + IB_SA_ATTR_INFORM_INFO = 0x03, + IB_SA_ATTR_NODE_REC = 0x11, + IB_SA_ATTR_PORT_INFO_REC = 0x12, + IB_SA_ATTR_SL2VL_REC = 0x13, + IB_SA_ATTR_SWITCH_REC = 0x14, + IB_SA_ATTR_LINEAR_FDB_REC = 0x15, + IB_SA_ATTR_RANDOM_FDB_REC = 0x16, + IB_SA_ATTR_MCAST_FDB_REC = 0x17, + IB_SA_ATTR_SM_INFO_REC = 0x18, + IB_SA_ATTR_LINK_REC = 0x20, + IB_SA_ATTR_GUID_INFO_REC = 0x30, + IB_SA_ATTR_SERVICE_REC = 0x31, + IB_SA_ATTR_PARTITION_REC = 0x33, + IB_SA_ATTR_PATH_REC = 0x35, + IB_SA_ATTR_VL_ARB_REC = 0x36, + IB_SA_ATTR_MC_MEMBER_REC = 0x38, + IB_SA_ATTR_TRACE_REC = 0x39, + IB_SA_ATTR_MULTI_PATH_REC = 0x3a, + IB_SA_ATTR_SERVICE_ASSOC_REC = 0x3b, + IB_SA_ATTR_INFORM_INFO_REC = 0xf3 +}; + +enum ib_sa_selector { + IB_SA_GT = 0, + IB_SA_LT = 1, + IB_SA_EQ = 2, + /* + * The meaning of "best" depends on the attribute: for + * example, for MTU best will return the largest available + * MTU, while for packet life time, best will return the + * smallest available life time. + */ + IB_SA_BEST = 3 +}; + +/* + * Structures for SA records are named "struct ib_sa_xxx_rec." No + * attempt is made to pack structures to match the physical layout of + * SA records in SA MADs; all packing and unpacking is handled by the + * SA query code. + * + * For a record with structure ib_sa_xxx_rec, the naming convention + * for the component mask value for field yyy is IB_SA_XXX_REC_YYY (we + * never use different abbreviations or otherwise change the spelling + * of xxx/yyy between ib_sa_xxx_rec.yyy and IB_SA_XXX_REC_YYY). + * + * Reserved rows are indicated with comments to help maintainability. + */ + +/* reserved: 0 */ +/* reserved: 1 */ +#define IB_SA_PATH_REC_DGID IB_SA_COMP_MASK( 2) +#define IB_SA_PATH_REC_SGID IB_SA_COMP_MASK( 3) +#define IB_SA_PATH_REC_DLID IB_SA_COMP_MASK( 4) +#define IB_SA_PATH_REC_SLID IB_SA_COMP_MASK( 5) +#define IB_SA_PATH_REC_RAW_TRAFFIC IB_SA_COMP_MASK( 6) +/* reserved: 7 */ +#define IB_SA_PATH_REC_FLOW_LABEL IB_SA_COMP_MASK( 8) +#define IB_SA_PATH_REC_HOP_LIMIT IB_SA_COMP_MASK( 9) +#define IB_SA_PATH_REC_TRAFFIC_CLASS IB_SA_COMP_MASK(10) +#define IB_SA_PATH_REC_REVERSIBLE IB_SA_COMP_MASK(11) +#define IB_SA_PATH_REC_NUMB_PATH IB_SA_COMP_MASK(12) +#define IB_SA_PATH_REC_PKEY IB_SA_COMP_MASK(13) +/* reserved: 14 */ +#define IB_SA_PATH_REC_SL IB_SA_COMP_MASK(15) +#define IB_SA_PATH_REC_MTU_SELECTOR IB_SA_COMP_MASK(16) +#define IB_SA_PATH_REC_MTU IB_SA_COMP_MASK(17) +#define IB_SA_PATH_REC_RATE_SELECTOR IB_SA_COMP_MASK(18) +#define IB_SA_PATH_REC_RATE IB_SA_COMP_MASK(19) +#define IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR IB_SA_COMP_MASK(20) +#define IB_SA_PATH_REC_PACKET_LIFE_TIME IB_SA_COMP_MASK(21) +#define IB_SA_PATH_REC_PREFERENCE IB_SA_COMP_MASK(22) + +struct ib_sa_path_rec { + /* reserved */ + /* reserved */ + union ib_gid dgid; + union ib_gid sgid; + __be16 dlid; + __be16 slid; + int raw_traffic; + /* reserved */ + __be32 flow_label; + u8 hop_limit; + u8 traffic_class; + int reversible; + u8 numb_path; + __be16 pkey; + /* reserved */ + u8 sl; + u8 mtu_selector; + u8 mtu; + u8 rate_selector; + u8 rate; + u8 packet_life_time_selector; + u8 packet_life_time; + u8 preference; +}; + +#define IB_SA_MCMEMBER_REC_MGID IB_SA_COMP_MASK( 0) +#define IB_SA_MCMEMBER_REC_PORT_GID IB_SA_COMP_MASK( 1) +#define IB_SA_MCMEMBER_REC_QKEY IB_SA_COMP_MASK( 2) +#define IB_SA_MCMEMBER_REC_MLID IB_SA_COMP_MASK( 3) +#define IB_SA_MCMEMBER_REC_MTU_SELECTOR IB_SA_COMP_MASK( 4) +#define IB_SA_MCMEMBER_REC_MTU IB_SA_COMP_MASK( 5) +#define IB_SA_MCMEMBER_REC_TRAFFIC_CLASS IB_SA_COMP_MASK( 6) +#define IB_SA_MCMEMBER_REC_PKEY IB_SA_COMP_MASK( 7) +#define IB_SA_MCMEMBER_REC_RATE_SELECTOR IB_SA_COMP_MASK( 8) +#define IB_SA_MCMEMBER_REC_RATE IB_SA_COMP_MASK( 9) +#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR IB_SA_COMP_MASK(10) +#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME IB_SA_COMP_MASK(11) +#define IB_SA_MCMEMBER_REC_SL IB_SA_COMP_MASK(12) +#define IB_SA_MCMEMBER_REC_FLOW_LABEL IB_SA_COMP_MASK(13) +#define IB_SA_MCMEMBER_REC_HOP_LIMIT IB_SA_COMP_MASK(14) +#define IB_SA_MCMEMBER_REC_SCOPE IB_SA_COMP_MASK(15) +#define IB_SA_MCMEMBER_REC_JOIN_STATE IB_SA_COMP_MASK(16) +#define IB_SA_MCMEMBER_REC_PROXY_JOIN IB_SA_COMP_MASK(17) + +struct ib_sa_mcmember_rec { + union ib_gid mgid; + union ib_gid port_gid; + __be32 qkey; + __be16 mlid; + u8 mtu_selector; + u8 mtu; + u8 traffic_class; + __be16 pkey; + u8 rate_selector; + u8 rate; + u8 packet_life_time_selector; + u8 packet_life_time; + u8 sl; + __be32 flow_label; + u8 hop_limit; + u8 scope; + u8 join_state; + int proxy_join; +}; + +/* Service Record Component Mask Sec 15.2.5.14 Ver 1.1 */ +#define IB_SA_SERVICE_REC_SERVICE_ID IB_SA_COMP_MASK( 0) +#define IB_SA_SERVICE_REC_SERVICE_GID IB_SA_COMP_MASK( 1) +#define IB_SA_SERVICE_REC_SERVICE_PKEY IB_SA_COMP_MASK( 2) +/* reserved: 3 */ +#define IB_SA_SERVICE_REC_SERVICE_LEASE IB_SA_COMP_MASK( 4) +#define IB_SA_SERVICE_REC_SERVICE_KEY IB_SA_COMP_MASK( 5) +#define IB_SA_SERVICE_REC_SERVICE_NAME IB_SA_COMP_MASK( 6) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_0 IB_SA_COMP_MASK( 7) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_1 IB_SA_COMP_MASK( 8) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_2 IB_SA_COMP_MASK( 9) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_3 IB_SA_COMP_MASK(10) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_4 IB_SA_COMP_MASK(11) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_5 IB_SA_COMP_MASK(12) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_6 IB_SA_COMP_MASK(13) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_7 IB_SA_COMP_MASK(14) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_8 IB_SA_COMP_MASK(15) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_9 IB_SA_COMP_MASK(16) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_10 IB_SA_COMP_MASK(17) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_11 IB_SA_COMP_MASK(18) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_12 IB_SA_COMP_MASK(19) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_13 IB_SA_COMP_MASK(20) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_14 IB_SA_COMP_MASK(21) +#define IB_SA_SERVICE_REC_SERVICE_DATA8_15 IB_SA_COMP_MASK(22) +#define IB_SA_SERVICE_REC_SERVICE_DATA16_0 IB_SA_COMP_MASK(23) +#define IB_SA_SERVICE_REC_SERVICE_DATA16_1 IB_SA_COMP_MASK(24) +#define IB_SA_SERVICE_REC_SERVICE_DATA16_2 IB_SA_COMP_MASK(25) +#define IB_SA_SERVICE_REC_SERVICE_DATA16_3 IB_SA_COMP_MASK(26) +#define IB_SA_SERVICE_REC_SERVICE_DATA16_4 IB_SA_COMP_MASK(27) +#define IB_SA_SERVICE_REC_SERVICE_DATA16_5 IB_SA_COMP_MASK(28) +#define IB_SA_SERVICE_REC_SERVICE_DATA16_6 IB_SA_COMP_MASK(29) +#define IB_SA_SERVICE_REC_SERVICE_DATA16_7 IB_SA_COMP_MASK(30) +#define IB_SA_SERVICE_REC_SERVICE_DATA32_0 IB_SA_COMP_MASK(31) +#define IB_SA_SERVICE_REC_SERVICE_DATA32_1 IB_SA_COMP_MASK(32) +#define IB_SA_SERVICE_REC_SERVICE_DATA32_2 IB_SA_COMP_MASK(33) +#define IB_SA_SERVICE_REC_SERVICE_DATA32_3 IB_SA_COMP_MASK(34) +#define IB_SA_SERVICE_REC_SERVICE_DATA64_0 IB_SA_COMP_MASK(35) +#define IB_SA_SERVICE_REC_SERVICE_DATA64_1 IB_SA_COMP_MASK(36) + +#define IB_DEFAULT_SERVICE_LEASE 0xFFFFFFFF + +struct ib_sa_service_rec { + u64 id; + union ib_gid gid; + __be16 pkey; + /* reserved */ + u32 lease; + u8 key[16]; + u8 name[64]; + u8 data8[16]; + u16 data16[8]; + u32 data32[4]; + u64 data64[2]; +}; + +struct ib_sa_client { + volatile int users; +#ifdef notyet + struct completion comp; +#endif +}; + +/** + * ib_sa_register_client - Register an SA client. + */ +void ib_sa_register_client(struct ib_sa_client *client); + +/** + * ib_sa_unregister_client - Deregister an SA client. + * @client: Client object to deregister. + */ +void ib_sa_unregister_client(struct ib_sa_client *client); + +struct ib_sa_query; + +void ib_sa_cancel_query(int id, struct ib_sa_query *query); + +int ib_sa_path_rec_get(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_path_rec *rec, + ib_sa_comp_mask comp_mask, + int timeout_ms, int gfp_mask, + void (*callback)(int status, + struct ib_sa_path_rec *resp, + void *context), + void *context, + struct ib_sa_query **query); + +int ib_sa_service_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + u8 method, + struct ib_sa_service_rec *rec, + ib_sa_comp_mask comp_mask, + int timeout_ms, int gfp_mask, + void (*callback)(int status, + struct ib_sa_service_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query); + +struct ib_sa_multicast { + struct ib_sa_mcmember_rec rec; + ib_sa_comp_mask comp_mask; + int (*callback)(int status, + struct ib_sa_multicast *multicast); + void *context; +}; + +/** + * ib_sa_join_multicast - Initiates a join request to the specified multicast + * group. + * @client: SA client + * @device: Device associated with the multicast group. + * @port_num: Port on the specified device to associate with the multicast + * group. + * @rec: SA multicast member record specifying group attributes. + * @comp_mask: Component mask indicating which group attributes of %rec are + * valid. + * @gfp_mask: GFP mask for memory allocations. + * @callback: User callback invoked once the join operation completes. + * @context: User specified context stored with the ib_sa_multicast structure. + * + * This call initiates a multicast join request with the SA for the specified + * multicast group. If the join operation is started successfully, it returns + * an ib_sa_multicast structure that is used to track the multicast operation. + * Users must free this structure by calling ib_free_multicast, even if the + * join operation later fails. (The callback status is non-zero.) + * + * If the join operation fails; status will be non-zero, with the following + * failures possible: + * ETIMEDOUT: The request timed out. + * EIO: An error occurred sending the query. + * EINVAL: The MCMemberRecord values differed from the existing group's. + * ENETRESET: Indicates that an fatal error has occurred on the multicast + * group, and the user must rejoin the group to continue using it. + */ +struct ib_sa_multicast *ib_sa_join_multicast(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_mcmember_rec *rec, + ib_sa_comp_mask comp_mask, int gfp_mask, + int (*callback)(int status, + struct ib_sa_multicast + *multicast), + void *context); + +/** + * ib_free_multicast - Frees the multicast tracking structure, and releases + * any reference on the multicast group. + * @multicast: Multicast tracking structure allocated by ib_join_multicast. + * + * This call blocks until the multicast identifier is destroyed. It may + * not be called from within the multicast callback; however, returning a non- + * zero value from the callback will result in destroying the multicast + * tracking structure. + */ +void ib_sa_free_multicast(struct ib_sa_multicast *multicast); + +/** + * ib_get_mcmember_rec - Looks up a multicast member record by its MGID and + * returns it if found. + * @device: Device associated with the multicast group. + * @port_num: Port on the specified device to associate with the multicast + * group. + * @mgid: MGID of multicast group. + * @rec: Location to copy SA multicast member record. + */ +int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num, + union ib_gid *mgid, struct ib_sa_mcmember_rec *rec); + +/** + * ib_init_ah_from_mcmember - Initialize address handle attributes based on + * an SA multicast member record. + */ +int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, + struct ib_sa_mcmember_rec *rec, + struct ib_ah_attr *ah_attr); + +/** + * ib_init_ah_from_path - Initialize address handle attributes based on an SA + * path record. + */ +int ib_init_ah_from_path(struct ib_device *device, u8 port_num, + struct ib_sa_path_rec *rec, + struct ib_ah_attr *ah_attr); + +#endif /* IB_SA_H */ diff --git a/sys/contrib/rdma/ib_smi.h b/sys/contrib/rdma/ib_smi.h new file mode 100644 index 000000000000..0e4b1e940d8e --- /dev/null +++ b/sys/contrib/rdma/ib_smi.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004 Voltaire Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: ib_smi.h 1389 2004-12-27 22:56:47Z roland $ + * + * $FreeBSD$ + */ + +#if !defined( IB_SMI_H ) +#define IB_SMI_H + +#include <rdma/ib_mad.h> + +#define IB_SMP_DATA_SIZE 64 +#define IB_SMP_MAX_PATH_HOPS 64 + +struct ib_smp { + u8 base_version; + u8 mgmt_class; + u8 class_version; + u8 method; + __be16 status; + u8 hop_ptr; + u8 hop_cnt; + __be64 tid; + __be16 attr_id; + __be16 resv; + __be32 attr_mod; + __be64 mkey; + __be16 dr_slid; + __be16 dr_dlid; + u8 reserved[28]; + u8 data[IB_SMP_DATA_SIZE]; + u8 initial_path[IB_SMP_MAX_PATH_HOPS]; + u8 return_path[IB_SMP_MAX_PATH_HOPS]; +} __attribute__ ((packed)); + +#define IB_SMP_DIRECTION __constant_htons(0x8000) + +/* Subnet management attributes */ +#define IB_SMP_ATTR_NOTICE __constant_htons(0x0002) +#define IB_SMP_ATTR_NODE_DESC __constant_htons(0x0010) +#define IB_SMP_ATTR_NODE_INFO __constant_htons(0x0011) +#define IB_SMP_ATTR_SWITCH_INFO __constant_htons(0x0012) +#define IB_SMP_ATTR_GUID_INFO __constant_htons(0x0014) +#define IB_SMP_ATTR_PORT_INFO __constant_htons(0x0015) +#define IB_SMP_ATTR_PKEY_TABLE __constant_htons(0x0016) +#define IB_SMP_ATTR_SL_TO_VL_TABLE __constant_htons(0x0017) +#define IB_SMP_ATTR_VL_ARB_TABLE __constant_htons(0x0018) +#define IB_SMP_ATTR_LINEAR_FORWARD_TABLE __constant_htons(0x0019) +#define IB_SMP_ATTR_RANDOM_FORWARD_TABLE __constant_htons(0x001A) +#define IB_SMP_ATTR_MCAST_FORWARD_TABLE __constant_htons(0x001B) +#define IB_SMP_ATTR_SM_INFO __constant_htons(0x0020) +#define IB_SMP_ATTR_VENDOR_DIAG __constant_htons(0x0030) +#define IB_SMP_ATTR_LED_INFO __constant_htons(0x0031) +#define IB_SMP_ATTR_VENDOR_MASK __constant_htons(0xFF00) + +struct ib_port_info { + __be64 mkey; + __be64 gid_prefix; + __be16 lid; + __be16 sm_lid; + __be32 cap_mask; + __be16 diag_code; + __be16 mkey_lease_period; + u8 local_port_num; + u8 link_width_enabled; + u8 link_width_supported; + u8 link_width_active; + u8 linkspeed_portstate; /* 4 bits, 4 bits */ + u8 portphysstate_linkdown; /* 4 bits, 4 bits */ + u8 mkeyprot_resv_lmc; /* 2 bits, 3, 3 */ + u8 linkspeedactive_enabled; /* 4 bits, 4 bits */ + u8 neighbormtu_mastersmsl; /* 4 bits, 4 bits */ + u8 vlcap_inittype; /* 4 bits, 4 bits */ + u8 vl_high_limit; + u8 vl_arb_high_cap; + u8 vl_arb_low_cap; + u8 inittypereply_mtucap; /* 4 bits, 4 bits */ + u8 vlstallcnt_hoqlife; /* 3 bits, 5 bits */ + u8 operationalvl_pei_peo_fpi_fpo; /* 4 bits, 1, 1, 1, 1 */ + __be16 mkey_violations; + __be16 pkey_violations; + __be16 qkey_violations; + u8 guid_cap; + u8 clientrereg_resv_subnetto; /* 1 bit, 2 bits, 5 */ + u8 resv_resptimevalue; /* 3 bits, 5 bits */ + u8 localphyerrors_overrunerrors; /* 4 bits, 4 bits */ + __be16 max_credit_hint; + u8 resv; + u8 link_roundtrip_latency[3]; +}; + +static inline u8 +ib_get_smp_direction(struct ib_smp *smp) +{ + return ((smp->status & IB_SMP_DIRECTION) == IB_SMP_DIRECTION); +} + +#endif /* IB_SMI_H */ diff --git a/sys/contrib/rdma/ib_umem.h b/sys/contrib/rdma/ib_umem.h new file mode 100644 index 000000000000..50dd5dac867d --- /dev/null +++ b/sys/contrib/rdma/ib_umem.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2007 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $FreeBSD$ + */ + +#ifndef IB_UMEM_H +#define IB_UMEM_H + +struct ib_ucontext; + +struct ib_umem_chunk { + TAILQ_ENTRY(ib_umem_chunk) entry; + int nents; + int nmap; + struct rdma_scatterlist page_list[0]; +}; + +struct ib_umem { + struct ib_ucontext *context; + size_t length; + int offset; + int page_size; + int writable; + TAILQ_HEAD(, ib_umem_chunk) chunk_list; +#ifdef notyet + struct work_struct work; + struct mm_struct *mm; +#endif + unsigned long diff; +}; + +#ifdef CONFIG_INFINIBAND_USER_MEM + +struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, + size_t size, int access); +void ib_umem_release(struct ib_umem *umem); +int ib_umem_page_count(struct ib_umem *umem); + +#else /* CONFIG_INFINIBAND_USER_MEM */ + + +static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context, + unsigned long addr, size_t size, + int access) { + return ERR_PTR(EINVAL); +} +static inline void ib_umem_release(struct ib_umem *umem) { } +static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; } + +#endif /* CONFIG_INFINIBAND_USER_MEM */ + +#endif /* IB_UMEM_H */ diff --git a/sys/contrib/rdma/ib_user_cm.h b/sys/contrib/rdma/ib_user_cm.h new file mode 100644 index 000000000000..e8815d51cf5b --- /dev/null +++ b/sys/contrib/rdma/ib_user_cm.h @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: ib_user_cm.h 4019 2005-11-11 00:33:09Z sean.hefty $ + * + * $FreeBSD$ + */ + +#ifndef IB_USER_CM_H +#define IB_USER_CM_H + +#include <rdma/ib_user_sa.h> + +#define IB_USER_CM_ABI_VERSION 5 + +enum { + IB_USER_CM_CMD_CREATE_ID, + IB_USER_CM_CMD_DESTROY_ID, + IB_USER_CM_CMD_ATTR_ID, + + IB_USER_CM_CMD_LISTEN, + IB_USER_CM_CMD_NOTIFY, + + IB_USER_CM_CMD_SEND_REQ, + IB_USER_CM_CMD_SEND_REP, + IB_USER_CM_CMD_SEND_RTU, + IB_USER_CM_CMD_SEND_DREQ, + IB_USER_CM_CMD_SEND_DREP, + IB_USER_CM_CMD_SEND_REJ, + IB_USER_CM_CMD_SEND_MRA, + IB_USER_CM_CMD_SEND_LAP, + IB_USER_CM_CMD_SEND_APR, + IB_USER_CM_CMD_SEND_SIDR_REQ, + IB_USER_CM_CMD_SEND_SIDR_REP, + + IB_USER_CM_CMD_EVENT, + IB_USER_CM_CMD_INIT_QP_ATTR, +}; +/* + * command ABI structures. + */ +struct ib_ucm_cmd_hdr { + __u32 cmd; + __u16 in; + __u16 out; +}; + +struct ib_ucm_create_id { + __u64 uid; + __u64 response; +}; + +struct ib_ucm_create_id_resp { + __u32 id; +}; + +struct ib_ucm_destroy_id { + __u64 response; + __u32 id; + __u32 reserved; +}; + +struct ib_ucm_destroy_id_resp { + __u32 events_reported; +}; + +struct ib_ucm_attr_id { + __u64 response; + __u32 id; + __u32 reserved; +}; + +struct ib_ucm_attr_id_resp { + __be64 service_id; + __be64 service_mask; + __be32 local_id; + __be32 remote_id; +}; + +struct ib_ucm_init_qp_attr { + __u64 response; + __u32 id; + __u32 qp_state; +}; + +struct ib_ucm_listen { + __be64 service_id; + __be64 service_mask; + __u32 id; + __u32 reserved; +}; + +struct ib_ucm_notify { + __u32 id; + __u32 event; +}; + +struct ib_ucm_private_data { + __u64 data; + __u32 id; + __u8 len; + __u8 reserved[3]; +}; + +struct ib_ucm_req { + __u32 id; + __u32 qpn; + __u32 qp_type; + __u32 psn; + __be64 sid; + __u64 data; + __u64 primary_path; + __u64 alternate_path; + __u8 len; + __u8 peer_to_peer; + __u8 responder_resources; + __u8 initiator_depth; + __u8 remote_cm_response_timeout; + __u8 flow_control; + __u8 local_cm_response_timeout; + __u8 retry_count; + __u8 rnr_retry_count; + __u8 max_cm_retries; + __u8 srq; + __u8 reserved[5]; +}; + +struct ib_ucm_rep { + __u64 uid; + __u64 data; + __u32 id; + __u32 qpn; + __u32 psn; + __u8 len; + __u8 responder_resources; + __u8 initiator_depth; + __u8 target_ack_delay; + __u8 failover_accepted; + __u8 flow_control; + __u8 rnr_retry_count; + __u8 srq; + __u8 reserved[4]; +}; + +struct ib_ucm_info { + __u32 id; + __u32 status; + __u64 info; + __u64 data; + __u8 info_len; + __u8 data_len; + __u8 reserved[6]; +}; + +struct ib_ucm_mra { + __u64 data; + __u32 id; + __u8 len; + __u8 timeout; + __u8 reserved[2]; +}; + +struct ib_ucm_lap { + __u64 path; + __u64 data; + __u32 id; + __u8 len; + __u8 reserved[3]; +}; + +struct ib_ucm_sidr_req { + __u32 id; + __u32 timeout; + __be64 sid; + __u64 data; + __u64 path; + __u16 reserved_pkey; + __u8 len; + __u8 max_cm_retries; + __u8 reserved[4]; +}; + +struct ib_ucm_sidr_rep { + __u32 id; + __u32 qpn; + __u32 qkey; + __u32 status; + __u64 info; + __u64 data; + __u8 info_len; + __u8 data_len; + __u8 reserved[6]; +}; +/* + * event notification ABI structures. + */ +struct ib_ucm_event_get { + __u64 response; + __u64 data; + __u64 info; + __u8 data_len; + __u8 info_len; + __u8 reserved[6]; +}; + +struct ib_ucm_req_event_resp { + struct ib_user_path_rec primary_path; + struct ib_user_path_rec alternate_path; + __be64 remote_ca_guid; + __u32 remote_qkey; + __u32 remote_qpn; + __u32 qp_type; + __u32 starting_psn; + __u8 responder_resources; + __u8 initiator_depth; + __u8 local_cm_response_timeout; + __u8 flow_control; + __u8 remote_cm_response_timeout; + __u8 retry_count; + __u8 rnr_retry_count; + __u8 srq; + __u8 port; + __u8 reserved[7]; +}; + +struct ib_ucm_rep_event_resp { + __be64 remote_ca_guid; + __u32 remote_qkey; + __u32 remote_qpn; + __u32 starting_psn; + __u8 responder_resources; + __u8 initiator_depth; + __u8 target_ack_delay; + __u8 failover_accepted; + __u8 flow_control; + __u8 rnr_retry_count; + __u8 srq; + __u8 reserved[5]; +}; + +struct ib_ucm_rej_event_resp { + __u32 reason; + /* ari in ib_ucm_event_get info field. */ +}; + +struct ib_ucm_mra_event_resp { + __u8 timeout; + __u8 reserved[3]; +}; + +struct ib_ucm_lap_event_resp { + struct ib_user_path_rec path; +}; + +struct ib_ucm_apr_event_resp { + __u32 status; + /* apr info in ib_ucm_event_get info field. */ +}; + +struct ib_ucm_sidr_req_event_resp { + __u16 pkey; + __u8 port; + __u8 reserved; +}; + +struct ib_ucm_sidr_rep_event_resp { + __u32 status; + __u32 qkey; + __u32 qpn; + /* info in ib_ucm_event_get info field. */ +}; + +#define IB_UCM_PRES_DATA 0x01 +#define IB_UCM_PRES_INFO 0x02 +#define IB_UCM_PRES_PRIMARY 0x04 +#define IB_UCM_PRES_ALTERNATE 0x08 + +struct ib_ucm_event_resp { + __u64 uid; + __u32 id; + __u32 event; + __u32 present; + __u32 reserved; + union { + struct ib_ucm_req_event_resp req_resp; + struct ib_ucm_rep_event_resp rep_resp; + struct ib_ucm_rej_event_resp rej_resp; + struct ib_ucm_mra_event_resp mra_resp; + struct ib_ucm_lap_event_resp lap_resp; + struct ib_ucm_apr_event_resp apr_resp; + + struct ib_ucm_sidr_req_event_resp sidr_req_resp; + struct ib_ucm_sidr_rep_event_resp sidr_rep_resp; + + __u32 send_status; + } u; +}; + +#endif /* IB_USER_CM_H */ diff --git a/sys/contrib/rdma/ib_user_mad.h b/sys/contrib/rdma/ib_user_mad.h new file mode 100644 index 000000000000..ec1d50987c88 --- /dev/null +++ b/sys/contrib/rdma/ib_user_mad.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: ib_user_mad.h 2814 2005-07-06 19:14:09Z halr $ + * + * $FreeBSD$ + */ + +#ifndef IB_USER_MAD_H +#define IB_USER_MAD_H + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define IB_USER_MAD_ABI_VERSION 5 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + */ + +/** + * ib_user_mad_hdr - MAD packet header + * @id - ID of agent MAD received with/to be sent with + * @status - 0 on successful receive, ETIMEDOUT if no response + * received (transaction ID in data[] will be set to TID of original + * request) (ignored on send) + * @timeout_ms - Milliseconds to wait for response (unset on receive) + * @retries - Number of automatic retries to attempt + * @qpn - Remote QP number received from/to be sent to + * @qkey - Remote Q_Key to be sent with (unset on receive) + * @lid - Remote lid received from/to be sent to + * @sl - Service level received with/to be sent with + * @path_bits - Local path bits received with/to be sent with + * @grh_present - If set, GRH was received/should be sent + * @gid_index - Local GID index to send with (unset on receive) + * @hop_limit - Hop limit in GRH + * @traffic_class - Traffic class in GRH + * @gid - Remote GID in GRH + * @flow_label - Flow label in GRH + */ +struct ib_user_mad_hdr { + __u32 id; + __u32 status; + __u32 timeout_ms; + __u32 retries; + __u32 length; + __be32 qpn; + __be32 qkey; + __be16 lid; + __u8 sl; + __u8 path_bits; + __u8 grh_present; + __u8 gid_index; + __u8 hop_limit; + __u8 traffic_class; + __u8 gid[16]; + __be32 flow_label; +}; + +/** + * ib_user_mad - MAD packet + * @hdr - MAD packet header + * @data - Contents of MAD + * + */ +struct ib_user_mad { + struct ib_user_mad_hdr hdr; + __u64 data[0]; +}; + +/** + * ib_user_mad_reg_req - MAD registration request + * @id - Set by the kernel; used to identify agent in future requests. + * @qpn - Queue pair number; must be 0 or 1. + * @method_mask - The caller will receive unsolicited MADs for any method + * where @method_mask = 1. + * @mgmt_class - Indicates which management class of MADs should be receive + * by the caller. This field is only required if the user wishes to + * receive unsolicited MADs, otherwise it should be 0. + * @mgmt_class_version - Indicates which version of MADs for the given + * management class to receive. + * @oui: Indicates IEEE OUI when mgmt_class is a vendor class + * in the range from 0x30 to 0x4f. Otherwise not used. + * @rmpp_version: If set, indicates the RMPP version used. + * + */ +struct ib_user_mad_reg_req { + __u32 id; + __u32 method_mask[4]; + __u8 qpn; + __u8 mgmt_class; + __u8 mgmt_class_version; + __u8 oui[3]; + __u8 rmpp_version; +}; + +#define IB_IOCTL_MAGIC 0x1b + +#define IB_USER_MAD_REGISTER_AGENT _IOWR(IB_IOCTL_MAGIC, 1, \ + struct ib_user_mad_reg_req) + +#define IB_USER_MAD_UNREGISTER_AGENT _IOW(IB_IOCTL_MAGIC, 2, __u32) + +#endif /* IB_USER_MAD_H */ diff --git a/sys/contrib/rdma/ib_user_sa.h b/sys/contrib/rdma/ib_user_sa.h new file mode 100644 index 000000000000..ddb76ed5c9c2 --- /dev/null +++ b/sys/contrib/rdma/ib_user_sa.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $FreeBSD$ + */ + +#ifndef IB_USER_SA_H +#define IB_USER_SA_H + +struct ib_user_path_rec { + __u8 dgid[16]; + __u8 sgid[16]; + __be16 dlid; + __be16 slid; + __u32 raw_traffic; + __be32 flow_label; + __u32 reversible; + __u32 mtu; + __be16 pkey; + __u8 hop_limit; + __u8 traffic_class; + __u8 numb_path; + __u8 sl; + __u8 mtu_selector; + __u8 rate_selector; + __u8 rate; + __u8 packet_life_time_selector; + __u8 packet_life_time; + __u8 preference; +}; + +#endif /* IB_USER_SA_H */ diff --git a/sys/contrib/rdma/ib_user_verbs.h b/sys/contrib/rdma/ib_user_verbs.h new file mode 100644 index 000000000000..faa966c20920 --- /dev/null +++ b/sys/contrib/rdma/ib_user_verbs.h @@ -0,0 +1,688 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * Copyright (c) 2006 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: ib_user_verbs.h 4019 2005-11-11 00:33:09Z sean.hefty $ + * + * $FreeBSD$ + */ + +#ifndef IB_USER_VERBS_H +#define IB_USER_VERBS_H + +#include <contrib/rdma/types.h> + + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define IB_USER_VERBS_ABI_VERSION 6 + +enum { + IB_USER_VERBS_CMD_GET_CONTEXT, + IB_USER_VERBS_CMD_QUERY_DEVICE, + IB_USER_VERBS_CMD_QUERY_PORT, + IB_USER_VERBS_CMD_ALLOC_PD, + IB_USER_VERBS_CMD_DEALLOC_PD, + IB_USER_VERBS_CMD_CREATE_AH, + IB_USER_VERBS_CMD_MODIFY_AH, + IB_USER_VERBS_CMD_QUERY_AH, + IB_USER_VERBS_CMD_DESTROY_AH, + IB_USER_VERBS_CMD_REG_MR, + IB_USER_VERBS_CMD_REG_SMR, + IB_USER_VERBS_CMD_REREG_MR, + IB_USER_VERBS_CMD_QUERY_MR, + IB_USER_VERBS_CMD_DEREG_MR, + IB_USER_VERBS_CMD_ALLOC_MW, + IB_USER_VERBS_CMD_BIND_MW, + IB_USER_VERBS_CMD_DEALLOC_MW, + IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL, + IB_USER_VERBS_CMD_CREATE_CQ, + IB_USER_VERBS_CMD_RESIZE_CQ, + IB_USER_VERBS_CMD_DESTROY_CQ, + IB_USER_VERBS_CMD_POLL_CQ, + IB_USER_VERBS_CMD_PEEK_CQ, + IB_USER_VERBS_CMD_REQ_NOTIFY_CQ, + IB_USER_VERBS_CMD_CREATE_QP, + IB_USER_VERBS_CMD_QUERY_QP, + IB_USER_VERBS_CMD_MODIFY_QP, + IB_USER_VERBS_CMD_DESTROY_QP, + IB_USER_VERBS_CMD_POST_SEND, + IB_USER_VERBS_CMD_POST_RECV, + IB_USER_VERBS_CMD_ATTACH_MCAST, + IB_USER_VERBS_CMD_DETACH_MCAST, + IB_USER_VERBS_CMD_CREATE_SRQ, + IB_USER_VERBS_CMD_MODIFY_SRQ, + IB_USER_VERBS_CMD_QUERY_SRQ, + IB_USER_VERBS_CMD_DESTROY_SRQ, + IB_USER_VERBS_CMD_POST_SRQ_RECV +}; + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * Specifically: + * - Do not use pointer types -- pass pointers in __u64 instead. + * - Make sure that any structure larger than 4 bytes is padded to a + * multiple of 8 bytes. Otherwise the structure size will be + * different between 32-bit and 64-bit architectures. + */ + +struct ib_uverbs_async_event_desc { + __u64 element; + __u32 event_type; /* enum ib_event_type */ + __u32 reserved; +}; + +struct ib_uverbs_comp_event_desc { + __u64 cq_handle; +}; + +/* + * All commands from userspace should start with a __u32 command field + * followed by __u16 in_words and out_words fields (which give the + * length of the command block and response buffer if any in 32-bit + * words). The kernel driver will read these fields first and read + * the rest of the command struct based on these value. + */ + +struct ib_uverbs_cmd_hdr { + __u32 command; + __u16 in_words; + __u16 out_words; +}; + +struct ib_uverbs_get_context { + __u64 response; + __u64 driver_data[0]; +}; + +struct ib_uverbs_get_context_resp { + __u32 async_fd; + __u32 num_comp_vectors; +}; + +struct ib_uverbs_query_device { + __u64 response; + __u64 driver_data[0]; +}; + +struct ib_uverbs_query_device_resp { + __u64 fw_ver; + __be64 node_guid; + __be64 sys_image_guid; + __u64 max_mr_size; + __u64 page_size_cap; + __u32 vendor_id; + __u32 vendor_part_id; + __u32 hw_ver; + __u32 max_qp; + __u32 max_qp_wr; + __u32 device_cap_flags; + __u32 max_sge; + __u32 max_sge_rd; + __u32 max_cq; + __u32 max_cqe; + __u32 max_mr; + __u32 max_pd; + __u32 max_qp_rd_atom; + __u32 max_ee_rd_atom; + __u32 max_res_rd_atom; + __u32 max_qp_init_rd_atom; + __u32 max_ee_init_rd_atom; + __u32 atomic_cap; + __u32 max_ee; + __u32 max_rdd; + __u32 max_mw; + __u32 max_raw_ipv6_qp; + __u32 max_raw_ethy_qp; + __u32 max_mcast_grp; + __u32 max_mcast_qp_attach; + __u32 max_total_mcast_qp_attach; + __u32 max_ah; + __u32 max_fmr; + __u32 max_map_per_fmr; + __u32 max_srq; + __u32 max_srq_wr; + __u32 max_srq_sge; + __u16 max_pkeys; + __u8 local_ca_ack_delay; + __u8 phys_port_cnt; + __u8 reserved[4]; +}; + +struct ib_uverbs_query_port { + __u64 response; + __u8 port_num; + __u8 reserved[7]; + __u64 driver_data[0]; +}; + +struct ib_uverbs_query_port_resp { + __u32 port_cap_flags; + __u32 max_msg_sz; + __u32 bad_pkey_cntr; + __u32 qkey_viol_cntr; + __u32 gid_tbl_len; + __u16 pkey_tbl_len; + __u16 lid; + __u16 sm_lid; + __u8 state; + __u8 max_mtu; + __u8 active_mtu; + __u8 lmc; + __u8 max_vl_num; + __u8 sm_sl; + __u8 subnet_timeout; + __u8 init_type_reply; + __u8 active_width; + __u8 active_speed; + __u8 phys_state; + __u8 reserved[3]; +}; + +struct ib_uverbs_alloc_pd { + __u64 response; + __u64 driver_data[0]; +}; + +struct ib_uverbs_alloc_pd_resp { + __u32 pd_handle; +}; + +struct ib_uverbs_dealloc_pd { + __u32 pd_handle; +}; + +struct ib_uverbs_reg_mr { + __u64 response; + __u64 start; + __u64 length; + __u64 hca_va; + __u32 pd_handle; + __u32 access_flags; + __u64 driver_data[0]; +}; + +struct ib_uverbs_reg_mr_resp { + __u32 mr_handle; + __u32 lkey; + __u32 rkey; +}; + +struct ib_uverbs_dereg_mr { + __u32 mr_handle; +}; + +struct ib_uverbs_create_comp_channel { + __u64 response; +}; + +struct ib_uverbs_create_comp_channel_resp { + __u32 fd; +}; + +struct ib_uverbs_create_cq { + __u64 response; + __u64 user_handle; + __u32 cqe; + __u32 comp_vector; + __s32 comp_channel; + __u32 reserved; + __u64 driver_data[0]; +}; + +struct ib_uverbs_create_cq_resp { + __u32 cq_handle; + __u32 cqe; +}; + +struct ib_uverbs_resize_cq { + __u64 response; + __u32 cq_handle; + __u32 cqe; + __u64 driver_data[0]; +}; + +struct ib_uverbs_resize_cq_resp { + __u32 cqe; + __u32 reserved; + __u64 driver_data[0]; +}; + +struct ib_uverbs_poll_cq { + __u64 response; + __u32 cq_handle; + __u32 ne; +}; + +struct ib_uverbs_wc { + __u64 wr_id; + __u32 status; + __u32 opcode; + __u32 vendor_err; + __u32 byte_len; + __u32 imm_data; + __u32 qp_num; + __u32 src_qp; + __u32 wc_flags; + __u16 pkey_index; + __u16 slid; + __u8 sl; + __u8 dlid_path_bits; + __u8 port_num; + __u8 reserved; +}; + +struct ib_uverbs_poll_cq_resp { + __u32 count; + __u32 reserved; + struct ib_uverbs_wc wc[0]; +}; + +struct ib_uverbs_req_notify_cq { + __u32 cq_handle; + __u32 solicited_only; +}; + +struct ib_uverbs_destroy_cq { + __u64 response; + __u32 cq_handle; + __u32 reserved; +}; + +struct ib_uverbs_destroy_cq_resp { + __u32 comp_events_reported; + __u32 async_events_reported; +}; + +struct ib_uverbs_global_route { + __u8 dgid[16]; + __u32 flow_label; + __u8 sgid_index; + __u8 hop_limit; + __u8 traffic_class; + __u8 reserved; +}; + +struct ib_uverbs_ah_attr { + struct ib_uverbs_global_route grh; + __u16 dlid; + __u8 sl; + __u8 src_path_bits; + __u8 static_rate; + __u8 is_global; + __u8 port_num; + __u8 reserved; +}; + +struct ib_uverbs_qp_attr { + __u32 qp_attr_mask; + __u32 qp_state; + __u32 cur_qp_state; + __u32 path_mtu; + __u32 path_mig_state; + __u32 qkey; + __u32 rq_psn; + __u32 sq_psn; + __u32 dest_qp_num; + __u32 qp_access_flags; + + struct ib_uverbs_ah_attr ah_attr; + struct ib_uverbs_ah_attr alt_ah_attr; + + /* ib_qp_cap */ + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + + __u16 pkey_index; + __u16 alt_pkey_index; + __u8 en_sqd_async_notify; + __u8 sq_draining; + __u8 max_rd_atomic; + __u8 max_dest_rd_atomic; + __u8 min_rnr_timer; + __u8 port_num; + __u8 timeout; + __u8 retry_cnt; + __u8 rnr_retry; + __u8 alt_port_num; + __u8 alt_timeout; + __u8 reserved[5]; +}; + +struct ib_uverbs_create_qp { + __u64 response; + __u64 user_handle; + __u32 pd_handle; + __u32 send_cq_handle; + __u32 recv_cq_handle; + __u32 srq_handle; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u8 sq_sig_all; + __u8 qp_type; + __u8 is_srq; + __u8 reserved; + __u64 driver_data[0]; +}; + +struct ib_uverbs_create_qp_resp { + __u32 qp_handle; + __u32 qpn; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u32 reserved; +}; + +/* + * This struct needs to remain a multiple of 8 bytes to keep the + * alignment of the modify QP parameters. + */ +struct ib_uverbs_qp_dest { + __u8 dgid[16]; + __u32 flow_label; + __u16 dlid; + __u16 reserved; + __u8 sgid_index; + __u8 hop_limit; + __u8 traffic_class; + __u8 sl; + __u8 src_path_bits; + __u8 static_rate; + __u8 is_global; + __u8 port_num; +}; + +struct ib_uverbs_query_qp { + __u64 response; + __u32 qp_handle; + __u32 attr_mask; + __u64 driver_data[0]; +}; + +struct ib_uverbs_query_qp_resp { + struct ib_uverbs_qp_dest dest; + struct ib_uverbs_qp_dest alt_dest; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u32 qkey; + __u32 rq_psn; + __u32 sq_psn; + __u32 dest_qp_num; + __u32 qp_access_flags; + __u16 pkey_index; + __u16 alt_pkey_index; + __u8 qp_state; + __u8 cur_qp_state; + __u8 path_mtu; + __u8 path_mig_state; + __u8 sq_draining; + __u8 max_rd_atomic; + __u8 max_dest_rd_atomic; + __u8 min_rnr_timer; + __u8 port_num; + __u8 timeout; + __u8 retry_cnt; + __u8 rnr_retry; + __u8 alt_port_num; + __u8 alt_timeout; + __u8 sq_sig_all; + __u8 reserved[5]; + __u64 driver_data[0]; +}; + +struct ib_uverbs_modify_qp { + struct ib_uverbs_qp_dest dest; + struct ib_uverbs_qp_dest alt_dest; + __u32 qp_handle; + __u32 attr_mask; + __u32 qkey; + __u32 rq_psn; + __u32 sq_psn; + __u32 dest_qp_num; + __u32 qp_access_flags; + __u16 pkey_index; + __u16 alt_pkey_index; + __u8 qp_state; + __u8 cur_qp_state; + __u8 path_mtu; + __u8 path_mig_state; + __u8 en_sqd_async_notify; + __u8 max_rd_atomic; + __u8 max_dest_rd_atomic; + __u8 min_rnr_timer; + __u8 port_num; + __u8 timeout; + __u8 retry_cnt; + __u8 rnr_retry; + __u8 alt_port_num; + __u8 alt_timeout; + __u8 reserved[2]; + __u64 driver_data[0]; +}; + +struct ib_uverbs_modify_qp_resp { +}; + +struct ib_uverbs_destroy_qp { + __u64 response; + __u32 qp_handle; + __u32 reserved; +}; + +struct ib_uverbs_destroy_qp_resp { + __u32 events_reported; +}; + +/* + * The ib_uverbs_sge structure isn't used anywhere, since we assume + * the ib_sge structure is packed the same way on 32-bit and 64-bit + * architectures in both kernel and user space. It's just here to + * document the ABI. + */ +struct ib_uverbs_sge { + __u64 addr; + __u32 length; + __u32 lkey; +}; + +struct ib_uverbs_send_wr { + __u64 wr_id; + __u32 num_sge; + __u32 opcode; + __u32 send_flags; + __u32 imm_data; + union { + struct { + __u64 remote_addr; + __u32 rkey; + __u32 reserved; + } rdma; + struct { + __u64 remote_addr; + __u64 compare_add; + __u64 swap; + __u32 rkey; + __u32 reserved; + } atomic; + struct { + __u32 ah; + __u32 remote_qpn; + __u32 remote_qkey; + __u32 reserved; + } ud; + } wr; +}; + +struct ib_uverbs_post_send { + __u64 response; + __u32 qp_handle; + __u32 wr_count; + __u32 sge_count; + __u32 wqe_size; + struct ib_uverbs_send_wr send_wr[0]; +}; + +struct ib_uverbs_post_send_resp { + __u32 bad_wr; +}; + +struct ib_uverbs_recv_wr { + __u64 wr_id; + __u32 num_sge; + __u32 reserved; +}; + +struct ib_uverbs_post_recv { + __u64 response; + __u32 qp_handle; + __u32 wr_count; + __u32 sge_count; + __u32 wqe_size; + struct ib_uverbs_recv_wr recv_wr[0]; +}; + +struct ib_uverbs_post_recv_resp { + __u32 bad_wr; +}; + +struct ib_uverbs_post_srq_recv { + __u64 response; + __u32 srq_handle; + __u32 wr_count; + __u32 sge_count; + __u32 wqe_size; + struct ib_uverbs_recv_wr recv[0]; +}; + +struct ib_uverbs_post_srq_recv_resp { + __u32 bad_wr; +}; + +struct ib_uverbs_create_ah { + __u64 response; + __u64 user_handle; + __u32 pd_handle; + __u32 reserved; + struct ib_uverbs_ah_attr attr; +}; + +struct ib_uverbs_create_ah_resp { + __u32 ah_handle; +}; + +struct ib_uverbs_destroy_ah { + __u32 ah_handle; +}; + +struct ib_uverbs_attach_mcast { + __u8 gid[16]; + __u32 qp_handle; + __u16 mlid; + __u16 reserved; + __u64 driver_data[0]; +}; + +struct ib_uverbs_detach_mcast { + __u8 gid[16]; + __u32 qp_handle; + __u16 mlid; + __u16 reserved; + __u64 driver_data[0]; +}; + +struct ib_uverbs_create_srq { + __u64 response; + __u64 user_handle; + __u32 pd_handle; + __u32 max_wr; + __u32 max_sge; + __u32 srq_limit; + __u64 driver_data[0]; +}; + +struct ib_uverbs_create_srq_resp { + __u32 srq_handle; + __u32 max_wr; + __u32 max_sge; + __u32 reserved; +}; + +struct ib_uverbs_modify_srq { + __u32 srq_handle; + __u32 attr_mask; + __u32 max_wr; + __u32 srq_limit; + __u64 driver_data[0]; +}; + +struct ib_uverbs_query_srq { + __u64 response; + __u32 srq_handle; + __u32 reserved; + __u64 driver_data[0]; +}; + +struct ib_uverbs_query_srq_resp { + __u32 max_wr; + __u32 max_sge; + __u32 srq_limit; + __u32 reserved; +}; + +struct ib_uverbs_destroy_srq { + __u64 response; + __u32 srq_handle; + __u32 reserved; +}; + +struct ib_uverbs_destroy_srq_resp { + __u32 events_reported; +}; + +#endif /* IB_USER_VERBS_H */ diff --git a/sys/contrib/rdma/ib_verbs.h b/sys/contrib/rdma/ib_verbs.h new file mode 100644 index 000000000000..5d98ef7e0e41 --- /dev/null +++ b/sys/contrib/rdma/ib_verbs.h @@ -0,0 +1,1854 @@ +/* + * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: ib_verbs.h 1349 2004-12-16 21:09:43Z roland $ + * + * $FreeBSD$ + */ + + +#if !defined(IB_VERBS_H) +#define IB_VERBS_H + +#include <contrib/rdma/types.h> +#include <sys/lock.h> +#include <sys/mutex.h> + +struct rdma_scatterlist { + void *page; + unsigned int length; + unsigned int offset; +}; +struct vm_object; + +union ib_gid { + u8 raw[16]; + struct { + __be64 subnet_prefix; + __be64 interface_id; + } global; +}; + +enum rdma_node_type { + /* IB values map to NodeInfo:NodeType. */ + RDMA_NODE_IB_CA = 1, + RDMA_NODE_IB_SWITCH, + RDMA_NODE_IB_ROUTER, + RDMA_NODE_RNIC +}; + +enum rdma_transport_type { + RDMA_TRANSPORT_IB, + RDMA_TRANSPORT_IWARP +}; + +enum rdma_transport_type +rdma_node_get_transport(enum rdma_node_type node_type); + +enum ib_device_cap_flags { + IB_DEVICE_RESIZE_MAX_WR = 1, + IB_DEVICE_BAD_PKEY_CNTR = (1<<1), + IB_DEVICE_BAD_QKEY_CNTR = (1<<2), + IB_DEVICE_RAW_MULTI = (1<<3), + IB_DEVICE_AUTO_PATH_MIG = (1<<4), + IB_DEVICE_CHANGE_PHY_PORT = (1<<5), + IB_DEVICE_UD_AV_PORT_ENFORCE = (1<<6), + IB_DEVICE_CURR_QP_STATE_MOD = (1<<7), + IB_DEVICE_SHUTDOWN_PORT = (1<<8), + IB_DEVICE_INIT_TYPE = (1<<9), + IB_DEVICE_PORT_ACTIVE_EVENT = (1<<10), + IB_DEVICE_SYS_IMAGE_GUID = (1<<11), + IB_DEVICE_RC_RNR_NAK_GEN = (1<<12), + IB_DEVICE_SRQ_RESIZE = (1<<13), + IB_DEVICE_N_NOTIFY_CQ = (1<<14), + IB_DEVICE_ZERO_STAG = (1<<15), + IB_DEVICE_SEND_W_INV = (1<<16), + IB_DEVICE_MEM_WINDOW = (1<<17) +}; + +enum ib_atomic_cap { + IB_ATOMIC_NONE, + IB_ATOMIC_HCA, + IB_ATOMIC_GLOB +}; + +struct ib_device_attr { + u64 fw_ver; + __be64 sys_image_guid; + u64 max_mr_size; + u64 page_size_cap; + u32 vendor_id; + u32 vendor_part_id; + u32 hw_ver; + int max_qp; + int max_qp_wr; + int device_cap_flags; + int max_sge; + int max_sge_rd; + int max_cq; + int max_cqe; + int max_mr; + int max_pd; + int max_qp_rd_atom; + int max_ee_rd_atom; + int max_res_rd_atom; + int max_qp_init_rd_atom; + int max_ee_init_rd_atom; + enum ib_atomic_cap atomic_cap; + int max_ee; + int max_rdd; + int max_mw; + int max_raw_ipv6_qp; + int max_raw_ethy_qp; + int max_mcast_grp; + int max_mcast_qp_attach; + int max_total_mcast_qp_attach; + int max_ah; + int max_fmr; + int max_map_per_fmr; + int max_srq; + int max_srq_wr; + int max_srq_sge; + u16 max_pkeys; + u8 local_ca_ack_delay; +}; + +enum ib_mtu { + IB_MTU_256 = 1, + IB_MTU_512 = 2, + IB_MTU_1024 = 3, + IB_MTU_2048 = 4, + IB_MTU_4096 = 5 +}; + +static inline int ib_mtu_enum_to_int(enum ib_mtu mtu) +{ + switch (mtu) { + case IB_MTU_256: return 256; + case IB_MTU_512: return 512; + case IB_MTU_1024: return 1024; + case IB_MTU_2048: return 2048; + case IB_MTU_4096: return 4096; + default: return -1; + } +} + +enum ib_port_state { + IB_PORT_NOP = 0, + IB_PORT_DOWN = 1, + IB_PORT_INIT = 2, + IB_PORT_ARMED = 3, + IB_PORT_ACTIVE = 4, + IB_PORT_ACTIVE_DEFER = 5 +}; + +enum ib_port_cap_flags { + IB_PORT_SM = 1 << 1, + IB_PORT_NOTICE_SUP = 1 << 2, + IB_PORT_TRAP_SUP = 1 << 3, + IB_PORT_OPT_IPD_SUP = 1 << 4, + IB_PORT_AUTO_MIGR_SUP = 1 << 5, + IB_PORT_SL_MAP_SUP = 1 << 6, + IB_PORT_MKEY_NVRAM = 1 << 7, + IB_PORT_PKEY_NVRAM = 1 << 8, + IB_PORT_LED_INFO_SUP = 1 << 9, + IB_PORT_SM_DISABLED = 1 << 10, + IB_PORT_SYS_IMAGE_GUID_SUP = 1 << 11, + IB_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12, + IB_PORT_CM_SUP = 1 << 16, + IB_PORT_SNMP_TUNNEL_SUP = 1 << 17, + IB_PORT_REINIT_SUP = 1 << 18, + IB_PORT_DEVICE_MGMT_SUP = 1 << 19, + IB_PORT_VENDOR_CLASS_SUP = 1 << 20, + IB_PORT_DR_NOTICE_SUP = 1 << 21, + IB_PORT_CAP_MASK_NOTICE_SUP = 1 << 22, + IB_PORT_BOOT_MGMT_SUP = 1 << 23, + IB_PORT_LINK_LATENCY_SUP = 1 << 24, + IB_PORT_CLIENT_REG_SUP = 1 << 25 +}; + +enum ib_port_width { + IB_WIDTH_1X = 1, + IB_WIDTH_4X = 2, + IB_WIDTH_8X = 4, + IB_WIDTH_12X = 8 +}; + +static inline int ib_width_enum_to_int(enum ib_port_width width) +{ + switch (width) { + case IB_WIDTH_1X: return 1; + case IB_WIDTH_4X: return 4; + case IB_WIDTH_8X: return 8; + case IB_WIDTH_12X: return 12; + default: return -1; + } +} + +struct ib_port_attr { + enum ib_port_state state; + enum ib_mtu max_mtu; + enum ib_mtu active_mtu; + int gid_tbl_len; + u32 port_cap_flags; + u32 max_msg_sz; + u32 bad_pkey_cntr; + u32 qkey_viol_cntr; + u16 pkey_tbl_len; + u16 lid; + u16 sm_lid; + u8 lmc; + u8 max_vl_num; + u8 sm_sl; + u8 subnet_timeout; + u8 init_type_reply; + u8 active_width; + u8 active_speed; + u8 phys_state; +}; + +enum ib_device_modify_flags { + IB_DEVICE_MODIFY_SYS_IMAGE_GUID = 1 << 0, + IB_DEVICE_MODIFY_NODE_DESC = 1 << 1 +}; + +struct ib_device_modify { + u64 sys_image_guid; + char node_desc[64]; +}; + +enum ib_port_modify_flags { + IB_PORT_SHUTDOWN = 1, + IB_PORT_INIT_TYPE = (1<<2), + IB_PORT_RESET_QKEY_CNTR = (1<<3) +}; + +struct ib_port_modify { + u32 set_port_cap_mask; + u32 clr_port_cap_mask; + u8 init_type; +}; + +enum ib_event_type { + IB_EVENT_CQ_ERR, + IB_EVENT_QP_FATAL, + IB_EVENT_QP_REQ_ERR, + IB_EVENT_QP_ACCESS_ERR, + IB_EVENT_COMM_EST, + IB_EVENT_SQ_DRAINED, + IB_EVENT_PATH_MIG, + IB_EVENT_PATH_MIG_ERR, + IB_EVENT_DEVICE_FATAL, + IB_EVENT_PORT_ACTIVE, + IB_EVENT_PORT_ERR, + IB_EVENT_LID_CHANGE, + IB_EVENT_PKEY_CHANGE, + IB_EVENT_SM_CHANGE, + IB_EVENT_SRQ_ERR, + IB_EVENT_SRQ_LIMIT_REACHED, + IB_EVENT_QP_LAST_WQE_REACHED, + IB_EVENT_CLIENT_REREGISTER +}; + +enum dma_data_direction { + DMA_BIDIRECTIONAL = 0, + DMA_TO_DEVICE = 1, + DMA_FROM_DEVICE = 2, + DMA_NONE = 3, +}; + +struct ib_event { + struct ib_device *device; + union { + struct ib_cq *cq; + struct ib_qp *qp; + struct ib_srq *srq; + u8 port_num; + } element; + enum ib_event_type event; +}; + +struct ib_event_handler { + struct ib_device *device; + void (*handler)(struct ib_event_handler *, struct ib_event *); + TAILQ_ENTRY(ib_event_handler) list; +}; + +#define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler) \ + do { \ + (_ptr)->device = _device; \ + (_ptr)->handler = _handler; \ + } while (0) + +struct ib_global_route { + union ib_gid dgid; + u32 flow_label; + u8 sgid_index; + u8 hop_limit; + u8 traffic_class; +}; + +struct ib_grh { + __be32 version_tclass_flow; + __be16 paylen; + u8 next_hdr; + u8 hop_limit; + union ib_gid sgid; + union ib_gid dgid; +}; + +enum { + IB_MULTICAST_QPN = 0xffffff +}; + +#define IB_LID_PERMISSIVE __constant_htons(0xFFFF) + +enum ib_ah_flags { + IB_AH_GRH = 1 +}; + +enum ib_rate { + IB_RATE_PORT_CURRENT = 0, + IB_RATE_2_5_GBPS = 2, + IB_RATE_5_GBPS = 5, + IB_RATE_10_GBPS = 3, + IB_RATE_20_GBPS = 6, + IB_RATE_30_GBPS = 4, + IB_RATE_40_GBPS = 7, + IB_RATE_60_GBPS = 8, + IB_RATE_80_GBPS = 9, + IB_RATE_120_GBPS = 10 +}; + +/** + * ib_rate_to_mult - Convert the IB rate enum to a multiple of the + * base rate of 2.5 Gbit/sec. For example, IB_RATE_5_GBPS will be + * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. + * @rate: rate to convert. + */ +int ib_rate_to_mult(enum ib_rate rate); + +/** + * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate + * enum. + * @mult: multiple to convert. + */ +enum ib_rate mult_to_ib_rate(int mult); + +struct ib_ah_attr { + struct ib_global_route grh; + u16 dlid; + u8 sl; + u8 src_path_bits; + u8 static_rate; + u8 ah_flags; + u8 port_num; +}; + +enum ib_wc_status { + IB_WC_SUCCESS, + IB_WC_LOC_LEN_ERR, + IB_WC_LOC_QP_OP_ERR, + IB_WC_LOC_EEC_OP_ERR, + IB_WC_LOC_PROT_ERR, + IB_WC_WR_FLUSH_ERR, + IB_WC_MW_BIND_ERR, + IB_WC_BAD_RESP_ERR, + IB_WC_LOC_ACCESS_ERR, + IB_WC_REM_INV_REQ_ERR, + IB_WC_REM_ACCESS_ERR, + IB_WC_REM_OP_ERR, + IB_WC_RETRY_EXC_ERR, + IB_WC_RNR_RETRY_EXC_ERR, + IB_WC_LOC_RDD_VIOL_ERR, + IB_WC_REM_INV_RD_REQ_ERR, + IB_WC_REM_ABORT_ERR, + IB_WC_INV_EECN_ERR, + IB_WC_INV_EEC_STATE_ERR, + IB_WC_FATAL_ERR, + IB_WC_RESP_TIMEOUT_ERR, + IB_WC_GENERAL_ERR +}; + +enum ib_wc_opcode { + IB_WC_SEND, + IB_WC_RDMA_WRITE, + IB_WC_RDMA_READ, + IB_WC_COMP_SWAP, + IB_WC_FETCH_ADD, + IB_WC_BIND_MW, +/* + * Set value of IB_WC_RECV so consumers can test if a completion is a + * receive by testing (opcode & IB_WC_RECV). + */ + IB_WC_RECV = 1 << 7, + IB_WC_RECV_RDMA_WITH_IMM +}; + +enum ib_wc_flags { + IB_WC_GRH = 1, + IB_WC_WITH_IMM = (1<<1) +}; + +struct ib_wc { + u64 wr_id; + enum ib_wc_status status; + enum ib_wc_opcode opcode; + u32 vendor_err; + u32 byte_len; + struct ib_qp *qp; + __be32 imm_data; + u32 src_qp; + int wc_flags; + u16 pkey_index; + u16 slid; + u8 sl; + u8 dlid_path_bits; + u8 port_num; /* valid only for DR SMPs on switches */ +}; + +enum ib_cq_notify_flags { + IB_CQ_SOLICITED = 1 << 0, + IB_CQ_NEXT_COMP = 1 << 1, + IB_CQ_SOLICITED_MASK = IB_CQ_SOLICITED | IB_CQ_NEXT_COMP, + IB_CQ_REPORT_MISSED_EVENTS = 1 << 2, +}; + +enum ib_srq_attr_mask { + IB_SRQ_MAX_WR = 1 << 0, + IB_SRQ_LIMIT = 1 << 1, +}; + +struct ib_srq_attr { + u32 max_wr; + u32 max_sge; + u32 srq_limit; +}; + +struct ib_srq_init_attr { + void (*event_handler)(struct ib_event *, void *); + void *srq_context; + struct ib_srq_attr attr; +}; + +struct ib_qp_cap { + u32 max_send_wr; + u32 max_recv_wr; + u32 max_send_sge; + u32 max_recv_sge; + u32 max_inline_data; +}; + +enum ib_sig_type { + IB_SIGNAL_ALL_WR, + IB_SIGNAL_REQ_WR +}; + +enum ib_qp_type { + /* + * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries + * here (and in that order) since the MAD layer uses them as + * indices into a 2-entry table. + */ + IB_QPT_SMI, + IB_QPT_GSI, + + IB_QPT_RC, + IB_QPT_UC, + IB_QPT_UD, + IB_QPT_RAW_IPV6, + IB_QPT_RAW_ETY +}; + +struct ib_qp_init_attr { + void (*event_handler)(struct ib_event *, void *); + void *qp_context; + struct ib_cq *send_cq; + struct ib_cq *recv_cq; + struct ib_srq *srq; + struct ib_qp_cap cap; + enum ib_sig_type sq_sig_type; + enum ib_qp_type qp_type; + u8 port_num; /* special QP types only */ +}; + +enum ib_rnr_timeout { + IB_RNR_TIMER_655_36 = 0, + IB_RNR_TIMER_000_01 = 1, + IB_RNR_TIMER_000_02 = 2, + IB_RNR_TIMER_000_03 = 3, + IB_RNR_TIMER_000_04 = 4, + IB_RNR_TIMER_000_06 = 5, + IB_RNR_TIMER_000_08 = 6, + IB_RNR_TIMER_000_12 = 7, + IB_RNR_TIMER_000_16 = 8, + IB_RNR_TIMER_000_24 = 9, + IB_RNR_TIMER_000_32 = 10, + IB_RNR_TIMER_000_48 = 11, + IB_RNR_TIMER_000_64 = 12, + IB_RNR_TIMER_000_96 = 13, + IB_RNR_TIMER_001_28 = 14, + IB_RNR_TIMER_001_92 = 15, + IB_RNR_TIMER_002_56 = 16, + IB_RNR_TIMER_003_84 = 17, + IB_RNR_TIMER_005_12 = 18, + IB_RNR_TIMER_007_68 = 19, + IB_RNR_TIMER_010_24 = 20, + IB_RNR_TIMER_015_36 = 21, + IB_RNR_TIMER_020_48 = 22, + IB_RNR_TIMER_030_72 = 23, + IB_RNR_TIMER_040_96 = 24, + IB_RNR_TIMER_061_44 = 25, + IB_RNR_TIMER_081_92 = 26, + IB_RNR_TIMER_122_88 = 27, + IB_RNR_TIMER_163_84 = 28, + IB_RNR_TIMER_245_76 = 29, + IB_RNR_TIMER_327_68 = 30, + IB_RNR_TIMER_491_52 = 31 +}; + +enum ib_qp_attr_mask { + IB_QP_STATE = 1, + IB_QP_CUR_STATE = (1<<1), + IB_QP_EN_SQD_ASYNC_NOTIFY = (1<<2), + IB_QP_ACCESS_FLAGS = (1<<3), + IB_QP_PKEY_INDEX = (1<<4), + IB_QP_PORT = (1<<5), + IB_QP_QKEY = (1<<6), + IB_QP_AV = (1<<7), + IB_QP_PATH_MTU = (1<<8), + IB_QP_TIMEOUT = (1<<9), + IB_QP_RETRY_CNT = (1<<10), + IB_QP_RNR_RETRY = (1<<11), + IB_QP_RQ_PSN = (1<<12), + IB_QP_MAX_QP_RD_ATOMIC = (1<<13), + IB_QP_ALT_PATH = (1<<14), + IB_QP_MIN_RNR_TIMER = (1<<15), + IB_QP_SQ_PSN = (1<<16), + IB_QP_MAX_DEST_RD_ATOMIC = (1<<17), + IB_QP_PATH_MIG_STATE = (1<<18), + IB_QP_CAP = (1<<19), + IB_QP_DEST_QPN = (1<<20) +}; + +enum ib_qp_state { + IB_QPS_RESET, + IB_QPS_INIT, + IB_QPS_RTR, + IB_QPS_RTS, + IB_QPS_SQD, + IB_QPS_SQE, + IB_QPS_ERR +}; + +enum ib_mig_state { + IB_MIG_MIGRATED, + IB_MIG_REARM, + IB_MIG_ARMED +}; + +struct ib_qp_attr { + enum ib_qp_state qp_state; + enum ib_qp_state cur_qp_state; + enum ib_mtu path_mtu; + enum ib_mig_state path_mig_state; + u32 qkey; + u32 rq_psn; + u32 sq_psn; + u32 dest_qp_num; + int qp_access_flags; + struct ib_qp_cap cap; + struct ib_ah_attr ah_attr; + struct ib_ah_attr alt_ah_attr; + u16 pkey_index; + u16 alt_pkey_index; + u8 en_sqd_async_notify; + u8 sq_draining; + u8 max_rd_atomic; + u8 max_dest_rd_atomic; + u8 min_rnr_timer; + u8 port_num; + u8 timeout; + u8 retry_cnt; + u8 rnr_retry; + u8 alt_port_num; + u8 alt_timeout; +}; + +enum ib_wr_opcode { + IB_WR_RDMA_WRITE, + IB_WR_RDMA_WRITE_WITH_IMM, + IB_WR_SEND, + IB_WR_SEND_WITH_IMM, + IB_WR_RDMA_READ, + IB_WR_ATOMIC_CMP_AND_SWP, + IB_WR_ATOMIC_FETCH_AND_ADD +}; + +enum ib_send_flags { + IB_SEND_FENCE = 1, + IB_SEND_SIGNALED = (1<<1), + IB_SEND_SOLICITED = (1<<2), + IB_SEND_INLINE = (1<<3) +}; + +struct ib_sge { + u64 addr; + u32 length; + u32 lkey; +}; + +struct ib_send_wr { + struct ib_send_wr *next; + u64 wr_id; + struct ib_sge *sg_list; + int num_sge; + enum ib_wr_opcode opcode; + int send_flags; + __be32 imm_data; + union { + struct { + u64 remote_addr; + u32 rkey; + } rdma; + struct { + u64 remote_addr; + u64 compare_add; + u64 swap; + u32 rkey; + } atomic; + struct { + struct ib_ah *ah; + u32 remote_qpn; + u32 remote_qkey; + u16 pkey_index; /* valid for GSI only */ + u8 port_num; /* valid for DR SMPs on switch only */ + } ud; + } wr; +}; + +struct ib_recv_wr { + struct ib_recv_wr *next; + u64 wr_id; + struct ib_sge *sg_list; + int num_sge; +}; + +enum ib_access_flags { + IB_ACCESS_LOCAL_WRITE = 1, + IB_ACCESS_REMOTE_WRITE = (1<<1), + IB_ACCESS_REMOTE_READ = (1<<2), + IB_ACCESS_REMOTE_ATOMIC = (1<<3), + IB_ACCESS_MW_BIND = (1<<4) +}; + +struct ib_phys_buf { + u64 addr; + u64 size; +}; + +struct ib_mr_attr { + struct ib_pd *pd; + u64 device_virt_addr; + u64 size; + int mr_access_flags; + u32 lkey; + u32 rkey; +}; + +enum ib_mr_rereg_flags { + IB_MR_REREG_TRANS = 1, + IB_MR_REREG_PD = (1<<1), + IB_MR_REREG_ACCESS = (1<<2) +}; + +struct ib_mw_bind { + struct ib_mr *mr; + u64 wr_id; + u64 addr; + u32 length; + int send_flags; + int mw_access_flags; +}; + +struct ib_fmr_attr { + int max_pages; + int max_maps; + u8 page_shift; +}; + +/* + * XXX can this really be on 7 different lists at once? + * + */ +struct ib_ucontext { + struct ib_device *device; + TAILQ_ENTRY(ib_ucontext) pd_list; + TAILQ_ENTRY(ib_ucontext) mr_list; + TAILQ_ENTRY(ib_ucontext) mw_list; + TAILQ_ENTRY(ib_ucontext) cq_list; + TAILQ_ENTRY(ib_ucontext) qp_list; + TAILQ_ENTRY(ib_ucontext) srq_list; + TAILQ_ENTRY(ib_ucontext) ah_list; + int closing; +}; + +struct ib_uobject { + u64 user_handle; /* handle given to us by userspace */ + struct ib_ucontext *context; /* associated user context */ + void *object; /* containing object */ + TAILQ_ENTRY(ib_uobject) entry; /* link to context's list */ + u32 id; /* index into kernel idr */ + volatile uint32_t ref; + struct mtx lock; /* protects .live */ + int live; +}; + +struct ib_udata { + void *inbuf; + void *outbuf; + size_t inlen; + size_t outlen; +}; + +#define IB_UMEM_MAX_PAGE_CHUNK \ + ((PAGE_SIZE - offsetof(struct ib_umem_chunk, page_list)) / \ + ((void *) &((struct ib_umem_chunk *) 0)->page_list[1] - \ + (void *) &((struct ib_umem_chunk *) 0)->page_list[0])) + +struct ib_pd { + struct ib_device *device; + struct ib_uobject *uobject; + volatile int usecnt; /* count all resources */ +}; + +struct ib_ah { + struct ib_device *device; + struct ib_pd *pd; + struct ib_uobject *uobject; +}; + +typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); + +struct ib_cq { + struct ib_device *device; + struct ib_uobject *uobject; + ib_comp_handler comp_handler; + void (*event_handler)(struct ib_event *, void *); + void * cq_context; + int cqe; + volatile int usecnt; /* count number of work queues */ +}; + +struct ib_srq { + struct ib_device *device; + struct ib_pd *pd; + struct ib_uobject *uobject; + void (*event_handler)(struct ib_event *, void *); + void *srq_context; + volatile int usecnt; +}; + +struct ib_qp { + struct ib_device *device; + struct ib_pd *pd; + struct ib_cq *send_cq; + struct ib_cq *recv_cq; + struct ib_srq *srq; + struct ib_uobject *uobject; + void (*event_handler)(struct ib_event *, void *); + void *qp_context; + u32 qp_num; + enum ib_qp_type qp_type; +}; + +struct ib_mr { + struct ib_device *device; + struct ib_pd *pd; + struct ib_uobject *uobject; + u32 lkey; + u32 rkey; + volatile int usecnt; /* count number of MWs */ +}; + +struct ib_mw { + struct ib_device *device; + struct ib_pd *pd; + struct ib_uobject *uobject; + u32 rkey; +}; + + +struct ib_fmr { + struct ib_device *device; + struct ib_pd *pd; + TAILQ_ENTRY(ib_fmr) entry; + u32 lkey; + u32 rkey; +}; + +TAILQ_HEAD(ib_fmr_list_head, ib_fmr); + +struct ib_mad; +struct ib_grh; + +enum ib_process_mad_flags { + IB_MAD_IGNORE_MKEY = 1, + IB_MAD_IGNORE_BKEY = 2, + IB_MAD_IGNORE_ALL = IB_MAD_IGNORE_MKEY | IB_MAD_IGNORE_BKEY +}; + +enum ib_mad_result { + IB_MAD_RESULT_FAILURE = 0, /* (!SUCCESS is the important flag) */ + IB_MAD_RESULT_SUCCESS = 1 << 0, /* MAD was successfully processed */ + IB_MAD_RESULT_REPLY = 1 << 1, /* Reply packet needs to be sent */ + IB_MAD_RESULT_CONSUMED = 1 << 2 /* Packet consumed: stop processing */ +}; + +#define IB_DEVICE_NAME_MAX 64 + +struct ib_cache { + struct mtx lock; + struct ib_event_handler event_handler; + struct ib_pkey_cache **pkey_cache; + struct ib_gid_cache **gid_cache; + u8 *lmc_cache; +}; + +struct ib_dma_mapping_ops { + int (*mapping_error)(struct ib_device *dev, + u64 dma_addr); + u64 (*map_single)(struct ib_device *dev, + void *ptr, size_t size, + enum dma_data_direction direction); + void (*unmap_single)(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction); + u64 (*map_page)(struct ib_device *dev, + void *page, unsigned long offset, + size_t size, + enum dma_data_direction direction); + void (*unmap_page)(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction); + int (*map_sg)(struct ib_device *dev, + struct rdma_scatterlist *sg, int nents, + enum dma_data_direction direction); + void (*unmap_sg)(struct ib_device *dev, + struct rdma_scatterlist *sg, int nents, + enum dma_data_direction direction); + u64 (*dma_address)(struct ib_device *dev, + struct rdma_scatterlist *sg); + unsigned int (*dma_len)(struct ib_device *dev, + struct rdma_scatterlist *sg); + void (*sync_single_for_cpu)(struct ib_device *dev, + u64 dma_handle, + size_t size, + enum dma_data_direction dir); + void (*sync_single_for_device)(struct ib_device *dev, + u64 dma_handle, + size_t size, + enum dma_data_direction dir); + void *(*alloc_coherent)(struct ib_device *dev, + size_t size, + u64 *dma_handle, + int flag); + void (*free_coherent)(struct ib_device *dev, + size_t size, void *cpu_addr, + u64 dma_handle); +}; + +struct iw_cm_verbs; + +struct ib_device { + struct device *dma_device; + + char name[IB_DEVICE_NAME_MAX]; + + TAILQ_HEAD(, ib_event_handler) event_handler_list; + struct mtx event_handler_lock; + + TAILQ_ENTRY(ib_device) core_list; + TAILQ_HEAD(, ib_client_data) client_data_list; + struct mtx client_data_lock; + + struct ib_cache cache; + int *pkey_tbl_len; + int *gid_tbl_len; + + u32 flags; + + int num_comp_vectors; + + struct iw_cm_verbs *iwcm; + + int (*query_device)(struct ib_device *device, + struct ib_device_attr *device_attr); + int (*query_port)(struct ib_device *device, + u8 port_num, + struct ib_port_attr *port_attr); + int (*query_gid)(struct ib_device *device, + u8 port_num, int index, + union ib_gid *gid); + int (*query_pkey)(struct ib_device *device, + u8 port_num, u16 index, u16 *pkey); + int (*modify_device)(struct ib_device *device, + int device_modify_mask, + struct ib_device_modify *device_modify); + int (*modify_port)(struct ib_device *device, + u8 port_num, int port_modify_mask, + struct ib_port_modify *port_modify); + struct ib_ucontext * (*alloc_ucontext)(struct ib_device *device, + struct ib_udata *udata); + int (*dealloc_ucontext)(struct ib_ucontext *context); + int (*mmap)(struct ib_ucontext *context, + struct vm_object *vma); + struct ib_pd * (*alloc_pd)(struct ib_device *device, + struct ib_ucontext *context, + struct ib_udata *udata); + int (*dealloc_pd)(struct ib_pd *pd); + struct ib_ah * (*create_ah)(struct ib_pd *pd, + struct ib_ah_attr *ah_attr); + int (*modify_ah)(struct ib_ah *ah, + struct ib_ah_attr *ah_attr); + int (*query_ah)(struct ib_ah *ah, + struct ib_ah_attr *ah_attr); + int (*destroy_ah)(struct ib_ah *ah); + struct ib_srq * (*create_srq)(struct ib_pd *pd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata); + int (*modify_srq)(struct ib_srq *srq, + struct ib_srq_attr *srq_attr, + enum ib_srq_attr_mask srq_attr_mask, + struct ib_udata *udata); + int (*query_srq)(struct ib_srq *srq, + struct ib_srq_attr *srq_attr); + int (*destroy_srq)(struct ib_srq *srq); + int (*post_srq_recv)(struct ib_srq *srq, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr); + struct ib_qp * (*create_qp)(struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr, + struct ib_udata *udata); + int (*modify_qp)(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_udata *udata); + int (*query_qp)(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); + int (*destroy_qp)(struct ib_qp *qp); + int (*post_send)(struct ib_qp *qp, + struct ib_send_wr *send_wr, + struct ib_send_wr **bad_send_wr); + int (*post_recv)(struct ib_qp *qp, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr); + struct ib_cq * (*create_cq)(struct ib_device *device, int cqe, + int comp_vector, + struct ib_ucontext *context, + struct ib_udata *udata); + int (*destroy_cq)(struct ib_cq *cq); + int (*resize_cq)(struct ib_cq *cq, int cqe, + struct ib_udata *udata); + int (*poll_cq)(struct ib_cq *cq, int num_entries, + struct ib_wc *wc); + int (*peek_cq)(struct ib_cq *cq, int wc_cnt); + int (*req_notify_cq)(struct ib_cq *cq, + enum ib_cq_notify_flags flags); + int (*req_ncomp_notif)(struct ib_cq *cq, + int wc_cnt); + struct ib_mr * (*get_dma_mr)(struct ib_pd *pd, + int mr_access_flags); + struct ib_mr * (*reg_phys_mr)(struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, + u64 *iova_start); + struct ib_mr * (*reg_user_mr)(struct ib_pd *pd, + u64 start, u64 length, + u64 virt_addr, + int mr_access_flags, + struct ib_udata *udata); + int (*query_mr)(struct ib_mr *mr, + struct ib_mr_attr *mr_attr); + int (*dereg_mr)(struct ib_mr *mr); + int (*rereg_phys_mr)(struct ib_mr *mr, + int mr_rereg_mask, + struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, + u64 *iova_start); + struct ib_mw * (*alloc_mw)(struct ib_pd *pd); + int (*bind_mw)(struct ib_qp *qp, + struct ib_mw *mw, + struct ib_mw_bind *mw_bind); + int (*dealloc_mw)(struct ib_mw *mw); + struct ib_fmr * (*alloc_fmr)(struct ib_pd *pd, + int mr_access_flags, + struct ib_fmr_attr *fmr_attr); + int (*map_phys_fmr)(struct ib_fmr *fmr, + u64 *page_list, int list_len, + u64 iova); + int (*unmap_fmr)(struct ib_fmr_list_head *fmr_list); + int (*dealloc_fmr)(struct ib_fmr *fmr); + int (*attach_mcast)(struct ib_qp *qp, + union ib_gid *gid, + u16 lid); + int (*detach_mcast)(struct ib_qp *qp, + union ib_gid *gid, + u16 lid); + int (*process_mad)(struct ib_device *device, + int process_mad_flags, + u8 port_num, + struct ib_wc *in_wc, + struct ib_grh *in_grh, + struct ib_mad *in_mad, + struct ib_mad *out_mad); + + struct ib_dma_mapping_ops *dma_ops; + + struct module *owner; +#ifdef notyet + struct class_device class_dev; + struct kobject ports_parent; + struct list_head port_list; +#endif + enum { + IB_DEV_UNINITIALIZED, + IB_DEV_REGISTERED, + IB_DEV_UNREGISTERED + } reg_state; + + u64 uverbs_cmd_mask; + int uverbs_abi_ver; + + char node_desc[64]; + __be64 node_guid; + u8 node_type; + u8 phys_port_cnt; +}; + +struct ib_client { + char *name; + void (*add) (struct ib_device *); + void (*remove)(struct ib_device *); + TAILQ_ENTRY(ib_client) list; +}; + +struct ib_device *ib_alloc_device(size_t size); +void ib_dealloc_device(struct ib_device *device); + +int ib_register_device (struct ib_device *device); +void ib_unregister_device(struct ib_device *device); + +int ib_register_client (struct ib_client *client); +void ib_unregister_client(struct ib_client *client); + +void *ib_get_client_data(struct ib_device *device, struct ib_client *client); +void ib_set_client_data(struct ib_device *device, struct ib_client *client, + void *data); + +static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) +{ + return copyin(udata->inbuf, dest, len); +} + +static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) +{ + return copyout(src, udata->outbuf, len); +} + +/** + * ib_modify_qp_is_ok - Check that the supplied attribute mask + * contains all required attributes and no attributes not allowed for + * the given QP state transition. + * @cur_state: Current QP state + * @next_state: Next QP state + * @type: QP type + * @mask: Mask of supplied QP attributes + * + * This function is a helper function that a low-level driver's + * modify_qp method can use to validate the consumer's input. It + * checks that cur_state and next_state are valid QP states, that a + * transition from cur_state to next_state is allowed by the IB spec, + * and that the attribute mask supplied is allowed for the transition. + */ +int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, + enum ib_qp_type type, enum ib_qp_attr_mask mask); + +int ib_register_event_handler (struct ib_event_handler *event_handler); +int ib_unregister_event_handler(struct ib_event_handler *event_handler); +void ib_dispatch_event(struct ib_event *event); + +int ib_query_device(struct ib_device *device, + struct ib_device_attr *device_attr); + +int ib_query_port(struct ib_device *device, + u8 port_num, struct ib_port_attr *port_attr); + +int ib_query_gid(struct ib_device *device, + u8 port_num, int index, union ib_gid *gid); + +int ib_query_pkey(struct ib_device *device, + u8 port_num, u16 index, u16 *pkey); + +int ib_modify_device(struct ib_device *device, + int device_modify_mask, + struct ib_device_modify *device_modify); + +int ib_modify_port(struct ib_device *device, + u8 port_num, int port_modify_mask, + struct ib_port_modify *port_modify); + +int ib_find_gid(struct ib_device *device, union ib_gid *gid, + u8 *port_num, u16 *index); + +int ib_find_pkey(struct ib_device *device, + u8 port_num, u16 pkey, u16 *index); + +/** + * ib_alloc_pd - Allocates an unused protection domain. + * @device: The device on which to allocate the protection domain. + * + * A protection domain object provides an association between QPs, shared + * receive queues, address handles, memory regions, and memory windows. + */ +struct ib_pd *ib_alloc_pd(struct ib_device *device); + +/** + * ib_dealloc_pd - Deallocates a protection domain. + * @pd: The protection domain to deallocate. + */ +int ib_dealloc_pd(struct ib_pd *pd); + +/** + * ib_create_ah - Creates an address handle for the given address vector. + * @pd: The protection domain associated with the address handle. + * @ah_attr: The attributes of the address vector. + * + * The address handle is used to reference a local or global destination + * in all UD QP post sends. + */ +struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); + +/** + * ib_init_ah_from_wc - Initializes address handle attributes from a + * work completion. + * @device: Device on which the received message arrived. + * @port_num: Port on which the received message arrived. + * @wc: Work completion associated with the received message. + * @grh: References the received global route header. This parameter is + * ignored unless the work completion indicates that the GRH is valid. + * @ah_attr: Returned attributes that can be used when creating an address + * handle for replying to the message. + */ +int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, + struct ib_grh *grh, struct ib_ah_attr *ah_attr); + +/** + * ib_create_ah_from_wc - Creates an address handle associated with the + * sender of the specified work completion. + * @pd: The protection domain associated with the address handle. + * @wc: Work completion information associated with a received message. + * @grh: References the received global route header. This parameter is + * ignored unless the work completion indicates that the GRH is valid. + * @port_num: The outbound port number to associate with the address. + * + * The address handle is used to reference a local or global destination + * in all UD QP post sends. + */ +struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc, + struct ib_grh *grh, u8 port_num); + +/** + * ib_modify_ah - Modifies the address vector associated with an address + * handle. + * @ah: The address handle to modify. + * @ah_attr: The new address vector attributes to associate with the + * address handle. + */ +int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); + +/** + * ib_query_ah - Queries the address vector associated with an address + * handle. + * @ah: The address handle to query. + * @ah_attr: The address vector attributes associated with the address + * handle. + */ +int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); + +/** + * ib_destroy_ah - Destroys an address handle. + * @ah: The address handle to destroy. + */ +int ib_destroy_ah(struct ib_ah *ah); + +/** + * ib_create_srq - Creates a SRQ associated with the specified protection + * domain. + * @pd: The protection domain associated with the SRQ. + * @srq_init_attr: A list of initial attributes required to create the + * SRQ. If SRQ creation succeeds, then the attributes are updated to + * the actual capabilities of the created SRQ. + * + * srq_attr->max_wr and srq_attr->max_sge are read the determine the + * requested size of the SRQ, and set to the actual values allocated + * on return. If ib_create_srq() succeeds, then max_wr and max_sge + * will always be at least as large as the requested values. + */ +struct ib_srq *ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *srq_init_attr); + +/** + * ib_modify_srq - Modifies the attributes for the specified SRQ. + * @srq: The SRQ to modify. + * @srq_attr: On input, specifies the SRQ attributes to modify. On output, + * the current values of selected SRQ attributes are returned. + * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ + * are being modified. + * + * The mask may contain IB_SRQ_MAX_WR to resize the SRQ and/or + * IB_SRQ_LIMIT to set the SRQ's limit and request notification when + * the number of receives queued drops below the limit. + */ +int ib_modify_srq(struct ib_srq *srq, + struct ib_srq_attr *srq_attr, + enum ib_srq_attr_mask srq_attr_mask); + +/** + * ib_query_srq - Returns the attribute list and current values for the + * specified SRQ. + * @srq: The SRQ to query. + * @srq_attr: The attributes of the specified SRQ. + */ +int ib_query_srq(struct ib_srq *srq, + struct ib_srq_attr *srq_attr); + +/** + * ib_destroy_srq - Destroys the specified SRQ. + * @srq: The SRQ to destroy. + */ +int ib_destroy_srq(struct ib_srq *srq); + +/** + * ib_post_srq_recv - Posts a list of work requests to the specified SRQ. + * @srq: The SRQ to post the work request on. + * @recv_wr: A list of work requests to post on the receive queue. + * @bad_recv_wr: On an immediate failure, this parameter will reference + * the work request that failed to be posted on the QP. + */ +static inline int ib_post_srq_recv(struct ib_srq *srq, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr) +{ + return srq->device->post_srq_recv(srq, recv_wr, bad_recv_wr); +} + +/** + * ib_create_qp - Creates a QP associated with the specified protection + * domain. + * @pd: The protection domain associated with the QP. + * @qp_init_attr: A list of initial attributes required to create the + * QP. If QP creation succeeds, then the attributes are updated to + * the actual capabilities of the created QP. + */ +struct ib_qp *ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr); + +/** + * ib_modify_qp - Modifies the attributes for the specified QP and then + * transitions the QP to the given state. + * @qp: The QP to modify. + * @qp_attr: On input, specifies the QP attributes to modify. On output, + * the current values of selected QP attributes are returned. + * @qp_attr_mask: A bit-mask used to specify which attributes of the QP + * are being modified. + */ +int ib_modify_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask); + +/** + * ib_query_qp - Returns the attribute list and current values for the + * specified QP. + * @qp: The QP to query. + * @qp_attr: The attributes of the specified QP. + * @qp_attr_mask: A bit-mask used to select specific attributes to query. + * @qp_init_attr: Additional attributes of the selected QP. + * + * The qp_attr_mask may be used to limit the query to gathering only the + * selected attributes. + */ +int ib_query_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); + +/** + * ib_destroy_qp - Destroys the specified QP. + * @qp: The QP to destroy. + */ +int ib_destroy_qp(struct ib_qp *qp); + +/** + * ib_post_send - Posts a list of work requests to the send queue of + * the specified QP. + * @qp: The QP to post the work request on. + * @send_wr: A list of work requests to post on the send queue. + * @bad_send_wr: On an immediate failure, this parameter will reference + * the work request that failed to be posted on the QP. + */ +static inline int ib_post_send(struct ib_qp *qp, + struct ib_send_wr *send_wr, + struct ib_send_wr **bad_send_wr) +{ + return qp->device->post_send(qp, send_wr, bad_send_wr); +} + +/** + * ib_post_recv - Posts a list of work requests to the receive queue of + * the specified QP. + * @qp: The QP to post the work request on. + * @recv_wr: A list of work requests to post on the receive queue. + * @bad_recv_wr: On an immediate failure, this parameter will reference + * the work request that failed to be posted on the QP. + */ +static inline int ib_post_recv(struct ib_qp *qp, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr) +{ + return qp->device->post_recv(qp, recv_wr, bad_recv_wr); +} + +/** + * ib_create_cq - Creates a CQ on the specified device. + * @device: The device on which to create the CQ. + * @comp_handler: A user-specified callback that is invoked when a + * completion event occurs on the CQ. + * @event_handler: A user-specified callback that is invoked when an + * asynchronous event not associated with a completion occurs on the CQ. + * @cq_context: Context associated with the CQ returned to the user via + * the associated completion and event handlers. + * @cqe: The minimum size of the CQ. + * @comp_vector - Completion vector used to signal completion events. + * Must be >= 0 and < context->num_comp_vectors. + * + * Users can examine the cq structure to determine the actual CQ size. + */ +struct ib_cq *ib_create_cq(struct ib_device *device, + ib_comp_handler comp_handler, + void (*event_handler)(struct ib_event *, void *), + void *cq_context, int cqe, int comp_vector); + +/** + * ib_resize_cq - Modifies the capacity of the CQ. + * @cq: The CQ to resize. + * @cqe: The minimum size of the CQ. + * + * Users can examine the cq structure to determine the actual CQ size. + */ +int ib_resize_cq(struct ib_cq *cq, int cqe); + +/** + * ib_destroy_cq - Destroys the specified CQ. + * @cq: The CQ to destroy. + */ +int ib_destroy_cq(struct ib_cq *cq); + +/** + * ib_poll_cq - poll a CQ for completion(s) + * @cq:the CQ being polled + * @num_entries:maximum number of completions to return + * @wc:array of at least @num_entries &struct ib_wc where completions + * will be returned + * + * Poll a CQ for (possibly multiple) completions. If the return value + * is < 0, an error occurred. If the return value is >= 0, it is the + * number of completions returned. If the return value is + * non-negative and < num_entries, then the CQ was emptied. + */ +static inline int ib_poll_cq(struct ib_cq *cq, int num_entries, + struct ib_wc *wc) +{ + return cq->device->poll_cq(cq, num_entries, wc); +} + +/** + * ib_peek_cq - Returns the number of unreaped completions currently + * on the specified CQ. + * @cq: The CQ to peek. + * @wc_cnt: A minimum number of unreaped completions to check for. + * + * If the number of unreaped completions is greater than or equal to wc_cnt, + * this function returns wc_cnt, otherwise, it returns the actual number of + * unreaped completions. + */ +int ib_peek_cq(struct ib_cq *cq, int wc_cnt); + +/** + * ib_req_notify_cq - Request completion notification on a CQ. + * @cq: The CQ to generate an event for. + * @flags: + * Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP + * to request an event on the next solicited event or next work + * completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS + * may also be |ed in to request a hint about missed events, as + * described below. + * + * Return Value: + * < 0 means an error occurred while requesting notification + * == 0 means notification was requested successfully, and if + * IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events + * were missed and it is safe to wait for another event. In + * this case is it guaranteed that any work completions added + * to the CQ since the last CQ poll will trigger a completion + * notification event. + * > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed + * in. It means that the consumer must poll the CQ again to + * make sure it is empty to avoid missing an event because of a + * race between requesting notification and an entry being + * added to the CQ. This return value means it is possible + * (but not guaranteed) that a work completion has been added + * to the CQ since the last poll without triggering a + * completion notification event. + */ +static inline int ib_req_notify_cq(struct ib_cq *cq, + enum ib_cq_notify_flags flags) +{ + return cq->device->req_notify_cq(cq, flags); +} + +/** + * ib_req_ncomp_notif - Request completion notification when there are + * at least the specified number of unreaped completions on the CQ. + * @cq: The CQ to generate an event for. + * @wc_cnt: The number of unreaped completions that should be on the + * CQ before an event is generated. + */ +static inline int ib_req_ncomp_notif(struct ib_cq *cq, int wc_cnt) +{ + return cq->device->req_ncomp_notif ? + cq->device->req_ncomp_notif(cq, wc_cnt) : + ENOSYS; +} + +/** + * ib_get_dma_mr - Returns a memory region for system memory that is + * usable for DMA. + * @pd: The protection domain associated with the memory region. + * @mr_access_flags: Specifies the memory access rights. + * + * Note that the ib_dma_*() functions defined below must be used + * to create/destroy addresses used with the Lkey or Rkey returned + * by ib_get_dma_mr(). + */ +struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags); +#ifdef notyet +/** + * ib_dma_mapping_error - check a DMA addr for error + * @dev: The device for which the dma_addr was created + * @dma_addr: The DMA address to check + */ +static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr) +{ + if (dev->dma_ops) + return dev->dma_ops->mapping_error(dev, dma_addr); + return dma_mapping_error(dma_addr); +} + +/** + * ib_dma_map_single - Map a kernel virtual address to DMA address + * @dev: The device for which the dma_addr is to be created + * @cpu_addr: The kernel virtual address + * @size: The size of the region in bytes + * @direction: The direction of the DMA + */ +static inline u64 ib_dma_map_single(struct ib_device *dev, + void *cpu_addr, size_t size, + enum dma_data_direction direction) +{ + if (dev->dma_ops) + return dev->dma_ops->map_single(dev, cpu_addr, size, direction); + return dma_map_single(dev->dma_device, cpu_addr, size, direction); +} + +/** + * ib_dma_unmap_single - Destroy a mapping created by ib_dma_map_single() + * @dev: The device for which the DMA address was created + * @addr: The DMA address + * @size: The size of the region in bytes + * @direction: The direction of the DMA + */ +static inline void ib_dma_unmap_single(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction) +{ + if (dev->dma_ops) + dev->dma_ops->unmap_single(dev, addr, size, direction); + else + dma_unmap_single(dev->dma_device, addr, size, direction); +} + +/** + * ib_dma_map_page - Map a physical page to DMA address + * @dev: The device for which the dma_addr is to be created + * @page: The page to be mapped + * @offset: The offset within the page + * @size: The size of the region in bytes + * @direction: The direction of the DMA + */ +static inline u64 ib_dma_map_page(struct ib_device *dev, + struct page *page, + unsigned long offset, + size_t size, + enum dma_data_direction direction) +{ + if (dev->dma_ops) + return dev->dma_ops->map_page(dev, page, offset, size, direction); + return dma_map_page(dev->dma_device, page, offset, size, direction); +} + +/** + * ib_dma_unmap_page - Destroy a mapping created by ib_dma_map_page() + * @dev: The device for which the DMA address was created + * @addr: The DMA address + * @size: The size of the region in bytes + * @direction: The direction of the DMA + */ +static inline void ib_dma_unmap_page(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction) +{ + if (dev->dma_ops) + dev->dma_ops->unmap_page(dev, addr, size, direction); + else + dma_unmap_page(dev->dma_device, addr, size, direction); +} + +/** + * ib_dma_map_sg - Map a scatter/gather list to DMA addresses + * @dev: The device for which the DMA addresses are to be created + * @sg: The array of scatter/gather entries + * @nents: The number of scatter/gather entries + * @direction: The direction of the DMA + */ +static inline int ib_dma_map_sg(struct ib_device *dev, + struct rdma_scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + if (dev->dma_ops) + return dev->dma_ops->map_sg(dev, sg, nents, direction); + return dma_map_sg(dev->dma_device, sg, nents, direction); +} + +/** + * ib_dma_unmap_sg - Unmap a scatter/gather list of DMA addresses + * @dev: The device for which the DMA addresses were created + * @sg: The array of scatter/gather entries + * @nents: The number of scatter/gather entries + * @direction: The direction of the DMA + */ +static inline void ib_dma_unmap_sg(struct ib_device *dev, + struct rdma_scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + if (dev->dma_ops) + dev->dma_ops->unmap_sg(dev, sg, nents, direction); + else + dma_unmap_sg(dev->dma_device, sg, nents, direction); +} + +/** + * ib_sg_dma_address - Return the DMA address from a scatter/gather entry + * @dev: The device for which the DMA addresses were created + * @sg: The scatter/gather entry + */ +static inline u64 ib_sg_dma_address(struct ib_device *dev, + struct rdma_scatterlist *sg) +{ + if (dev->dma_ops) + return dev->dma_ops->dma_address(dev, sg); + return sg_dma_address(sg); +} + +/** + * ib_sg_dma_len - Return the DMA length from a scatter/gather entry + * @dev: The device for which the DMA addresses were created + * @sg: The scatter/gather entry + */ +static inline unsigned int ib_sg_dma_len(struct ib_device *dev, + struct rdma_scatterlist *sg) +{ + if (dev->dma_ops) + return dev->dma_ops->dma_len(dev, sg); + return sg_dma_len(sg); +} + +/** + * ib_dma_sync_single_for_cpu - Prepare DMA region to be accessed by CPU + * @dev: The device for which the DMA address was created + * @addr: The DMA address + * @size: The size of the region in bytes + * @dir: The direction of the DMA + */ +static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev, + u64 addr, + size_t size, + enum dma_data_direction dir) +{ + if (dev->dma_ops) + dev->dma_ops->sync_single_for_cpu(dev, addr, size, dir); + else + dma_sync_single_for_cpu(dev->dma_device, addr, size, dir); +} + +/** + * ib_dma_sync_single_for_device - Prepare DMA region to be accessed by device + * @dev: The device for which the DMA address was created + * @addr: The DMA address + * @size: The size of the region in bytes + * @dir: The direction of the DMA + */ +static inline void ib_dma_sync_single_for_device(struct ib_device *dev, + u64 addr, + size_t size, + enum dma_data_direction dir) +{ + if (dev->dma_ops) + dev->dma_ops->sync_single_for_device(dev, addr, size, dir); + else + dma_sync_single_for_device(dev->dma_device, addr, size, dir); +} + +/** + * ib_dma_alloc_coherent - Allocate memory and map it for DMA + * @dev: The device for which the DMA address is requested + * @size: The size of the region to allocate in bytes + * @dma_handle: A pointer for returning the DMA address of the region + * @flag: memory allocator flags + */ +static inline void *ib_dma_alloc_coherent(struct ib_device *dev, + size_t size, + u64 *dma_handle, + gfp_t flag) +{ + if (dev->dma_ops) + return dev->dma_ops->alloc_coherent(dev, size, dma_handle, flag); + else { + dma_addr_t handle; + void *ret; + + ret = dma_alloc_coherent(dev->dma_device, size, &handle, flag); + *dma_handle = handle; + return ret; + } +} + +/** + * ib_dma_free_coherent - Free memory allocated by ib_dma_alloc_coherent() + * @dev: The device for which the DMA addresses were allocated + * @size: The size of the region + * @cpu_addr: the address returned by ib_dma_alloc_coherent() + * @dma_handle: the DMA address returned by ib_dma_alloc_coherent() + */ +static inline void ib_dma_free_coherent(struct ib_device *dev, + size_t size, void *cpu_addr, + u64 dma_handle) +{ + if (dev->dma_ops) + dev->dma_ops->free_coherent(dev, size, cpu_addr, dma_handle); + else + dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle); +} +#endif +/** + * ib_reg_phys_mr - Prepares a virtually addressed memory region for use + * by an HCA. + * @pd: The protection domain associated assigned to the registered region. + * @phys_buf_array: Specifies a list of physical buffers to use in the + * memory region. + * @num_phys_buf: Specifies the size of the phys_buf_array. + * @mr_access_flags: Specifies the memory access rights. + * @iova_start: The offset of the region's starting I/O virtual address. + */ +struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, + u64 *iova_start); + +/** + * ib_rereg_phys_mr - Modifies the attributes of an existing memory region. + * Conceptually, this call performs the functions deregister memory region + * followed by register physical memory region. Where possible, + * resources are reused instead of deallocated and reallocated. + * @mr: The memory region to modify. + * @mr_rereg_mask: A bit-mask used to indicate which of the following + * properties of the memory region are being modified. + * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies + * the new protection domain to associated with the memory region, + * otherwise, this parameter is ignored. + * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this + * field specifies a list of physical buffers to use in the new + * translation, otherwise, this parameter is ignored. + * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this + * field specifies the size of the phys_buf_array, otherwise, this + * parameter is ignored. + * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this + * field specifies the new memory access rights, otherwise, this + * parameter is ignored. + * @iova_start: The offset of the region's starting I/O virtual address. + */ +int ib_rereg_phys_mr(struct ib_mr *mr, + int mr_rereg_mask, + struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, + u64 *iova_start); + +/** + * ib_query_mr - Retrieves information about a specific memory region. + * @mr: The memory region to retrieve information about. + * @mr_attr: The attributes of the specified memory region. + */ +int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr); + +/** + * ib_dereg_mr - Deregisters a memory region and removes it from the + * HCA translation table. + * @mr: The memory region to deregister. + */ +int ib_dereg_mr(struct ib_mr *mr); + +/** + * ib_alloc_mw - Allocates a memory window. + * @pd: The protection domain associated with the memory window. + */ +struct ib_mw *ib_alloc_mw(struct ib_pd *pd); + +/** + * ib_bind_mw - Posts a work request to the send queue of the specified + * QP, which binds the memory window to the given address range and + * remote access attributes. + * @qp: QP to post the bind work request on. + * @mw: The memory window to bind. + * @mw_bind: Specifies information about the memory window, including + * its address range, remote access rights, and associated memory region. + */ +static inline int ib_bind_mw(struct ib_qp *qp, + struct ib_mw *mw, + struct ib_mw_bind *mw_bind) +{ + /* XXX reference counting in corresponding MR? */ + return mw->device->bind_mw ? + mw->device->bind_mw(qp, mw, mw_bind) : + ENOSYS; +} + +/** + * ib_dealloc_mw - Deallocates a memory window. + * @mw: The memory window to deallocate. + */ +int ib_dealloc_mw(struct ib_mw *mw); + +/** + * ib_alloc_fmr - Allocates a unmapped fast memory region. + * @pd: The protection domain associated with the unmapped region. + * @mr_access_flags: Specifies the memory access rights. + * @fmr_attr: Attributes of the unmapped region. + * + * A fast memory region must be mapped before it can be used as part of + * a work request. + */ +struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, + int mr_access_flags, + struct ib_fmr_attr *fmr_attr); + +/** + * ib_map_phys_fmr - Maps a list of physical pages to a fast memory region. + * @fmr: The fast memory region to associate with the pages. + * @page_list: An array of physical pages to map to the fast memory region. + * @list_len: The number of pages in page_list. + * @iova: The I/O virtual address to use with the mapped region. + */ +static inline int ib_map_phys_fmr(struct ib_fmr *fmr, + u64 *page_list, int list_len, + u64 iova) +{ + return fmr->device->map_phys_fmr(fmr, page_list, list_len, iova); +} + +/** + * ib_unmap_fmr - Removes the mapping from a list of fast memory regions. + * @fmr_list: A linked list of fast memory regions to unmap. + */ +int ib_unmap_fmr(struct ib_fmr_list_head *fmr_list); + +/** + * ib_dealloc_fmr - Deallocates a fast memory region. + * @fmr: The fast memory region to deallocate. + */ +int ib_dealloc_fmr(struct ib_fmr *fmr); + +/** + * ib_attach_mcast - Attaches the specified QP to a multicast group. + * @qp: QP to attach to the multicast group. The QP must be type + * IB_QPT_UD. + * @gid: Multicast group GID. + * @lid: Multicast group LID in host byte order. + * + * In order to send and receive multicast packets, subnet + * administration must have created the multicast group and configured + * the fabric appropriately. The port associated with the specified + * QP must also be a member of the multicast group. + */ +int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); + +/** + * ib_detach_mcast - Detaches the specified QP from a multicast group. + * @qp: QP to detach from the multicast group. + * @gid: Multicast group GID. + * @lid: Multicast group LID in host byte order. + */ +int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); + +#endif /* IB_VERBS_H */ diff --git a/sys/contrib/rdma/iw_cm.h b/sys/contrib/rdma/iw_cm.h new file mode 100644 index 000000000000..e594d669375a --- /dev/null +++ b/sys/contrib/rdma/iw_cm.h @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2005 Network Appliance, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $FreeBSD$ + */ +#ifndef IW_CM_H +#define IW_CM_H + +#include <contrib/rdma/ib_cm.h> + +#include <sys/socket.h> +#include <sys/socketvar.h> + +struct iw_cm_id; + +enum iw_cm_event_type { + IW_CM_EVENT_CONNECT_REQUEST = 1, /* connect request received */ + IW_CM_EVENT_CONNECT_REPLY, /* reply from active connect request */ + IW_CM_EVENT_ESTABLISHED, /* passive side accept successful */ + IW_CM_EVENT_DISCONNECT, /* orderly shutdown */ + IW_CM_EVENT_CLOSE /* close complete */ +}; + +enum iw_cm_event_status { + IW_CM_EVENT_STATUS_OK = 0, /* request successful */ + IW_CM_EVENT_STATUS_ACCEPTED = 0, /* connect request accepted */ + IW_CM_EVENT_STATUS_REJECTED, /* connect request rejected */ + IW_CM_EVENT_STATUS_TIMEOUT, /* the operation timed out */ + IW_CM_EVENT_STATUS_RESET, /* reset from remote peer */ + IW_CM_EVENT_STATUS_EINVAL, /* asynchronous failure for bad parm */ +}; + +struct iw_cm_event { + enum iw_cm_event_type event; + enum iw_cm_event_status status; + struct sockaddr_in local_addr; + struct sockaddr_in remote_addr; + void *private_data; + u8 private_data_len; + void* provider_data; + struct socket *so; +}; + +/** + * iw_cm_handler - Function to be called by the IW CM when delivering events + * to the client. + * + * @cm_id: The IW CM identifier associated with the event. + * @event: Pointer to the event structure. + */ +typedef int (*iw_cm_handler)(struct iw_cm_id *cm_id, + struct iw_cm_event *event); + +/** + * iw_event_handler - Function called by the provider when delivering provider + * events to the IW CM. Returns either 0 indicating the event was processed + * or -errno if the event could not be processed. + * + * @cm_id: The IW CM identifier associated with the event. + * @event: Pointer to the event structure. + */ +typedef int (*iw_event_handler)(struct iw_cm_id *cm_id, + struct iw_cm_event *event); + +struct iw_cm_id { + iw_cm_handler cm_handler; /* client callback function */ + void *context; /* client cb context */ + struct ib_device *device; + struct sockaddr_in local_addr; + struct sockaddr_in remote_addr; + void *provider_data; /* provider private data */ + iw_event_handler event_handler; /* cb for provider + events */ + /* Used by provider to add and remove refs on IW cm_id */ + void (*add_ref)(struct iw_cm_id *); + void (*rem_ref)(struct iw_cm_id *); + struct socket *so; +}; + +struct iw_cm_conn_param { + const void *private_data; + u16 private_data_len; + u32 ord; + u32 ird; + u32 qpn; +}; + +struct iw_cm_verbs { + void (*add_ref)(struct ib_qp *qp); + + void (*rem_ref)(struct ib_qp *qp); + + struct ib_qp * (*get_qp)(struct ib_device *device, + int qpn); + + int (*connect)(struct iw_cm_id *cm_id, + struct iw_cm_conn_param *conn_param); + + int (*accept)(struct iw_cm_id *cm_id, + struct iw_cm_conn_param *conn_param); + + int (*reject)(struct iw_cm_id *cm_id, + const void *pdata, u8 pdata_len); + + int (*create_listen)(struct iw_cm_id *cm_id, + int backlog); + + int (*destroy_listen)(struct iw_cm_id *cm_id); +}; + +/** + * iw_create_cm_id - Create an IW CM identifier. + * + * @device: The IB device on which to create the IW CM identier. + * @so: The socket to be used for establishing the rdma connection. + * @event_handler: User callback invoked to report events associated with the + * returned IW CM identifier. + * @context: User specified context associated with the id. + */ +struct iw_cm_id *iw_create_cm_id(struct ib_device *device, + struct socket *so, + iw_cm_handler cm_handler, void *context); + +/** + * iw_destroy_cm_id - Destroy an IW CM identifier. + * + * @cm_id: The previously created IW CM identifier to destroy. + * + * The client can assume that no events will be delivered for the CM ID after + * this function returns. + */ +void iw_destroy_cm_id(struct iw_cm_id *cm_id); + +/** + * iw_cm_bind_qp - Unbind the specified IW CM identifier and QP + * + * @cm_id: The IW CM idenfier to unbind from the QP. + * @qp: The QP + * + * This is called by the provider when destroying the QP to ensure + * that any references held by the IWCM are released. It may also + * be called by the IWCM when destroying a CM_ID to that any + * references held by the provider are released. + */ +void iw_cm_unbind_qp(struct iw_cm_id *cm_id, struct ib_qp *qp); + +/** + * iw_cm_get_qp - Return the ib_qp associated with a QPN + * + * @ib_device: The IB device + * @qpn: The queue pair number + */ +struct ib_qp *iw_cm_get_qp(struct ib_device *device, int qpn); + +/** + * iw_cm_listen - Listen for incoming connection requests on the + * specified IW CM id. + * + * @cm_id: The IW CM identifier. + * @backlog: The maximum number of outstanding un-accepted inbound listen + * requests to queue. + * + * The source address and port number are specified in the IW CM identifier + * structure. + */ +int iw_cm_listen(struct iw_cm_id *cm_id, int backlog); + +/** + * iw_cm_accept - Called to accept an incoming connect request. + * + * @cm_id: The IW CM identifier associated with the connection request. + * @iw_param: Pointer to a structure containing connection establishment + * parameters. + * + * The specified cm_id will have been provided in the event data for a + * CONNECT_REQUEST event. Subsequent events related to this connection will be + * delivered to the specified IW CM identifier prior and may occur prior to + * the return of this function. If this function returns a non-zero value, the + * client can assume that no events will be delivered to the specified IW CM + * identifier. + */ +int iw_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param); + +/** + * iw_cm_reject - Reject an incoming connection request. + * + * @cm_id: Connection identifier associated with the request. + * @private_daa: Pointer to data to deliver to the remote peer as part of the + * reject message. + * @private_data_len: The number of bytes in the private_data parameter. + * + * The client can assume that no events will be delivered to the specified IW + * CM identifier following the return of this function. The private_data + * buffer is available for reuse when this function returns. + */ +int iw_cm_reject(struct iw_cm_id *cm_id, const void *private_data, + u8 private_data_len); + +/** + * iw_cm_connect - Called to request a connection to a remote peer. + * + * @cm_id: The IW CM identifier for the connection. + * @iw_param: Pointer to a structure containing connection establishment + * parameters. + * + * Events may be delivered to the specified IW CM identifier prior to the + * return of this function. If this function returns a non-zero value, the + * client can assume that no events will be delivered to the specified IW CM + * identifier. + */ +int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param); + +/** + * iw_cm_disconnect - Close the specified connection. + * + * @cm_id: The IW CM identifier to close. + * @abrupt: If 0, the connection will be closed gracefully, otherwise, the + * connection will be reset. + * + * The IW CM identifier is still active until the IW_CM_EVENT_CLOSE event is + * delivered. + */ +int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt); + +/** + * iw_cm_init_qp_attr - Called to initialize the attributes of the QP + * associated with a IW CM identifier. + * + * @cm_id: The IW CM identifier associated with the QP + * @qp_attr: Pointer to the QP attributes structure. + * @qp_attr_mask: Pointer to a bit vector specifying which QP attributes are + * valid. + */ +int iw_cm_init_qp_attr(struct iw_cm_id *cm_id, struct ib_qp_attr *qp_attr, + int *qp_attr_mask); + +#endif /* IW_CM_H */ diff --git a/sys/contrib/rdma/krping/getopt.c b/sys/contrib/rdma/krping/getopt.c new file mode 100644 index 000000000000..701910ea8b33 --- /dev/null +++ b/sys/contrib/rdma/krping/getopt.c @@ -0,0 +1,77 @@ +/* + * lifted from fs/ncpfs/getopt.c + * + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/ctype.h> +#include <sys/param.h> +#include <sys/libkern.h> +#include "getopt.h" + +/** + * krping_getopt - option parser + * @caller: name of the caller, for error messages + * @options: the options string + * @opts: an array of &struct option entries controlling parser operations + * @optopt: output; will contain the current option + * @optarg: output; will contain the value (if one exists) + * @flag: output; may be NULL; should point to a long for or'ing flags + * @value: output; may be NULL; will be overwritten with the integer value + * of the current argument. + * + * Helper to parse options on the format used by mount ("a=b,c=d,e,f"). + * Returns opts->val if a matching entry in the 'opts' array is found, + * 0 when no more tokens are found, -1 if an error is encountered. + */ +int krping_getopt(const char *caller, char **options, + const struct krping_option *opts, char **optopt, + char **optarg, unsigned long *value) +{ + char *token; + char *val; + + do { + if ((token = strsep(options, ",")) == NULL) + return 0; + } while (*token == '\0'); + if (optopt) + *optopt = token; + + if ((val = strchr (token, '=')) != NULL) { + *val++ = 0; + } + *optarg = val; + for (; opts->name; opts++) { + if (!strcmp(opts->name, token)) { + if (!val) { + if (opts->has_arg & OPT_NOPARAM) { + return opts->val; + } + printf("%s: the %s option requires " + "an argument\n", caller, token); + return -EINVAL; + } + if (opts->has_arg & OPT_INT) { + char* v; + + *value = strtoul(val, &v, 0); + if (!*v) { + return opts->val; + } + printf("%s: invalid numeric value " + "in %s=%s\n", caller, token, val); + return -EDOM; + } + if (opts->has_arg & OPT_STRING) { + return opts->val; + } + printf("%s: unexpected argument %s to the " + "%s option\n", caller, val, token); + return -EINVAL; + } + } + printf("%s: Unrecognized option %s\n", caller, token); + return -EOPNOTSUPP; +} diff --git a/sys/contrib/rdma/krping/getopt.h b/sys/contrib/rdma/krping/getopt.h new file mode 100644 index 000000000000..610ec7625424 --- /dev/null +++ b/sys/contrib/rdma/krping/getopt.h @@ -0,0 +1,21 @@ +/* + * lifted from fs/ncpfs/getopt.c + * + * $FreeBSD$ + */ +#ifndef _KRPING_GETOPT_H +#define _KRPING_GETOPT_H + +#define OPT_NOPARAM 1 +#define OPT_INT 2 +#define OPT_STRING 4 +struct krping_option { + const char *name; + unsigned int has_arg; + int val; +}; + +extern int krping_getopt(const char *caller, char **options, const struct krping_option *opts, + char **optopt, char **optarg, unsigned long *value); + +#endif /* _KRPING_GETOPT_H */ diff --git a/sys/contrib/rdma/krping/krping.c b/sys/contrib/rdma/krping/krping.c new file mode 100644 index 000000000000..202daf85eda6 --- /dev/null +++ b/sys/contrib/rdma/krping/krping.c @@ -0,0 +1,1865 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/ctype.h> + +#include <sys/param.h> +#include <sys/condvar.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/socket.h> +#include <sys/module.h> +#include <sys/endian.h> +#include <sys/limits.h> +#include <sys/proc.h> +#include <sys/signalvar.h> + +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/rwlock.h> +#include <sys/queue.h> +#include <sys/taskqueue.h> +#include <sys/syslog.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <contrib/rdma/rdma_cm.h> + +#include "getopt.h" +#include "krping.h" + +#define PFX "krping: " + +static int debug = 0; +#define DEBUG_LOG if (debug) printf + +static const struct krping_option krping_opts[] = { + {"count", OPT_INT, 'C'}, + {"size", OPT_INT, 'S'}, + {"addr", OPT_STRING, 'a'}, + {"port", OPT_INT, 'p'}, + {"verbose", OPT_NOPARAM, 'v'}, + {"validate", OPT_NOPARAM, 'V'}, + {"server", OPT_NOPARAM, 's'}, + {"client", OPT_NOPARAM, 'c'}, + {"dmamr", OPT_NOPARAM, 'D'}, + {"debug", OPT_NOPARAM, 'd'}, + {"wlat", OPT_NOPARAM, 'l'}, + {"rlat", OPT_NOPARAM, 'L'}, + {"bw", OPT_NOPARAM, 'B'}, + {"tx-depth", OPT_INT, 't'}, + {"poll", OPT_NOPARAM, 'P'}, + {NULL, 0, 0} +}; + +struct mtx krping_mutex; + +/* + * List of running krping threads. + */ +struct krping_cb_list krping_cbs; + +/* + * krping "ping/pong" loop: + * client sends source rkey/addr/len + * server receives source rkey/add/len + * server rdma reads "ping" data from source + * server sends "go ahead" on rdma read completion + * client sends sink rkey/addr/len + * server receives sink rkey/addr/len + * server rdma writes "pong" data to sink + * server sends "go ahead" on rdma write completion + * <repeat loop> + */ + +/* + * Default max buffer size for IO... + */ +#define RPING_BUFSIZE 128*1024 +#define RPING_SQ_DEPTH 32 + + +/* lifted from netinet/libalias/alias_proxy.c */ +static int inet_aton(const char *cp, struct in_addr *addr); +static int +inet_aton(cp, addr) + const char *cp; + struct in_addr *addr; +{ + u_long parts[4]; + in_addr_t val; + const char *c; + char *endptr; + int gotend, n; + + c = (const char *)cp; + n = 0; + /* + * Run through the string, grabbing numbers until + * the end of the string, or some error + */ + gotend = 0; + while (!gotend) { + unsigned long l; + + l = strtoul(c, &endptr, 0); + + if (l == ULONG_MAX || (l == 0 && endptr == c)) + return (0); + + val = (in_addr_t)l; + /* + * If the whole string is invalid, endptr will equal + * c.. this way we can make sure someone hasn't + * gone '.12' or something which would get past + * the next check. + */ + if (endptr == c) + return (0); + parts[n] = val; + c = endptr; + + /* Check the next character past the previous number's end */ + switch (*c) { + case '.' : + /* Make sure we only do 3 dots .. */ + if (n == 3) /* Whoops. Quit. */ + return (0); + n++; + c++; + break; + + case '\0': + gotend = 1; + break; + + default: + if (isspace((unsigned char)*c)) { + gotend = 1; + break; + } else + return (0); /* Invalid character, so fail */ + } + + } + + /* + * Concoct the address according to + * the number of parts specified. + */ + + switch (n) { + case 0: /* a -- 32 bits */ + /* + * Nothing is necessary here. Overflow checking was + * already done in strtoul(). + */ + break; + case 1: /* a.b -- 8.24 bits */ + if (val > 0xffffff || parts[0] > 0xff) + return (0); + val |= parts[0] << 24; + break; + + case 2: /* a.b.c -- 8.8.16 bits */ + if (val > 0xffff || parts[0] > 0xff || parts[1] > 0xff) + return (0); + val |= (parts[0] << 24) | (parts[1] << 16); + break; + + case 3: /* a.b.c.d -- 8.8.8.8 bits */ + if (val > 0xff || parts[0] > 0xff || parts[1] > 0xff || + parts[2] > 0xff) + return (0); + val |= (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8); + break; + } + + if (addr != NULL) + addr->s_addr = htonl(val); + return (1); +} + + +static void krping_wait(struct krping_cb *cb, int state) +{ + int rc; + mtx_lock(&cb->lock); + while (cb->state < state) { + rc = msleep(cb, &cb->lock, 0, "krping", 0); + if (rc && rc != ERESTART) { + cb->state = ERROR; + break; + } + } + mtx_unlock(&cb->lock); +} + +static int krping_cma_event_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) +{ + int ret; + struct krping_cb *cb = cma_id->context; + + DEBUG_LOG(PFX "cma_event type %d cma_id %p (%s)\n", event->event, cma_id, + (cma_id == cb->cm_id) ? "parent" : "child"); + + mtx_lock(&cb->lock); + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + cb->state = ADDR_RESOLVED; + ret = rdma_resolve_route(cma_id, 2000); + if (ret) { + log(LOG_ERR, "rdma_resolve_route error %d\n", + ret); + wakeup(cb); + } + break; + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + cb->state = ROUTE_RESOLVED; + wakeup(cb); + break; + + case RDMA_CM_EVENT_CONNECT_REQUEST: + cb->state = CONNECT_REQUEST; + cb->child_cm_id = cma_id; + DEBUG_LOG(PFX "child cma %p\n", cb->child_cm_id); + wakeup(cb); + break; + + case RDMA_CM_EVENT_ESTABLISHED: + DEBUG_LOG(PFX "ESTABLISHED\n"); + if (!cb->server) { + cb->state = CONNECTED; + wakeup(cb); + } + break; + + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + log(LOG_ERR, "cma event %d, error %d\n", event->event, + event->status); + cb->state = ERROR; + wakeup(cb); + break; + + case RDMA_CM_EVENT_DISCONNECTED: + DEBUG_LOG(PFX "DISCONNECT EVENT...\n"); + cb->state = ERROR; + wakeup(cb); + break; + + case RDMA_CM_EVENT_DEVICE_REMOVAL: + DEBUG_LOG(PFX "cma detected device removal!!!!\n"); + break; + + default: + log(LOG_ERR, "oof bad type!\n"); + wakeup(cb); + break; + } + mtx_unlock(&cb->lock); + return 0; +} + +static int server_recv(struct krping_cb *cb, struct ib_wc *wc) +{ + if (wc->byte_len != sizeof(cb->recv_buf)) { + log(LOG_ERR, "Received bogus data, size %d\n", + wc->byte_len); + return -1; + } + + cb->remote_rkey = ntohl(cb->recv_buf.rkey); + cb->remote_addr = ntohll(cb->recv_buf.buf); + cb->remote_len = ntohl(cb->recv_buf.size); + DEBUG_LOG(PFX "Received rkey %x addr %llx len %d from peer\n", + cb->remote_rkey, (unsigned long long)cb->remote_addr, + cb->remote_len); + + if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE) + cb->state = RDMA_READ_ADV; + else + cb->state = RDMA_WRITE_ADV; + + return 0; +} + +static int client_recv(struct krping_cb *cb, struct ib_wc *wc) +{ + if (wc->byte_len != sizeof(cb->recv_buf)) { + log(LOG_ERR, "Received bogus data, size %d\n", + wc->byte_len); + return -1; + } + + if (cb->state == RDMA_READ_ADV) + cb->state = RDMA_WRITE_ADV; + else + cb->state = RDMA_WRITE_COMPLETE; + + return 0; +} + +static void krping_cq_event_handler(struct ib_cq *cq, void *ctx) +{ + struct krping_cb *cb = ctx; + struct ib_wc wc; + struct ib_recv_wr *bad_wr; + int ret; + + mtx_lock(&cb->lock); + KASSERT(cb->cq == cq, ("bad condition")); + if (cb->state == ERROR) { + log(LOG_ERR, "cq completion in ERROR state\n"); + mtx_unlock(&cb->lock); + return; + } + if (!cb->wlat && !cb->rlat && !cb->bw) + ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); + while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) { + if (wc.status) { + if (wc.status != IB_WC_WR_FLUSH_ERR) + log(LOG_ERR, "cq completion failed status %d\n", + wc.status); + goto error; + } + + switch (wc.opcode) { + case IB_WC_SEND: + DEBUG_LOG(PFX "send completion\n"); + cb->stats.send_bytes += cb->send_sgl.length; + cb->stats.send_msgs++; + break; + + case IB_WC_RDMA_WRITE: + DEBUG_LOG(PFX "rdma write completion\n"); + cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length; + cb->stats.write_msgs++; + cb->state = RDMA_WRITE_COMPLETE; + wakeup(cb); + break; + + case IB_WC_RDMA_READ: + DEBUG_LOG(PFX "rdma read completion\n"); + cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length; + cb->stats.read_msgs++; + cb->state = RDMA_READ_COMPLETE; + wakeup(cb); + break; + + case IB_WC_RECV: + DEBUG_LOG(PFX "recv completion\n"); + cb->stats.recv_bytes += sizeof(cb->recv_buf); + cb->stats.recv_msgs++; + if (cb->wlat || cb->rlat || cb->bw) + ret = server_recv(cb, &wc); + else + ret = cb->server ? server_recv(cb, &wc) : + client_recv(cb, &wc); + if (ret) { + log(LOG_ERR, "recv wc error: %d\n", ret); + goto error; + } + + ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); + if (ret) { + log(LOG_ERR, "post recv error: %d\n", + ret); + goto error; + } + wakeup(cb); + break; + + default: + log(LOG_ERR, "unknown!!!!! completion\n"); + goto error; + } + } + if (ret) { + log(LOG_ERR, "poll error %d\n", ret); + goto error; + } + mtx_unlock(&cb->lock); + return; +error: + cb->state = ERROR; + wakeup(cb); + mtx_unlock(&cb->lock); +} + +static int krping_accept(struct krping_cb *cb) +{ + struct rdma_conn_param conn_param; + int ret; + + DEBUG_LOG(PFX "accepting client connection request\n"); + + memset(&conn_param, 0, sizeof conn_param); + conn_param.responder_resources = 1; + conn_param.initiator_depth = 1; + + ret = rdma_accept(cb->child_cm_id, &conn_param); + if (ret) { + log(LOG_ERR, "rdma_accept error: %d\n", ret); + return ret; + } + + if (!cb->wlat && !cb->rlat && !cb->bw) { + krping_wait(cb, CONNECTED); + if (cb->state == ERROR) { + log(LOG_ERR, "wait for CONNECTED state %d\n", cb->state); + return -1; + } + } + return 0; +} + +static void krping_setup_wr(struct krping_cb *cb) +{ + /* XXX X86 only here... not mapping for dma! */ + cb->recv_sgl.addr = vtophys(&cb->recv_buf); + cb->recv_sgl.length = sizeof cb->recv_buf; + if (cb->use_dmamr) + cb->recv_sgl.lkey = cb->dma_mr->lkey; + else + cb->recv_sgl.lkey = cb->recv_mr->lkey; + cb->rq_wr.sg_list = &cb->recv_sgl; + cb->rq_wr.num_sge = 1; + + cb->send_sgl.addr = vtophys(&cb->send_buf); + cb->send_sgl.length = sizeof cb->send_buf; + if (cb->use_dmamr) + cb->send_sgl.lkey = cb->dma_mr->lkey; + else + cb->send_sgl.lkey = cb->send_mr->lkey; + + cb->sq_wr.opcode = IB_WR_SEND; + cb->sq_wr.send_flags = IB_SEND_SIGNALED; + cb->sq_wr.sg_list = &cb->send_sgl; + cb->sq_wr.num_sge = 1; + + cb->rdma_addr = vtophys(cb->rdma_buf); + cb->rdma_sgl.addr = cb->rdma_addr; + if (cb->use_dmamr) + cb->rdma_sgl.lkey = cb->dma_mr->lkey; + else + cb->rdma_sgl.lkey = cb->rdma_mr->lkey; + cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED; + cb->rdma_sq_wr.sg_list = &cb->rdma_sgl; + cb->rdma_sq_wr.num_sge = 1; + + if (!cb->server || cb->wlat || cb->rlat || cb->bw) { + cb->start_addr = vtophys(cb->start_buf); + } +} + +static int krping_setup_buffers(struct krping_cb *cb) +{ + int ret; + struct ib_phys_buf buf; + u64 iovbase; + + DEBUG_LOG(PFX "krping_setup_buffers called on cb %p\n", cb); + + if (cb->use_dmamr) { + cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE| + IB_ACCESS_REMOTE_READ| + IB_ACCESS_REMOTE_WRITE); + if (IS_ERR(cb->dma_mr)) { + log(LOG_ERR, "reg_dmamr failed\n"); + return PTR_ERR(cb->dma_mr); + } + } else { + + buf.addr = vtophys(&cb->recv_buf); + buf.size = sizeof cb->recv_buf; + iovbase = vtophys(&cb->recv_buf); + cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1, + IB_ACCESS_LOCAL_WRITE, + &iovbase); + + if (IS_ERR(cb->recv_mr)) { + log(LOG_ERR, "recv_buf reg_mr failed\n"); + return PTR_ERR(cb->recv_mr); + } + + buf.addr = vtophys(&cb->send_buf); + buf.size = sizeof cb->send_buf; + iovbase = vtophys(&cb->send_buf); + cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1, + 0, &iovbase); + + if (IS_ERR(cb->send_mr)) { + log(LOG_ERR, "send_buf reg_mr failed\n"); + ib_dereg_mr(cb->recv_mr); + return PTR_ERR(cb->send_mr); + } + } + + cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, -1UL, + PAGE_SIZE, 0); + + if (!cb->rdma_buf) { + log(LOG_ERR, "rdma_buf malloc failed\n"); + ret = ENOMEM; + goto err1; + } + if (!cb->use_dmamr) { + + buf.addr = vtophys(cb->rdma_buf); + buf.size = cb->size; + iovbase = vtophys(cb->rdma_buf); + cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1, + IB_ACCESS_REMOTE_READ| + IB_ACCESS_REMOTE_WRITE, + &iovbase); + + if (IS_ERR(cb->rdma_mr)) { + log(LOG_ERR, "rdma_buf reg_mr failed\n"); + ret = PTR_ERR(cb->rdma_mr); + goto err2; + } + } + + if (!cb->server || cb->wlat || cb->rlat || cb->bw) { + cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, + 0, -1UL, PAGE_SIZE, 0); + if (!cb->start_buf) { + log(LOG_ERR, "start_buf malloc failed\n"); + ret = ENOMEM; + goto err2; + } + if (!cb->use_dmamr) { + unsigned flags = IB_ACCESS_REMOTE_READ; + + if (cb->wlat || cb->rlat || cb->bw) + flags |= IB_ACCESS_REMOTE_WRITE; + buf.addr = vtophys(cb->start_buf); + buf.size = cb->size; + iovbase = vtophys(cb->start_buf); + cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1, + flags, + &iovbase); + + if (IS_ERR(cb->start_mr)) { + log(LOG_ERR, "start_buf reg_mr failed\n"); + ret = PTR_ERR(cb->start_mr); + goto err3; + } + } + } + + krping_setup_wr(cb); + DEBUG_LOG(PFX "allocated & registered buffers...\n"); + return 0; +err3: + contigfree(cb->start_buf, cb->size, M_DEVBUF); + + if (!cb->use_dmamr) + ib_dereg_mr(cb->rdma_mr); +err2: + contigfree(cb->rdma_buf, cb->size, M_DEVBUF); +err1: + if (cb->use_dmamr) + ib_dereg_mr(cb->dma_mr); + else { + ib_dereg_mr(cb->recv_mr); + ib_dereg_mr(cb->send_mr); + } + return ret; +} + +static void krping_free_buffers(struct krping_cb *cb) +{ + DEBUG_LOG(PFX "krping_free_buffers called on cb %p\n", cb); + +#if 0 + dma_unmap_single(cb->pd->device->dma_device, + pci_unmap_addr(cb, recv_mapping), + sizeof(cb->recv_buf), DMA_BIDIRECTIONAL); + dma_unmap_single(cb->pd->device->dma_device, + pci_unmap_addr(cb, send_mapping), + sizeof(cb->send_buf), DMA_BIDIRECTIONAL); + dma_unmap_single(cb->pd->device->dma_device, + pci_unmap_addr(cb, rdma_mapping), + cb->size, DMA_BIDIRECTIONAL); +#endif + contigfree(cb->rdma_buf, cb->size, M_DEVBUF); + if (!cb->server || cb->wlat || cb->rlat || cb->bw) { +#if 0 + dma_unmap_single(cb->pd->device->dma_device, + pci_unmap_addr(cb, start_mapping), + cb->size, DMA_BIDIRECTIONAL); +#endif + contigfree(cb->start_buf, cb->size, M_DEVBUF); + } + if (cb->use_dmamr) + ib_dereg_mr(cb->dma_mr); + else { + ib_dereg_mr(cb->send_mr); + ib_dereg_mr(cb->recv_mr); + ib_dereg_mr(cb->rdma_mr); + if (!cb->server) + ib_dereg_mr(cb->start_mr); + } +} + +static int krping_create_qp(struct krping_cb *cb) +{ + struct ib_qp_init_attr init_attr; + int ret; + + memset(&init_attr, 0, sizeof(init_attr)); + init_attr.cap.max_send_wr = cb->txdepth; + init_attr.cap.max_recv_wr = 2; + init_attr.cap.max_recv_sge = 1; + init_attr.cap.max_send_sge = 1; + init_attr.qp_type = IB_QPT_RC; + init_attr.send_cq = cb->cq; + init_attr.recv_cq = cb->cq; + + if (cb->server) { + ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr); + if (!ret) + cb->qp = cb->child_cm_id->qp; + } else { + ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr); + if (!ret) + cb->qp = cb->cm_id->qp; + } + + return ret; +} + +static void krping_free_qp(struct krping_cb *cb) +{ + ib_destroy_qp(cb->qp); + ib_destroy_cq(cb->cq); + ib_dealloc_pd(cb->pd); +} + +static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id) +{ + int ret; + cb->pd = ib_alloc_pd(cm_id->device); + if (IS_ERR(cb->pd)) { + log(LOG_ERR, "ib_alloc_pd failed\n"); + return PTR_ERR(cb->pd); + } + DEBUG_LOG(PFX "created pd %p\n", cb->pd); + + cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL, + cb, cb->txdepth * 2, 0); + if (IS_ERR(cb->cq)) { + log(LOG_ERR, "ib_create_cq failed\n"); + ret = PTR_ERR(cb->cq); + goto err1; + } + DEBUG_LOG(PFX "created cq %p\n", cb->cq); + + if (!cb->wlat && !cb->rlat && !cb->bw) { + ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); + if (ret) { + log(LOG_ERR, "ib_create_cq failed\n"); + goto err2; + } + } + + ret = krping_create_qp(cb); + if (ret) { + log(LOG_ERR, "krping_create_qp failed: %d\n", ret); + goto err2; + } + DEBUG_LOG(PFX "created qp %p\n", cb->qp); + return 0; +err2: + ib_destroy_cq(cb->cq); +err1: + ib_dealloc_pd(cb->pd); + return ret; +} + +static void krping_format_send(struct krping_cb *cb, u64 buf, + struct ib_mr *mr) +{ + struct krping_rdma_info *info = &cb->send_buf; + + info->buf = htonll(buf); + info->rkey = htonl(mr->rkey); + info->size = htonl(cb->size); + + DEBUG_LOG(PFX "RDMA addr %llx rkey %x len %d\n", + (unsigned long long)buf, mr->rkey, cb->size); +} + +static void krping_test_server(struct krping_cb *cb) +{ + struct ib_send_wr *bad_wr; + int ret; + + while (1) { + /* Wait for client's Start STAG/TO/Len */ + krping_wait(cb, RDMA_READ_ADV); + if (cb->state != RDMA_READ_ADV) { + DEBUG_LOG(PFX "wait for RDMA_READ_ADV state %d\n", + cb->state); + break; + } + + DEBUG_LOG(PFX "server received sink adv\n"); + + /* Issue RDMA Read. */ + cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ; + cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; + cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; + cb->rdma_sq_wr.sg_list->length = cb->remote_len; + + ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); + if (ret) { + log(LOG_ERR, "post send error %d\n", ret); + break; + } + DEBUG_LOG(PFX "server posted rdma read req \n"); + + /* Wait for read completion */ + krping_wait(cb, RDMA_READ_COMPLETE); + if (cb->state != RDMA_READ_COMPLETE) { + log(LOG_ERR, + "wait for RDMA_READ_COMPLETE state %d\n", + cb->state); + break; + } + DEBUG_LOG(PFX "server received read complete\n"); + + /* Display data in recv buf */ + if (cb->verbose) + DEBUG_LOG("server ping data: %s\n", cb->rdma_buf); + + /* Tell client to continue */ + ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); + if (ret) { + log(LOG_ERR, "post send error %d\n", ret); + break; + } + DEBUG_LOG(PFX "server posted go ahead\n"); + + /* Wait for client's RDMA STAG/TO/Len */ + krping_wait(cb, RDMA_WRITE_ADV); + if (cb->state != RDMA_WRITE_ADV) { + log(LOG_ERR, + "wait for RDMA_WRITE_ADV state %d\n", + cb->state); + break; + } + DEBUG_LOG(PFX "server received sink adv\n"); + + /* RDMA Write echo data */ + cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; + cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; + cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; + cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1; + DEBUG_LOG(PFX "rdma write from lkey %x laddr %llx len %d\n", + cb->rdma_sq_wr.sg_list->lkey, + (unsigned long long)cb->rdma_sq_wr.sg_list->addr, + cb->rdma_sq_wr.sg_list->length); + + ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); + if (ret) { + log(LOG_ERR, "post send error %d\n", ret); + break; + } + + /* Wait for completion */ + krping_wait(cb, RDMA_WRITE_COMPLETE); + if (cb->state != RDMA_WRITE_COMPLETE) { + log(LOG_ERR, + "wait for RDMA_WRITE_COMPLETE state %d\n", + cb->state); + break; + } + DEBUG_LOG(PFX "server rdma write complete \n"); + + cb->state = CONNECTED; + + /* Tell client to begin again */ + ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); + if (ret) { + log(LOG_ERR, "post send error %d\n", ret); + break; + } + DEBUG_LOG(PFX "server posted go ahead\n"); + } +} + +static void rlat_test(struct krping_cb *cb) +{ + int scnt; + int iters = cb->count; + struct timeval start_tv, stop_tv; + int ret; + struct ib_wc wc; + struct ib_send_wr *bad_wr; + int ne; + + scnt = 0; + cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ; + cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; + cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; + cb->rdma_sq_wr.sg_list->length = cb->size; + + microtime(&start_tv); + if (!cb->poll) { + cb->state = RDMA_READ_ADV; + ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); + } + while (scnt < iters) { + + cb->state = RDMA_READ_ADV; + ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr); + if (ret) { + log(LOG_ERR, + "Couldn't post send: ret=%d scnt %d\n", + ret, scnt); + return; + } + + do { + if (!cb->poll) { + krping_wait(cb, RDMA_READ_COMPLETE); + if (cb->state == RDMA_READ_COMPLETE) { + ne = 1; + ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); + } else { + ne = -1; + } + } else + ne = ib_poll_cq(cb->cq, 1, &wc); + if (cb->state == ERROR) { + log(LOG_ERR, + "state == ERROR...bailing scnt %d\n", scnt); + return; + } + } while (ne == 0); + + if (ne < 0) { + log(LOG_ERR, "poll CQ failed %d\n", ne); + return; + } + if (cb->poll && wc.status != IB_WC_SUCCESS) { + log(LOG_ERR, "Completion wth error at %s:\n", + cb->server ? "server" : "client"); + log(LOG_ERR, "Failed status %d: wr_id %d\n", + wc.status, (int) wc.wr_id); + return; + } + ++scnt; + } + microtime(&stop_tv); + + if (stop_tv.tv_usec < start_tv.tv_usec) { + stop_tv.tv_usec += 1000000; + stop_tv.tv_sec -= 1; + } + + log(LOG_ERR, "delta sec %lu delta usec %lu iter %d size %d\n", + stop_tv.tv_sec - start_tv.tv_sec, + stop_tv.tv_usec - start_tv.tv_usec, + scnt, cb->size); +} + +static int alloc_cycle_mem(int cycle_iters, + cycles_t **post_cycles_start, + cycles_t **post_cycles_stop, + cycles_t **poll_cycles_start, + cycles_t **poll_cycles_stop, + cycles_t **last_poll_cycles_start) +{ + *post_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK); + if (!*post_cycles_start) { + goto fail1; + } + *post_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK); + if (!*post_cycles_stop) { + goto fail2; + } + *poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK); + if (!*poll_cycles_start) { + goto fail3; + } + *poll_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK); + if (!*poll_cycles_stop) { + goto fail4; + } + *last_poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK); + if (!*last_poll_cycles_start) { + goto fail5; + } + return 0; +fail5: + free(*poll_cycles_stop, M_DEVBUF); +fail4: + free(*poll_cycles_start, M_DEVBUF); +fail3: + free(*post_cycles_stop, M_DEVBUF); +fail2: + free(*post_cycles_start, M_DEVBUF); +fail1: + log(LOG_ERR, "%s malloc failed\n", __FUNCTION__); + return ENOMEM; +} + +static void free_cycle_mem(cycles_t *post_cycles_start, + cycles_t *post_cycles_stop, + cycles_t *poll_cycles_start, + cycles_t *poll_cycles_stop, + cycles_t *last_poll_cycles_start) +{ + free(last_poll_cycles_start, M_DEVBUF); + free(poll_cycles_stop, M_DEVBUF); + free(poll_cycles_start, M_DEVBUF); + free(post_cycles_stop, M_DEVBUF); + free(post_cycles_start, M_DEVBUF); +} + +static void wlat_test(struct krping_cb *cb) +{ + int ccnt, scnt, rcnt; + int iters=cb->count; + volatile char *poll_buf = (char *) cb->start_buf; + char *buf = (char *)cb->rdma_buf; + ccnt = 0; + scnt = 0; + rcnt = 0; + struct timeval start_tv, stop_tv; + cycles_t *post_cycles_start, *post_cycles_stop; + cycles_t *poll_cycles_start, *poll_cycles_stop; + cycles_t *last_poll_cycles_start; + cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0; + int i; + int cycle_iters = 1000; + int err; + + err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop, + &poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start); + + if (err) { + log(LOG_ERR, "%s malloc failed\n", __FUNCTION__); + return; + } + + cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; + cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; + cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; + cb->rdma_sq_wr.sg_list->length = cb->size; + + if (cycle_iters > iters) + cycle_iters = iters; + microtime(&start_tv); + while (scnt < iters || ccnt < iters || rcnt < iters) { + + /* Wait till buffer changes. */ + if (rcnt < iters && !(scnt < 1 && !cb->server)) { + ++rcnt; + while (*poll_buf != (char)rcnt) { + if (cb->state == ERROR) { + log(LOG_ERR, "state = ERROR, bailing\n"); + return; + } + } + } + + if (scnt < iters) { + struct ib_send_wr *bad_wr; + + *buf = (char)scnt+1; + if (scnt < cycle_iters) + post_cycles_start[scnt] = get_cycles(); + if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { + log(LOG_ERR, "Couldn't post send: scnt=%d\n", + scnt); + return; + } + if (scnt < cycle_iters) + post_cycles_stop[scnt] = get_cycles(); + scnt++; + } + + if (ccnt < iters) { + struct ib_wc wc; + int ne; + + if (ccnt < cycle_iters) + poll_cycles_start[ccnt] = get_cycles(); + do { + if (ccnt < cycle_iters) + last_poll_cycles_start[ccnt] = get_cycles(); + ne = ib_poll_cq(cb->cq, 1, &wc); + } while (ne == 0); + if (ccnt < cycle_iters) + poll_cycles_stop[ccnt] = get_cycles(); + ++ccnt; + + if (ne < 0) { + log(LOG_ERR, "poll CQ failed %d\n", ne); + return; + } + if (wc.status != IB_WC_SUCCESS) { + log(LOG_ERR, "Completion wth error at %s:\n", + cb->server ? "server" : "client"); + log(LOG_ERR, "Failed status %d: wr_id %d\n", + wc.status, (int) wc.wr_id); + log(LOG_ERR, "scnt=%d, rcnt=%d, ccnt=%d\n", + scnt, rcnt, ccnt); + return; + } + } + } + microtime(&stop_tv); + + if (stop_tv.tv_usec < start_tv.tv_usec) { + stop_tv.tv_usec += 1000000; + stop_tv.tv_sec -= 1; + } + + for (i=0; i < cycle_iters; i++) { + sum_post += post_cycles_stop[i] - post_cycles_start[i]; + sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; + sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i]; + } + + log(LOG_ERR, "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n", + stop_tv.tv_sec - start_tv.tv_sec, + stop_tv.tv_usec - start_tv.tv_usec, + scnt, cb->size, cycle_iters, + (unsigned long long)sum_post, (unsigned long long)sum_poll, + (unsigned long long)sum_last_poll); + + free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start, + poll_cycles_stop, last_poll_cycles_start); +} + +static void bw_test(struct krping_cb *cb) +{ + int ccnt, scnt, rcnt; + int iters=cb->count; + ccnt = 0; + scnt = 0; + rcnt = 0; + struct timeval start_tv, stop_tv; + cycles_t *post_cycles_start, *post_cycles_stop; + cycles_t *poll_cycles_start, *poll_cycles_stop; + cycles_t *last_poll_cycles_start; + cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0; + int i; + int cycle_iters = 1000; + int err; + + err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop, + &poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start); + + if (err) { + log(LOG_ERR, "%s kmalloc failed\n", __FUNCTION__); + return; + } + + cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; + cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; + cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; + cb->rdma_sq_wr.sg_list->length = cb->size; + + if (cycle_iters > iters) + cycle_iters = iters; + microtime(&start_tv); + while (scnt < iters || ccnt < iters) { + + while (scnt < iters && scnt - ccnt < cb->txdepth) { + struct ib_send_wr *bad_wr; + + if (scnt < cycle_iters) + post_cycles_start[scnt] = get_cycles(); + if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { + log(LOG_ERR, "Couldn't post send: scnt=%d\n", + scnt); + return; + } + if (scnt < cycle_iters) + post_cycles_stop[scnt] = get_cycles(); + ++scnt; + } + + if (ccnt < iters) { + int ne; + struct ib_wc wc; + + if (ccnt < cycle_iters) + poll_cycles_start[ccnt] = get_cycles(); + do { + if (ccnt < cycle_iters) + last_poll_cycles_start[ccnt] = get_cycles(); + ne = ib_poll_cq(cb->cq, 1, &wc); + } while (ne == 0); + if (ccnt < cycle_iters) + poll_cycles_stop[ccnt] = get_cycles(); + ccnt += 1; + + if (ne < 0) { + log(LOG_ERR, "poll CQ failed %d\n", ne); + return; + } + if (wc.status != IB_WC_SUCCESS) { + log(LOG_ERR, "Completion wth error at %s:\n", + cb->server ? "server" : "client"); + log(LOG_ERR, "Failed status %d: wr_id %d\n", + wc.status, (int) wc.wr_id); + return; + } + } + } + microtime(&stop_tv); + + if (stop_tv.tv_usec < start_tv.tv_usec) { + stop_tv.tv_usec += 1000000; + stop_tv.tv_sec -= 1; + } + + for (i=0; i < cycle_iters; i++) { + sum_post += post_cycles_stop[i] - post_cycles_start[i]; + sum_poll += poll_cycles_stop[i] - poll_cycles_start[i]; + sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i]; + } + + log(LOG_ERR, "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n", + stop_tv.tv_sec - start_tv.tv_sec, + stop_tv.tv_usec - start_tv.tv_usec, + scnt, cb->size, cycle_iters, + (unsigned long long)sum_post, (unsigned long long)sum_poll, + (unsigned long long)sum_last_poll); + + free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start, + poll_cycles_stop, last_poll_cycles_start); +} + +static void krping_rlat_test_server(struct krping_cb *cb) +{ + struct ib_send_wr *bad_wr; + struct ib_wc wc; + int ret; + + /* Spin waiting for client's Start STAG/TO/Len */ + while (cb->state < RDMA_READ_ADV) { + krping_cq_event_handler(cb->cq, cb); + } + + /* Send STAG/TO/Len to client */ + if (cb->dma_mr) + krping_format_send(cb, cb->start_addr, cb->dma_mr); + else + krping_format_send(cb, cb->start_addr, cb->start_mr); + ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); + if (ret) { + log(LOG_ERR, "post send error %d\n", ret); + return; + } + + /* Spin waiting for send completion */ + while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); + if (ret < 0) { + log(LOG_ERR, "poll error %d\n", ret); + return; + } + if (wc.status) { + log(LOG_ERR, "send completiong error %d\n", wc.status); + return; + } + + krping_wait(cb, ERROR); +} + +static void krping_wlat_test_server(struct krping_cb *cb) +{ + struct ib_send_wr *bad_wr; + struct ib_wc wc; + int ret; + + /* Spin waiting for client's Start STAG/TO/Len */ + while (cb->state < RDMA_READ_ADV) { + krping_cq_event_handler(cb->cq, cb); + } + + /* Send STAG/TO/Len to client */ + if (cb->dma_mr) + krping_format_send(cb, cb->start_addr, cb->dma_mr); + else + krping_format_send(cb, cb->start_addr, cb->start_mr); + ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); + if (ret) { + log(LOG_ERR, "post send error %d\n", ret); + return; + } + + /* Spin waiting for send completion */ + while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); + if (ret < 0) { + log(LOG_ERR, "poll error %d\n", ret); + return; + } + if (wc.status) { + log(LOG_ERR, "send completiong error %d\n", wc.status); + return; + } + + wlat_test(cb); + +} + +static void krping_bw_test_server(struct krping_cb *cb) +{ + struct ib_send_wr *bad_wr; + struct ib_wc wc; + int ret; + + /* Spin waiting for client's Start STAG/TO/Len */ + while (cb->state < RDMA_READ_ADV) { + krping_cq_event_handler(cb->cq, cb); + } + + /* Send STAG/TO/Len to client */ + if (cb->dma_mr) + krping_format_send(cb, cb->start_addr, cb->dma_mr); + else + krping_format_send(cb, cb->start_addr, cb->start_mr); + ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); + if (ret) { + log(LOG_ERR, "post send error %d\n", ret); + return; + } + + /* Spin waiting for send completion */ + while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); + if (ret < 0) { + log(LOG_ERR, "poll error %d\n", ret); + return; + } + if (wc.status) { + log(LOG_ERR, "send completiong error %d\n", wc.status); + return; + } + + if (cb->duplex) + bw_test(cb); + krping_wait(cb, ERROR); +} + +static int krping_bind_server(struct krping_cb *cb) +{ + struct sockaddr_in sin; + int ret; + + memset(&sin, 0, sizeof(sin)); + sin.sin_len = sizeof sin; + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = cb->addr.s_addr; + sin.sin_port = cb->port; + + ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin); + if (ret) { + log(LOG_ERR, "rdma_bind_addr error %d\n", ret); + return ret; + } + DEBUG_LOG(PFX "rdma_bind_addr successful\n"); + + DEBUG_LOG(PFX "rdma_listen\n"); + ret = rdma_listen(cb->cm_id, 3); + if (ret) { + log(LOG_ERR, "rdma_listen failed: %d\n", ret); + return ret; + } + + krping_wait(cb, CONNECT_REQUEST); + if (cb->state != CONNECT_REQUEST) { + log(LOG_ERR, "wait for CONNECT_REQUEST state %d\n", + cb->state); + return -1; + } + + return 0; +} + +static void krping_run_server(struct krping_cb *cb) +{ + struct ib_recv_wr *bad_wr; + int ret; + + ret = krping_bind_server(cb); + if (ret) + return; + + ret = krping_setup_qp(cb, cb->child_cm_id); + if (ret) { + log(LOG_ERR, "setup_qp failed: %d\n", ret); + return; + } + + ret = krping_setup_buffers(cb); + if (ret) { + log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret); + goto err1; + } + + ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); + if (ret) { + log(LOG_ERR, "ib_post_recv failed: %d\n", ret); + goto err2; + } + + ret = krping_accept(cb); + if (ret) { + log(LOG_ERR, "connect error %d\n", ret); + goto err2; + } + + if (cb->wlat) + krping_wlat_test_server(cb); + else if (cb->rlat) + krping_rlat_test_server(cb); + else if (cb->bw) + krping_bw_test_server(cb); + else + krping_test_server(cb); + + rdma_disconnect(cb->child_cm_id); + rdma_destroy_id(cb->child_cm_id); +err2: + krping_free_buffers(cb); +err1: + krping_free_qp(cb); +} + +static void krping_test_client(struct krping_cb *cb) +{ + int ping, start, cc, i, ret; + struct ib_send_wr *bad_wr; + unsigned char c; + + start = 65; + for (ping = 0; !cb->count || ping < cb->count; ping++) { + cb->state = RDMA_READ_ADV; + + /* Put some ascii text in the buffer. */ + cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping); + for (i = cc, c = start; i < cb->size; i++) { + cb->start_buf[i] = c; + c++; + if (c > 122) + c = 65; + } + start++; + if (start > 122) + start = 65; + cb->start_buf[cb->size - 1] = 0; + + if (cb->dma_mr) + krping_format_send(cb, cb->start_addr, cb->dma_mr); + else + krping_format_send(cb, cb->start_addr, cb->start_mr); + + ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); + if (ret) { + log(LOG_ERR, "post send error %d\n", ret); + break; + } + + /* Wait for server to ACK */ + krping_wait(cb, RDMA_WRITE_ADV); + if (cb->state != RDMA_WRITE_ADV) { + log(LOG_ERR, + "wait for RDMA_WRITE_ADV state %d\n", + cb->state); + break; + } + + if (cb->dma_mr) + krping_format_send(cb, cb->rdma_addr, cb->dma_mr); + else + krping_format_send(cb, cb->rdma_addr, cb->rdma_mr); + + ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); + if (ret) { + log(LOG_ERR, "post send error %d\n", ret); + break; + } + + /* Wait for the server to say the RDMA Write is complete. */ + krping_wait(cb, RDMA_WRITE_COMPLETE); + if (cb->state != RDMA_WRITE_COMPLETE) { + log(LOG_ERR, + "wait for RDMA_WRITE_COMPLETE state %d\n", + cb->state); + break; + } + + if (cb->validate) + if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) { + log(LOG_ERR, "data mismatch!\n"); + break; + } + + if (cb->verbose) + DEBUG_LOG("ping data: %s\n", cb->rdma_buf); + } +} + +static void krping_rlat_test_client(struct krping_cb *cb) +{ + struct ib_send_wr *bad_wr; + struct ib_wc wc; + int ret; + + cb->state = RDMA_READ_ADV; + + /* Send STAG/TO/Len to client */ + if (cb->dma_mr) + krping_format_send(cb, cb->start_addr, cb->dma_mr); + else + krping_format_send(cb, cb->start_addr, cb->rdma_mr); + ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); + if (ret) { + log(LOG_ERR, "post send error %d\n", ret); + return; + } + + /* Spin waiting for send completion */ + while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); + if (ret < 0) { + log(LOG_ERR, "poll error %d\n", ret); + return; + } + if (wc.status) { + log(LOG_ERR, "send completion error %d\n", wc.status); + return; + } + + /* Spin waiting for server's Start STAG/TO/Len */ + while (cb->state < RDMA_WRITE_ADV) { + krping_cq_event_handler(cb->cq, cb); + } + +#if 0 +{ + int i; + struct timeval start, stop; + time_t sec; + suseconds_t usec; + unsigned long long elapsed; + struct ib_wc wc; + struct ib_send_wr *bad_wr; + int ne; + + cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE; + cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey; + cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr; + cb->rdma_sq_wr.sg_list->length = 0; + cb->rdma_sq_wr.num_sge = 0; + + microtime(&start); + for (i=0; i < 100000; i++) { + if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) { + log(LOG_ERR, "Couldn't post send\n"); + return; + } + do { + ne = ib_poll_cq(cb->cq, 1, &wc); + } while (ne == 0); + if (ne < 0) { + log(LOG_ERR, "poll CQ failed %d\n", ne); + return; + } + if (wc.status != IB_WC_SUCCESS) { + log(LOG_ERR, "Completion wth error at %s:\n", + cb->server ? "server" : "client"); + log(LOG_ERR, "Failed status %d: wr_id %d\n", + wc.status, (int) wc.wr_id); + return; + } + } + microtime(&stop); + + if (stop.tv_usec < start.tv_usec) { + stop.tv_usec += 1000000; + stop.tv_sec -= 1; + } + sec = stop.tv_sec - start.tv_sec; + usec = stop.tv_usec - start.tv_usec; + elapsed = sec * 1000000 + usec; + log(LOG_ERR, "0B-write-lat iters 100000 usec %llu\n", elapsed); +} +#endif + + rlat_test(cb); +} + +static void krping_wlat_test_client(struct krping_cb *cb) +{ + struct ib_send_wr *bad_wr; + struct ib_wc wc; + int ret; + + cb->state = RDMA_READ_ADV; + + /* Send STAG/TO/Len to client */ + if (cb->dma_mr) + krping_format_send(cb, cb->start_addr, cb->dma_mr); + else + krping_format_send(cb, cb->start_addr, cb->start_mr); + ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); + if (ret) { + log(LOG_ERR, "post send error %d\n", ret); + return; + } + + /* Spin waiting for send completion */ + while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); + if (ret < 0) { + log(LOG_ERR, "poll error %d\n", ret); + return; + } + if (wc.status) { + log(LOG_ERR, "send completion error %d\n", wc.status); + return; + } + + /* Spin waiting for server's Start STAG/TO/Len */ + while (cb->state < RDMA_WRITE_ADV) { + krping_cq_event_handler(cb->cq, cb); + } + + wlat_test(cb); +} + +static void krping_bw_test_client(struct krping_cb *cb) +{ + struct ib_send_wr *bad_wr; + struct ib_wc wc; + int ret; + + cb->state = RDMA_READ_ADV; + + /* Send STAG/TO/Len to client */ + if (cb->dma_mr) + krping_format_send(cb, cb->start_addr, cb->dma_mr); + else + krping_format_send(cb, cb->start_addr, cb->start_mr); + ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr); + if (ret) { + log(LOG_ERR, "post send error %d\n", ret); + return; + } + + /* Spin waiting for send completion */ + while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0)); + if (ret < 0) { + log(LOG_ERR, "poll error %d\n", ret); + return; + } + if (wc.status) { + log(LOG_ERR, "send completion error %d\n", wc.status); + return; + } + + /* Spin waiting for server's Start STAG/TO/Len */ + while (cb->state < RDMA_WRITE_ADV) { + krping_cq_event_handler(cb->cq, cb); + } + + bw_test(cb); +} + +static int krping_connect_client(struct krping_cb *cb) +{ + struct rdma_conn_param conn_param; + int ret; + + memset(&conn_param, 0, sizeof conn_param); + conn_param.responder_resources = 1; + conn_param.initiator_depth = 1; + conn_param.retry_count = 10; + + ret = rdma_connect(cb->cm_id, &conn_param); + if (ret) { + log(LOG_ERR, "rdma_connect error %d\n", ret); + return ret; + } + + krping_wait(cb, CONNECTED); + if (cb->state == ERROR) { + log(LOG_ERR, "wait for CONNECTED state %d\n", cb->state); + return -1; + } + + DEBUG_LOG(PFX "rdma_connect successful\n"); + return 0; +} + +static int krping_bind_client(struct krping_cb *cb) +{ + struct sockaddr_in sin; + int ret; + + memset(&sin, 0, sizeof(sin)); + sin.sin_len = sizeof sin; + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = cb->addr.s_addr; + sin.sin_port = cb->port; + + ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin, + 2000); + if (ret) { + log(LOG_ERR, "rdma_resolve_addr error %d\n", ret); + return ret; + } + + krping_wait(cb, ROUTE_RESOLVED); + if (cb->state != ROUTE_RESOLVED) { + log(LOG_ERR, + "addr/route resolution did not resolve: state %d\n", + cb->state); + return EINTR; + } + + DEBUG_LOG(PFX "rdma_resolve_addr - rdma_resolve_route successful\n"); + return 0; +} + +static void krping_run_client(struct krping_cb *cb) +{ + struct ib_recv_wr *bad_wr; + int ret; + + ret = krping_bind_client(cb); + if (ret) + return; + + ret = krping_setup_qp(cb, cb->cm_id); + if (ret) { + log(LOG_ERR, "setup_qp failed: %d\n", ret); + return; + } + + ret = krping_setup_buffers(cb); + if (ret) { + log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret); + goto err1; + } + + ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr); + if (ret) { + log(LOG_ERR, "ib_post_recv failed: %d\n", ret); + goto err2; + } + + ret = krping_connect_client(cb); + if (ret) { + log(LOG_ERR, "connect error %d\n", ret); + goto err2; + } + + if (cb->wlat) + krping_wlat_test_client(cb); + else if (cb->rlat) + krping_rlat_test_client(cb); + else if (cb->bw) + krping_bw_test_client(cb); + else + krping_test_client(cb); + rdma_disconnect(cb->cm_id); +err2: + krping_free_buffers(cb); +err1: + krping_free_qp(cb); +} + +int krping_doit(char *cmd) +{ + struct krping_cb *cb; + int op; + int ret = 0; + char *optarg; + unsigned long optint; + debug = 0; + + cb = malloc(sizeof(*cb), M_DEVBUF, M_WAITOK); + if (!cb) + return ENOMEM; + bzero(cb, sizeof *cb); + + mtx_lock(&krping_mutex); + TAILQ_INSERT_TAIL(&krping_cbs, cb, list); + mtx_unlock(&krping_mutex); + + cb->server = -1; + cb->state = IDLE; + cb->size = 64; + cb->txdepth = RPING_SQ_DEPTH; + mtx_init(&cb->lock, "krping mtx", NULL, MTX_DUPOK|MTX_DEF); + + while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg, + &optint)) != 0) { + switch (op) { + case 'a': + cb->addr_str = optarg; + DEBUG_LOG(PFX "ipaddr (%s)\n", optarg); + if (!inet_aton(optarg, &cb->addr)) { + log(LOG_ERR, "bad addr string %s\n", optarg); + ret = EINVAL; + } + break; + case 'D': + cb->use_dmamr = 1; + DEBUG_LOG(PFX "using dma mr\n"); + break; + case 'p': + cb->port = htons(optint); + DEBUG_LOG(PFX "port %d\n", (int)optint); + break; + case 'P': + cb->poll = 1; + DEBUG_LOG("server\n"); + break; + case 's': + cb->server = 1; + DEBUG_LOG(PFX "server\n"); + break; + case 'c': + cb->server = 0; + DEBUG_LOG(PFX "client\n"); + break; + case 'S': + cb->size = optint; + if ((cb->size < 1) || + (cb->size > RPING_BUFSIZE)) { + log(LOG_ERR, "Invalid size %d " + "(valid range is 1 to %d)\n", + cb->size, RPING_BUFSIZE); + ret = EINVAL; + } else + DEBUG_LOG(PFX "size %d\n", (int)optint); + break; + case 'C': + cb->count = optint; + if (cb->count < 0) { + log(LOG_ERR, "Invalid count %d\n", + cb->count); + ret = EINVAL; + } else + DEBUG_LOG(PFX "count %d\n", (int) cb->count); + break; + case 'v': + cb->verbose++; + DEBUG_LOG(PFX "verbose\n"); + break; + case 'V': + cb->validate++; + DEBUG_LOG(PFX "validate data\n"); + break; + case 'L': + cb->rlat++; + break; + case 'l': + cb->wlat++; + break; + case 'B': + cb->bw++; + break; + case 't': + cb->txdepth = optint; + DEBUG_LOG(PFX "txdepth %d\n", cb->txdepth); + break; + case 'd': + debug++; + break; + default: + log(LOG_ERR, "unknown opt %s\n", optarg); + ret = EINVAL; + break; + } + } + if (ret) + goto out; + + if (cb->server == -1) { + log(LOG_ERR, "must be either client or server\n"); + ret = EINVAL; + goto out; + } + if ((cb->bw + cb->rlat + cb->wlat) > 1) { + log(LOG_ERR, "Pick only one test: bw, rlat, wlat\n"); + ret = EINVAL; + goto out; + } + + + cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP); + if (IS_ERR(cb->cm_id)) { + ret = PTR_ERR(cb->cm_id); + log(LOG_ERR, "rdma_create_id error %d\n", ret); + goto out; + } + DEBUG_LOG(PFX "created cm_id %p\n", cb->cm_id); + if (cb->server) + krping_run_server(cb); + else + krping_run_client(cb); + DEBUG_LOG(PFX "destroy cm_id %p\n", cb->cm_id); + rdma_destroy_id(cb->cm_id); +out: + mtx_lock(&krping_mutex); + TAILQ_REMOVE(&krping_cbs, cb, list); + mtx_unlock(&krping_mutex); + free(cb, M_DEVBUF); + return ret; +} + +void krping_init(void) +{ + mtx_init(&krping_mutex, "krping lock", NULL, MTX_DEF); + TAILQ_INIT(&krping_cbs); +} diff --git a/sys/contrib/rdma/krping/krping.h b/sys/contrib/rdma/krping/krping.h new file mode 100644 index 000000000000..8578e7e7979b --- /dev/null +++ b/sys/contrib/rdma/krping/krping.h @@ -0,0 +1,128 @@ +/* + * $FreeBSD$ + */ +#include <contrib/rdma/ib_verbs.h> +#include <netinet/in.h> + +/* + * Krping header stuffs... + */ + +struct krping_stats { + unsigned send_bytes; + unsigned send_msgs; + unsigned recv_bytes; + unsigned recv_msgs; + unsigned write_bytes; + unsigned write_msgs; + unsigned read_bytes; + unsigned read_msgs; +}; + + +/* + * These states are used to signal events between the completion handler + * and the main client or server thread. + * + * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV, + * and RDMA_WRITE_COMPLETE for each ping. + */ +enum test_state { + IDLE = 1, + CONNECT_REQUEST, + ADDR_RESOLVED, + ROUTE_RESOLVED, + CONNECTED, + RDMA_READ_ADV, + RDMA_READ_COMPLETE, + RDMA_WRITE_ADV, + RDMA_WRITE_COMPLETE, + ERROR +}; + +struct krping_rdma_info { + uint64_t buf; + uint32_t rkey; + uint32_t size; +}; + +/* + * Control block struct. + */ +struct krping_cb { + int server; /* 0 iff client */ + struct ib_cq *cq; + struct ib_pd *pd; + struct ib_qp *qp; + struct ib_mr *dma_mr; + int use_dmamr; + + struct ib_recv_wr rq_wr; /* recv work request record */ + struct ib_sge recv_sgl; /* recv single SGE */ + struct krping_rdma_info recv_buf;/* malloc'd buffer */ + struct ib_mr *recv_mr; + + struct ib_send_wr sq_wr; /* send work requrest record */ + struct ib_sge send_sgl; + struct krping_rdma_info send_buf;/* single send buf */ + struct ib_mr *send_mr; + + struct ib_send_wr rdma_sq_wr; /* rdma work request record */ + struct ib_sge rdma_sgl; /* rdma single SGE */ + char *rdma_buf; /* used as rdma sink */ + u64 rdma_addr; + struct ib_mr *rdma_mr; + + uint32_t remote_rkey; /* remote guys RKEY */ + uint64_t remote_addr; /* remote guys TO */ + uint32_t remote_len; /* remote guys LEN */ + + char *start_buf; /* rdma read src */ + u64 start_addr; + struct ib_mr *start_mr; + + enum test_state state; /* used for cond/signalling */ + struct mtx lock; + struct krping_stats stats; + + uint16_t port; /* dst port in NBO */ + struct in_addr addr; /* dst addr in NBO */ + char *addr_str; /* dst addr string */ + int verbose; /* verbose logging */ + int count; /* ping count */ + int size; /* ping data size */ + int validate; /* validate ping data */ + + /* CM stuff */ + struct rdma_cm_id *cm_id; /* connection on client side,*/ + /* listener on service side. */ + struct rdma_cm_id *child_cm_id; /* connection on server side */ + TAILQ_ENTRY(krping_cb) list; + + int rlat; /* run read latency test */ + int wlat; /* run write latency test */ + int bw; /* run write bw test */ + int duplex; /* run write bw full duplex test */ + int poll; /* poll vs block in rlat */ + int txdepth; +}; + +static __inline uint64_t +get_cycles(void) +{ + u_int32_t low, high; + __asm __volatile("rdtsc" : "=a" (low), "=d" (high)); + return (low | ((u_int64_t)high << 32)); +} + +#define htonll(x) htobe64((x)) +#define ntohll(x) be64toh((x)) + +typedef uint64_t cycles_t; + +extern struct mtx krping_mutex; +TAILQ_HEAD(krping_cb_list, krping_cb); +extern struct krping_cb_list krping_cbs; + +int krping_doit(char *cmd); +void krping_init(void); diff --git a/sys/contrib/rdma/krping/krping_dev.c b/sys/contrib/rdma/krping/krping_dev.c new file mode 100644 index 000000000000..448f19717578 --- /dev/null +++ b/sys/contrib/rdma/krping/krping_dev.c @@ -0,0 +1,180 @@ +/* + * This code lifted from: + * Simple `echo' pseudo-device KLD + * Murray Stokely + * Converted to 5.X by Søren (Xride) Straarup + */ + +/* + * /bin/echo "server,port=9999,addr=192.168.69.142,validate" > /dev/krping + * /bin/echo "client,port=9999,addr=192.168.69.142,validate" > /dev/krping + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/module.h> +#include <sys/systm.h> /* uprintf */ +#include <sys/errno.h> +#include <sys/param.h> /* defines used in kernel.h */ +#include <sys/kernel.h> /* types used in module initialization */ +#include <sys/conf.h> /* cdevsw struct */ +#include <sys/uio.h> /* uio struct */ +#include <sys/malloc.h> + +#include "krping.h" + +#define BUFFERSIZE 512 + +/* Function prototypes */ +static d_open_t krping_open; +static d_close_t krping_close; +static d_read_t krping_read; +static d_write_t krping_write; + +/* Character device entry points */ +static struct cdevsw krping_cdevsw = { + .d_version = D_VERSION, + .d_open = krping_open, + .d_close = krping_close, + .d_read = krping_read, + .d_write = krping_write, + .d_name = "krping", +}; + +typedef struct s_krping { + char msg[BUFFERSIZE]; + int len; +} krping_t; + +/* vars */ +static struct cdev *krping_dev; + +static int +krping_loader(struct module *m, int what, void *arg) +{ + int err = 0; + + switch (what) { + case MOD_LOAD: /* kldload */ + krping_init(); + krping_dev = make_dev(&krping_cdevsw, 0, UID_ROOT, GID_WHEEL, + 0600, "krping"); + printf("Krping device loaded.\n"); + break; + case MOD_UNLOAD: + destroy_dev(krping_dev); + printf("Krping device unloaded.\n"); + break; + default: + err = EOPNOTSUPP; + break; + } + return err; +} + +static int +krping_open(struct cdev *dev, int oflags, int devtype, struct thread *p) +{ + int err = 0; + return err; +} + +static int +krping_close(struct cdev *dev, int fflag, int devtype, struct thread *p) +{ + return 0; +} + +static int +krping_read(struct cdev *dev, struct uio *uio, int ioflag) +{ + struct krping_cb *cb, *cb2; + int num=1; + struct krping_cb_list copy_cbs; + + uprintf("krping: %4s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n", + "num", "device", "snd bytes", "snd msgs", "rcv bytes", + "rcv msgs", "wr bytes", "wr msgs", "rd bytes", "rd msgs"); + TAILQ_INIT(©_cbs); + + mtx_lock(&krping_mutex); + TAILQ_FOREACH(cb, &krping_cbs, list) { + cb2 = malloc(sizeof(*cb), M_DEVBUF, M_NOWAIT|M_ZERO); + if (!cb2) + break; + bcopy(cb, cb2, sizeof(*cb)); + TAILQ_INSERT_TAIL(©_cbs, cb2, list); + } + mtx_unlock(&krping_mutex); + + while (!TAILQ_EMPTY(©_cbs)) { + + cb = TAILQ_FIRST(©_cbs); + TAILQ_REMOVE(©_cbs, cb, list); + if (cb->pd) { + uprintf("krping: %4d %10s %10u %10u %10u %10u %10u %10u %10u %10u\n", + num++, cb->pd->device->name, cb->stats.send_bytes, + cb->stats.send_msgs, cb->stats.recv_bytes, + cb->stats.recv_msgs, cb->stats.write_bytes, + cb->stats.write_msgs, + cb->stats.read_bytes, + cb->stats.read_msgs); + } else { + uprintf("krping: %d listen\n", num++); + } + free(cb, M_DEVBUF); + } + return 0; +} + +static int +krping_write(struct cdev *dev, struct uio *uio, int ioflag) +{ + int err = 0; + int amt; + int remain = BUFFERSIZE; + char *cp; + krping_t *krpingmsg; + + krpingmsg = malloc(sizeof *krpingmsg, M_DEVBUF, M_WAITOK|M_ZERO); + if (!krpingmsg) { + uprintf("Could not malloc mem!\n"); + return ENOMEM; + } + + cp = krpingmsg->msg; + while (uio->uio_resid) { + amt = MIN(uio->uio_resid, remain); + if (amt == 0) + break; + + /* Copy the string in from user memory to kernel memory */ + err = uiomove(cp, amt, uio); + if (err) { + uprintf("Write failed: bad address!\n"); + return err; + } + cp += amt; + remain -= amt; + } + + if (uio->uio_resid != 0) { + uprintf("Message too big. max size is %d!\n", BUFFERSIZE); + return EMSGSIZE; + } + + /* null terminate and remove the \n */ + cp--; + *cp = 0; + krpingmsg->len = (unsigned long)(cp - krpingmsg->msg); + uprintf("krping: write string = |%s|\n", krpingmsg->msg); + err = krping_doit(krpingmsg->msg); + free(krpingmsg, M_DEVBUF); + return(err); +} + +MODULE_DEPEND(krping, rdma_core, 1, 1, 1); +MODULE_DEPEND(krping, rdma_cma, 1, 1, 1); +DEV_MODULE(krping,krping_loader,NULL); diff --git a/sys/contrib/rdma/rdma_addr.c b/sys/contrib/rdma/rdma_addr.c new file mode 100644 index 000000000000..e052b807a7d9 --- /dev/null +++ b/sys/contrib/rdma/rdma_addr.c @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. + * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This Software is licensed under one of the following licenses: + * + * 1) under the terms of the "Common Public License 1.0" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/cpl.php. + * + * 2) under the terms of the "The BSD License" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/bsd-license.php. + * + * 3) under the terms of the "GNU General Public License (GPL) Version 2" a + * copy of which is available from the Open Source Initiative, see + * http://www.opensource.org/licenses/gpl-license.php. + * + * Licensee has the right to choose one of the above licenses. + * + * Redistributions of source code must retain the above copyright + * notice and one of the license notices. + * + * Redistributions in binary form must reproduce both the above copyright + * notice, one of the license notices in the documentation + * and/or other materials provided with the distribution. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/condvar.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/socket.h> +#include <sys/module.h> + +#include <sys/lock.h> +#include <sys/condvar.h> +#include <sys/mutex.h> +#include <sys/rwlock.h> +#include <sys/queue.h> +#include <sys/taskqueue.h> + +#include <net/if.h> +#include <net/if_dl.h> +#include <net/if_var.h> +#include <net/if_arp.h> +#include <net/route.h> + +#include <net80211/ieee80211_freebsd.h> + +#include <netinet/in.h> +#include <netinet/if_ether.h> + +#include <contrib/rdma/ib_addr.h> + +struct addr_req { + TAILQ_ENTRY(addr_req) entry; + struct sockaddr src_addr; + struct sockaddr dst_addr; + struct rdma_dev_addr *addr; + struct rdma_addr_client *client; + void *context; + void (*callback)(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context); + unsigned long timeout; + int status; +}; + +static void process_req(void *ctx, int pending); + +static struct mtx lock; + +static TAILQ_HEAD(addr_req_list, addr_req) req_list; +static struct task addr_task; +static struct taskqueue *addr_taskq; +static struct callout addr_ch; +static eventhandler_tag route_event_tag; + +static void addr_timeout(void *arg) +{ + taskqueue_enqueue(addr_taskq, &addr_task); +} + +void rdma_addr_register_client(struct rdma_addr_client *client) +{ + mtx_init(&client->lock, "rdma_addr client lock", NULL, MTX_DUPOK|MTX_DEF); + cv_init(&client->comp, "rdma_addr cv"); + client->refcount = 1; +} + +static inline void put_client(struct rdma_addr_client *client) +{ + mtx_lock(&client->lock); + if (--client->refcount == 0) { + cv_broadcast(&client->comp); + } + mtx_unlock(&client->lock); +} + +void rdma_addr_unregister_client(struct rdma_addr_client *client) +{ + put_client(client); + mtx_lock(&client->lock); + if (client->refcount) { + cv_wait(&client->comp, &client->lock); + } + mtx_unlock(&client->lock); +} + +int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev, + const unsigned char *dst_dev_addr) +{ + dev_addr->dev_type = RDMA_NODE_RNIC; + memcpy(dev_addr->src_dev_addr, IF_LLADDR(dev), MAX_ADDR_LEN); + memcpy(dev_addr->broadcast, dev->if_broadcastaddr, MAX_ADDR_LEN); + if (dst_dev_addr) + memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN); + return 0; +} + +int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) +{ + struct ifaddr *ifa; + struct sockaddr_in *sin = (struct sockaddr_in *)addr; + uint16_t port = sin->sin_port; + + sin->sin_port = 0; + ifa = ifa_ifwithaddr(addr); + sin->sin_port = port; + if (!ifa) + return (EADDRNOTAVAIL); + return rdma_copy_addr(dev_addr, ifa->ifa_ifp, NULL); +} + +static void queue_req(struct addr_req *req) +{ + struct addr_req *tmp_req = NULL; + + mtx_lock(&lock); + TAILQ_FOREACH_REVERSE(tmp_req, &req_list, addr_req_list, entry) + if (time_after_eq(req->timeout, tmp_req->timeout)) + break; + + if (tmp_req) + TAILQ_INSERT_AFTER(&req_list, tmp_req, req, entry); + else + TAILQ_INSERT_TAIL(&req_list, req, entry); + + if (TAILQ_FIRST(&req_list) == req) + callout_reset(&addr_ch, req->timeout - ticks, addr_timeout, NULL); + mtx_unlock(&lock); +} + +#ifdef needed +static void addr_send_arp(struct sockaddr_in *dst_in) +{ + struct route iproute; + struct sockaddr_in *dst = (struct sockaddr_in *)&iproute.ro_dst; + char dmac[ETHER_ADDR_LEN]; + + bzero(&iproute, sizeof iproute); + *dst = *dst_in; + + rtalloc(&iproute); + if (iproute.ro_rt == NULL); + return; + + arpresolve(iproute.ro_rt->rt_ifp, iproute.ro_rt, NULL, + rt_key(iproute.ro_rt), dmac); + + RTFREE(iproute.ro_rt); +} +#endif + +static int addr_resolve_remote(struct sockaddr_in *src_in, + struct sockaddr_in *dst_in, + struct rdma_dev_addr *addr) +{ + int ret = 0; + struct route iproute; + struct sockaddr_in *dst = (struct sockaddr_in *)&iproute.ro_dst; + char dmac[ETHER_ADDR_LEN]; + + bzero(&iproute, sizeof iproute); + *dst = *dst_in; + + rtalloc(&iproute); + if (iproute.ro_rt == NULL) { + ret = EHOSTUNREACH; + goto out; + } + + /* If the device does ARP internally, return 'done' */ + if (iproute.ro_rt->rt_ifp->if_flags & IFF_NOARP) { + rdma_copy_addr(addr, iproute.ro_rt->rt_ifp, NULL); + goto put; + } + ret = arpresolve(iproute.ro_rt->rt_ifp, iproute.ro_rt, NULL, + rt_key(iproute.ro_rt), dmac); + if (ret) { + goto put; + } + + if (!src_in->sin_addr.s_addr) { + src_in->sin_len = sizeof *src_in; + src_in->sin_family = dst_in->sin_family; + src_in->sin_addr.s_addr = ((struct sockaddr_in *)iproute.ro_rt->rt_ifa->ifa_addr)->sin_addr.s_addr; + } + + ret = rdma_copy_addr(addr, iproute.ro_rt->rt_ifp, dmac); +put: + RTFREE(iproute.ro_rt); +out: + return ret; +} + +static void process_req(void *ctx, int pending) +{ + struct addr_req *req, *tmp_req; + struct sockaddr_in *src_in, *dst_in; + TAILQ_HEAD(, addr_req) done_list; + + TAILQ_INIT(&done_list); + + mtx_lock(&lock); + TAILQ_FOREACH_SAFE(req, &req_list, entry, tmp_req) { + if (req->status == EWOULDBLOCK) { + src_in = (struct sockaddr_in *) &req->src_addr; + dst_in = (struct sockaddr_in *) &req->dst_addr; + req->status = addr_resolve_remote(src_in, dst_in, + req->addr); + if (req->status && time_after_eq(ticks, req->timeout)) + req->status = ETIMEDOUT; + else if (req->status == EWOULDBLOCK) + continue; + } + TAILQ_REMOVE(&req_list, req, entry); + TAILQ_INSERT_TAIL(&done_list, req, entry); + } + + if (!TAILQ_EMPTY(&req_list)) { + req = TAILQ_FIRST(&req_list); + callout_reset(&addr_ch, req->timeout - ticks, addr_timeout, + NULL); + } + mtx_unlock(&lock); + + TAILQ_FOREACH_SAFE(req, &done_list, entry, tmp_req) { + TAILQ_REMOVE(&done_list, req, entry); + req->callback(req->status, &req->src_addr, req->addr, + req->context); + put_client(req->client); + free(req, M_DEVBUF); + } +} + +int rdma_resolve_ip(struct rdma_addr_client *client, + struct sockaddr *src_addr, struct sockaddr *dst_addr, + struct rdma_dev_addr *addr, int timeout_ms, + void (*callback)(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *addr, void *context), + void *context) +{ + struct sockaddr_in *src_in, *dst_in; + struct addr_req *req; + int ret = 0; + + req = malloc(sizeof *req, M_DEVBUF, M_NOWAIT); + if (!req) + return (ENOMEM); + memset(req, 0, sizeof *req); + + if (src_addr) + memcpy(&req->src_addr, src_addr, ip_addr_size(src_addr)); + memcpy(&req->dst_addr, dst_addr, ip_addr_size(dst_addr)); + req->addr = addr; + req->callback = callback; + req->context = context; + req->client = client; + mtx_lock(&client->lock); + client->refcount++; + mtx_unlock(&client->lock); + + src_in = (struct sockaddr_in *) &req->src_addr; + dst_in = (struct sockaddr_in *) &req->dst_addr; + + req->status = addr_resolve_remote(src_in, dst_in, addr); + + switch (req->status) { + case 0: + req->timeout = ticks; + queue_req(req); + break; + case EWOULDBLOCK: + req->timeout = msecs_to_ticks(timeout_ms) + ticks; + queue_req(req); +#ifdef needed + addr_send_arp(dst_in); +#endif + break; + default: + ret = req->status; + mtx_lock(&client->lock); + client->refcount--; + mtx_unlock(&client->lock); + free(req, M_DEVBUF); + break; + } + return ret; +} + +void rdma_addr_cancel(struct rdma_dev_addr *addr) +{ + struct addr_req *req, *tmp_req; + + mtx_lock(&lock); + TAILQ_FOREACH_SAFE(req, &req_list, entry, tmp_req) { + if (req->addr == addr) { + req->status = ECANCELED; + req->timeout = ticks; + TAILQ_REMOVE(&req_list, req, entry); + TAILQ_INSERT_HEAD(&req_list, req, entry); + callout_reset(&addr_ch, req->timeout - ticks, addr_timeout, NULL); + break; + } + } + mtx_unlock(&lock); +} + +static void +route_event_arp_update(void *unused, struct rtentry *rt0, uint8_t *enaddr, + struct sockaddr *sa) +{ + callout_stop(&addr_ch); + taskqueue_enqueue(addr_taskq, &addr_task); +} + +static int addr_init(void) +{ + TAILQ_INIT(&req_list); + mtx_init(&lock, "rdma_addr req_list lock", NULL, MTX_DEF); + + addr_taskq = taskqueue_create("rdma_addr_taskq", M_NOWAIT, + taskqueue_thread_enqueue, &addr_taskq); + if (addr_taskq == NULL) { + printf("failed to allocate rdma_addr taskqueue\n"); + return (ENOMEM); + } + taskqueue_start_threads(&addr_taskq, 1, PI_NET, "rdma_addr taskq"); + TASK_INIT(&addr_task, 0, process_req, NULL); + + callout_init(&addr_ch, TRUE); + + route_event_tag = EVENTHANDLER_REGISTER(route_arp_update_event, + route_event_arp_update, NULL, EVENTHANDLER_PRI_ANY); + + return 0; +} + +static void addr_cleanup(void) +{ + EVENTHANDLER_DEREGISTER(route_event_arp_update, route_event_tag); + callout_stop(&addr_ch); + taskqueue_drain(addr_taskq, &addr_task); + taskqueue_free(addr_taskq); +} + +static int +addr_load(module_t mod, int cmd, void *arg) +{ + int err = 0; + + switch (cmd) { + case MOD_LOAD: + printf("Loading rdma_addr.\n"); + + addr_init(); + break; + case MOD_QUIESCE: + break; + case MOD_UNLOAD: + printf("Unloading rdma_addr.\n"); + addr_cleanup(); + break; + case MOD_SHUTDOWN: + break; + default: + err = EOPNOTSUPP; + break; + } + + return (err); +} + +static moduledata_t mod_data = { + "rdma_addr", + addr_load, + 0 +}; + +MODULE_VERSION(rdma_addr, 1); +DECLARE_MODULE(rdma_addr, mod_data, SI_SUB_EXEC, SI_ORDER_ANY); diff --git a/sys/contrib/rdma/rdma_cache.c b/sys/contrib/rdma/rdma_cache.c new file mode 100644 index 000000000000..dced8ebd0110 --- /dev/null +++ b/sys/contrib/rdma/rdma_cache.c @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: cache.c 1349 2004-12-16 21:09:43Z roland $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/taskqueue.h> +#include <sys/lock.h> +#include <sys/rwlock.h> +#include <sys/mutex.h> +#include <sys/module.h> +#include <sys/syslog.h> + + +#ifdef needed +#include <sys/condvar.h> +#include <sys/socket.h> +#include <sys/condvar.h> +#endif + +#include <contrib/rdma/ib_cache.h> + +#include "core_priv.h" + +struct ib_pkey_cache { + int table_len; + u16 table[0]; +}; + +struct ib_gid_cache { + int table_len; + union ib_gid table[0]; +}; + +struct ib_update_work { + struct task task; + struct ib_device *device; + u8 port_num; +}; + +static inline int start_port(struct ib_device *device) +{ + return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1; +} + +static inline int end_port(struct ib_device *device) +{ + return (device->node_type == RDMA_NODE_IB_SWITCH) ? + 0 : device->phys_port_cnt; +} + +int ib_get_cached_gid(struct ib_device *device, + u8 port_num, + int index, + union ib_gid *gid) +{ + struct ib_gid_cache *cache; + int ret = 0; + + if (port_num < start_port(device) || port_num > end_port(device)) + return -EINVAL; + + mtx_lock(&device->cache.lock); + + cache = device->cache.gid_cache[port_num - start_port(device)]; + + if (index < 0 || index >= cache->table_len) + ret = -EINVAL; + else + *gid = cache->table[index]; + + mtx_unlock(&device->cache.lock); + + return ret; +} + +int ib_find_cached_gid(struct ib_device *device, + union ib_gid *gid, + u8 *port_num, + u16 *index) +{ + struct ib_gid_cache *cache; + int p, i; + int ret = -ENOENT; + + *port_num = -1; + if (index) + *index = -1; + + mtx_lock(&device->cache.lock); + + for (p = 0; p <= end_port(device) - start_port(device); ++p) { + cache = device->cache.gid_cache[p]; + for (i = 0; i < cache->table_len; ++i) { + if (!memcmp(gid, &cache->table[i], 6)) { /* XXX */ + *port_num = p + start_port(device); + if (index) + *index = i; + ret = 0; + goto found; + } + } + } +found: + mtx_unlock(&device->cache.lock); + + return ret; +} + +int ib_get_cached_pkey(struct ib_device *device, + u8 port_num, + int index, + u16 *pkey) +{ + struct ib_pkey_cache *cache; + int ret = 0; + + if (port_num < start_port(device) || port_num > end_port(device)) + return -EINVAL; + + mtx_lock(&device->cache.lock); + + cache = device->cache.pkey_cache[port_num - start_port(device)]; + + if (index < 0 || index >= cache->table_len) + ret = -EINVAL; + else + *pkey = cache->table[index]; + + mtx_unlock(&device->cache.lock); + + return ret; +} + +int ib_find_cached_pkey(struct ib_device *device, + u8 port_num, + u16 pkey, + u16 *index) +{ + struct ib_pkey_cache *cache; + int i; + int ret = -ENOENT; + + if (port_num < start_port(device) || port_num > end_port(device)) + return -EINVAL; + + mtx_lock(&device->cache.lock); + + cache = device->cache.pkey_cache[port_num - start_port(device)]; + + *index = -1; + + for (i = 0; i < cache->table_len; ++i) + if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) { + *index = i; + ret = 0; + break; + } + + mtx_unlock(&device->cache.lock); + + return ret; +} + +int ib_get_cached_lmc(struct ib_device *device, + u8 port_num, + u8 *lmc) +{ + int ret = 0; + + if (port_num < start_port(device) || port_num > end_port(device)) + return -EINVAL; + + mtx_lock(&device->cache.lock); + *lmc = device->cache.lmc_cache[port_num - start_port(device)]; + mtx_unlock(&device->cache.lock); + + return ret; +} + +static void ib_cache_update(struct ib_device *device, + u8 port) +{ + struct ib_port_attr *tprops = NULL; + struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache; + struct ib_gid_cache *gid_cache = NULL, *old_gid_cache; + int i; + int ret; + + tprops = malloc(sizeof *tprops, M_DEVBUF, M_NOWAIT); + if (!tprops) + return; + + ret = ib_query_port(device, port, tprops); + if (ret) { + log(LOG_WARNING, "ib_query_port failed (%d) for %s\n", + ret, device->name); + goto err; + } + + pkey_cache = malloc(sizeof *pkey_cache + tprops->pkey_tbl_len * + sizeof *pkey_cache->table, M_DEVBUF, M_NOWAIT); + if (!pkey_cache) + goto err; + + pkey_cache->table_len = tprops->pkey_tbl_len; + + gid_cache = malloc(sizeof *gid_cache + tprops->gid_tbl_len * + sizeof *gid_cache->table, M_DEVBUF, M_NOWAIT); + if (!gid_cache) + goto err; + + gid_cache->table_len = tprops->gid_tbl_len; + + for (i = 0; i < pkey_cache->table_len; ++i) { + ret = ib_query_pkey(device, port, i, pkey_cache->table + i); + if (ret) { + log(LOG_WARNING, "ib_query_pkey failed (%d) for %s (index %d)\n", + ret, device->name, i); + goto err; + } + } + + for (i = 0; i < gid_cache->table_len; ++i) { + ret = ib_query_gid(device, port, i, gid_cache->table + i); + if (ret) { + log(LOG_WARNING, "ib_query_gid failed (%d) for %s (index %d)\n", + ret, device->name, i); + goto err; + } + } + + mtx_lock(&device->cache.lock); + + old_pkey_cache = device->cache.pkey_cache[port - start_port(device)]; + old_gid_cache = device->cache.gid_cache [port - start_port(device)]; + + device->cache.pkey_cache[port - start_port(device)] = pkey_cache; + device->cache.gid_cache [port - start_port(device)] = gid_cache; + + device->cache.lmc_cache[port - start_port(device)] = tprops->lmc; + + mtx_unlock(&device->cache.lock); + + free(old_pkey_cache, M_DEVBUF); + free(old_gid_cache, M_DEVBUF); + free(tprops, M_DEVBUF); + return; + +err: + free(pkey_cache, M_DEVBUF); + free(gid_cache, M_DEVBUF); + free(tprops, M_DEVBUF); +} + +static void ib_cache_task(void *context, int pending) +{ + struct ib_update_work *work = context; + + ib_cache_update(work->device, work->port_num); + free(work, M_DEVBUF); +} + +static void ib_cache_event(struct ib_event_handler *handler, + struct ib_event *event) +{ + struct ib_update_work *work; + + if (event->event == IB_EVENT_PORT_ERR || + event->event == IB_EVENT_PORT_ACTIVE || + event->event == IB_EVENT_LID_CHANGE || + event->event == IB_EVENT_PKEY_CHANGE || + event->event == IB_EVENT_SM_CHANGE || + event->event == IB_EVENT_CLIENT_REREGISTER) { + work = malloc(sizeof *work, M_DEVBUF, M_NOWAIT); + if (work) { + TASK_INIT(&work->task, 0, ib_cache_task, work); + work->device = event->device; + work->port_num = event->element.port_num; + taskqueue_enqueue(taskqueue_thread, &work->task); + } + } +} + +static void ib_cache_setup_one(struct ib_device *device) +{ + int p; + + mtx_init(&device->cache.lock, "ib device cache", NULL, + MTX_DUPOK|MTX_DEF); + + device->cache.pkey_cache = + malloc(sizeof *device->cache.pkey_cache * + (end_port(device) - start_port(device) + 1), M_DEVBUF, + M_NOWAIT); + device->cache.gid_cache = + malloc(sizeof *device->cache.gid_cache * + (end_port(device) - start_port(device) + 1), M_DEVBUF, + M_NOWAIT); + + device->cache.lmc_cache = malloc(sizeof *device->cache.lmc_cache * + (end_port(device) - + start_port(device) + 1), + M_DEVBUF, M_NOWAIT); + + if (!device->cache.pkey_cache || !device->cache.gid_cache || + !device->cache.lmc_cache) { + log(LOG_WARNING, "Couldn't allocate cache " + "for %s\n", device->name); + goto err; + } + + for (p = 0; p <= end_port(device) - start_port(device); ++p) { + device->cache.pkey_cache[p] = NULL; + device->cache.gid_cache [p] = NULL; + ib_cache_update(device, p + start_port(device)); + } + + INIT_IB_EVENT_HANDLER(&device->cache.event_handler, + device, ib_cache_event); + if (ib_register_event_handler(&device->cache.event_handler)) + goto err_cache; + + return; + +err_cache: + for (p = 0; p <= end_port(device) - start_port(device); ++p) { + free(device->cache.pkey_cache[p], M_DEVBUF); + free(device->cache.gid_cache[p], M_DEVBUF); + } + +err: + free(device->cache.pkey_cache, M_DEVBUF); + free(device->cache.gid_cache, M_DEVBUF); + free(device->cache.lmc_cache, M_DEVBUF); +} + +static void ib_cache_cleanup_one(struct ib_device *device) +{ + int p; + + ib_unregister_event_handler(&device->cache.event_handler); +#ifdef XXX + flush_scheduled_work(); +#endif + + for (p = 0; p <= end_port(device) - start_port(device); ++p) { + free(device->cache.pkey_cache[p], M_DEVBUF); + free(device->cache.gid_cache[p], M_DEVBUF); + } + + free(device->cache.pkey_cache, M_DEVBUF); + free(device->cache.gid_cache, M_DEVBUF); + free(device->cache.lmc_cache, M_DEVBUF); +} + +static struct ib_client cache_client = { + .name = "cache", + .add = ib_cache_setup_one, + .remove = ib_cache_cleanup_one +}; + +int ib_cache_setup(void) +{ + return ib_register_client(&cache_client); +} + +void ib_cache_cleanup(void) +{ + ib_unregister_client(&cache_client); +} diff --git a/sys/contrib/rdma/rdma_cm.h b/sys/contrib/rdma/rdma_cm.h new file mode 100644 index 000000000000..1b30d04435d5 --- /dev/null +++ b/sys/contrib/rdma/rdma_cm.h @@ -0,0 +1,318 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This Software is licensed under one of the following licenses: + * + * 1) under the terms of the "Common Public License 1.0" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/cpl.php. + * + * 2) under the terms of the "The BSD License" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/bsd-license.php. + * + * 3) under the terms of the "GNU General Public License (GPL) Version 2" a + * copy of which is available from the Open Source Initiative, see + * http://www.opensource.org/licenses/gpl-license.php. + * + * Licensee has the right to choose one of the above licenses. + * + * Redistributions of source code must retain the above copyright + * notice and one of the license notices. + * + * Redistributions in binary form must reproduce both the above copyright + * notice, one of the license notices in the documentation + * and/or other materials provided with the distribution. + * + * $FreeBSD$ + */ + +#if !defined(RDMA_CM_H) +#define RDMA_CM_H + +#include <sys/socket.h> +#include <netinet/in.h> +#include <contrib/rdma/ib_addr.h> +#include <contrib/rdma/ib_sa.h> + +/* + * Upon receiving a device removal event, users must destroy the associated + * RDMA identifier and release all resources allocated with the device. + */ +enum rdma_cm_event_type { + RDMA_CM_EVENT_ADDR_RESOLVED, + RDMA_CM_EVENT_ADDR_ERROR, + RDMA_CM_EVENT_ROUTE_RESOLVED, + RDMA_CM_EVENT_ROUTE_ERROR, + RDMA_CM_EVENT_CONNECT_REQUEST, + RDMA_CM_EVENT_CONNECT_RESPONSE, + RDMA_CM_EVENT_CONNECT_ERROR, + RDMA_CM_EVENT_UNREACHABLE, + RDMA_CM_EVENT_REJECTED, + RDMA_CM_EVENT_ESTABLISHED, + RDMA_CM_EVENT_DISCONNECTED, + RDMA_CM_EVENT_DEVICE_REMOVAL, + RDMA_CM_EVENT_MULTICAST_JOIN, + RDMA_CM_EVENT_MULTICAST_ERROR +}; + +enum rdma_port_space { + RDMA_PS_SDP = 0x0001, + RDMA_PS_IPOIB= 0x0002, + RDMA_PS_TCP = 0x0106, + RDMA_PS_UDP = 0x0111, + RDMA_PS_SCTP = 0x0183 +}; + +struct rdma_addr { + struct sockaddr src_addr; + u8 src_pad[sizeof(struct sockaddr_in6) - + sizeof(struct sockaddr)]; + struct sockaddr dst_addr; + u8 dst_pad[sizeof(struct sockaddr_in6) - + sizeof(struct sockaddr)]; + struct rdma_dev_addr dev_addr; +}; + +struct rdma_route { + struct rdma_addr addr; + struct ib_sa_path_rec *path_rec; + int num_paths; +}; + +struct rdma_conn_param { + const void *private_data; + u8 private_data_len; + u8 responder_resources; + u8 initiator_depth; + u8 flow_control; + u8 retry_count; /* ignored when accepting */ + u8 rnr_retry_count; + /* Fields below ignored if a QP is created on the rdma_cm_id. */ + u8 srq; + u32 qp_num; +}; + +struct rdma_ud_param { + const void *private_data; + u8 private_data_len; + struct ib_ah_attr ah_attr; + u32 qp_num; + u32 qkey; +}; + +struct rdma_cm_event { + enum rdma_cm_event_type event; + int status; + union { + struct rdma_conn_param conn; + struct rdma_ud_param ud; + } param; +}; + +struct rdma_cm_id; + +/** + * rdma_cm_event_handler - Callback used to report user events. + * + * Notes: Users may not call rdma_destroy_id from this callback to destroy + * the passed in id, or a corresponding listen id. Returning a + * non-zero value from the callback will destroy the passed in id. + */ +typedef int (*rdma_cm_event_handler)(struct rdma_cm_id *id, + struct rdma_cm_event *event); + +struct rdma_cm_id { + struct ib_device *device; + void *context; + struct ib_qp *qp; + rdma_cm_event_handler event_handler; + struct rdma_route route; + enum rdma_port_space ps; + u8 port_num; +}; + +/** + * rdma_create_id - Create an RDMA identifier. + * + * @event_handler: User callback invoked to report events associated with the + * returned rdma_id. + * @context: User specified context associated with the id. + * @ps: RDMA port space. + */ +struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, + void *context, enum rdma_port_space ps); + +/** + * rdma_destroy_id - Destroys an RDMA identifier. + * + * @id: RDMA identifier. + * + * Note: calling this function has the effect of canceling in-flight + * asynchronous operations associated with the id. + */ +void rdma_destroy_id(struct rdma_cm_id *id); + +/** + * rdma_bind_addr - Bind an RDMA identifier to a source address and + * associated RDMA device, if needed. + * + * @id: RDMA identifier. + * @addr: Local address information. Wildcard values are permitted. + * + * This associates a source address with the RDMA identifier before calling + * rdma_listen. If a specific local address is given, the RDMA identifier will + * be bound to a local RDMA device. + */ +int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr); + +/** + * rdma_resolve_addr - Resolve destination and optional source addresses + * from IP addresses to an RDMA address. If successful, the specified + * rdma_cm_id will be bound to a local device. + * + * @id: RDMA identifier. + * @src_addr: Source address information. This parameter may be NULL. + * @dst_addr: Destination address information. + * @timeout_ms: Time to wait for resolution to complete. + */ +int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + struct sockaddr *dst_addr, int timeout_ms); + +/** + * rdma_resolve_route - Resolve the RDMA address bound to the RDMA identifier + * into route information needed to establish a connection. + * + * This is called on the client side of a connection. + * Users must have first called rdma_resolve_addr to resolve a dst_addr + * into an RDMA address before calling this routine. + */ +int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms); + +/** + * rdma_create_qp - Allocate a QP and associate it with the specified RDMA + * identifier. + * + * QPs allocated to an rdma_cm_id will automatically be transitioned by the CMA + * through their states. + */ +int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr); + +/** + * rdma_destroy_qp - Deallocate the QP associated with the specified RDMA + * identifier. + * + * Users must destroy any QP associated with an RDMA identifier before + * destroying the RDMA ID. + */ +void rdma_destroy_qp(struct rdma_cm_id *id); + +/** + * rdma_init_qp_attr - Initializes the QP attributes for use in transitioning + * to a specified QP state. + * @id: Communication identifier associated with the QP attributes to + * initialize. + * @qp_attr: On input, specifies the desired QP state. On output, the + * mandatory and desired optional attributes will be set in order to + * modify the QP to the specified state. + * @qp_attr_mask: The QP attribute mask that may be used to transition the + * QP to the specified state. + * + * Users must set the @qp_attr->qp_state to the desired QP state. This call + * will set all required attributes for the given transition, along with + * known optional attributes. Users may override the attributes returned from + * this call before calling ib_modify_qp. + * + * Users that wish to have their QP automatically transitioned through its + * states can associate a QP with the rdma_cm_id by calling rdma_create_qp(). + */ +int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, + int *qp_attr_mask); + +/** + * rdma_connect - Initiate an active connection request. + * @id: Connection identifier to connect. + * @conn_param: Connection information used for connected QPs. + * + * Users must have resolved a route for the rdma_cm_id to connect with + * by having called rdma_resolve_route before calling this routine. + * + * This call will either connect to a remote QP or obtain remote QP + * information for unconnected rdma_cm_id's. The actual operation is + * based on the rdma_cm_id's port space. + */ +int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param); + +/** + * rdma_listen - This function is called by the passive side to + * listen for incoming connection requests. + * + * Users must have bound the rdma_cm_id to a local address by calling + * rdma_bind_addr before calling this routine. + */ +int rdma_listen(struct rdma_cm_id *id, int backlog); + +/** + * rdma_accept - Called to accept a connection request or response. + * @id: Connection identifier associated with the request. + * @conn_param: Information needed to establish the connection. This must be + * provided if accepting a connection request. If accepting a connection + * response, this parameter must be NULL. + * + * Typically, this routine is only called by the listener to accept a connection + * request. It must also be called on the active side of a connection if the + * user is performing their own QP transitions. + * + * In the case of error, a reject message is sent to the remote side and the + * state of the qp associated with the id is modified to error, such that any + * previously posted receive buffers would be flushed. + */ +int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param); + +/** + * rdma_notify - Notifies the RDMA CM of an asynchronous event that has + * occurred on the connection. + * @id: Connection identifier to transition to established. + * @event: Asynchronous event. + * + * This routine should be invoked by users to notify the CM of relevant + * communication events. Events that should be reported to the CM and + * when to report them are: + * + * IB_EVENT_COMM_EST - Used when a message is received on a connected + * QP before an RTU has been received. + */ +int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event); + +/** + * rdma_reject - Called to reject a connection request or response. + */ +int rdma_reject(struct rdma_cm_id *id, const void *private_data, + u8 private_data_len); + +/** + * rdma_disconnect - This function disconnects the associated QP and + * transitions it into the error state. + */ +int rdma_disconnect(struct rdma_cm_id *id); + +/** + * rdma_join_multicast - Join the multicast group specified by the given + * address. + * @id: Communication identifier associated with the request. + * @addr: Multicast address identifying the group to join. + * @context: User-defined context associated with the join request, returned + * to the user through the private_data pointer in multicast events. + */ +int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, + void *context); + +/** + * rdma_leave_multicast - Leave the multicast group specified by the given + * address. + */ +void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr); + +#endif /* RDMA_CM_H */ diff --git a/sys/contrib/rdma/rdma_cm_ib.h b/sys/contrib/rdma/rdma_cm_ib.h new file mode 100644 index 000000000000..b69f66613bd9 --- /dev/null +++ b/sys/contrib/rdma/rdma_cm_ib.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2006 Intel Corporation. All rights reserved. + * + * This Software is licensed under one of the following licenses: + * + * 1) under the terms of the "Common Public License 1.0" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/cpl.php. + * + * 2) under the terms of the "The BSD License" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/bsd-license.php. + * + * 3) under the terms of the "GNU General Public License (GPL) Version 2" a + * copy of which is available from the Open Source Initiative, see + * http://www.opensource.org/licenses/gpl-license.php. + * + * Licensee has the right to choose one of the above licenses. + * + * Redistributions of source code must retain the above copyright + * notice and one of the license notices. + * + * Redistributions in binary form must reproduce both the above copyright + * notice, one of the license notices in the documentation + * and/or other materials provided with the distribution. + * + * $FreeBSD$ + */ + +#if !defined(RDMA_CM_IB_H) +#define RDMA_CM_IB_H + +#include <rdma/rdma_cm.h> + +/** + * rdma_set_ib_paths - Manually sets the path records used to establish a + * connection. + * @id: Connection identifier associated with the request. + * @path_rec: Reference to the path record + * + * This call permits a user to specify routing information for rdma_cm_id's + * bound to Infiniband devices. It is called on the client side of a + * connection and replaces the call to rdma_resolve_route. + */ +int rdma_set_ib_paths(struct rdma_cm_id *id, + struct ib_sa_path_rec *path_rec, int num_paths); + +/* Global qkey for UDP QPs and multicast groups. */ +#define RDMA_UDP_QKEY 0x01234567 + +#endif /* RDMA_CM_IB_H */ diff --git a/sys/contrib/rdma/rdma_cma.c b/sys/contrib/rdma/rdma_cma.c new file mode 100644 index 000000000000..8dddf6ce157d --- /dev/null +++ b/sys/contrib/rdma/rdma_cma.c @@ -0,0 +1,2998 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. + * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. + * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. + * + * This Software is licensed under one of the following licenses: + * + * 1) under the terms of the "Common Public License 1.0" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/cpl.php. + * + * 2) under the terms of the "The BSD License" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/bsd-license.php. + * + * 3) under the terms of the "GNU General Public License (GPL) Version 2" a + * copy of which is available from the Open Source Initiative, see + * http://www.opensource.org/licenses/gpl-license.php. + * + * Licensee has the right to choose one of the above licenses. + * + * Redistributions of source code must retain the above copyright + * notice and one of the license notices. + * + * Redistributions in binary form must reproduce both the above copyright + * notice, one of the license notices in the documentation + * and/or other materials provided with the distribution. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/condvar.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/libkern.h> +#include <sys/socket.h> +#include <sys/module.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/rwlock.h> +#include <sys/queue.h> +#include <sys/taskqueue.h> +#include <sys/priv.h> +#include <sys/syslog.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> + +#include <contrib/rdma/rdma_cm.h> +#include <contrib/rdma/ib_cache.h> +#include <contrib/rdma/ib_cm.h> +#include <contrib/rdma/ib_sa.h> +#include <contrib/rdma/iw_cm.h> + +#define CMA_CM_RESPONSE_TIMEOUT 20 +#define CMA_MAX_CM_RETRIES 15 + +static void cma_add_one(struct ib_device *device); +static void cma_remove_one(struct ib_device *device); + +static struct ib_client cma_client = { + .name = "cma", + .add = cma_add_one, + .remove = cma_remove_one +}; + +#ifdef IB_SUPPORTED +static struct ib_sa_client sa_client; +#endif +static struct rdma_addr_client addr_client; +static TAILQ_HEAD(, cma_device) dev_list; +static LIST_HEAD(, rdma_id_private) listen_any_list; +static struct mtx lock; +static struct taskqueue *cma_wq; +static DEFINE_KVL(sdp_ps); +static DEFINE_KVL(tcp_ps); +static DEFINE_KVL(udp_ps); +static DEFINE_KVL(ipoib_ps); +static int next_port; + +struct cma_device { + struct ib_device *device; + struct mtx lock; + struct cv comp; + int refcount; + + LIST_HEAD(, rdma_id_private) id_list; + TAILQ_ENTRY(cma_device) list; +}; + +enum cma_state { + CMA_IDLE, + CMA_ADDR_QUERY, + CMA_ADDR_RESOLVED, + CMA_ROUTE_QUERY, + CMA_ROUTE_RESOLVED, + CMA_CONNECT, + CMA_DISCONNECT, + CMA_ADDR_BOUND, + CMA_LISTEN, + CMA_DEVICE_REMOVAL, + CMA_DESTROYING +}; + +struct rdma_bind_list { + struct kvl *ps; + TAILQ_HEAD(, rdma_id_private) owners; + unsigned short port; +}; + +/* + * Device removal can occur at anytime, so we need extra handling to + * serialize notifying the user of device removal with other callbacks. + * We do this by disabling removal notification while a callback is in process, + * and reporting it after the callback completes. + */ +struct rdma_id_private { + struct rdma_cm_id id; + + struct rdma_bind_list *bind_list; + struct socket *so; + TAILQ_ENTRY(rdma_id_private) node; + LIST_ENTRY(rdma_id_private) list; /* listen_any_list or cma_dev.list */ + LIST_HEAD(, rdma_id_private) listen_list; /* per-device listens */ + LIST_ENTRY(rdma_id_private) listen_entry; + struct cma_device *cma_dev; +#ifdef IB_SUPPORTED + LIST_HEAD(, cma_multicast) mc_list; +#endif + enum cma_state state; + struct mtx lock; + struct cv comp; + int refcount; + struct cv wait_remove; + int dev_remove; + + int backlog; + int timeout_ms; + struct ib_sa_query *query; + int query_id; + union { + struct ib_cm_id *ib; + struct iw_cm_id *iw; + } cm_id; + + u32 seq_num; + u32 qkey; + u32 qp_num; + u8 srq; +}; + +#ifdef IB_SUPPORTED +struct cma_multicast { + struct rdma_id_private *id_priv; + union { + struct ib_sa_multicast *ib; + } multicast; + struct list_head list; + void *context; + struct sockaddr addr; + u8 pad[sizeof(struct sockaddr_in6) - + sizeof(struct sockaddr)]; +}; +#endif + +struct cma_work { + struct task task; + struct rdma_id_private *id; + enum cma_state old_state; + enum cma_state new_state; + struct rdma_cm_event event; +}; + +union cma_ip_addr { + struct in6_addr ip6; + struct { + __u32 pad[3]; + __u32 addr; + } ip4; +}; + +struct cma_hdr { + u8 cma_version; + u8 ip_version; /* IP version: 7:4 */ + __u16 port; + union cma_ip_addr src_addr; + union cma_ip_addr dst_addr; +}; + +struct sdp_hh { + u8 bsdh[16]; + u8 sdp_version; /* Major version: 7:4 */ + u8 ip_version; /* IP version: 7:4 */ + u8 sdp_specific1[10]; + __u16 port; + __u16 sdp_specific2; + union cma_ip_addr src_addr; + union cma_ip_addr dst_addr; +}; + +struct sdp_hah { + u8 bsdh[16]; + u8 sdp_version; +}; + +#define CMA_VERSION 0x00 +#define SDP_MAJ_VERSION 0x2 + +static int cma_comp(struct rdma_id_private *id_priv, enum cma_state comp) +{ + int ret; + + mtx_lock(&id_priv->lock); + ret = (id_priv->state == comp); + mtx_unlock(&id_priv->lock); + return ret; +} + +static int cma_comp_exch(struct rdma_id_private *id_priv, + enum cma_state comp, enum cma_state exch) +{ + int ret; + + mtx_lock(&id_priv->lock); + if ((ret = (id_priv->state == comp))) + id_priv->state = exch; + mtx_unlock(&id_priv->lock); + return ret; +} + +static enum cma_state cma_exch(struct rdma_id_private *id_priv, + enum cma_state exch) +{ + enum cma_state old; + + mtx_lock(&id_priv->lock); + old = id_priv->state; + id_priv->state = exch; + mtx_unlock(&id_priv->lock); + return old; +} + +static inline u8 cma_get_ip_ver(struct cma_hdr *hdr) +{ + return hdr->ip_version >> 4; +} + +static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) +{ + hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); +} + +static inline u8 sdp_get_majv(u8 sdp_version) +{ + return sdp_version >> 4; +} + +static inline u8 sdp_get_ip_ver(struct sdp_hh *hh) +{ + return hh->ip_version >> 4; +} + +static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver) +{ + hh->ip_version = (ip_ver << 4) | (hh->ip_version & 0xF); +} + +static inline int cma_is_ud_ps(enum rdma_port_space ps) +{ + return (ps == RDMA_PS_UDP || ps == RDMA_PS_IPOIB); +} + +static void cma_attach_to_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev) +{ + mtx_lock(&cma_dev->lock); + cma_dev->refcount++; + mtx_unlock(&cma_dev->lock); + id_priv->cma_dev = cma_dev; + id_priv->id.device = cma_dev->device; + LIST_INSERT_HEAD(&cma_dev->id_list, id_priv, list); +} + +static inline void cma_deref_dev(struct cma_device *cma_dev) +{ + mtx_lock(&cma_dev->lock); + if (--cma_dev->refcount == 0) + cv_broadcast(&cma_dev->comp); + mtx_unlock(&cma_dev->lock); +} + +static void cma_detach_from_dev(struct rdma_id_private *id_priv) +{ + LIST_REMOVE(id_priv, list); + cma_deref_dev(id_priv->cma_dev); + id_priv->cma_dev = NULL; +} + +#ifdef IB_SUPPORTED +static int cma_set_qkey(struct ib_device *device, u8 port_num, + enum rdma_port_space ps, + struct rdma_dev_addr *dev_addr, u32 *qkey) +{ + struct ib_sa_mcmember_rec rec; + int ret = 0; + + switch (ps) { + case RDMA_PS_UDP: + *qkey = RDMA_UDP_QKEY; + break; + case RDMA_PS_IPOIB: + ib_addr_get_mgid(dev_addr, &rec.mgid); + ret = ib_sa_get_mcmember_rec(device, port_num, &rec.mgid, &rec); + *qkey = be32_to_cpu(rec.qkey); + break; + default: + break; + } + return ret; +} +#endif + +static int cma_acquire_dev(struct rdma_id_private *id_priv) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + struct cma_device *cma_dev; + union ib_gid gid; + int ret = ENODEV; + + switch (rdma_node_get_transport(dev_addr->dev_type)) { +#ifdef IB_SUPPORTED + case RDMA_TRANSPORT_IB: + ib_addr_get_sgid(dev_addr, &gid); + break; +#endif + case RDMA_TRANSPORT_IWARP: + iw_addr_get_sgid(dev_addr, &gid); + break; + default: + return (ENODEV); + } + + TAILQ_FOREACH(cma_dev, &dev_list, list) { + ret = ib_find_cached_gid(cma_dev->device, &gid, + &id_priv->id.port_num, NULL); + if (!ret) { +#ifdef IB_SUPPORTED + ret = cma_set_qkey(cma_dev->device, + id_priv->id.port_num, + id_priv->id.ps, dev_addr, + &id_priv->qkey); + if (!ret) +#endif + cma_attach_to_dev(id_priv, cma_dev); + break; + } + } + return ret; +} + +static void cma_deref_id(struct rdma_id_private *id_priv) +{ + mtx_lock(&id_priv->lock); + if (--id_priv->refcount == 0) { + cv_broadcast(&id_priv->comp); + } + mtx_unlock(&id_priv->lock); +} + +static int cma_disable_remove(struct rdma_id_private *id_priv, + enum cma_state state) +{ + int ret; + + mtx_lock(&id_priv->lock); + if (id_priv->state == state) { + id_priv->dev_remove++; + ret = 0; + } else + ret = EINVAL; + mtx_unlock(&id_priv->lock); + return ret; +} + +static void cma_enable_remove(struct rdma_id_private *id_priv) +{ + mtx_lock(&id_priv->lock); + if (--id_priv->dev_remove == 0) + cv_broadcast(&id_priv->wait_remove); + mtx_unlock(&id_priv->lock); +} + +static int cma_has_cm_dev(struct rdma_id_private *id_priv) +{ + return (id_priv->id.device && id_priv->cm_id.ib); +} + +struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, + void *context, enum rdma_port_space ps) +{ + struct rdma_id_private *id_priv; + + id_priv = malloc(sizeof *id_priv, M_DEVBUF, M_NOWAIT); + if (!id_priv) + return ERR_PTR(-ENOMEM); + bzero(id_priv, sizeof *id_priv); + + id_priv->state = CMA_IDLE; + id_priv->id.context = context; + id_priv->id.event_handler = event_handler; + id_priv->id.ps = ps; + mtx_init(&id_priv->lock, "rdma_cm_id_priv", NULL, MTX_DUPOK|MTX_DEF); + cv_init(&id_priv->comp, "rdma_cm_id_priv"); + id_priv->refcount = 1; + cv_init(&id_priv->wait_remove, "id priv wait remove"); + LIST_INIT(&id_priv->listen_list); + arc4rand(&id_priv->seq_num, sizeof id_priv->seq_num, 0); + + return &id_priv->id; +} + +static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + qp_attr.qp_state = IB_QPS_INIT; + ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) + return ret; + + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); + if (ret) + return ret; + + qp_attr.qp_state = IB_QPS_RTS; + qp_attr.sq_psn = 0; + ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN); + + return ret; +} + +static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + qp_attr.qp_state = IB_QPS_INIT; + ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + return ib_modify_qp(qp, &qp_attr, qp_attr_mask); +} + +int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr) +{ + struct rdma_id_private *id_priv; + struct ib_qp *qp; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (id->device != pd->device) + return (EINVAL); + + qp = ib_create_qp(pd, qp_init_attr); + if (IS_ERR(qp)) + return PTR_ERR(qp); + if (cma_is_ud_ps(id_priv->id.ps)) + ret = cma_init_ud_qp(id_priv, qp); + else + ret = cma_init_conn_qp(id_priv, qp); + if (ret) + goto err; + + id->qp = qp; + id_priv->qp_num = qp->qp_num; + id_priv->srq = (qp->srq != NULL); + return 0; +err: + ib_destroy_qp(qp); + return ret; +} + +void rdma_destroy_qp(struct rdma_cm_id *id) +{ + ib_destroy_qp(id->qp); +} + +static int cma_modify_qp_rtr(struct rdma_cm_id *id) +{ + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + if (!id->qp) + return 0; + + /* Need to update QP attributes from default values. */ + qp_attr.qp_state = IB_QPS_INIT; + ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + ret = ib_modify_qp(id->qp, &qp_attr, qp_attr_mask); + if (ret) + return ret; + + qp_attr.qp_state = IB_QPS_RTR; + ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + return ib_modify_qp(id->qp, &qp_attr, qp_attr_mask); +} + +#ifdef IB_SUPPORTED +static int cma_modify_qp_rts(struct rdma_cm_id *id) +{ + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + if (!id->qp) + return 0; + + qp_attr.qp_state = IB_QPS_RTS; + ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + return ib_modify_qp(id->qp, &qp_attr, qp_attr_mask); +} +#endif + +static int cma_modify_qp_err(struct rdma_cm_id *id) +{ + struct ib_qp_attr qp_attr; + + if (!id->qp) + return 0; + + qp_attr.qp_state = IB_QPS_ERR; + return ib_modify_qp(id->qp, &qp_attr, IB_QP_STATE); +} + +#ifdef IB_SUPPORTED +static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, + struct ib_qp_attr *qp_attr, int *qp_attr_mask) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + int ret; + + ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num, + ib_addr_get_pkey(dev_addr), + &qp_attr->pkey_index); + if (ret) + return ret; + + qp_attr->port_num = id_priv->id.port_num; + *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; + + if (cma_is_ud_ps(id_priv->id.ps)) { + qp_attr->qkey = id_priv->qkey; + *qp_attr_mask |= IB_QP_QKEY; + } else { + qp_attr->qp_access_flags = 0; + *qp_attr_mask |= IB_QP_ACCESS_FLAGS; + } + return 0; +} +#endif + +int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + struct rdma_id_private *id_priv; + int ret = 0; + + id_priv = container_of(id, struct rdma_id_private, id); +#ifdef IB_SUPPORTED + switch (rdma_node_get_transport(id_priv->id.device->node_type)) { + case RDMA_TRANSPORT_IB: + if (!id_priv->cm_id.ib || cma_is_ud_ps(id_priv->id.ps)) + ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); + else + ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, + qp_attr_mask); + if (qp_attr->qp_state == IB_QPS_RTR) + qp_attr->rq_psn = id_priv->seq_num; + break; + case RDMA_TRANSPORT_IWARP: +#endif + if (!id_priv->cm_id.iw) { + qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE; + *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; + } else + ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr, + qp_attr_mask); +#ifdef IB_SUPPORTED + break; + default: + ret = ENOSYS; + break; + } +#endif + + return ret; +} + +static inline int cma_zero_addr(struct sockaddr *addr) +{ + struct in6_addr *ip6; + + if (addr->sa_family == AF_INET) + return in_nullhost(((struct sockaddr_in *) addr)->sin_addr); + else { + ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr; + return (ip6->s6_addr32[0] | ip6->s6_addr32[1] | + ip6->s6_addr32[2] | ip6->s6_addr32[3]) == 0; + } +} + +static inline int cma_loopback_addr(struct sockaddr *addr) +{ + return ((struct sockaddr_in *)addr)->sin_addr.s_addr == INADDR_LOOPBACK; +} + +static inline int cma_any_addr(struct sockaddr *addr) +{ + return cma_zero_addr(addr) || cma_loopback_addr(addr); +} + +static inline __be16 cma_port(struct sockaddr *addr) +{ + if (addr->sa_family == AF_INET) + return ((struct sockaddr_in *) addr)->sin_port; + else + return ((struct sockaddr_in6 *) addr)->sin6_port; +} + +static inline int cma_any_port(struct sockaddr *addr) +{ + return !cma_port(addr); +} + +#ifdef IB_SUPPORTED +static int cma_get_net_info(void *hdr, enum rdma_port_space ps, + u8 *ip_ver, __u16 *port, + union cma_ip_addr **src, union cma_ip_addr **dst) +{ + switch (ps) { + case RDMA_PS_SDP: + if (sdp_get_majv(((struct sdp_hh *) hdr)->sdp_version) != + SDP_MAJ_VERSION) + return (EINVAL); + + *ip_ver = sdp_get_ip_ver(hdr); + *port = ((struct sdp_hh *) hdr)->port; + *src = &((struct sdp_hh *) hdr)->src_addr; + *dst = &((struct sdp_hh *) hdr)->dst_addr; + break; + default: + if (((struct cma_hdr *) hdr)->cma_version != CMA_VERSION) + return (EINVAL); + + *ip_ver = cma_get_ip_ver(hdr); + *port = ((struct cma_hdr *) hdr)->port; + *src = &((struct cma_hdr *) hdr)->src_addr; + *dst = &((struct cma_hdr *) hdr)->dst_addr; + break; + } + + if (*ip_ver != 4 && *ip_ver != 6) + return (EINVAL); + return 0; +} + +static void cma_save_net_info(struct rdma_addr *addr, + struct rdma_addr *listen_addr, + u8 ip_ver, __u16 port, + union cma_ip_addr *src, union cma_ip_addr *dst) +{ + struct sockaddr_in *listen4, *ip4; + struct sockaddr_in6 *listen6, *ip6; + + switch (ip_ver) { + case 4: + listen4 = (struct sockaddr_in *) &listen_addr->src_addr; + ip4 = (struct sockaddr_in *) &addr->src_addr; + ip4->sin_family = listen4->sin_family; + ip4->sin_addr.s_addr = dst->ip4.addr; + ip4->sin_port = listen4->sin_port; + + ip4 = (struct sockaddr_in *) &addr->dst_addr; + ip4->sin_family = listen4->sin_family; + ip4->sin_addr.s_addr = src->ip4.addr; + ip4->sin_port = port; + break; + case 6: + listen6 = (struct sockaddr_in6 *) &listen_addr->src_addr; + ip6 = (struct sockaddr_in6 *) &addr->src_addr; + ip6->sin6_family = listen6->sin6_family; + ip6->sin6_addr = dst->ip6; + ip6->sin6_port = listen6->sin6_port; + + ip6 = (struct sockaddr_in6 *) &addr->dst_addr; + ip6->sin6_family = listen6->sin6_family; + ip6->sin6_addr = src->ip6; + ip6->sin6_port = port; + break; + default: + break; + } +} +#endif + +static inline int cma_user_data_offset(enum rdma_port_space ps) +{ + switch (ps) { + case RDMA_PS_SDP: + return 0; + default: + return sizeof(struct cma_hdr); + } +} + +static void cma_cancel_route(struct rdma_id_private *id_priv) +{ +#ifdef IB_SUPPORTED + switch (rdma_node_get_transport(id_priv->id.device->node_type)) { + case RDMA_TRANSPORT_IB: + if (id_priv->query) + ib_sa_cancel_query(id_priv->query_id, id_priv->query); + break; + default: + break; + } +#endif +} + +static inline int cma_internal_listen(struct rdma_id_private *id_priv) +{ + return (id_priv->state == CMA_LISTEN) && id_priv->cma_dev && + cma_any_addr(&id_priv->id.route.addr.src_addr); +} + +static void cma_destroy_listen(struct rdma_id_private *id_priv) +{ + cma_exch(id_priv, CMA_DESTROYING); + + if (id_priv->cma_dev) { +#ifdef IB_SUPPORTED + switch (rdma_node_get_transport(id_priv->id.device->node_type)) { + case RDMA_TRANSPORT_IB: + if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib)) + ib_destroy_cm_id(id_priv->cm_id.ib); + break; + case RDMA_TRANSPORT_IWARP: +#endif + if (id_priv->cm_id.iw && !IS_ERR(id_priv->cm_id.iw)) + iw_destroy_cm_id(id_priv->cm_id.iw); +#ifdef IB_SUPPORTED + break; + default: + break; + } +#endif + cma_detach_from_dev(id_priv); + } + LIST_REMOVE(id_priv, listen_entry); + + cma_deref_id(id_priv); + mtx_lock(&id_priv->lock); + if (id_priv->refcount) + cv_wait(&id_priv->comp, &id_priv->lock); + mtx_unlock(&id_priv->lock); + + free(id_priv, M_DEVBUF); +} + +static void cma_cancel_listens(struct rdma_id_private *id_priv) +{ + struct rdma_id_private *dev_id_priv; + + mtx_lock(&lock); + LIST_REMOVE(id_priv, list); + + while (!LIST_EMPTY(&id_priv->listen_list)) { + dev_id_priv = LIST_FIRST(&id_priv->listen_list); + cma_destroy_listen(dev_id_priv); + } + mtx_unlock(&lock); +} + +static void cma_cancel_operation(struct rdma_id_private *id_priv, + enum cma_state state) +{ + switch (state) { + case CMA_ADDR_QUERY: + rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); + break; + case CMA_ROUTE_QUERY: + cma_cancel_route(id_priv); + break; + case CMA_LISTEN: + if (cma_any_addr(&id_priv->id.route.addr.src_addr) && + !id_priv->cma_dev) + cma_cancel_listens(id_priv); + break; + default: + break; + } +} + +static void cma_release_port(struct rdma_id_private *id_priv) +{ + struct rdma_bind_list *bind_list = id_priv->bind_list; + + if (!bind_list) + return; + + mtx_lock(&lock); + TAILQ_REMOVE(&bind_list->owners, id_priv, node); + if (TAILQ_EMPTY(&bind_list->owners)) { + kvl_delete(bind_list->ps, bind_list->port); + free(bind_list, M_DEVBUF); + } + mtx_unlock(&lock); + if (id_priv->so) + soclose(id_priv->so); +} + +#ifdef IB_SUPPORTED +static void cma_leave_mc_groups(struct rdma_id_private *id_priv) +{ + struct cma_multicast *mc; + + while (!LIST_EMPTY(&id_priv->mc_list)) { + mc = LIST_FIRST(&id_priv->mc_list); + LIST_REMOVE(mc, list); + ib_sa_free_multicast(mc->multicast.ib); + free(mc, M_DEVBUF); + } +} +#endif + +void rdma_destroy_id(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv; + enum cma_state state; + + id_priv = container_of(id, struct rdma_id_private, id); + state = cma_exch(id_priv, CMA_DESTROYING); + cma_cancel_operation(id_priv, state); + + mtx_lock(&lock); + if (id_priv->cma_dev) { + mtx_unlock(&lock); +#ifdef IB_SUPPORTED + switch (rdma_node_get_transport(id->device->node_type)) { + case RDMA_TRANSPORT_IB: + if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib)) + ib_destroy_cm_id(id_priv->cm_id.ib); + break; + case RDMA_TRANSPORT_IWARP: +#endif + if (id_priv->cm_id.iw && !IS_ERR(id_priv->cm_id.iw)) + iw_destroy_cm_id(id_priv->cm_id.iw); +#ifdef IB_SUPPORTED + break; + default: + break; + } + cma_leave_mc_groups(id_priv); +#endif + mtx_lock(&lock); + cma_detach_from_dev(id_priv); + } + mtx_unlock(&lock); + cma_release_port(id_priv); + cma_deref_id(id_priv); + mtx_lock(&id_priv->lock); + PANIC_IF(id_priv->refcount < 0); + if (id_priv->refcount) + cv_wait(&id_priv->comp, &id_priv->lock); + mtx_unlock(&id_priv->lock); + free(id_priv->id.route.path_rec, M_DEVBUF); + free(id_priv, M_DEVBUF); +} + +#ifdef IB_SUPPORTED +static int cma_rep_recv(struct rdma_id_private *id_priv) +{ + int ret; + + ret = cma_modify_qp_rtr(&id_priv->id); + if (ret) + goto reject; + + ret = cma_modify_qp_rts(&id_priv->id); + if (ret) + goto reject; + + ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); + if (ret) + goto reject; + + return 0; +reject: + cma_modify_qp_err(&id_priv->id); + ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, + NULL, 0, NULL, 0); + return ret; +} + +static int cma_verify_rep(struct rdma_id_private *id_priv, void *data) +{ + if (id_priv->id.ps == RDMA_PS_SDP && + sdp_get_majv(((struct sdp_hah *) data)->sdp_version) != + SDP_MAJ_VERSION) + return (EINVAL); + + return 0; +} + +static void cma_set_rep_event_data(struct rdma_cm_event *event, + struct ib_cm_rep_event_param *rep_data, + void *private_data) +{ + event->param.conn.private_data = private_data; + event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE; + event->param.conn.responder_resources = rep_data->responder_resources; + event->param.conn.initiator_depth = rep_data->initiator_depth; + event->param.conn.flow_control = rep_data->flow_control; + event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; + event->param.conn.srq = rep_data->srq; + event->param.conn.qp_num = rep_data->remote_qpn; +} + +static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) +{ + struct rdma_id_private *id_priv = cm_id->context; + struct rdma_cm_event event; + int ret = 0; + + if (cma_disable_remove(id_priv, CMA_CONNECT)) + return 0; + + memset(&event, 0, sizeof event); + switch (ib_event->event) { + case IB_CM_REQ_ERROR: + case IB_CM_REP_ERROR: + event.event = RDMA_CM_EVENT_UNREACHABLE; + event.status = ETIMEDOUT; + break; + case IB_CM_REP_RECEIVED: + event.status = cma_verify_rep(id_priv, ib_event->private_data); + if (event.status) + event.event = RDMA_CM_EVENT_CONNECT_ERROR; + else if (id_priv->id.qp && id_priv->id.ps != RDMA_PS_SDP) { + event.status = cma_rep_recv(id_priv); + event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR : + RDMA_CM_EVENT_ESTABLISHED; + } else + event.event = RDMA_CM_EVENT_CONNECT_RESPONSE; + cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd, + ib_event->private_data); + break; + case IB_CM_RTU_RECEIVED: + case IB_CM_USER_ESTABLISHED: + event.event = RDMA_CM_EVENT_ESTABLISHED; + break; + case IB_CM_DREQ_ERROR: + event.status = ETIMEDOUT; /* fall through */ + case IB_CM_DREQ_RECEIVED: + case IB_CM_DREP_RECEIVED: + if (!cma_comp_exch(id_priv, CMA_CONNECT, CMA_DISCONNECT)) + goto out; + event.event = RDMA_CM_EVENT_DISCONNECTED; + break; + case IB_CM_TIMEWAIT_EXIT: + case IB_CM_MRA_RECEIVED: + /* ignore event */ + goto out; + case IB_CM_REJ_RECEIVED: + cma_modify_qp_err(&id_priv->id); + event.status = ib_event->param.rej_rcvd.reason; + event.event = RDMA_CM_EVENT_REJECTED; + event.param.conn.private_data = ib_event->private_data; + event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; + break; + default: + log(LOG_ERR, "RDMA CMA: unexpected IB CM event: %d", + ib_event->event); + goto out; + } + + ret = id_priv->id.event_handler(&id_priv->id, &event); + if (ret) { + /* Destroy the CM ID by returning a non-zero value. */ + id_priv->cm_id.ib = NULL; + cma_exch(id_priv, CMA_DESTROYING); + cma_enable_remove(id_priv); + rdma_destroy_id(&id_priv->id); + return ret; + } +out: + cma_enable_remove(id_priv); + return ret; +} + +static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, + struct ib_cm_event *ib_event) +{ + struct rdma_id_private *id_priv; + struct rdma_cm_id *id; + struct rdma_route *rt; + union cma_ip_addr *src, *dst; + __u16 port; + u8 ip_ver; + + if (cma_get_net_info(ib_event->private_data, listen_id->ps, + &ip_ver, &port, &src, &dst)) + goto err; + + id = rdma_create_id(listen_id->event_handler, listen_id->context, + listen_id->ps); + if (IS_ERR(id)) + goto err; + + cma_save_net_info(&id->route.addr, &listen_id->route.addr, + ip_ver, port, src, dst); + + rt = &id->route; + rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; + rt->path_rec = malloc(sizeof *rt->path_rec * rt->num_paths, + M_DEVBUF, M_NOWAIT); + if (!rt->path_rec) + goto destroy_id; + + rt->path_rec[0] = *ib_event->param.req_rcvd.primary_path; + if (rt->num_paths == 2) + rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; + + ib_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); + ib_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); + ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); + rt->addr.dev_addr.dev_type = RDMA_NODE_IB_CA; + + id_priv = container_of(id, struct rdma_id_private, id); + id_priv->state = CMA_CONNECT; + return id_priv; + +destroy_id: + rdma_destroy_id(id); +err: + return NULL; +} + +static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, + struct ib_cm_event *ib_event) +{ + struct rdma_id_private *id_priv; + struct rdma_cm_id *id; + union cma_ip_addr *src, *dst; + __u16 port; + u8 ip_ver; + int ret; + + id = rdma_create_id(listen_id->event_handler, listen_id->context, + listen_id->ps); + if (IS_ERR(id)) + return NULL; + + + if (cma_get_net_info(ib_event->private_data, listen_id->ps, + &ip_ver, &port, &src, &dst)) + goto err; + + cma_save_net_info(&id->route.addr, &listen_id->route.addr, + ip_ver, port, src, dst); + + ret = rdma_translate_ip(&id->route.addr.src_addr, + &id->route.addr.dev_addr); + if (ret) + goto err; + + id_priv = container_of(id, struct rdma_id_private, id); + id_priv->state = CMA_CONNECT; + return id_priv; +err: + rdma_destroy_id(id); + return NULL; +} + +static void cma_set_req_event_data(struct rdma_cm_event *event, + struct ib_cm_req_event_param *req_data, + void *private_data, int offset) +{ + event->param.conn.private_data = private_data + offset; + event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset; + event->param.conn.responder_resources = req_data->responder_resources; + event->param.conn.initiator_depth = req_data->initiator_depth; + event->param.conn.flow_control = req_data->flow_control; + event->param.conn.retry_count = req_data->retry_count; + event->param.conn.rnr_retry_count = req_data->rnr_retry_count; + event->param.conn.srq = req_data->srq; + event->param.conn.qp_num = req_data->remote_qpn; +} + +static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) +{ + struct rdma_id_private *listen_id, *conn_id; + struct rdma_cm_event event; + int offset, ret; + + listen_id = cm_id->context; + if (cma_disable_remove(listen_id, CMA_LISTEN)) + return (ECONNABORTED); + + memset(&event, 0, sizeof event); + offset = cma_user_data_offset(listen_id->id.ps); + event.event = RDMA_CM_EVENT_CONNECT_REQUEST; + if (cma_is_ud_ps(listen_id->id.ps)) { + conn_id = cma_new_udp_id(&listen_id->id, ib_event); + event.param.ud.private_data = ib_event->private_data + offset; + event.param.ud.private_data_len = + IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; + } else { + conn_id = cma_new_conn_id(&listen_id->id, ib_event); + cma_set_req_event_data(&event, &ib_event->param.req_rcvd, + ib_event->private_data, offset); + } + if (!conn_id) { + ret = ENOMEM; + goto out; + } + + mtx_lock(&conn_id->lock); + conn_id->dev_remove++; + mtx_unlock(&conn_id->lock); + mtx_lock(&lock); + ret = cma_acquire_dev(conn_id); + mtx_unlock(&lock); + if (ret) + goto release_conn_id; + + conn_id->cm_id.ib = cm_id; + cm_id->context = conn_id; + cm_id->cm_handler = cma_ib_handler; + + ret = conn_id->id.event_handler(&conn_id->id, &event); + if (!ret) + goto out; + + /* Destroy the CM ID by returning a non-zero value. */ + conn_id->cm_id.ib = NULL; + +release_conn_id: + cma_exch(conn_id, CMA_DESTROYING); + cma_enable_remove(conn_id); + rdma_destroy_id(&conn_id->id); + +out: + cma_enable_remove(listen_id); + return ret; +} + +static __be64 cma_get_service_id(enum rdma_port_space ps, struct sockaddr *addr) +{ + return cpu_to_be64(((u64)ps << 16) + be16_to_cpu(cma_port(addr))); +} + +static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr, + struct ib_cm_compare_data *compare) +{ + struct cma_hdr *cma_data, *cma_mask; + struct sdp_hh *sdp_data, *sdp_mask; + __u32 ip4_addr; + struct in6_addr ip6_addr; + + memset(compare, 0, sizeof *compare); + cma_data = (void *) compare->data; + cma_mask = (void *) compare->mask; + sdp_data = (void *) compare->data; + sdp_mask = (void *) compare->mask; + + switch (addr->sa_family) { + case AF_INET: + ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr; + if (ps == RDMA_PS_SDP) { + sdp_set_ip_ver(sdp_data, 4); + sdp_set_ip_ver(sdp_mask, 0xF); + sdp_data->dst_addr.ip4.addr = ip4_addr; + sdp_mask->dst_addr.ip4.addr = ~0; + } else { + cma_set_ip_ver(cma_data, 4); + cma_set_ip_ver(cma_mask, 0xF); + cma_data->dst_addr.ip4.addr = ip4_addr; + cma_mask->dst_addr.ip4.addr = ~0; + } + break; + case AF_INET6: + ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr; + if (ps == RDMA_PS_SDP) { + sdp_set_ip_ver(sdp_data, 6); + sdp_set_ip_ver(sdp_mask, 0xF); + sdp_data->dst_addr.ip6 = ip6_addr; + memset(&sdp_mask->dst_addr.ip6, 0xFF, + sizeof sdp_mask->dst_addr.ip6); + } else { + cma_set_ip_ver(cma_data, 6); + cma_set_ip_ver(cma_mask, 0xF); + cma_data->dst_addr.ip6 = ip6_addr; + memset(&cma_mask->dst_addr.ip6, 0xFF, + sizeof cma_mask->dst_addr.ip6); + } + break; + default: + break; + } +} +#endif /* IB_SUPPORTED */ + +static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) +{ + struct rdma_id_private *id_priv = iw_id->context; + struct rdma_cm_event event; + struct sockaddr_in *sin; + int ret = 0; + + if (cma_disable_remove(id_priv, CMA_CONNECT)) + return 0; + + memset(&event, 0, sizeof event); + switch (iw_event->event) { + case IW_CM_EVENT_CLOSE: + event.event = RDMA_CM_EVENT_DISCONNECTED; + break; + case IW_CM_EVENT_CONNECT_REPLY: + sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; + *sin = iw_event->local_addr; + sin = (struct sockaddr_in *) &id_priv->id.route.addr.dst_addr; + *sin = iw_event->remote_addr; + switch (iw_event->status) { + case 0: + event.event = RDMA_CM_EVENT_ESTABLISHED; + break; + case ECONNRESET: + case ECONNREFUSED: + event.event = RDMA_CM_EVENT_REJECTED; + break; + case ETIMEDOUT: + event.event = RDMA_CM_EVENT_UNREACHABLE; + break; + default: + event.event = RDMA_CM_EVENT_CONNECT_ERROR; + break; + } + break; + case IW_CM_EVENT_ESTABLISHED: + event.event = RDMA_CM_EVENT_ESTABLISHED; + break; + default: + panic("unknown event type %d", iw_event->event); + + } + + event.status = iw_event->status; + event.param.conn.private_data = iw_event->private_data; + event.param.conn.private_data_len = iw_event->private_data_len; + ret = id_priv->id.event_handler(&id_priv->id, &event); + if (ret) { + /* Destroy the CM ID by returning a non-zero value. */ + id_priv->cm_id.iw = NULL; + cma_exch(id_priv, CMA_DESTROYING); + cma_enable_remove(id_priv); + rdma_destroy_id(&id_priv->id); + return ret; + } + + cma_enable_remove(id_priv); + return ret; +} + +static int iw_conn_req_handler(struct iw_cm_id *cm_id, + struct iw_cm_event *iw_event) +{ + struct rdma_cm_id *new_cm_id; + struct rdma_id_private *listen_id, *conn_id; + struct sockaddr_in *sin; + struct ifnet *dev; + struct rdma_cm_event event; + int ret; + struct ifaddr *ifa; + uint16_t port; + + listen_id = cm_id->context; + if (cma_disable_remove(listen_id, CMA_LISTEN)) + return (ECONNABORTED); + + /* Create a new RDMA id for the new IW CM ID */ + new_cm_id = rdma_create_id(listen_id->id.event_handler, + listen_id->id.context, + RDMA_PS_TCP); + if (!new_cm_id) { + ret = ENOMEM; + goto out; + } + conn_id = container_of(new_cm_id, struct rdma_id_private, id); + mtx_lock(&conn_id->lock); + ++conn_id->dev_remove; + mtx_unlock(&conn_id->lock); + conn_id->state = CMA_CONNECT; + + port = iw_event->local_addr.sin_port; + iw_event->local_addr.sin_port = 0; + ifa = ifa_ifwithaddr((struct sockaddr *)&iw_event->local_addr); + iw_event->local_addr.sin_port = port; + if (!ifa) { + ret = EADDRNOTAVAIL; + cma_enable_remove(conn_id); + rdma_destroy_id(new_cm_id); + goto out; + } + dev = ifa->ifa_ifp; + ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL); + if (ret) { + cma_enable_remove(conn_id); + rdma_destroy_id(new_cm_id); + goto out; + } + + mtx_lock(&lock); + ret = cma_acquire_dev(conn_id); + mtx_unlock(&lock); + if (ret) { + cma_enable_remove(conn_id); + rdma_destroy_id(new_cm_id); + goto out; + } + + conn_id->cm_id.iw = cm_id; + cm_id->context = conn_id; + cm_id->cm_handler = cma_iw_handler; + + sin = (struct sockaddr_in *) &new_cm_id->route.addr.src_addr; + *sin = iw_event->local_addr; + sin = (struct sockaddr_in *) &new_cm_id->route.addr.dst_addr; + *sin = iw_event->remote_addr; + conn_id->so = cm_id->so; + + memset(&event, 0, sizeof event); + event.event = RDMA_CM_EVENT_CONNECT_REQUEST; + event.param.conn.private_data = iw_event->private_data; + event.param.conn.private_data_len = iw_event->private_data_len; + ret = conn_id->id.event_handler(&conn_id->id, &event); + if (ret) { + /* User wants to destroy the CM ID */ + conn_id->cm_id.iw = NULL; + cma_exch(conn_id, CMA_DESTROYING); + cma_enable_remove(conn_id); + rdma_destroy_id(&conn_id->id); + } + +out: + cma_enable_remove(listen_id); + return ret; +} + +#ifdef IB_SUPPORTED +static int cma_ib_listen(struct rdma_id_private *id_priv) +{ + struct ib_cm_compare_data compare_data; + struct sockaddr *addr; + __be64 svc_id; + int ret; + + id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_req_handler, + id_priv); + if (IS_ERR(id_priv->cm_id.ib)) + return PTR_ERR(id_priv->cm_id.ib); + + addr = &id_priv->id.route.addr.src_addr; + svc_id = cma_get_service_id(id_priv->id.ps, addr); + if (cma_any_addr(addr)) + ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL); + else { + cma_set_compare_data(id_priv->id.ps, addr, &compare_data); + ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data); + } + + if (ret) { + ib_destroy_cm_id(id_priv->cm_id.ib); + id_priv->cm_id.ib = NULL; + } + + return ret; +} +#endif + +static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) +{ + int ret; + struct sockaddr_in *sin; + + id_priv->cm_id.iw = iw_create_cm_id(id_priv->id.device, id_priv->so, + iw_conn_req_handler, id_priv); + if (IS_ERR(id_priv->cm_id.iw)) + return PTR_ERR(id_priv->cm_id.iw); + + sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; + id_priv->cm_id.iw->local_addr = *sin; + + ret = iw_cm_listen(id_priv->cm_id.iw, backlog); + + if (ret) { + iw_destroy_cm_id(id_priv->cm_id.iw); + id_priv->cm_id.iw = NULL; + } + + return ret; +} + +static int cma_listen_handler(struct rdma_cm_id *id, + struct rdma_cm_event *event) +{ + struct rdma_id_private *id_priv = id->context; + + id->context = id_priv->id.context; + id->event_handler = id_priv->id.event_handler; + return id_priv->id.event_handler(id, event); +} + +static void cma_listen_on_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev) +{ + struct rdma_id_private *dev_id_priv; + struct rdma_cm_id *id; + int ret; + + id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps); + if (IS_ERR(id)) + return; + + dev_id_priv = container_of(id, struct rdma_id_private, id); + + dev_id_priv->state = CMA_ADDR_BOUND; + memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr, + ip_addr_size(&id_priv->id.route.addr.src_addr)); + dev_id_priv->so = id_priv->so; /* XXX */ + + cma_attach_to_dev(dev_id_priv, cma_dev); + LIST_INSERT_HEAD(&id_priv->listen_list, dev_id_priv, listen_entry); + + ret = rdma_listen(id, id_priv->backlog); + if (ret) + goto err; + + return; +err: + cma_destroy_listen(dev_id_priv); +} + +static void cma_listen_on_all(struct rdma_id_private *id_priv) +{ + struct cma_device *cma_dev; + + mtx_lock(&lock); + LIST_INSERT_HEAD(&listen_any_list, id_priv, list); + TAILQ_FOREACH(cma_dev, &dev_list, list) + cma_listen_on_dev(id_priv, cma_dev); + mtx_unlock(&lock); +} + +static int cma_bind_any(struct rdma_cm_id *id, sa_family_t af) +{ + struct sockaddr_in addr_in; + + memset(&addr_in, 0, sizeof addr_in); + addr_in.sin_family = af; + addr_in.sin_len = sizeof addr_in; + return rdma_bind_addr(id, (struct sockaddr *) &addr_in); +} + +int rdma_listen(struct rdma_cm_id *id, int backlog) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (id_priv->state == CMA_IDLE) { + ret = cma_bind_any(id, AF_INET); + if (ret) + return ret; + } + + if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN)) + return (EINVAL); + + id_priv->backlog = backlog; + if (id->device) { +#ifdef IB_SUPPORTED + switch (rdma_node_get_transport(id->device->node_type)) { + case RDMA_TRANSPORT_IB: + ret = cma_ib_listen(id_priv); + if (ret) + goto err; + break; + case RDMA_TRANSPORT_IWARP: +#endif + ret = cma_iw_listen(id_priv, backlog); + if (ret) + goto err; +#ifdef IB_SUPPORTED + break; + default: + ret = ENOSYS; + goto err; + } +#endif + } else + cma_listen_on_all(id_priv); + + return 0; +err: + id_priv->backlog = 0; + cma_comp_exch(id_priv, CMA_LISTEN, CMA_ADDR_BOUND); + return ret; +} + +#ifdef IB_SUPPORTED +static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec, + void *context) +{ + struct cma_work *work = context; + struct rdma_route *route; + + route = &work->id->id.route; + + if (!status) { + route->num_paths = 1; + *route->path_rec = *path_rec; + } else { + work->old_state = CMA_ROUTE_QUERY; + work->new_state = CMA_ADDR_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; + work->event.status = status; + } + + taskqueue_enqueue(cma_wq, &work->task); +} + +static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, + struct cma_work *work) +{ + struct rdma_dev_addr *addr = &id_priv->id.route.addr.dev_addr; + struct ib_sa_path_rec path_rec; + + memset(&path_rec, 0, sizeof path_rec); + ib_addr_get_sgid(addr, &path_rec.sgid); + ib_addr_get_dgid(addr, &path_rec.dgid); + path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(addr)); + path_rec.numb_path = 1; + path_rec.reversible = 1; + + id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, + id_priv->id.port_num, &path_rec, + IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | + IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | + IB_SA_PATH_REC_REVERSIBLE, + timeout_ms, M_NOWAIT, + cma_query_handler, work, &id_priv->query); + + return (id_priv->query_id < 0) ? id_priv->query_id : 0; +} +#endif + +static void cma_work_handler(void *context, int pending) +{ + struct cma_work *work = context; + struct rdma_id_private *id_priv = work->id; + int destroy = 0; + + mtx_lock(&id_priv->lock); + ++id_priv->dev_remove; + mtx_unlock(&id_priv->lock); + if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) + goto out; + + if (id_priv->id.event_handler(&id_priv->id, &work->event)) { + cma_exch(id_priv, CMA_DESTROYING); + destroy = 1; + } +out: + cma_enable_remove(id_priv); + cma_deref_id(id_priv); + if (destroy) + rdma_destroy_id(&id_priv->id); + free(work, M_DEVBUF); +} + +#ifdef IB_SUPPORTED +static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) +{ + struct rdma_route *route = &id_priv->id.route; + struct cma_work *work; + int ret; + + work = malloc(sizeof *work, M_DEVBUF, M_NOWAIT); + if (!work) + return (ENOMEM); + bzero(work, sizeof *work); + + work->id = id_priv; + TASK_INIT(&work->task, 0, cma_work_handler, work); + work->old_state = CMA_ROUTE_QUERY; + work->new_state = CMA_ROUTE_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; + + route->path_rec = malloc(sizeof *route->path_rec, M_DEVBUF, M_NOWAIT); + if (!route->path_rec) { + ret = ENOMEM; + goto err1; + } + + ret = cma_query_ib_route(id_priv, timeout_ms, work); + if (ret) + goto err2; + + return 0; +err2: + free(route->path_rec, M_DEVBUF); + route->path_rec = NULL; +err1: + free(work, M_DEVBUF); + return ret; +} + +int rdma_set_ib_paths(struct rdma_cm_id *id, + struct ib_sa_path_rec *path_rec, int num_paths) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_RESOLVED)) + return (EINVAL); + + id->route.path_rec = malloc(sizeof *path_rec * num_paths, M_DEVBUF, M_NOWAIT); + if (!id->route.path_rec) { + ret = ENOMEM; + goto err; + } + + memcpy(id->route.path_rec, path_rec, sizeof *path_rec * num_paths); + return 0; +err: + cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_ADDR_RESOLVED); + return ret; +} +#endif + +static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) +{ + struct cma_work *work; + + work = malloc(sizeof *work, M_DEVBUF, M_NOWAIT); + if (!work) + return (ENOMEM); + bzero(work, sizeof *work); + + work->id = id_priv; + TASK_INIT(&work->task, 0, cma_work_handler, work); + work->old_state = CMA_ROUTE_QUERY; + work->new_state = CMA_ROUTE_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; + taskqueue_enqueue(cma_wq, &work->task); + return 0; +} + +int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_QUERY)) + return (EINVAL); + + mtx_lock(&id_priv->lock); + id_priv->refcount++; + mtx_unlock(&id_priv->lock); +#ifdef IB_SUPPORTED + switch (rdma_node_get_transport(id->device->node_type)) { + case RDMA_TRANSPORT_IB: + ret = cma_resolve_ib_route(id_priv, timeout_ms); + break; + case RDMA_TRANSPORT_IWARP: +#endif + ret = cma_resolve_iw_route(id_priv, timeout_ms); +#ifdef IB_SUPPORTED + break; + default: + ret = ENOSYS; + break; + } +#endif + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, CMA_ROUTE_QUERY, CMA_ADDR_RESOLVED); + cma_deref_id(id_priv); + return ret; +} + +static int cma_bind_loopback(struct rdma_id_private *id_priv) +{ + struct cma_device *cma_dev; + struct ib_port_attr port_attr; + union ib_gid gid; + u16 pkey; + int ret; + u8 p; + + mtx_lock(&lock); + if (TAILQ_EMPTY(&dev_list)) { + ret = ENODEV; + goto out; + } + TAILQ_FOREACH(cma_dev, &dev_list, list) + for (p = 1; p <= cma_dev->device->phys_port_cnt; ++p) + if (!ib_query_port(cma_dev->device, p, &port_attr) && + port_attr.state == IB_PORT_ACTIVE) + goto port_found; + + p = 1; + cma_dev = TAILQ_FIRST(&dev_list); + +port_found: + ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid); + if (ret) + goto out; + + ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey); + if (ret) + goto out; + + ib_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); + ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); + id_priv->id.port_num = p; + cma_attach_to_dev(id_priv, cma_dev); +out: + mtx_unlock(&lock); + return ret; +} + +static void addr_handler(int status, struct sockaddr *src_addr, + struct rdma_dev_addr *dev_addr, void *context) +{ + struct rdma_id_private *id_priv = context; + struct rdma_cm_event event; + + memset(&event, 0, sizeof event); + mtx_lock(&id_priv->lock); + ++id_priv->dev_remove; + mtx_unlock(&id_priv->lock); + + /* + * Grab mutex to block rdma_destroy_id() from removing the device while + * we're trying to acquire it. + */ + mtx_lock(&lock); + if (!cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_RESOLVED)) { + mtx_unlock(&lock); + goto out; + } + + if (!status && !id_priv->cma_dev) + status = cma_acquire_dev(id_priv); + mtx_unlock(&lock); + + if (status) { + if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ADDR_BOUND)) + goto out; + event.event = RDMA_CM_EVENT_ADDR_ERROR; + event.status = status; + } else { + memcpy(&id_priv->id.route.addr.src_addr, src_addr, + ip_addr_size(src_addr)); + event.event = RDMA_CM_EVENT_ADDR_RESOLVED; + } + + if (id_priv->id.event_handler(&id_priv->id, &event)) { + cma_exch(id_priv, CMA_DESTROYING); + cma_enable_remove(id_priv); + cma_deref_id(id_priv); + rdma_destroy_id(&id_priv->id); + return; + } +out: + cma_enable_remove(id_priv); + cma_deref_id(id_priv); +} + +static int cma_resolve_loopback(struct rdma_id_private *id_priv) +{ + struct cma_work *work; + struct sockaddr_in *src_in, *dst_in; + union ib_gid gid; + int ret; + + work = malloc(sizeof *work, M_DEVBUF, M_NOWAIT); + if (!work) + return (ENOMEM); + bzero(work, sizeof *work); + + if (!id_priv->cma_dev) { + ret = cma_bind_loopback(id_priv); + if (ret) + goto err; + } + + ib_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); + ib_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); + + if (cma_zero_addr(&id_priv->id.route.addr.src_addr)) { + src_in = (struct sockaddr_in *)&id_priv->id.route.addr.src_addr; + dst_in = (struct sockaddr_in *)&id_priv->id.route.addr.dst_addr; + src_in->sin_family = dst_in->sin_family; + src_in->sin_addr.s_addr = dst_in->sin_addr.s_addr; + } + + work->id = id_priv; + TASK_INIT(&work->task, 0, cma_work_handler, work); + work->old_state = CMA_ADDR_QUERY; + work->new_state = CMA_ADDR_RESOLVED; + work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; + taskqueue_enqueue(cma_wq, &work->task); + return 0; +err: + free(work, M_DEVBUF); + return ret; +} + +static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + struct sockaddr *dst_addr) +{ + if (src_addr && src_addr->sa_family) + return rdma_bind_addr(id, src_addr); + else + return cma_bind_any(id, dst_addr->sa_family); +} + +int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + struct sockaddr *dst_addr, int timeout_ms) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (id_priv->state == CMA_IDLE) { + ret = cma_bind_addr(id, src_addr, dst_addr); + if (ret) + return ret; + } + + if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_ADDR_QUERY)) + return (EINVAL); + + mtx_lock(&id_priv->lock); + id_priv->refcount++; + mtx_unlock(&id_priv->lock); + memcpy(&id->route.addr.dst_addr, dst_addr, ip_addr_size(dst_addr)); + if (cma_any_addr(dst_addr)) + ret = cma_resolve_loopback(id_priv); + else + ret = rdma_resolve_ip(&addr_client, &id->route.addr.src_addr, + dst_addr, &id->route.addr.dev_addr, + timeout_ms, addr_handler, id_priv); + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_BOUND); + cma_deref_id(id_priv); + return ret; +} + +static void cma_bind_port(struct rdma_bind_list *bind_list, + struct rdma_id_private *id_priv) +{ + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; + sin->sin_port = htons(bind_list->port); + id_priv->bind_list = bind_list; + TAILQ_INSERT_HEAD(&bind_list->owners, id_priv, node); +} + +static int cma_alloc_port(struct kvl *ps, struct rdma_id_private *id_priv, + unsigned short snum) +{ + struct rdma_bind_list *bind_list; + int port, ret; + + bind_list = malloc(sizeof *bind_list, M_DEVBUF, M_NOWAIT); + if (!bind_list) + return (ENOMEM); + bzero(bind_list, sizeof *bind_list); + + do { + ret = kvl_alloc_above(ps, bind_list, snum, &port); + } while (ret == EAGAIN); + + if (ret) + goto err1; + + if (port != snum) { + ret = EADDRNOTAVAIL; + goto err2; + } + + bind_list->ps = ps; + bind_list->port = (unsigned short) port; + cma_bind_port(bind_list, id_priv); + return 0; +err2: + kvl_delete(ps, port); +err1: + free(bind_list, M_DEVBUF); + return ret; +} + +static int cma_alloc_any_port(struct kvl *ps, struct rdma_id_private *id_priv) +{ + struct rdma_bind_list *bind_list; + int port, ret; + + bind_list = malloc(sizeof *bind_list, M_DEVBUF, M_NOWAIT); + if (!bind_list) + return (ENOMEM); + bzero(bind_list, sizeof *bind_list); + +retry: + do { + ret = kvl_alloc_above(ps, bind_list, next_port, &port); + } while (ret == EAGAIN); + + if (ret) + goto err1; + + if (port > ipport_lastauto) { + if (next_port != ipport_firstauto) { + kvl_delete(ps, port); + next_port = ipport_firstauto; + goto retry; + } + ret = EADDRNOTAVAIL; + goto err2; + } + + if (port == ipport_lastauto) + next_port = ipport_firstauto; + else + next_port = port + 1; + + bind_list->ps = ps; + bind_list->port = (unsigned short) port; + cma_bind_port(bind_list, id_priv); + return 0; +err2: + kvl_delete(ps, port); +err1: + free(bind_list, M_DEVBUF); + return ret; +} + +static int cma_use_port(struct kvl *ps, struct rdma_id_private *id_priv) +{ + struct rdma_id_private *cur_id; + struct sockaddr_in *sin, *cur_sin; + struct rdma_bind_list *bind_list; + unsigned short snum; + + sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; + snum = ntohs(sin->sin_port); + if (snum <= ipport_reservedhigh && snum >= ipport_reservedlow && + priv_check(curthread, PRIV_NETINET_RESERVEDPORT)) + return (EACCES); + + bind_list = kvl_lookup(ps, snum); + if (!bind_list) + return cma_alloc_port(ps, id_priv, snum); + + /* + * We don't support binding to any address if anyone is bound to + * a specific address on the same port. + */ + if (cma_any_addr(&id_priv->id.route.addr.src_addr)) + return (EADDRNOTAVAIL); + + TAILQ_FOREACH(cur_id, &bind_list->owners, node) { + if (cma_any_addr(&cur_id->id.route.addr.src_addr)) + return (EADDRNOTAVAIL); + + cur_sin = (struct sockaddr_in *)&cur_id->id.route.addr.src_addr; + if (sin->sin_addr.s_addr == cur_sin->sin_addr.s_addr) + return (EADDRINUSE); + } + + cma_bind_port(bind_list, id_priv); + return 0; +} + +static int cma_get_tcp_port(struct rdma_id_private *id_priv) +{ + int ret; + struct socket *so; + + ret = socreate(AF_INET, &so, SOCK_STREAM, IPPROTO_TCP, + curthread->td_ucred, curthread); + if (ret) { + printf("%s socreate err %d\n", __FUNCTION__, ret); + return ret; + } + + ret = sobind(so, (struct sockaddr *)&id_priv->id.route.addr.src_addr, + curthread); + if (ret) { + soclose(so); + return ret; + } + id_priv->so = so; + return 0; +} + +static int cma_get_port(struct rdma_id_private *id_priv) +{ + struct kvl *ps; + int ret; + + switch (id_priv->id.ps) { + case RDMA_PS_SDP: + ps = &sdp_ps; + break; + case RDMA_PS_TCP: + ps = &tcp_ps; + ret = cma_get_tcp_port(id_priv); /* Synch with native stack */ + if (ret) + return ret; + break; + case RDMA_PS_UDP: + ps = &udp_ps; + break; + case RDMA_PS_IPOIB: + ps = &ipoib_ps; + break; + default: + return (EPROTONOSUPPORT); + } + + mtx_lock(&lock); + if (cma_any_port(&id_priv->id.route.addr.src_addr)) + ret = cma_alloc_any_port(ps, id_priv); + else + ret = cma_use_port(ps, id_priv); + mtx_unlock(&lock); + + return ret; +} + +int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) +{ + struct rdma_id_private *id_priv; + int ret; + + if (addr->sa_family != AF_INET) + return (EAFNOSUPPORT); + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND)) + return (EINVAL); + + if (!cma_any_addr(addr)) { + ret = rdma_translate_ip(addr, &id->route.addr.dev_addr); + if (ret) + goto err1; + + mtx_lock(&lock); + ret = cma_acquire_dev(id_priv); + mtx_unlock(&lock); + if (ret) + goto err1; + } + + memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr)); + ret = cma_get_port(id_priv); + if (ret) + goto err2; + + return 0; +err2: + if (!cma_any_addr(addr)) { + mtx_lock(&lock); + cma_detach_from_dev(id_priv); + mtx_unlock(&lock); + } +err1: + cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_IDLE); + return ret; +} + +#ifdef IB_SUPPORTED +static int cma_format_hdr(void *hdr, enum rdma_port_space ps, + struct rdma_route *route) +{ + struct sockaddr_in *src4, *dst4; + struct cma_hdr *cma_hdr; + struct sdp_hh *sdp_hdr; + + src4 = (struct sockaddr_in *) &route->addr.src_addr; + dst4 = (struct sockaddr_in *) &route->addr.dst_addr; + + switch (ps) { + case RDMA_PS_SDP: + sdp_hdr = hdr; + if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION) + return (EINVAL); + sdp_set_ip_ver(sdp_hdr, 4); + sdp_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; + sdp_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; + sdp_hdr->port = src4->sin_port; + break; + default: + cma_hdr = hdr; + cma_hdr->cma_version = CMA_VERSION; + cma_set_ip_ver(cma_hdr, 4); + cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; + cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; + cma_hdr->port = src4->sin_port; + break; + } + return 0; +} + +static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *ib_event) +{ + struct rdma_id_private *id_priv = cm_id->context; + struct rdma_cm_event event; + struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; + int ret = 0; + + if (cma_disable_remove(id_priv, CMA_CONNECT)) + return 0; + + memset(&event, 0, sizeof event); + switch (ib_event->event) { + case IB_CM_SIDR_REQ_ERROR: + event.event = RDMA_CM_EVENT_UNREACHABLE; + event.status = ETIMEDOUT; + break; + case IB_CM_SIDR_REP_RECEIVED: + event.param.ud.private_data = ib_event->private_data; + event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; + if (rep->status != IB_SIDR_SUCCESS) { + event.event = RDMA_CM_EVENT_UNREACHABLE; + event.status = ib_event->param.sidr_rep_rcvd.status; + break; + } + if (id_priv->qkey != rep->qkey) { + event.event = RDMA_CM_EVENT_UNREACHABLE; + event.status = EINVAL; + break; + } + ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num, + id_priv->id.route.path_rec, + &event.param.ud.ah_attr); + event.param.ud.qp_num = rep->qpn; + event.param.ud.qkey = rep->qkey; + event.event = RDMA_CM_EVENT_ESTABLISHED; + event.status = 0; + break; + default: + log(LOG_ERR, "RDMA CMA: unexpected IB CM event: %d", + ib_event->event); + goto out; + } + + ret = id_priv->id.event_handler(&id_priv->id, &event); + if (ret) { + /* Destroy the CM ID by returning a non-zero value. */ + id_priv->cm_id.ib = NULL; + cma_exch(id_priv, CMA_DESTROYING); + cma_enable_remove(id_priv); + rdma_destroy_id(&id_priv->id); + return ret; + } +out: + cma_enable_remove(id_priv); + return ret; +} + +static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct ib_cm_sidr_req_param req; + struct rdma_route *route; + int ret; + + req.private_data_len = sizeof(struct cma_hdr) + + conn_param->private_data_len; + req.private_data = malloc(req.private_data_len, M_DEVBUF, M_NOWAIT); + if (!req.private_data) + return (ENOMEM); + bzero((void *)req.private_data, req.private_data_len); + + if (conn_param->private_data && conn_param->private_data_len) + memcpy((caddr_t) req.private_data + sizeof(struct cma_hdr), + conn_param->private_data, conn_param->private_data_len); + + route = &id_priv->id.route; + ret = cma_format_hdr((void *) req.private_data, id_priv->id.ps, route); + if (ret) + goto out; + + id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, + cma_sidr_rep_handler, id_priv); + if (IS_ERR(id_priv->cm_id.ib)) { + ret = PTR_ERR(id_priv->cm_id.ib); + goto out; + } + + req.path = route->path_rec; + req.service_id = cma_get_service_id(id_priv->id.ps, + &route->addr.dst_addr); + req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8); + req.max_cm_retries = CMA_MAX_CM_RETRIES; + + ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); + if (ret) { + ib_destroy_cm_id(id_priv->cm_id.ib); + id_priv->cm_id.ib = NULL; + } +out: + free(req.private_data, M_DEVBUF); + return ret; +} + +static int cma_connect_ib(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct ib_cm_req_param req; + struct rdma_route *route; + void *private_data; + int offset, ret; + + memset(&req, 0, sizeof req); + offset = cma_user_data_offset(id_priv->id.ps); + req.private_data_len = offset + conn_param->private_data_len; + private_data = malloc(req.private_data_len, M_DEVBUF, M_NOWAIT); + if (!private_data) + return (ENOMEM); + bzero(private_data, req.private_data_len); + + if (conn_param->private_data && conn_param->private_data_len) + memcpy(private_data + offset, conn_param->private_data, + conn_param->private_data_len); + + id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_ib_handler, + id_priv); + if (IS_ERR(id_priv->cm_id.ib)) { + ret = PTR_ERR(id_priv->cm_id.ib); + goto out; + } + + route = &id_priv->id.route; + ret = cma_format_hdr(private_data, id_priv->id.ps, route); + if (ret) + goto out; + req.private_data = private_data; + + req.primary_path = &route->path_rec[0]; + if (route->num_paths == 2) + req.alternate_path = &route->path_rec[1]; + + req.service_id = cma_get_service_id(id_priv->id.ps, + &route->addr.dst_addr); + req.qp_num = id_priv->qp_num; + req.qp_type = IB_QPT_RC; + req.starting_psn = id_priv->seq_num; + req.responder_resources = conn_param->responder_resources; + req.initiator_depth = conn_param->initiator_depth; + req.flow_control = conn_param->flow_control; + req.retry_count = conn_param->retry_count; + req.rnr_retry_count = conn_param->rnr_retry_count; + req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; + req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; + req.max_cm_retries = CMA_MAX_CM_RETRIES; + req.srq = id_priv->srq ? 1 : 0; + + ret = ib_send_cm_req(id_priv->cm_id.ib, &req); +out: + if (ret && !IS_ERR(id_priv->cm_id.ib)) { + ib_destroy_cm_id(id_priv->cm_id.ib); + id_priv->cm_id.ib = NULL; + } + + free(private_data, M_DEVBUF); + return ret; +} +#endif + +static int cma_connect_iw(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct iw_cm_id *cm_id; + struct sockaddr_in* sin; + int ret; + struct iw_cm_conn_param iw_param; + + cm_id = iw_create_cm_id(id_priv->id.device, id_priv->so, + cma_iw_handler, id_priv); + if (IS_ERR(cm_id)) { + ret = PTR_ERR(cm_id); + goto out; + } + + id_priv->cm_id.iw = cm_id; + + sin = (struct sockaddr_in*) &id_priv->id.route.addr.src_addr; + cm_id->local_addr = *sin; + + sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr; + cm_id->remote_addr = *sin; + + ret = cma_modify_qp_rtr(&id_priv->id); + if (ret) + goto out; + + iw_param.ord = conn_param->initiator_depth; + iw_param.ird = conn_param->responder_resources; + iw_param.private_data = conn_param->private_data; + iw_param.private_data_len = conn_param->private_data_len; + if (id_priv->id.qp) + iw_param.qpn = id_priv->qp_num; + else + iw_param.qpn = conn_param->qp_num; + ret = iw_cm_connect(cm_id, &iw_param); +out: + if (ret && !IS_ERR(cm_id)) { + iw_destroy_cm_id(cm_id); + id_priv->cm_id.iw = NULL; + } + return ret; +} + +int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_CONNECT)) + return (EINVAL); + + if (!id->qp) { + id_priv->qp_num = conn_param->qp_num; + id_priv->srq = conn_param->srq; + } + +#ifdef IB_SUPPORTED + switch (rdma_node_get_transport(id->device->node_type)) { + case RDMA_TRANSPORT_IB: + if (cma_is_ud_ps(id->ps)) + ret = cma_resolve_ib_udp(id_priv, conn_param); + else + ret = cma_connect_ib(id_priv, conn_param); + break; + case RDMA_TRANSPORT_IWARP: +#endif + ret = cma_connect_iw(id_priv, conn_param); +#ifdef IB_SUPPORTED + break; + default: + ret = ENOSYS; + break; + } +#endif + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, CMA_CONNECT, CMA_ROUTE_RESOLVED); + return ret; +} + +#ifdef IB_SUPPORTED +static int cma_accept_ib(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct ib_cm_rep_param rep; + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + if (id_priv->id.qp) { + ret = cma_modify_qp_rtr(&id_priv->id); + if (ret) + goto out; + + qp_attr.qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, &qp_attr, + &qp_attr_mask); + if (ret) + goto out; + + qp_attr.max_rd_atomic = conn_param->initiator_depth; + ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); + if (ret) + goto out; + } + + memset(&rep, 0, sizeof rep); + rep.qp_num = id_priv->qp_num; + rep.starting_psn = id_priv->seq_num; + rep.private_data = conn_param->private_data; + rep.private_data_len = conn_param->private_data_len; + rep.responder_resources = conn_param->responder_resources; + rep.initiator_depth = conn_param->initiator_depth; + rep.target_ack_delay = CMA_CM_RESPONSE_TIMEOUT; + rep.failover_accepted = 0; + rep.flow_control = conn_param->flow_control; + rep.rnr_retry_count = conn_param->rnr_retry_count; + rep.srq = id_priv->srq ? 1 : 0; + + ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); +out: + return ret; +} +#endif + +static int cma_accept_iw(struct rdma_id_private *id_priv, + struct rdma_conn_param *conn_param) +{ + struct iw_cm_conn_param iw_param; + int ret; + + ret = cma_modify_qp_rtr(&id_priv->id); + if (ret) + return ret; + + iw_param.ord = conn_param->initiator_depth; + iw_param.ird = conn_param->responder_resources; + iw_param.private_data = conn_param->private_data; + iw_param.private_data_len = conn_param->private_data_len; + if (id_priv->id.qp) { + iw_param.qpn = id_priv->qp_num; + } else + iw_param.qpn = conn_param->qp_num; + + return iw_cm_accept(id_priv->cm_id.iw, &iw_param); +} + +#ifdef IB_SUPPORTED +static int cma_send_sidr_rep(struct rdma_id_private *id_priv, + enum ib_cm_sidr_status status, + const void *private_data, int private_data_len) +{ + struct ib_cm_sidr_rep_param rep; + + memset(&rep, 0, sizeof rep); + rep.status = status; + if (status == IB_SIDR_SUCCESS) { + rep.qp_num = id_priv->qp_num; + rep.qkey = id_priv->qkey; + } + rep.private_data = private_data; + rep.private_data_len = private_data_len; + + return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); +} +#endif + +int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp(id_priv, CMA_CONNECT)) + return (EINVAL); + + if (!id->qp && conn_param) { + id_priv->qp_num = conn_param->qp_num; + id_priv->srq = conn_param->srq; + } + +#ifdef IB_SUPPORTED + switch (rdma_node_get_transport(id->device->node_type)) { + case RDMA_TRANSPORT_IB: + if (cma_is_ud_ps(id->ps)) + ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, + conn_param->private_data, + conn_param->private_data_len); + else if (conn_param) + ret = cma_accept_ib(id_priv, conn_param); + else + ret = cma_rep_recv(id_priv); + break; + case RDMA_TRANSPORT_IWARP: +#endif + ret = cma_accept_iw(id_priv, conn_param); +#ifdef IB_SUPPORTED + break; + default: + ret = ENOSYS; + break; + } +#endif + + if (ret) + goto reject; + + return 0; +reject: + cma_modify_qp_err(id); + rdma_reject(id, NULL, 0); + return ret; +} + +int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_has_cm_dev(id_priv)) + return (EINVAL); +#ifdef IB_SUPPORTED + switch (id->device->node_type) { + case RDMA_NODE_IB_CA: + ret = ib_cm_notify(id_priv->cm_id.ib, event); + break; + default: +#endif + ret = 0; +#ifdef IB_SUPPORTED + break; + } +#endif + return ret; +} + +int rdma_reject(struct rdma_cm_id *id, const void *private_data, + u8 private_data_len) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_has_cm_dev(id_priv)) + return (EINVAL); + +#ifdef IB_SUPPORTED + switch (rdma_node_get_transport(id->device->node_type)) { + case RDMA_TRANSPORT_IB: + if (cma_is_ud_ps(id->ps)) + ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, + private_data, private_data_len); + else + ret = ib_send_cm_rej(id_priv->cm_id.ib, + IB_CM_REJ_CONSUMER_DEFINED, NULL, + 0, private_data, private_data_len); + break; + case RDMA_TRANSPORT_IWARP: +#endif + ret = iw_cm_reject(id_priv->cm_id.iw, + private_data, private_data_len); +#ifdef IB_SUPPORTED + break; + default: + ret = ENOSYS; + break; + } +#endif + return ret; +} + +int rdma_disconnect(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_has_cm_dev(id_priv)) + return (EINVAL); + +#ifdef IB_SUPPORTED + switch (rdma_node_get_transport(id->device->node_type)) { + case RDMA_TRANSPORT_IB: + ret = cma_modify_qp_err(id); + if (ret) + goto out; + /* Initiate or respond to a disconnect. */ + if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) + ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0); + break; + case RDMA_TRANSPORT_IWARP: +#endif + ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); +#ifdef IB_SUPPORTED + break; + default: + ret = EINVAL; + break; + } +out: +#endif + return ret; +} + +#ifdef IB_SUPPORTED +static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) +{ + struct rdma_id_private *id_priv; + struct cma_multicast *mc = multicast->context; + struct rdma_cm_event event; + int ret; + + id_priv = mc->id_priv; + if (cma_disable_remove(id_priv, CMA_ADDR_BOUND) && + cma_disable_remove(id_priv, CMA_ADDR_RESOLVED)) + return 0; + + if (!status && id_priv->id.qp) + status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid, + multicast->rec.mlid); + + memset(&event, 0, sizeof event); + event.status = status; + event.param.ud.private_data = mc->context; + if (!status) { + event.event = RDMA_CM_EVENT_MULTICAST_JOIN; + ib_init_ah_from_mcmember(id_priv->id.device, + id_priv->id.port_num, &multicast->rec, + &event.param.ud.ah_attr); + event.param.ud.qp_num = 0xFFFFFF; + event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey); + } else + event.event = RDMA_CM_EVENT_MULTICAST_ERROR; + + ret = id_priv->id.event_handler(&id_priv->id, &event); + if (ret) { + cma_exch(id_priv, CMA_DESTROYING); + cma_enable_remove(id_priv); + rdma_destroy_id(&id_priv->id); + return 0; + } + + cma_enable_remove(id_priv); + return 0; +} + +static void cma_set_mgid(struct rdma_id_private *id_priv, + struct sockaddr *addr, union ib_gid *mgid) +{ + unsigned char mc_map[MAX_ADDR_LEN]; + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + struct sockaddr_in *sin = (struct sockaddr_in *) addr; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr; + + if (cma_any_addr(addr)) { + memset(mgid, 0, sizeof *mgid); + } else if ((addr->sa_family == AF_INET6) && + ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFF10A01B) == + 0xFF10A01B)) { + /* IPv6 address is an SA assigned MGID. */ + memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); + } else { + ip_ib_mc_map(sin->sin_addr.s_addr, mc_map); + if (id_priv->id.ps == RDMA_PS_UDP) + mc_map[7] = 0x01; /* Use RDMA CM signature */ + mc_map[8] = ib_addr_get_pkey(dev_addr) >> 8; + mc_map[9] = (unsigned char) ib_addr_get_pkey(dev_addr); + *mgid = *(union ib_gid *) (mc_map + 4); + } +} + +static int cma_join_ib_multicast(struct rdma_id_private *id_priv, + struct cma_multicast *mc) +{ + struct ib_sa_mcmember_rec rec; + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + ib_sa_comp_mask comp_mask; + int ret; + + ib_addr_get_mgid(dev_addr, &rec.mgid); + ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, + &rec.mgid, &rec); + if (ret) + return ret; + + cma_set_mgid(id_priv, &mc->addr, &rec.mgid); + if (id_priv->id.ps == RDMA_PS_UDP) + rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); + ib_addr_get_sgid(dev_addr, &rec.port_gid); + rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); + rec.join_state = 1; + + comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | + IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE | + IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL | + IB_SA_MCMEMBER_REC_FLOW_LABEL | + IB_SA_MCMEMBER_REC_TRAFFIC_CLASS; + + mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device, + id_priv->id.port_num, &rec, + comp_mask, M_NOWAIT, + cma_ib_mc_handler, mc); + if (IS_ERR(mc->multicast.ib)) + return PTR_ERR(mc->multicast.ib); + + return 0; +} + +int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, + void *context) +{ + struct rdma_id_private *id_priv; + struct cma_multicast *mc; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (!cma_comp(id_priv, CMA_ADDR_BOUND) && + !cma_comp(id_priv, CMA_ADDR_RESOLVED)) + return (EINVAL); + + mc = malloc(sizeof *mc, M_DEVBUF, M_NOWAIT); + if (!mc) + return (ENOMEM); + + memcpy(&mc->addr, addr, ip_addr_size(addr)); + mc->context = context; + mc->id_priv = id_priv; + + mtx_lock(&id_priv->lock); + LIST_INSERT_HEAD(&id_priv->mc_list, mc, list); + mtx_unlock(&id_priv->lock); + + switch (rdma_node_get_transport(id->device->node_type)) { + case RDMA_TRANSPORT_IB: + ret = cma_join_ib_multicast(id_priv, mc); + break; + default: + ret = ENOSYS; + break; + } + + if (ret) { + mtx_lock(&id_priv->lock); + list_del(&mc->list); + mtx_unlock(&id_priv->lock); + free(mc, M_DEVBUF); + } + return ret; +} + +void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) +{ + struct rdma_id_private *id_priv; + struct cma_multicast *mc; + + id_priv = container_of(id, struct rdma_id_private, id); + mtx_lock(&id_priv->lock); + LIST_FOREACH(mc, &id_priv->mc_list, list) { + if (!memcmp(&mc->addr, addr, ip_addr_size(addr))) { + list_del(&mc->list); + mtx_unlock(&id_priv->lock); + + if (id->qp) + ib_detach_mcast(id->qp, + &mc->multicast.ib->rec.mgid, + mc->multicast.ib->rec.mlid); + ib_sa_free_multicast(mc->multicast.ib, M_DEVBUF); + free(mc, M_DEVBUF); + return; + } + } + mtx_unlock(&id_priv->lock); +} +#endif + +static void cma_add_one(struct ib_device *device) +{ + struct cma_device *cma_dev; + struct rdma_id_private *id_priv; + + cma_dev = malloc(sizeof *cma_dev, M_DEVBUF, M_NOWAIT|M_ZERO); + if (!cma_dev) + return; + + cma_dev->device = device; + + cv_init(&cma_dev->comp, "cma_device"); + mtx_init(&cma_dev->lock, "cma_device", NULL, MTX_DUPOK|MTX_DEF); + cma_dev->refcount = 1; + LIST_INIT(&cma_dev->id_list); + ib_set_client_data(device, &cma_client, cma_dev); + + mtx_lock(&lock); + TAILQ_INSERT_TAIL(&dev_list, cma_dev, list); + LIST_FOREACH(id_priv, &listen_any_list, list) + cma_listen_on_dev(id_priv, cma_dev); + mtx_unlock(&lock); +} + +static int cma_remove_id_dev(struct rdma_id_private *id_priv) +{ + struct rdma_cm_event event; + enum cma_state state; + + /* Record that we want to remove the device */ + state = cma_exch(id_priv, CMA_DEVICE_REMOVAL); + if (state == CMA_DESTROYING) + return 0; + + cma_cancel_operation(id_priv, state); + mtx_lock(&id_priv->lock); + PANIC_IF(id_priv->dev_remove < 0); + if (id_priv->dev_remove) + cv_wait(&id_priv->wait_remove, &id_priv->lock); + mtx_unlock(&id_priv->lock); + + /* Check for destruction from another callback. */ + if (!cma_comp(id_priv, CMA_DEVICE_REMOVAL)) + return 0; + + memset(&event, 0, sizeof event); + event.event = RDMA_CM_EVENT_DEVICE_REMOVAL; + return id_priv->id.event_handler(&id_priv->id, &event); +} + +static void cma_process_remove(struct cma_device *cma_dev) +{ + struct rdma_id_private *id_priv; + int ret; + + mtx_lock(&lock); + while (!LIST_EMPTY(&cma_dev->id_list)) { + id_priv = LIST_FIRST(&cma_dev->id_list); + + if (cma_internal_listen(id_priv)) { + cma_destroy_listen(id_priv); + continue; + } + + LIST_REMOVE(id_priv, list); + mtx_lock(&id_priv->lock); + id_priv->refcount++; + mtx_unlock(&id_priv->lock); + mtx_unlock(&lock); + + ret = cma_remove_id_dev(id_priv); + cma_deref_id(id_priv); + if (ret) + rdma_destroy_id(&id_priv->id); + + mtx_lock(&lock); + } + mtx_unlock(&lock); + + cma_deref_dev(cma_dev); + mtx_lock(&cma_dev->lock); + PANIC_IF(cma_dev->refcount < 0); + if (cma_dev->refcount) + cv_wait(&cma_dev->comp, &cma_dev->lock); + mtx_unlock(&cma_dev->lock); +} + +static void cma_remove_one(struct ib_device *device) +{ + struct cma_device *cma_dev; + + cma_dev = ib_get_client_data(device, &cma_client); + if (!cma_dev) + return; + + mtx_lock(&lock); + TAILQ_REMOVE(&dev_list, cma_dev, list); + mtx_unlock(&lock); + + cma_process_remove(cma_dev); + free(cma_dev, M_DEVBUF); +} + +static int cma_init(void) +{ + int ret; + + LIST_INIT(&listen_any_list); + TAILQ_INIT(&dev_list); + mtx_init(&lock, "cma_device list", NULL, MTX_DEF); + + arc4rand(&next_port, sizeof next_port, 0); + next_port = ((unsigned int) next_port % + (ipport_lastauto - ipport_firstauto)) + + ipport_firstauto; + cma_wq = taskqueue_create("rdma_cm", M_NOWAIT, taskqueue_thread_enqueue, + &cma_wq); + + if (!cma_wq) + return (ENOMEM); + + taskqueue_start_threads(&cma_wq, 1, PI_NET, "cma_wq thread"); +#ifdef IB_SUPPORTED + ib_sa_register_client(&sa_client); +#endif + rdma_addr_register_client(&addr_client); + + ret = ib_register_client(&cma_client); + if (ret) + goto err; + return 0; + +err: + rdma_addr_unregister_client(&addr_client); +#ifdef IB_SUPPORTED + ib_sa_unregister_client(&sa_client); +#endif + taskqueue_free(cma_wq); + return ret; +} + +static void cma_cleanup(void) +{ + ib_unregister_client(&cma_client); + rdma_addr_unregister_client(&addr_client); +#ifdef IB_SUPPORTED + ib_sa_unregister_client(&sa_client); +#endif + taskqueue_free(cma_wq); + kvl_free(&sdp_ps); + kvl_free(&tcp_ps); + kvl_free(&udp_ps); + kvl_free(&ipoib_ps); +} + +static int +cma_load(module_t mod, int cmd, void *arg) +{ + int err = 0; + + switch (cmd) { + case MOD_LOAD: + printf("Loading rdma_cma.\n"); + cma_init(); + break; + case MOD_QUIESCE: + break; + case MOD_UNLOAD: + printf("Unloading rdma_cma.\n"); + cma_cleanup(); + break; + case MOD_SHUTDOWN: + break; + default: + err = EOPNOTSUPP; + break; + } + + return (err); +} + +static moduledata_t mod_data = { + "rdma_cma", + cma_load, + 0 +}; + +MODULE_VERSION(rdma_cma, 1); +MODULE_DEPEND(rdma_cma, rdma_core, 1, 1, 1); +MODULE_DEPEND(rdma_cma, rdma_addr, 1, 1, 1); +MODULE_DEPEND(rdma_cma, rdma_iwcm, 1, 1, 1); +DECLARE_MODULE(rdma_cma, mod_data, SI_SUB_EXEC, SI_ORDER_ANY); diff --git a/sys/contrib/rdma/rdma_device.c b/sys/contrib/rdma/rdma_device.c new file mode 100644 index 000000000000..53cf31fb28d9 --- /dev/null +++ b/sys/contrib/rdma/rdma_device.c @@ -0,0 +1,776 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: device.c 1349 2004-12-16 21:09:43Z roland $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/condvar.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/libkern.h> +#include <sys/socket.h> +#include <sys/module.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/rwlock.h> +#include <sys/queue.h> +#include <sys/taskqueue.h> +#include <sys/priv.h> +#include <sys/syslog.h> + +#include <contrib/rdma/core_priv.h> + +struct ib_client_data { + TAILQ_ENTRY(ib_client_data) list; + struct ib_client *client; + void * data; +}; + +static TAILQ_HEAD(, ib_device) device_list; +static TAILQ_HEAD(client_list_s, ib_client) client_list; + +/* + * device_mutex protects access to both device_list and client_list. + * There's no real point to using multiple locks or something fancier + * like an rwsem: we always access both lists, and we're always + * modifying one list or the other list. In any case this is not a + * hot path so there's no point in trying to optimize. + */ +static struct mtx device_mutex; + +static int ib_device_check_mandatory(struct ib_device *device) +{ +#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x } +#define MANDATORY_TABLE_DEPTH 19 + static const struct { + size_t offset; + char *name; + } mandatory_table[] = { + IB_MANDATORY_FUNC(query_device), + IB_MANDATORY_FUNC(query_port), + IB_MANDATORY_FUNC(query_pkey), + IB_MANDATORY_FUNC(query_gid), + IB_MANDATORY_FUNC(alloc_pd), + IB_MANDATORY_FUNC(dealloc_pd), + IB_MANDATORY_FUNC(create_ah), + IB_MANDATORY_FUNC(destroy_ah), + IB_MANDATORY_FUNC(create_qp), + IB_MANDATORY_FUNC(modify_qp), + IB_MANDATORY_FUNC(destroy_qp), + IB_MANDATORY_FUNC(post_send), + IB_MANDATORY_FUNC(post_recv), + IB_MANDATORY_FUNC(create_cq), + IB_MANDATORY_FUNC(destroy_cq), + IB_MANDATORY_FUNC(poll_cq), + IB_MANDATORY_FUNC(req_notify_cq), + IB_MANDATORY_FUNC(get_dma_mr), + IB_MANDATORY_FUNC(dereg_mr) + }; + int i; + + for (i = 0; i < MANDATORY_TABLE_DEPTH; ++i) { + if (!*(void **) ((void *) ((unsigned long)device + mandatory_table[i].offset))) { + log(LOG_WARNING, "Device %s is missing mandatory function %s\n", + device->name, mandatory_table[i].name); + return (EINVAL); + } + } + + return 0; +} + +static struct ib_device *__ib_device_get_by_name(const char *name) +{ + struct ib_device *device; + + TAILQ_FOREACH(device, &device_list, core_list) + if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX)) + return device; + + return NULL; +} + + +static int alloc_name(char *name) +{ + long *inuse; + char buf[IB_DEVICE_NAME_MAX]; + struct ib_device *device; + int i; + + inuse = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT); + if (!inuse) + return (ENOMEM); + + TAILQ_FOREACH(device, &device_list, core_list) { + if (!sscanf(device->name, name, &i)) + continue; + if (i < 0 || i >= PAGE_SIZE * 8) + continue; + snprintf(buf, sizeof buf, name, i); + if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX)) + setbit(inuse, i); + } + + i = find_first_zero_bit(inuse, PAGE_SIZE * 8); + free(inuse, M_DEVBUF); + snprintf(buf, sizeof buf, name, i); + + if (__ib_device_get_by_name(buf)) + return (ENFILE); + + strlcpy(name, buf, IB_DEVICE_NAME_MAX); + return 0; +} + +static int start_port(struct ib_device *device) +{ + return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1; +} + + +static int end_port(struct ib_device *device) +{ + return (device->node_type == RDMA_NODE_IB_SWITCH) ? + 0 : device->phys_port_cnt; +} + +/** + * ib_alloc_device - allocate an IB device struct + * @size:size of structure to allocate + * + * Low-level drivers should use ib_alloc_device() to allocate &struct + * ib_device. @size is the size of the structure to be allocated, + * including any private data used by the low-level driver. + * ib_dealloc_device() must be used to free structures allocated with + * ib_alloc_device(). + */ +struct ib_device *ib_alloc_device(size_t size) +{ + void *dev; + + if (size < sizeof (struct ib_device)) + panic("size=%zd < sizeof (struct ib_device)=%zd)", + size, sizeof (struct ib_device)); + + dev = malloc(size, M_DEVBUF, M_NOWAIT); + if (dev) + bzero(dev, size); + return dev; +} + +/** + * ib_dealloc_device - free an IB device struct + * @device:structure to free + * + * Free a structure allocated with ib_alloc_device(). + */ +void ib_dealloc_device(struct ib_device *device) +{ + if (device->reg_state == IB_DEV_UNINITIALIZED) { + free(device, M_DEVBUF); + return; + } + + if (device->reg_state != IB_DEV_UNREGISTERED) + panic("device->reg_state=%d != IB_DEV_UNREGISTERED)", + device->reg_state); +#ifdef notyet + ib_device_unregister_sysfs(device); +#endif +} + +static int add_client_context(struct ib_device *device, struct ib_client *client) +{ + struct ib_client_data *context; + + context = malloc(sizeof *context, M_DEVBUF, M_NOWAIT); + if (!context) { + log(LOG_WARNING, "Couldn't allocate client context for %s/%s\n", + device->name, client->name); + return (ENOMEM); + } + + context->client = client; + context->data = NULL; + + mtx_lock(&device->client_data_lock); + TAILQ_INSERT_TAIL(&device->client_data_list, context, list); + mtx_unlock(&device->client_data_lock); + + return 0; +} + +static int read_port_table_lengths(struct ib_device *device) +{ + struct ib_port_attr *tprops = NULL; + int num_ports, ret = ENOMEM; + u8 port_index; + + tprops = malloc(sizeof *tprops, M_DEVBUF, M_NOWAIT); + if (!tprops) + goto out; + + num_ports = end_port(device) - start_port(device) + 1; + + device->pkey_tbl_len = malloc(sizeof *device->pkey_tbl_len * num_ports, + M_DEVBUF, M_NOWAIT); + device->gid_tbl_len = malloc(sizeof *device->gid_tbl_len * num_ports, + M_DEVBUF, M_NOWAIT); + if (!device->pkey_tbl_len || !device->gid_tbl_len) + goto err; + + for (port_index = 0; port_index < num_ports; ++port_index) { + ret = ib_query_port(device, port_index + start_port(device), + tprops); + if (ret) + goto err; + device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len; + device->gid_tbl_len[port_index] = tprops->gid_tbl_len; + } + + ret = 0; + goto out; + +err: + free(device->gid_tbl_len, M_DEVBUF); + free(device->pkey_tbl_len, M_DEVBUF); +out: + free(tprops, M_DEVBUF); + return ret; +} + +/** + * ib_register_device - Register an IB device with IB core + * @device:Device to register + * + * Low-level drivers use ib_register_device() to register their + * devices with the IB core. All registered clients will receive a + * callback for each device that is added. @device must be allocated + * with ib_alloc_device(). + */ +int ib_register_device(struct ib_device *device) +{ + int ret; + + mtx_lock(&device_mutex); + + if (strchr(device->name, '%')) { + ret = alloc_name(device->name); + if (ret) + goto out; + } + + if (ib_device_check_mandatory(device)) { + ret = EINVAL; + goto out; + } + + TAILQ_INIT(&device->event_handler_list); + TAILQ_INIT(&device->client_data_list); + mtx_init(&device->event_handler_lock, "ib event handler", NULL, + MTX_DUPOK|MTX_DEF); + mtx_init(&device->client_data_lock, "ib client data", NULL, + MTX_DUPOK|MTX_DEF); + + ret = read_port_table_lengths(device); + if (ret) { + log(LOG_WARNING, "Couldn't create table lengths cache for device %s\n", + device->name); + goto out; + } + +#ifdef notyet + ret = ib_device_register_sysfs(device); + if (ret) { + log(LOG_WARNING, "Couldn't register device %s with driver model\n", + device->name); + free(device->gid_tbl_len, M_DEVBUF); + free(device->pkey_tbl_len, M_DEVBUF); + goto out; + } +#endif + + TAILQ_INSERT_TAIL(&device_list, device, core_list); + + device->reg_state = IB_DEV_REGISTERED; + + { + struct ib_client *client; + + TAILQ_FOREACH(client, &client_list, list) + if (client->add && !add_client_context(device, client)) + client->add(device); + } + + out: + mtx_unlock(&device_mutex); + return ret; +} + +/** + * ib_unregister_device - Unregister an IB device + * @device:Device to unregister + * + * Unregister an IB device. All clients will receive a remove callback. + */ +void ib_unregister_device(struct ib_device *device) +{ + struct ib_client *client; + struct ib_client_data *context, *tmp; + + mtx_lock(&device_mutex); + + TAILQ_FOREACH_REVERSE(client, &client_list, client_list_s, list) + if (client->remove) + client->remove(device); + + TAILQ_REMOVE(&device_list, device, core_list); + + free(device->gid_tbl_len, M_DEVBUF); + free(device->pkey_tbl_len, M_DEVBUF); + + mtx_unlock(&device_mutex); + + mtx_lock(&device->client_data_lock); + TAILQ_FOREACH_SAFE(context, &device->client_data_list, list, tmp) + free(context, M_DEVBUF); + mtx_unlock(&device->client_data_lock); + + device->reg_state = IB_DEV_UNREGISTERED; +} + +/** + * ib_register_client - Register an IB client + * @client:Client to register + * + * Upper level users of the IB drivers can use ib_register_client() to + * register callbacks for IB device addition and removal. When an IB + * device is added, each registered client's add method will be called + * (in the order the clients were registered), and when a device is + * removed, each client's remove method will be called (in the reverse + * order that clients were registered). In addition, when + * ib_register_client() is called, the client will receive an add + * callback for all devices already registered. + */ +int ib_register_client(struct ib_client *client) +{ + struct ib_device *device; + + mtx_lock(&device_mutex); + + TAILQ_INSERT_TAIL(&client_list, client, list); + TAILQ_FOREACH(device, &device_list, core_list) + if (client->add && !add_client_context(device, client)) + client->add(device); + + mtx_unlock(&device_mutex); + + return 0; +} + +/** + * ib_unregister_client - Unregister an IB client + * @client:Client to unregister + * + * Upper level users use ib_unregister_client() to remove their client + * registration. When ib_unregister_client() is called, the client + * will receive a remove callback for each IB device still registered. + */ +void ib_unregister_client(struct ib_client *client) +{ + struct ib_client_data *context, *tmp; + struct ib_device *device; + + mtx_lock(&device_mutex); + + TAILQ_FOREACH(device, &device_list, core_list) { + if (client->remove) + client->remove(device); + + mtx_lock(&device->client_data_lock); + TAILQ_FOREACH_SAFE(context, &device->client_data_list, list,tmp) + if (context->client == client) { + TAILQ_REMOVE(&device->client_data_list, context, + list); + free(context, M_DEVBUF); + } + mtx_unlock(&device->client_data_lock); + } + TAILQ_REMOVE(&client_list, client, list); + + mtx_unlock(&device_mutex); +} + +/** + * ib_get_client_data - Get IB client context + * @device:Device to get context for + * @client:Client to get context for + * + * ib_get_client_data() returns client context set with + * ib_set_client_data(). + */ +void *ib_get_client_data(struct ib_device *device, struct ib_client *client) +{ + struct ib_client_data *context; + void *ret = NULL; + + mtx_lock(&device->client_data_lock); + TAILQ_FOREACH(context, &device->client_data_list, list) + if (context->client == client) { + ret = context->data; + break; + } + mtx_unlock(&device->client_data_lock); + + return ret; +} + +/** + * ib_set_client_data - Set IB client context + * @device:Device to set context for + * @client:Client to set context for + * @data:Context to set + * + * ib_set_client_data() sets client context that can be retrieved with + * ib_get_client_data(). + */ +void ib_set_client_data(struct ib_device *device, struct ib_client *client, + void *data) +{ + struct ib_client_data *context; + + mtx_lock(&device->client_data_lock); + TAILQ_FOREACH(context, &device->client_data_list, list) + if (context->client == client) { + context->data = data; + goto out; + } + + log(LOG_WARNING, "No client context found for %s/%s\n", + device->name, client->name); + +out: + mtx_unlock(&device->client_data_lock); +} + +/** + * ib_register_event_handler - Register an IB event handler + * @event_handler:Handler to register + * + * ib_register_event_handler() registers an event handler that will be + * called back when asynchronous IB events occur (as defined in + * chapter 11 of the InfiniBand Architecture Specification). This + * callback may occur in interrupt context. + */ +int ib_register_event_handler (struct ib_event_handler *event_handler) +{ + mtx_lock(&event_handler->device->event_handler_lock); + TAILQ_INSERT_TAIL(&event_handler->device->event_handler_list, + event_handler, list); + mtx_unlock(&event_handler->device->event_handler_lock); + + return 0; +} + +/** + * ib_unregister_event_handler - Unregister an event handler + * @event_handler:Handler to unregister + * + * Unregister an event handler registered with + * ib_register_event_handler(). + */ +int ib_unregister_event_handler(struct ib_event_handler *event_handler) +{ + mtx_lock(&event_handler->device->event_handler_lock); + TAILQ_REMOVE(&event_handler->device->event_handler_list, event_handler, + list); + mtx_unlock(&event_handler->device->event_handler_lock); + + return 0; +} + +/** + * ib_dispatch_event - Dispatch an asynchronous event + * @event:Event to dispatch + * + * Low-level drivers must call ib_dispatch_event() to dispatch the + * event to all registered event handlers when an asynchronous event + * occurs. + */ +void ib_dispatch_event(struct ib_event *event) +{ + struct ib_event_handler *handler; + + mtx_lock(&event->device->event_handler_lock); + + TAILQ_FOREACH(handler, &event->device->event_handler_list, list) + handler->handler(handler, event); + + mtx_unlock(&event->device->event_handler_lock); +} + +/** + * ib_query_device - Query IB device attributes + * @device:Device to query + * @device_attr:Device attributes + * + * ib_query_device() returns the attributes of a device through the + * @device_attr pointer. + */ +int ib_query_device(struct ib_device *device, + struct ib_device_attr *device_attr) +{ + return device->query_device(device, device_attr); +} + +/** + * ib_query_port - Query IB port attributes + * @device:Device to query + * @port_num:Port number to query + * @port_attr:Port attributes + * + * ib_query_port() returns the attributes of a port through the + * @port_attr pointer. + */ +int ib_query_port(struct ib_device *device, + u8 port_num, + struct ib_port_attr *port_attr) +{ + if (port_num < start_port(device) || port_num > end_port(device)) + return (EINVAL); + + return device->query_port(device, port_num, port_attr); +} + +/** + * ib_query_gid - Get GID table entry + * @device:Device to query + * @port_num:Port number to query + * @index:GID table index to query + * @gid:Returned GID + * + * ib_query_gid() fetches the specified GID table entry. + */ +int ib_query_gid(struct ib_device *device, + u8 port_num, int index, union ib_gid *gid) +{ + return device->query_gid(device, port_num, index, gid); +} + +/** + * ib_query_pkey - Get P_Key table entry + * @device:Device to query + * @port_num:Port number to query + * @index:P_Key table index to query + * @pkey:Returned P_Key + * + * ib_query_pkey() fetches the specified P_Key table entry. + */ +int ib_query_pkey(struct ib_device *device, + u8 port_num, u16 index, u16 *pkey) +{ + return device->query_pkey(device, port_num, index, pkey); +} + +/** + * ib_modify_device - Change IB device attributes + * @device:Device to modify + * @device_modify_mask:Mask of attributes to change + * @device_modify:New attribute values + * + * ib_modify_device() changes a device's attributes as specified by + * the @device_modify_mask and @device_modify structure. + */ +int ib_modify_device(struct ib_device *device, + int device_modify_mask, + struct ib_device_modify *device_modify) +{ + return device->modify_device(device, device_modify_mask, + device_modify); +} + +/** + * ib_modify_port - Modifies the attributes for the specified port. + * @device: The device to modify. + * @port_num: The number of the port to modify. + * @port_modify_mask: Mask used to specify which attributes of the port + * to change. + * @port_modify: New attribute values for the port. + * + * ib_modify_port() changes a port's attributes as specified by the + * @port_modify_mask and @port_modify structure. + */ +int ib_modify_port(struct ib_device *device, + u8 port_num, int port_modify_mask, + struct ib_port_modify *port_modify) +{ + if (port_num < start_port(device) || port_num > end_port(device)) + return (EINVAL); + + return device->modify_port(device, port_num, port_modify_mask, + port_modify); +} + +/** + * ib_find_gid - Returns the port number and GID table index where + * a specified GID value occurs. + * @device: The device to query. + * @gid: The GID value to search for. + * @port_num: The port number of the device where the GID value was found. + * @index: The index into the GID table where the GID was found. This + * parameter may be NULL. + */ +int ib_find_gid(struct ib_device *device, union ib_gid *gid, + u8 *port_num, u16 *index) +{ + union ib_gid tmp_gid; + int ret, port, i; + + for (port = start_port(device); port <= end_port(device); ++port) { + for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) { + ret = ib_query_gid(device, port, i, &tmp_gid); + if (ret) + return ret; + if (!memcmp(&tmp_gid, gid, sizeof *gid)) { + *port_num = port; + if (index) + *index = i; + return 0; + } + } + } + + return (ENOENT); +} + +/** + * ib_find_pkey - Returns the PKey table index where a specified + * PKey value occurs. + * @device: The device to query. + * @port_num: The port number of the device to search for the PKey. + * @pkey: The PKey value to search for. + * @index: The index into the PKey table where the PKey was found. + */ +int ib_find_pkey(struct ib_device *device, + u8 port_num, u16 pkey, u16 *index) +{ + int ret, i; + u16 tmp_pkey; + + for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) { + ret = ib_query_pkey(device, port_num, i, &tmp_pkey); + if (ret) + return ret; + + if (pkey == tmp_pkey) { + *index = i; + return 0; + } + } + + return (ENOENT); +} + +static int rdma_core_init(void) +{ + int ret; +#ifdef notyet + ret = ib_sysfs_setup(); + if (ret) + log(LOG_WARNING, "Couldn't create InfiniBand device class\n"); +#endif + + mtx_init(&device_mutex, "rdma_device mutex", NULL, MTX_DEF); + TAILQ_INIT(&client_list); + TAILQ_INIT(&device_list); + ret = ib_cache_setup(); + if (ret) { + log(LOG_WARNING, "Couldn't set up InfiniBand P_Key/GID cache\n"); +#ifdef notyet + ib_sysfs_cleanup(); +#endif + } + + return ret; +} + +static void rdma_core_cleanup(void) +{ + ib_cache_cleanup(); +#ifdef notyet + ib_sysfs_cleanup(); + /* Make sure that any pending umem accounting work is done. */ + flush_scheduled_work(); +#endif +} + +static int +rdma_core_load(module_t mod, int cmd, void *arg) +{ + int err = 0; + + switch (cmd) { + case MOD_LOAD: + printf("Loading rdma_core.\n"); + rdma_core_init(); + break; + case MOD_QUIESCE: + break; + case MOD_UNLOAD: + printf("Unloading rdma_core.\n"); + rdma_core_cleanup(); + break; + case MOD_SHUTDOWN: + break; + default: + err = EOPNOTSUPP; + break; + } + + return (err); +} + +static moduledata_t mod_data = { + "rdma_core", + rdma_core_load, + 0 +}; + +MODULE_VERSION(rdma_core, 1); +DECLARE_MODULE(rdma_core, mod_data, SI_SUB_EXEC, SI_ORDER_ANY); diff --git a/sys/contrib/rdma/rdma_iwcm.c b/sys/contrib/rdma/rdma_iwcm.c new file mode 100644 index 000000000000..916abcd2dfd9 --- /dev/null +++ b/sys/contrib/rdma/rdma_iwcm.c @@ -0,0 +1,1086 @@ +/* + * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * Copyright (c) 2005 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/kernel.h> +#include <sys/libkern.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/module.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/rwlock.h> +#include <sys/queue.h> +#include <sys/taskqueue.h> +#include <sys/priv.h> +#include <sys/syslog.h> +#include <sys/malloc.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> + +#include <contrib/rdma/iw_cm.h> + +enum iw_cm_state { + IW_CM_STATE_IDLE, /* unbound, inactive */ + IW_CM_STATE_LISTEN, /* listen waiting for connect */ + IW_CM_STATE_CONN_RECV, /* inbound waiting for user accept */ + IW_CM_STATE_CONN_SENT, /* outbound waiting for peer accept */ + IW_CM_STATE_ESTABLISHED, /* established */ + IW_CM_STATE_CLOSING, /* disconnect */ + IW_CM_STATE_DESTROYING /* object being deleted */ +}; + +struct iwcm_id_private { + struct iw_cm_id id; + enum iw_cm_state state; + unsigned long flags; + struct ib_qp *qp; + void * destroy_comp; + void * connect_wait; + TAILQ_HEAD(, iwcm_work) work_list; + struct mtx lock; + volatile int refcount; + TAILQ_HEAD(, iwcm_work) work_free_list; +}; + +#define IWCM_F_CALLBACK_DESTROY 1 +#define IWCM_F_CONNECT_WAIT 2 + +static struct taskqueue *iwcm_wq; +struct iwcm_work { + struct task task; + struct iwcm_id_private *cm_id; + TAILQ_ENTRY(iwcm_work) list; + struct iw_cm_event event; + TAILQ_ENTRY(iwcm_work) free_list; +}; + +/* + * The following services provide a mechanism for pre-allocating iwcm_work + * elements. The design pre-allocates them based on the cm_id type: + * LISTENING IDS: Get enough elements preallocated to handle the + * listen backlog. + * ACTIVE IDS: 4: CONNECT_REPLY, ESTABLISHED, DISCONNECT, CLOSE + * PASSIVE IDS: 3: ESTABLISHED, DISCONNECT, CLOSE + * + * Allocating them in connect and listen avoids having to deal + * with allocation failures on the event upcall from the provider (which + * is called in the interrupt context). + * + * One exception is when creating the cm_id for incoming connection requests. + * There are two cases: + * 1) in the event upcall, cm_event_handler(), for a listening cm_id. If + * the backlog is exceeded, then no more connection request events will + * be processed. cm_event_handler() returns ENOMEM in this case. Its up + * to the provider to reject the connection request. + * 2) in the connection request workqueue handler, cm_conn_req_handler(). + * If work elements cannot be allocated for the new connect request cm_id, + * then IWCM will call the provider reject method. This is ok since + * cm_conn_req_handler() runs in the workqueue thread context. + */ + +static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv) +{ + struct iwcm_work *work; + + if (TAILQ_EMPTY(&cm_id_priv->work_free_list)) + return NULL; + work = TAILQ_FIRST(&cm_id_priv->work_free_list); + TAILQ_REMOVE(&cm_id_priv->work_free_list, work, free_list); + return work; +} + +static void put_work(struct iwcm_work *work) +{ + TAILQ_INSERT_HEAD(&work->cm_id->work_free_list, work, free_list); +} + +static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv) +{ + struct iwcm_work *e, *tmp; + + TAILQ_FOREACH_SAFE(e, &cm_id_priv->work_free_list, free_list, tmp) + free(e, M_DEVBUF); +} + +static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count) +{ + struct iwcm_work *work; + + PANIC_IF(!TAILQ_EMPTY(&cm_id_priv->work_free_list)); + while (count--) { + work = malloc(sizeof(struct iwcm_work), M_DEVBUF, M_NOWAIT); + if (!work) { + dealloc_work_entries(cm_id_priv); + return (ENOMEM); + } + work->cm_id = cm_id_priv; + put_work(work); + } + return 0; +} + +/* + * Save private data from incoming connection requests to + * iw_cm_event, so the low level driver doesn't have to. Adjust + * the event ptr to point to the local copy. + */ +static int copy_private_data(struct iw_cm_event *event) +{ + void *p; + + p = malloc(event->private_data_len, M_DEVBUF, M_NOWAIT); + if (!p) + return (ENOMEM); + bcopy(event->private_data, p, event->private_data_len); + event->private_data = p; + return 0; +} + +static void free_cm_id(struct iwcm_id_private *cm_id_priv) +{ + dealloc_work_entries(cm_id_priv); + free(cm_id_priv, M_DEVBUF); +} + +/* + * Release a reference on cm_id. If the last reference is being + * released, enable the waiting thread (in iw_destroy_cm_id) to + * get woken up, and return 1 if a thread is already waiting. + */ +static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv) +{ + mtx_lock(&cm_id_priv->lock); + PANIC_IF(atomic_load_acq_int(&cm_id_priv->refcount)==0); + if (atomic_fetchadd_int(&cm_id_priv->refcount, -1) == 1) { + PANIC_IF(!TAILQ_EMPTY(&cm_id_priv->work_list)); + wakeup(&cm_id_priv->destroy_comp); + mtx_unlock(&cm_id_priv->lock); + return 1; + } + mtx_unlock(&cm_id_priv->lock); + + return 0; +} + +static void add_ref(struct iw_cm_id *cm_id) +{ + struct iwcm_id_private *cm_id_priv; + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + mtx_lock(&cm_id_priv->lock); + atomic_add_int(&cm_id_priv->refcount, 1); + mtx_unlock(&cm_id_priv->lock); +} + +static void rem_ref(struct iw_cm_id *cm_id) +{ + struct iwcm_id_private *cm_id_priv; + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + if (iwcm_deref_id(cm_id_priv) && + isset(&cm_id_priv->flags, IWCM_F_CALLBACK_DESTROY)) { + PANIC_IF(!TAILQ_EMPTY(&cm_id_priv->work_list)); + free_cm_id(cm_id_priv); + } +} + +static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event); + +struct iw_cm_id *iw_create_cm_id(struct ib_device *device, + struct socket *so, + iw_cm_handler cm_handler, + void *context) +{ + struct iwcm_id_private *cm_id_priv; + + KASSERT(so, ("iw_create_cm_id called with NULL socket!")); + cm_id_priv = malloc(sizeof(*cm_id_priv), M_DEVBUF, M_NOWAIT); + if (!cm_id_priv) + return ERR_PTR(ENOMEM); + bzero(cm_id_priv, sizeof *cm_id_priv); + + cm_id_priv->state = IW_CM_STATE_IDLE; + cm_id_priv->id.device = device; + cm_id_priv->id.cm_handler = cm_handler; + cm_id_priv->id.context = context; + cm_id_priv->id.event_handler = cm_event_handler; + cm_id_priv->id.add_ref = add_ref; + cm_id_priv->id.rem_ref = rem_ref; + cm_id_priv->id.so = so; + mtx_init(&cm_id_priv->lock, "cm_id_priv", NULL, MTX_DUPOK|MTX_DEF); + atomic_store_rel_int(&cm_id_priv->refcount, 1); + TAILQ_INIT(&cm_id_priv->work_list); + TAILQ_INIT(&cm_id_priv->work_free_list); + + return &cm_id_priv->id; +} + + +static int iwcm_modify_qp_err(struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + + if (!qp) + return (EINVAL); + + qp_attr.qp_state = IB_QPS_ERR; + return ib_modify_qp(qp, &qp_attr, IB_QP_STATE); +} + +/* + * This is really the RDMAC CLOSING state. It is most similar to the + * IB SQD QP state. + */ +static int iwcm_modify_qp_sqd(struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + + PANIC_IF(qp == NULL); + qp_attr.qp_state = IB_QPS_SQD; + return ib_modify_qp(qp, &qp_attr, IB_QP_STATE); +} + +/* + * CM_ID <-- CLOSING + * + * Block if a passive or active connection is currently being processed. Then + * process the event as follows: + * - If we are ESTABLISHED, move to CLOSING and modify the QP state + * based on the abrupt flag + * - If the connection is already in the CLOSING or IDLE state, the peer is + * disconnecting concurrently with us and we've already seen the + * DISCONNECT event -- ignore the request and return 0 + * - Disconnect on a listening endpoint returns EINVAL + */ +int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt) +{ + struct iwcm_id_private *cm_id_priv; + int ret = 0; + struct ib_qp *qp = NULL; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + /* Wait if we're currently in a connect or accept downcall */ + mtx_lock(&cm_id_priv->lock); + if (isset(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT)) + msleep(&cm_id_priv->connect_wait, &cm_id_priv->lock, 0, "iwcm connect1", 0); + + switch (cm_id_priv->state) { + case IW_CM_STATE_ESTABLISHED: + cm_id_priv->state = IW_CM_STATE_CLOSING; + + /* QP could be <nul> for user-mode client */ + if (cm_id_priv->qp) + qp = cm_id_priv->qp; + else + ret = EINVAL; + break; + case IW_CM_STATE_LISTEN: + ret = EINVAL; + break; + case IW_CM_STATE_CLOSING: + /* remote peer closed first */ + case IW_CM_STATE_IDLE: + /* accept or connect returned !0 */ + break; + case IW_CM_STATE_CONN_RECV: + /* + * App called disconnect before/without calling accept after + * connect_request event delivered. + */ + break; + case IW_CM_STATE_CONN_SENT: + /* Can only get here if wait above fails */ + default: + panic("just cuz"); + } + mtx_unlock(&cm_id_priv->lock); + + if (qp) { + if (abrupt) + ret = iwcm_modify_qp_err(qp); + else + ret = iwcm_modify_qp_sqd(qp); + + /* + * If both sides are disconnecting the QP could + * already be in ERR or SQD states + */ + ret = 0; + } + + return ret; +} + +/* + * CM_ID <-- DESTROYING + * + * Clean up all resources associated with the connection and release + * the initial reference taken by iw_create_cm_id. + */ +static void destroy_cm_id(struct iw_cm_id *cm_id) +{ + struct iwcm_id_private *cm_id_priv; + int ret; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + /* + * Wait if we're currently in a connect or accept downcall. A + * listening endpoint should never block here. + */ + mtx_lock(&cm_id_priv->lock); + if (isset(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT)) + msleep(&cm_id_priv->connect_wait, &cm_id_priv->lock, 0, "iwcm connect2", 0); + + switch (cm_id_priv->state) { + case IW_CM_STATE_LISTEN: + cm_id_priv->state = IW_CM_STATE_DESTROYING; + mtx_unlock(&cm_id_priv->lock); + /* destroy the listening endpoint */ + ret = cm_id->device->iwcm->destroy_listen(cm_id); + mtx_lock(&cm_id_priv->lock); + break; + case IW_CM_STATE_ESTABLISHED: + cm_id_priv->state = IW_CM_STATE_DESTROYING; + mtx_unlock(&cm_id_priv->lock); + /* Abrupt close of the connection */ + (void)iwcm_modify_qp_err(cm_id_priv->qp); + mtx_lock(&cm_id_priv->lock); + break; + case IW_CM_STATE_IDLE: + case IW_CM_STATE_CLOSING: + cm_id_priv->state = IW_CM_STATE_DESTROYING; + break; + case IW_CM_STATE_CONN_RECV: + /* + * App called destroy before/without calling accept after + * receiving connection request event notification or + * returned non zero from the event callback function. + * In either case, must tell the provider to reject. + */ + cm_id_priv->state = IW_CM_STATE_DESTROYING; + break; + case IW_CM_STATE_CONN_SENT: + case IW_CM_STATE_DESTROYING: + default: + panic("just cuz"); + break; + } + if (cm_id_priv->qp) { + cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); + cm_id_priv->qp = NULL; + } + mtx_unlock(&cm_id_priv->lock); + + (void)iwcm_deref_id(cm_id_priv); +} + +/* + * This function is only called by the application thread and cannot + * be called by the event thread. The function will wait for all + * references to be released on the cm_id and then free the cm_id + * object. + */ +void iw_destroy_cm_id(struct iw_cm_id *cm_id) +{ + struct iwcm_id_private *cm_id_priv; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + PANIC_IF(isset(&cm_id_priv->flags, IWCM_F_CALLBACK_DESTROY)); + + destroy_cm_id(cm_id); + + mtx_lock(&cm_id_priv->lock); + if (atomic_load_acq_int(&cm_id_priv->refcount)) + msleep(&cm_id_priv->destroy_comp, &cm_id_priv->lock, 0, "iwcm destroy", 0); + mtx_unlock(&cm_id_priv->lock); + + free_cm_id(cm_id_priv); +} + +/* + * CM_ID <-- LISTEN + * + * Start listening for connect requests. Generates one CONNECT_REQUEST + * event for each inbound connect request. + */ +int iw_cm_listen(struct iw_cm_id *cm_id, int backlog) +{ + struct iwcm_id_private *cm_id_priv; + int ret; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + + ret = alloc_work_entries(cm_id_priv, backlog); + if (ret) + return ret; + + mtx_lock(&cm_id_priv->lock); + switch (cm_id_priv->state) { + case IW_CM_STATE_IDLE: + cm_id_priv->state = IW_CM_STATE_LISTEN; + mtx_unlock(&cm_id_priv->lock); + ret = cm_id->device->iwcm->create_listen(cm_id, backlog); + if (ret) + cm_id_priv->state = IW_CM_STATE_IDLE; + mtx_lock(&cm_id_priv->lock); + break; + default: + ret = EINVAL; + } + mtx_unlock(&cm_id_priv->lock); + + return ret; +} + +/* + * CM_ID <-- IDLE + * + * Rejects an inbound connection request. No events are generated. + */ +int iw_cm_reject(struct iw_cm_id *cm_id, + const void *private_data, + u8 private_data_len) +{ + struct iwcm_id_private *cm_id_priv; + int ret; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + setbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT); + + mtx_lock(&cm_id_priv->lock); + if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) { + clrbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT); + wakeup(&cm_id_priv->connect_wait); + mtx_unlock(&cm_id_priv->lock); + return (EINVAL); + } + cm_id_priv->state = IW_CM_STATE_IDLE; + mtx_unlock(&cm_id_priv->lock); + + ret = cm_id->device->iwcm->reject(cm_id, private_data, + private_data_len); + + mtx_lock(&cm_id_priv->lock); + clrbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT); + wakeup(&cm_id_priv->connect_wait); + mtx_unlock(&cm_id_priv->lock); + + return ret; +} + +/* + * CM_ID <-- ESTABLISHED + * + * Accepts an inbound connection request and generates an ESTABLISHED + * event. Callers of iw_cm_disconnect and iw_destroy_cm_id will block + * until the ESTABLISHED event is received from the provider. + */ +int iw_cm_accept(struct iw_cm_id *cm_id, + struct iw_cm_conn_param *iw_param) +{ + struct iwcm_id_private *cm_id_priv; + struct ib_qp *qp; + int ret; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + setbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT); + + mtx_lock(&cm_id_priv->lock); + if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) { + clrbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT); + wakeup(&cm_id_priv->connect_wait); + mtx_unlock(&cm_id_priv->lock); + + return (EINVAL); + } + /* Get the ib_qp given the QPN */ + qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); + if (!qp) { + mtx_unlock(&cm_id_priv->lock); + return (EINVAL); + } + cm_id->device->iwcm->add_ref(qp); + cm_id_priv->qp = qp; + mtx_unlock(&cm_id_priv->lock); + + ret = cm_id->device->iwcm->accept(cm_id, iw_param); + if (ret) { + /* An error on accept precludes provider events */ + PANIC_IF(cm_id_priv->state != IW_CM_STATE_CONN_RECV); + cm_id_priv->state = IW_CM_STATE_IDLE; + mtx_lock(&cm_id_priv->lock); + if (cm_id_priv->qp) { + cm_id->device->iwcm->rem_ref(qp); + cm_id_priv->qp = NULL; + } + clrbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT); + wakeup(&cm_id_priv->connect_wait); + mtx_unlock(&cm_id_priv->lock); + } + + return ret; +} + +/* + * Active Side: CM_ID <-- CONN_SENT + * + * If successful, results in the generation of a CONNECT_REPLY + * event. iw_cm_disconnect and iw_cm_destroy will block until the + * CONNECT_REPLY event is received from the provider. + */ +int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) +{ + struct iwcm_id_private *cm_id_priv; + int ret; + struct ib_qp *qp; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + + ret = alloc_work_entries(cm_id_priv, 4); + if (ret) + return ret; + + setbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT); + mtx_lock(&cm_id_priv->lock); + + if (cm_id_priv->state != IW_CM_STATE_IDLE) { + clrbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT); + wakeup(&cm_id_priv->connect_wait); + mtx_unlock(&cm_id_priv->lock); + + return (EINVAL); + } + + /* Get the ib_qp given the QPN */ + qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); + if (!qp) { + mtx_unlock(&cm_id_priv->lock); + return (EINVAL); + } + cm_id->device->iwcm->add_ref(qp); + cm_id_priv->qp = qp; + cm_id_priv->state = IW_CM_STATE_CONN_SENT; + mtx_unlock(&cm_id_priv->lock); + + ret = cm_id->device->iwcm->connect(cm_id, iw_param); + if (ret) { + mtx_lock(&cm_id_priv->lock); + if (cm_id_priv->qp) { + cm_id->device->iwcm->rem_ref(qp); + cm_id_priv->qp = NULL; + } + PANIC_IF(cm_id_priv->state != IW_CM_STATE_CONN_SENT); + cm_id_priv->state = IW_CM_STATE_IDLE; + clrbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT); + wakeup(&cm_id_priv->connect_wait); + mtx_unlock(&cm_id_priv->lock); + + } + + return ret; +} + +/* + * Passive Side: new CM_ID <-- CONN_RECV + * + * Handles an inbound connect request. The function creates a new + * iw_cm_id to represent the new connection and inherits the client + * callback function and other attributes from the listening parent. + * + * The work item contains a pointer to the listen_cm_id and the event. The + * listen_cm_id contains the client cm_handler, context and + * device. These are copied when the device is cloned. The event + * contains the new four tuple. + * + * An error on the child should not affect the parent, so this + * function does not return a value. + */ +static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv, + struct iw_cm_event *iw_event) +{ + struct iw_cm_id *cm_id; + struct iwcm_id_private *cm_id_priv; + int ret; + + /* + * The provider should never generate a connection request + * event with a bad status. + */ + PANIC_IF(iw_event->status); + + /* + * We could be destroying the listening id. If so, ignore this + * upcall. + */ + mtx_lock(&listen_id_priv->lock); + if (listen_id_priv->state != IW_CM_STATE_LISTEN) { + mtx_unlock(&listen_id_priv->lock); + goto out; + } + mtx_unlock(&listen_id_priv->lock); + + cm_id = iw_create_cm_id(listen_id_priv->id.device, + iw_event->so, + listen_id_priv->id.cm_handler, + listen_id_priv->id.context); + /* If the cm_id could not be created, ignore the request */ + if (IS_ERR(cm_id)) + goto out; + + cm_id->provider_data = iw_event->provider_data; + cm_id->local_addr = iw_event->local_addr; + cm_id->remote_addr = iw_event->remote_addr; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + cm_id_priv->state = IW_CM_STATE_CONN_RECV; + + ret = alloc_work_entries(cm_id_priv, 3); + if (ret) { + iw_cm_reject(cm_id, NULL, 0); + iw_destroy_cm_id(cm_id); + goto out; + } + + /* Call the client CM handler */ + ret = cm_id->cm_handler(cm_id, iw_event); + if (ret) { + iw_cm_reject(cm_id, NULL, 0); + setbit(&cm_id_priv->flags, IWCM_F_CALLBACK_DESTROY); + + destroy_cm_id(cm_id); + if (atomic_load_acq_int(&cm_id_priv->refcount)==0) + free_cm_id(cm_id_priv); + } + +out: + if (iw_event->private_data_len) + free(iw_event->private_data, M_DEVBUF); +} + +/* + * Passive Side: CM_ID <-- ESTABLISHED + * + * The provider generated an ESTABLISHED event which means that + * the MPA negotion has completed successfully and we are now in MPA + * FPDU mode. + * + * This event can only be received in the CONN_RECV state. If the + * remote peer closed, the ESTABLISHED event would be received followed + * by the CLOSE event. If the app closes, it will block until we wake + * it up after processing this event. + */ +static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv, + struct iw_cm_event *iw_event) +{ + int ret; + + mtx_lock(&cm_id_priv->lock); + + /* + * We clear the CONNECT_WAIT bit here to allow the callback + * function to call iw_cm_disconnect. Calling iw_destroy_cm_id + * from a callback handler is not allowed. + */ + clrbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT); + PANIC_IF(cm_id_priv->state != IW_CM_STATE_CONN_RECV); + cm_id_priv->state = IW_CM_STATE_ESTABLISHED; + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); + wakeup(&cm_id_priv->connect_wait); + mtx_unlock(&cm_id_priv->lock); + + return ret; +} + +/* + * Active Side: CM_ID <-- ESTABLISHED + * + * The app has called connect and is waiting for the established event to + * post it's requests to the server. This event will wake up anyone + * blocked in iw_cm_disconnect or iw_destroy_id. + */ +static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv, + struct iw_cm_event *iw_event) +{ + int ret; + + mtx_lock(&cm_id_priv->lock); + /* + * Clear the connect wait bit so a callback function calling + * iw_cm_disconnect will not wait and deadlock this thread + */ + clrbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT); + PANIC_IF(cm_id_priv->state != IW_CM_STATE_CONN_SENT); + if (iw_event->status == IW_CM_EVENT_STATUS_ACCEPTED) { + cm_id_priv->id.local_addr = iw_event->local_addr; + cm_id_priv->id.remote_addr = iw_event->remote_addr; + cm_id_priv->state = IW_CM_STATE_ESTABLISHED; + } else { + /* REJECTED or RESET */ + cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); + cm_id_priv->qp = NULL; + cm_id_priv->state = IW_CM_STATE_IDLE; + } + mtx_unlock(&cm_id_priv->lock); + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); + + mtx_lock(&cm_id_priv->lock); + if (iw_event->private_data_len) + free(iw_event->private_data, M_DEVBUF); + + /* Wake up waiters on connect complete */ + wakeup(&cm_id_priv->connect_wait); + mtx_unlock(&cm_id_priv->lock); + + return ret; +} + +/* + * CM_ID <-- CLOSING + * + * If in the ESTABLISHED state, move to CLOSING. + */ +static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv, + struct iw_cm_event *iw_event) +{ + + mtx_lock(&cm_id_priv->lock); + if (cm_id_priv->state == IW_CM_STATE_ESTABLISHED) + cm_id_priv->state = IW_CM_STATE_CLOSING; + mtx_unlock(&cm_id_priv->lock); +} + +/* + * CM_ID <-- IDLE + * + * If in the ESTBLISHED or CLOSING states, the QP will have have been + * moved by the provider to the ERR state. Disassociate the CM_ID from + * the QP, move to IDLE, and remove the 'connected' reference. + * + * If in some other state, the cm_id was destroyed asynchronously. + * This is the last reference that will result in waking up + * the app thread blocked in iw_destroy_cm_id. + */ +static int cm_close_handler(struct iwcm_id_private *cm_id_priv, + struct iw_cm_event *iw_event) +{ + int ret = 0; + mtx_lock(&cm_id_priv->lock); + + if (cm_id_priv->qp) { + cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); + cm_id_priv->qp = NULL; + } + switch (cm_id_priv->state) { + case IW_CM_STATE_ESTABLISHED: + case IW_CM_STATE_CLOSING: + cm_id_priv->state = IW_CM_STATE_IDLE; + mtx_unlock(&cm_id_priv->lock); + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); + mtx_lock(&cm_id_priv->lock); + break; + case IW_CM_STATE_DESTROYING: + break; + default: + panic("just cuz"); + } + mtx_unlock(&cm_id_priv->lock); + + return ret; +} + +static int process_event(struct iwcm_id_private *cm_id_priv, + struct iw_cm_event *iw_event) +{ + int ret = 0; + + switch (iw_event->event) { + case IW_CM_EVENT_CONNECT_REQUEST: + cm_conn_req_handler(cm_id_priv, iw_event); + break; + case IW_CM_EVENT_CONNECT_REPLY: + ret = cm_conn_rep_handler(cm_id_priv, iw_event); + break; + case IW_CM_EVENT_ESTABLISHED: + ret = cm_conn_est_handler(cm_id_priv, iw_event); + break; + case IW_CM_EVENT_DISCONNECT: + cm_disconnect_handler(cm_id_priv, iw_event); + break; + case IW_CM_EVENT_CLOSE: + ret = cm_close_handler(cm_id_priv, iw_event); + break; + default: + panic("just cuz"); + } + + return ret; +} + +/* + * Process events on the work_list for the cm_id. If the callback + * function requests that the cm_id be deleted, a flag is set in the + * cm_id flags to indicate that when the last reference is + * removed, the cm_id is to be destroyed. This is necessary to + * distinguish between an object that will be destroyed by the app + * thread asleep on the destroy_comp list vs. an object destroyed + * here synchronously when the last reference is removed. + */ +static void cm_work_handler(void *context, int pending) +{ + struct iwcm_work *work = context; + struct iw_cm_event levent; + struct iwcm_id_private *cm_id_priv = work->cm_id; + int empty; + int ret = 0; + + mtx_lock(&cm_id_priv->lock); + empty = TAILQ_EMPTY(&cm_id_priv->work_list); + while (!empty) { + work = TAILQ_FIRST(&cm_id_priv->work_list); + TAILQ_REMOVE(&cm_id_priv->work_list, work, list); + empty = TAILQ_EMPTY(&cm_id_priv->work_list); + levent = work->event; + put_work(work); + mtx_unlock(&cm_id_priv->lock); + + ret = process_event(cm_id_priv, &levent); + if (ret) { + setbit(&cm_id_priv->flags, IWCM_F_CALLBACK_DESTROY); + destroy_cm_id(&cm_id_priv->id); + } + PANIC_IF(atomic_load_acq_int(&cm_id_priv->refcount)==0); + if (iwcm_deref_id(cm_id_priv)) { + if (isset(&cm_id_priv->flags, + IWCM_F_CALLBACK_DESTROY)) { + PANIC_IF(!TAILQ_EMPTY(&cm_id_priv->work_list)); + free_cm_id(cm_id_priv); + } + return; + } + mtx_lock(&cm_id_priv->lock); + } + mtx_unlock(&cm_id_priv->lock); +} + +/* + * This function is called on interrupt context. Schedule events on + * the iwcm_wq thread to allow callback functions to downcall into + * the CM and/or block. Events are queued to a per-CM_ID + * work_list. If this is the first event on the work_list, the work + * element is also queued on the iwcm_wq thread. + * + * Each event holds a reference on the cm_id. Until the last posted + * event has been delivered and processed, the cm_id cannot be + * deleted. + * + * Returns: + * 0 - the event was handled. + * ENOMEM - the event was not handled due to lack of resources. + */ +static int cm_event_handler(struct iw_cm_id *cm_id, + struct iw_cm_event *iw_event) +{ + struct iwcm_work *work; + struct iwcm_id_private *cm_id_priv; + int ret = 0; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + + mtx_lock(&cm_id_priv->lock); + work = get_work(cm_id_priv); + if (!work) { + ret = ENOMEM; + goto out; + } + + TASK_INIT(&work->task, 0, cm_work_handler, work); + work->cm_id = cm_id_priv; + work->event = *iw_event; + + if ((work->event.event == IW_CM_EVENT_CONNECT_REQUEST || + work->event.event == IW_CM_EVENT_CONNECT_REPLY) && + work->event.private_data_len) { + ret = copy_private_data(&work->event); + if (ret) { + put_work(work); + goto out; + } + } + + atomic_add_acq_int(&cm_id_priv->refcount, 1); + if (TAILQ_EMPTY(&cm_id_priv->work_list)) { + TAILQ_INSERT_TAIL(&cm_id_priv->work_list, work, list); + taskqueue_enqueue(iwcm_wq, &work->task); + } else + TAILQ_INSERT_TAIL(&cm_id_priv->work_list, work, list); +out: + mtx_unlock(&cm_id_priv->lock); + return ret; +} + +static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv, + struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + int ret; + + mtx_lock(&cm_id_priv->lock); + switch (cm_id_priv->state) { + case IW_CM_STATE_IDLE: + case IW_CM_STATE_CONN_SENT: + case IW_CM_STATE_CONN_RECV: + case IW_CM_STATE_ESTABLISHED: + *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; + qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE| + IB_ACCESS_REMOTE_READ; + ret = 0; + break; + default: + ret = EINVAL; + break; + } + mtx_unlock(&cm_id_priv->lock); + return ret; +} + +static int iwcm_init_qp_rts_attr(struct iwcm_id_private *cm_id_priv, + struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + int ret; + + mtx_lock(&cm_id_priv->lock); + switch (cm_id_priv->state) { + case IW_CM_STATE_IDLE: + case IW_CM_STATE_CONN_SENT: + case IW_CM_STATE_CONN_RECV: + case IW_CM_STATE_ESTABLISHED: + *qp_attr_mask = 0; + ret = 0; + break; + default: + ret = EINVAL; + break; + } + mtx_unlock(&cm_id_priv->lock); + return ret; +} + +int iw_cm_init_qp_attr(struct iw_cm_id *cm_id, + struct ib_qp_attr *qp_attr, + int *qp_attr_mask) +{ + struct iwcm_id_private *cm_id_priv; + int ret; + + cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + switch (qp_attr->qp_state) { + case IB_QPS_INIT: + case IB_QPS_RTR: + ret = iwcm_init_qp_init_attr(cm_id_priv, + qp_attr, qp_attr_mask); + break; + case IB_QPS_RTS: + ret = iwcm_init_qp_rts_attr(cm_id_priv, + qp_attr, qp_attr_mask); + break; + default: + ret = EINVAL; + break; + } + return ret; +} + +static int iw_cm_init(void) +{ + iwcm_wq = taskqueue_create("iw_cm_wq", M_NOWAIT, taskqueue_thread_enqueue, &iwcm_wq); + if (!iwcm_wq) + return (ENOMEM); + + taskqueue_start_threads(&iwcm_wq, 1, PI_NET, "iw_cm_wq thread"); + return 0; +} + +static void iw_cm_cleanup(void) +{ + taskqueue_free(iwcm_wq); +} + +static int +iw_cm_load(module_t mod, int cmd, void *arg) +{ + int err = 0; + + switch (cmd) { + case MOD_LOAD: + printf("Loading rdma_iwcm.\n"); + + iw_cm_init(); + break; + case MOD_QUIESCE: + break; + case MOD_UNLOAD: + printf("Unloading rdma_iwcm.\n"); + iw_cm_cleanup(); + break; + case MOD_SHUTDOWN: + break; + default: + err = EOPNOTSUPP; + break; + } + + return (err); +} + +static moduledata_t mod_data = { + "rdma_iwcm", + iw_cm_load, + 0 +}; + +MODULE_VERSION(rdma_iwcm, 1); +MODULE_DEPEND(rdma_iwcm, rdma_core, 1, 1, 1); +DECLARE_MODULE(rdma_iwcm, mod_data, SI_SUB_EXEC, SI_ORDER_ANY); diff --git a/sys/contrib/rdma/rdma_user_cm.h b/sys/contrib/rdma/rdma_user_cm.h new file mode 100644 index 000000000000..0ffa4e5b3c24 --- /dev/null +++ b/sys/contrib/rdma/rdma_user_cm.h @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $FreeBSD$ + */ + +#ifndef RDMA_USER_CM_H +#define RDMA_USER_CM_H + +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_user_sa.h> + +#define RDMA_USER_CM_ABI_VERSION 4 + +#define RDMA_MAX_PRIVATE_DATA 256 + +enum { + RDMA_USER_CM_CMD_CREATE_ID, + RDMA_USER_CM_CMD_DESTROY_ID, + RDMA_USER_CM_CMD_BIND_ADDR, + RDMA_USER_CM_CMD_RESOLVE_ADDR, + RDMA_USER_CM_CMD_RESOLVE_ROUTE, + RDMA_USER_CM_CMD_QUERY_ROUTE, + RDMA_USER_CM_CMD_CONNECT, + RDMA_USER_CM_CMD_LISTEN, + RDMA_USER_CM_CMD_ACCEPT, + RDMA_USER_CM_CMD_REJECT, + RDMA_USER_CM_CMD_DISCONNECT, + RDMA_USER_CM_CMD_INIT_QP_ATTR, + RDMA_USER_CM_CMD_GET_EVENT, + RDMA_USER_CM_CMD_GET_OPTION, + RDMA_USER_CM_CMD_SET_OPTION, + RDMA_USER_CM_CMD_NOTIFY, + RDMA_USER_CM_CMD_JOIN_MCAST, + RDMA_USER_CM_CMD_LEAVE_MCAST +}; + +/* + * command ABI structures. + */ +struct rdma_ucm_cmd_hdr { + __u32 cmd; + __u16 in; + __u16 out; +}; + +struct rdma_ucm_create_id { + __u64 uid; + __u64 response; + __u16 ps; + __u8 reserved[6]; +}; + +struct rdma_ucm_create_id_resp { + __u32 id; +}; + +struct rdma_ucm_destroy_id { + __u64 response; + __u32 id; + __u32 reserved; +}; + +struct rdma_ucm_destroy_id_resp { + __u32 events_reported; +}; + +struct rdma_ucm_bind_addr { + __u64 response; + struct sockaddr_in6 addr; + __u32 id; +}; + +struct rdma_ucm_resolve_addr { + struct sockaddr_in6 src_addr; + struct sockaddr_in6 dst_addr; + __u32 id; + __u32 timeout_ms; +}; + +struct rdma_ucm_resolve_route { + __u32 id; + __u32 timeout_ms; +}; + +struct rdma_ucm_query_route { + __u64 response; + __u32 id; + __u32 reserved; +}; + +struct rdma_ucm_query_route_resp { + __u64 node_guid; + struct ib_user_path_rec ib_route[2]; + struct sockaddr_in6 src_addr; + struct sockaddr_in6 dst_addr; + __u32 num_paths; + __u8 port_num; + __u8 reserved[3]; +}; + +struct rdma_ucm_conn_param { + __u32 qp_num; + __u32 reserved; + __u8 private_data[RDMA_MAX_PRIVATE_DATA]; + __u8 private_data_len; + __u8 srq; + __u8 responder_resources; + __u8 initiator_depth; + __u8 flow_control; + __u8 retry_count; + __u8 rnr_retry_count; + __u8 valid; +}; + +struct rdma_ucm_ud_param { + __u32 qp_num; + __u32 qkey; + struct ib_uverbs_ah_attr ah_attr; + __u8 private_data[RDMA_MAX_PRIVATE_DATA]; + __u8 private_data_len; + __u8 reserved[7]; +}; + +struct rdma_ucm_connect { + struct rdma_ucm_conn_param conn_param; + __u32 id; + __u32 reserved; +}; + +struct rdma_ucm_listen { + __u32 id; + __u32 backlog; +}; + +struct rdma_ucm_accept { + __u64 uid; + struct rdma_ucm_conn_param conn_param; + __u32 id; + __u32 reserved; +}; + +struct rdma_ucm_reject { + __u32 id; + __u8 private_data_len; + __u8 reserved[3]; + __u8 private_data[RDMA_MAX_PRIVATE_DATA]; +}; + +struct rdma_ucm_disconnect { + __u32 id; +}; + +struct rdma_ucm_init_qp_attr { + __u64 response; + __u32 id; + __u32 qp_state; +}; + +struct rdma_ucm_notify { + __u32 id; + __u32 event; +}; + +struct rdma_ucm_join_mcast { + __u64 response; /* rdma_ucm_create_id_resp */ + __u64 uid; + struct sockaddr_in6 addr; + __u32 id; +}; + +struct rdma_ucm_get_event { + __u64 response; +}; + +struct rdma_ucm_event_resp { + __u64 uid; + __u32 id; + __u32 event; + __u32 status; + union { + struct rdma_ucm_conn_param conn; + struct rdma_ucm_ud_param ud; + } param; +}; + +#endif /* RDMA_USER_CM_H */ diff --git a/sys/contrib/rdma/rdma_verbs.c b/sys/contrib/rdma/rdma_verbs.c new file mode 100644 index 000000000000..93821074b3c1 --- /dev/null +++ b/sys/contrib/rdma/rdma_verbs.c @@ -0,0 +1,822 @@ +/* + * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2004 Infinicon Corporation. All rights reserved. + * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: verbs.c 1349 2004-12-16 21:09:43Z roland $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/libkern.h> +#include <sys/module.h> +#include <sys/endian.h> + +#include <contrib/rdma/ib_verbs.h> +#include <contrib/rdma/ib_cache.h> + +int ib_rate_to_mult(enum ib_rate rate) +{ + switch (rate) { + case IB_RATE_2_5_GBPS: return 1; + case IB_RATE_5_GBPS: return 2; + case IB_RATE_10_GBPS: return 4; + case IB_RATE_20_GBPS: return 8; + case IB_RATE_30_GBPS: return 12; + case IB_RATE_40_GBPS: return 16; + case IB_RATE_60_GBPS: return 24; + case IB_RATE_80_GBPS: return 32; + case IB_RATE_120_GBPS: return 48; + default: return -1; + } +} + +enum ib_rate mult_to_ib_rate(int mult) +{ + switch (mult) { + case 1: return IB_RATE_2_5_GBPS; + case 2: return IB_RATE_5_GBPS; + case 4: return IB_RATE_10_GBPS; + case 8: return IB_RATE_20_GBPS; + case 12: return IB_RATE_30_GBPS; + case 16: return IB_RATE_40_GBPS; + case 24: return IB_RATE_60_GBPS; + case 32: return IB_RATE_80_GBPS; + case 48: return IB_RATE_120_GBPS; + default: return IB_RATE_PORT_CURRENT; + } +} + +enum rdma_transport_type +rdma_node_get_transport(enum rdma_node_type node_type) +{ + switch (node_type) { + case RDMA_NODE_IB_CA: + case RDMA_NODE_IB_SWITCH: + case RDMA_NODE_IB_ROUTER: + return RDMA_TRANSPORT_IB; + case RDMA_NODE_RNIC: + return RDMA_TRANSPORT_IWARP; + default: + panic("bad condition"); + return 0; + } +} + +/* Protection domains */ + +struct ib_pd *ib_alloc_pd(struct ib_device *device) +{ + struct ib_pd *pd; + + pd = device->alloc_pd(device, NULL, NULL); + + if (!IS_ERR(pd)) { + pd->device = device; + pd->uobject = NULL; + atomic_store_rel_int(&pd->usecnt, 0); + } + + return pd; +} + +int ib_dealloc_pd(struct ib_pd *pd) +{ + if (atomic_load_acq_int(&pd->usecnt)) + return (EBUSY); + + return pd->device->dealloc_pd(pd); +} + +/* Address handles */ + +struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + struct ib_ah *ah; + + ah = pd->device->create_ah(pd, ah_attr); + + if (!IS_ERR(ah)) { + ah->device = pd->device; + ah->pd = pd; + ah->uobject = NULL; + atomic_add_acq_int(&pd->usecnt, 1); + } + + return ah; +} + +int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, + struct ib_grh *grh, struct ib_ah_attr *ah_attr) +{ + u32 flow_class; + u16 gid_index; + int ret; + + memset(ah_attr, 0, sizeof *ah_attr); + ah_attr->dlid = wc->slid; + ah_attr->sl = wc->sl; + ah_attr->src_path_bits = wc->dlid_path_bits; + ah_attr->port_num = port_num; + + if (wc->wc_flags & IB_WC_GRH) { + ah_attr->ah_flags = IB_AH_GRH; + ah_attr->grh.dgid = grh->sgid; + + ret = ib_find_cached_gid(device, &grh->dgid, &port_num, + &gid_index); + if (ret) + return ret; + + ah_attr->grh.sgid_index = (u8) gid_index; + flow_class = be32toh(grh->version_tclass_flow); + ah_attr->grh.flow_label = flow_class & 0xFFFFF; + ah_attr->grh.hop_limit = 0xFF; + ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF; + } + return 0; +} + +struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc, + struct ib_grh *grh, u8 port_num) +{ + struct ib_ah_attr ah_attr; + int ret; + + ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr); + if (ret) + return ERR_PTR(ret); + + return ib_create_ah(pd, &ah_attr); +} + +int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) +{ + return ah->device->modify_ah ? + ah->device->modify_ah(ah, ah_attr) : + ENOSYS; +} + +int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) +{ + return ah->device->query_ah ? + ah->device->query_ah(ah, ah_attr) : + ENOSYS; +} + +int ib_destroy_ah(struct ib_ah *ah) +{ + struct ib_pd *pd; + int ret; + + pd = ah->pd; + ret = ah->device->destroy_ah(ah); + if (!ret) + atomic_subtract_acq_int(&pd->usecnt, 1); + + return ret; +} + +/* Shared receive queues */ + +struct ib_srq *ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *srq_init_attr) +{ + struct ib_srq *srq; + + if (!pd->device->create_srq) + return ERR_PTR(ENOSYS); + + srq = pd->device->create_srq(pd, srq_init_attr, NULL); + + if (!IS_ERR(srq)) { + srq->device = pd->device; + srq->pd = pd; + srq->uobject = NULL; + srq->event_handler = srq_init_attr->event_handler; + srq->srq_context = srq_init_attr->srq_context; + atomic_add_acq_int(&pd->usecnt, 1); + atomic_store_rel_int(&srq->usecnt, 0); + } + + return srq; +} + +int ib_modify_srq(struct ib_srq *srq, + struct ib_srq_attr *srq_attr, + enum ib_srq_attr_mask srq_attr_mask) +{ + return srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL); +} + +int ib_query_srq(struct ib_srq *srq, + struct ib_srq_attr *srq_attr) +{ + return srq->device->query_srq ? + srq->device->query_srq(srq, srq_attr) : ENOSYS; +} + +int ib_destroy_srq(struct ib_srq *srq) +{ + struct ib_pd *pd; + int ret; + + if (atomic_load_acq_int(&srq->usecnt)) + return (EBUSY); + + pd = srq->pd; + + ret = srq->device->destroy_srq(srq); + if (!ret) + atomic_subtract_acq_int(&pd->usecnt, 1); + + return ret; +} + +/* Queue pairs */ + +struct ib_qp *ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr) +{ + struct ib_qp *qp; + + qp = pd->device->create_qp(pd, qp_init_attr, NULL); + + if (!IS_ERR(qp)) { + qp->device = pd->device; + qp->pd = pd; + qp->send_cq = qp_init_attr->send_cq; + qp->recv_cq = qp_init_attr->recv_cq; + qp->srq = qp_init_attr->srq; + qp->uobject = NULL; + qp->event_handler = qp_init_attr->event_handler; + qp->qp_context = qp_init_attr->qp_context; + qp->qp_type = qp_init_attr->qp_type; + atomic_add_acq_int(&pd->usecnt, 1); + atomic_add_acq_int(&qp_init_attr->send_cq->usecnt, 1); + atomic_add_acq_int(&qp_init_attr->recv_cq->usecnt, 1); + if (qp_init_attr->srq) + atomic_add_acq_int(&qp_init_attr->srq->usecnt, 1); + } + + return qp; +} + +static const struct { + int valid; + enum ib_qp_attr_mask req_param[IB_QPT_RAW_ETY + 1]; + enum ib_qp_attr_mask opt_param[IB_QPT_RAW_ETY + 1]; +} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_INIT] = { + .valid = 1, + .req_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_RC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + } + }, + }, + [IB_QPS_INIT] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_INIT] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_RC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + } + }, + [IB_QPS_RTR] = { + .valid = 1, + .req_param = { + [IB_QPT_UC] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN), + [IB_QPT_RC] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_MIN_RNR_TIMER), + }, + .opt_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_RC] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + } + } + }, + [IB_QPS_RTR] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .req_param = { + [IB_QPT_UD] = IB_QP_SQ_PSN, + [IB_QPT_UC] = IB_QP_SQ_PSN, + [IB_QPT_RC] = (IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_SQ_PSN | + IB_QP_MAX_QP_RD_ATOMIC), + [IB_QPT_SMI] = IB_QP_SQ_PSN, + [IB_QPT_GSI] = IB_QP_SQ_PSN, + }, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + } + } + }, + [IB_QPS_RTS] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE | + IB_QP_MIN_RNR_TIMER), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + } + }, + [IB_QPS_SQD] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY + } + }, + }, + [IB_QPS_SQD] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + } + }, + [IB_QPS_SQD] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_AV | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_PATH_MIG_STATE), + [IB_QPT_RC] = (IB_QP_PORT | + IB_QP_AV | + IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), + [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | + IB_QP_QKEY), + } + } + }, + [IB_QPS_SQE] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, + [IB_QPS_RTS] = { + .valid = 1, + .opt_param = { + [IB_QPT_UD] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_UC] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS), + [IB_QPT_SMI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + [IB_QPT_GSI] = (IB_QP_CUR_STATE | + IB_QP_QKEY), + } + } + }, + [IB_QPS_ERR] = { + [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 } + } +}; + +int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, + enum ib_qp_type type, enum ib_qp_attr_mask mask) +{ + enum ib_qp_attr_mask req_param, opt_param; + + if (cur_state < 0 || cur_state > IB_QPS_ERR || + next_state < 0 || next_state > IB_QPS_ERR) + return 0; + + if (mask & IB_QP_CUR_STATE && + cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS && + cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE) + return 0; + + if (!qp_state_table[cur_state][next_state].valid) + return 0; + + req_param = qp_state_table[cur_state][next_state].req_param[type]; + opt_param = qp_state_table[cur_state][next_state].opt_param[type]; + + if ((mask & req_param) != req_param) + return 0; + + if (mask & ~(req_param | opt_param | IB_QP_STATE)) + return 0; + + return 1; +} + +int ib_modify_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask) +{ + return qp->device->modify_qp(qp, qp_attr, qp_attr_mask, NULL); +} + +int ib_query_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + return qp->device->query_qp ? + qp->device->query_qp(qp, qp_attr, qp_attr_mask, qp_init_attr) : + ENOSYS; +} + +int ib_destroy_qp(struct ib_qp *qp) +{ + struct ib_pd *pd; + struct ib_cq *scq, *rcq; + struct ib_srq *srq; + int ret; + + pd = qp->pd; + scq = qp->send_cq; + rcq = qp->recv_cq; + srq = qp->srq; + + ret = qp->device->destroy_qp(qp); + if (!ret) { + atomic_subtract_acq_int(&pd->usecnt, 1); + atomic_subtract_acq_int(&scq->usecnt, 1); + atomic_subtract_acq_int(&rcq->usecnt, 1); + if (srq) + atomic_subtract_acq_int(&srq->usecnt, 1); + } + + return ret; +} + +/* Completion queues */ + +struct ib_cq *ib_create_cq(struct ib_device *device, + ib_comp_handler comp_handler, + void (*event_handler)(struct ib_event *, void *), + void *cq_context, int cqe, int comp_vector) +{ + struct ib_cq *cq; + + cq = device->create_cq(device, cqe, comp_vector, NULL, NULL); + + if (!IS_ERR(cq)) { + cq->device = device; + cq->uobject = NULL; + cq->comp_handler = comp_handler; + cq->event_handler = event_handler; + cq->cq_context = cq_context; + atomic_store_rel_int(&cq->usecnt, 0); + } + + return cq; +} + +int ib_destroy_cq(struct ib_cq *cq) +{ + if (atomic_load_acq_int(&cq->usecnt)) + return (EBUSY); + + return cq->device->destroy_cq(cq); +} + +int ib_resize_cq(struct ib_cq *cq, int cqe) +{ + return cq->device->resize_cq ? + cq->device->resize_cq(cq, cqe, NULL) : ENOSYS; +} + +/* Memory regions */ + +struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) +{ + struct ib_mr *mr; + + mr = pd->device->get_dma_mr(pd, mr_access_flags); + + if (!IS_ERR(mr)) { + mr->device = pd->device; + mr->pd = pd; + mr->uobject = NULL; + atomic_add_acq_int(&pd->usecnt, 1); + atomic_store_rel_int(&mr->usecnt, 0); + } + + return mr; +} + +struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, + u64 *iova_start) +{ + struct ib_mr *mr; + + mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf, + mr_access_flags, iova_start); + + if (!IS_ERR(mr)) { + mr->device = pd->device; + mr->pd = pd; + mr->uobject = NULL; + atomic_add_acq_int(&pd->usecnt, 1); + atomic_store_rel_int(&mr->usecnt, 0); + } + + return mr; +} + +int ib_rereg_phys_mr(struct ib_mr *mr, + int mr_rereg_mask, + struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, + u64 *iova_start) +{ + struct ib_pd *old_pd; + int ret; + + if (!mr->device->rereg_phys_mr) + return (ENOSYS); + + if (atomic_load_acq_int(&mr->usecnt)) + return (EBUSY); + + old_pd = mr->pd; + + ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd, + phys_buf_array, num_phys_buf, + mr_access_flags, iova_start); + + if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) { + atomic_subtract_acq_int(&old_pd->usecnt, 1); + atomic_add_acq_int(&pd->usecnt, 1); + } + + return ret; +} + +int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr) +{ + return mr->device->query_mr ? + mr->device->query_mr(mr, mr_attr) : ENOSYS; +} + +int ib_dereg_mr(struct ib_mr *mr) +{ + struct ib_pd *pd; + int ret; + + if (atomic_load_acq_int(&mr->usecnt)) + return (EBUSY); + + pd = mr->pd; + ret = mr->device->dereg_mr(mr); + if (!ret) + atomic_subtract_acq_int(&pd->usecnt, 1); + + return ret; +} + +/* Memory windows */ + +struct ib_mw *ib_alloc_mw(struct ib_pd *pd) +{ + struct ib_mw *mw; + + if (!pd->device->alloc_mw) + return ERR_PTR(ENOSYS); + + mw = pd->device->alloc_mw(pd); + if (!IS_ERR(mw)) { + mw->device = pd->device; + mw->pd = pd; + mw->uobject = NULL; + atomic_add_acq_int(&pd->usecnt, 1); + } + + return mw; +} + +int ib_dealloc_mw(struct ib_mw *mw) +{ + struct ib_pd *pd; + int ret; + + pd = mw->pd; + ret = mw->device->dealloc_mw(mw); + if (!ret) + atomic_subtract_acq_int(&pd->usecnt, 1); + + return ret; +} + +/* "Fast" memory regions */ + +struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, + int mr_access_flags, + struct ib_fmr_attr *fmr_attr) +{ + struct ib_fmr *fmr; + + if (!pd->device->alloc_fmr) + return ERR_PTR(ENOSYS); + + fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr); + if (!IS_ERR(fmr)) { + fmr->device = pd->device; + fmr->pd = pd; + atomic_add_acq_int(&pd->usecnt, 1); + } + + return fmr; +} + +int ib_unmap_fmr(struct ib_fmr_list_head *fmr_list) +{ + struct ib_fmr *fmr; + + if (TAILQ_EMPTY(fmr_list)) + return 0; + + fmr = TAILQ_FIRST(fmr_list); + return fmr->device->unmap_fmr(fmr_list); +} + +int ib_dealloc_fmr(struct ib_fmr *fmr) +{ + struct ib_pd *pd; + int ret; + + pd = fmr->pd; + ret = fmr->device->dealloc_fmr(fmr); + if (!ret) + atomic_subtract_acq_int(&pd->usecnt, 1); + + return ret; +} + +/* Multicast groups */ + +int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) +{ + if (!qp->device->attach_mcast) + return (ENOSYS); + if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) + return (EINVAL); + + return qp->device->attach_mcast(qp, gid, lid); +} + +int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) +{ + if (!qp->device->detach_mcast) + return (ENOSYS); + if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) + return (EINVAL); + + return qp->device->detach_mcast(qp, gid, lid); +} diff --git a/sys/contrib/rdma/types.h b/sys/contrib/rdma/types.h new file mode 100644 index 000000000000..33a1a62bd905 --- /dev/null +++ b/sys/contrib/rdma/types.h @@ -0,0 +1,121 @@ +/* + * $FreeBSD$ + */ +#ifndef __RDMA_TYPES_H_ +#define __RDMA_TYPES_H_ +#include <sys/types.h> +#include <sys/malloc.h> + + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; + +typedef uint8_t __u8; +typedef uint16_t __u16; +typedef uint32_t __u32; +typedef uint64_t __u64; +typedef uint8_t __be8; +typedef uint16_t __be16; +typedef uint32_t __be32; +typedef uint64_t __be64; + +typedef int32_t __s32; + + +#define LINUX_TYPES_DEFINED +#define ERR_PTR(err) ((void *)((long)(err))) +#define IS_ERR(ptr) ((unsigned long)(ptr) > (unsigned long)(-1000)) +#define PTR_ERR(ptr) ((long)(ptr)) + +#define PANIC_IF(exp) do { \ + if (exp) \ + panic("BUG func %s line %u: %s", __FUNCTION__, __LINE__, #exp); \ +} while (0) + +#define container_of(p, stype, field) ((stype *)(((uint8_t *)(p)) - offsetof(stype, field))) + +static __inline int +find_first_zero_bit(volatile void *p, int max) +{ + int b; + volatile int *ptr = (volatile int *)p; + + for (b = 0; b < max; b += 32) { + if (ptr[b >> 5] != ~0) { + for (;;) { + if ((ptr[b >> 5] & (1 << (b & 0x1f))) == 0) + return (b); + b++; + } + } + } + + return (max); +} + +struct kvl { + struct kvl *next; + unsigned int key; + void *value; +}; + +#define DEFINE_KVL(x) struct kvl x; + +static __inline void * +kvl_lookup(struct kvl *x, uint32_t key) +{ + struct kvl *i; + for (i=x->next;i;i=i->next) if (i->key==key) return(i->value); + return(0); +} + +static __inline int +kvl_alloc_above(struct kvl *idp, void *ptr, int starting_id, int *id) +{ + int newid = starting_id; + struct kvl *i; + + for (i=idp->next;i;i=i->next) + if (i->key == newid) + return -EEXIST; + + i=malloc(sizeof(struct kvl),M_TEMP,M_NOWAIT); + i->key=newid; + i->value=ptr; + i->next=idp->next; + idp->next=i; + *id = newid; + return(0); +} + +static __inline void +kvl_delete(struct kvl *idp, int id) +{ + /* leak */ + struct kvl *i, *prev=NULL; + for (i=idp->next;i;prev=i,i=i->next) + if ((i)->key==id) { + if (!prev) + idp->next = i->next; + else + prev->next = i->next; + free(i, M_TEMP); + return; + } +} + +static __inline void +kvl_free(struct kvl *idp) +{ + struct kvl *i, *tmp; + for (i=idp->next;i;i=tmp) { + tmp=i->next; + free(i, M_TEMP); + } + idp->next = NULL; +} + + +#endif |