summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKip Macy <kmacy@FreeBSD.org>2008-05-05 18:35:55 +0000
committerKip Macy <kmacy@FreeBSD.org>2008-05-05 18:35:55 +0000
commite68ff398875b17e19f8a55cd0f2d3d502c45d821 (patch)
tree5b86b050e1ca019fffc7e10f98517cb5f92d4686
parent143b946188bf3f298f12bee29e7adcbca3215f33 (diff)
downloadsrc-test2-e68ff398875b17e19f8a55cd0f2d3d502c45d821.tar.gz
src-test2-e68ff398875b17e19f8a55cd0f2d3d502c45d821.zip
Notes
-rw-r--r--sys/contrib/rdma/core_priv.h55
-rw-r--r--sys/contrib/rdma/ib_addr.h164
-rw-r--r--sys/contrib/rdma/ib_cache.h120
-rw-r--r--sys/contrib/rdma/ib_cm.h593
-rw-r--r--sys/contrib/rdma/ib_fmr_pool.h97
-rw-r--r--sys/contrib/rdma/ib_mad.h656
-rw-r--r--sys/contrib/rdma/ib_marshall.h55
-rw-r--r--sys/contrib/rdma/ib_pack.h247
-rw-r--r--sys/contrib/rdma/ib_sa.h386
-rw-r--r--sys/contrib/rdma/ib_smi.h132
-rw-r--r--sys/contrib/rdma/ib_umem.h81
-rw-r--r--sys/contrib/rdma/ib_user_cm.h328
-rw-r--r--sys/contrib/rdma/ib_user_mad.h136
-rw-r--r--sys/contrib/rdma/ib_user_sa.h60
-rw-r--r--sys/contrib/rdma/ib_user_verbs.h688
-rw-r--r--sys/contrib/rdma/ib_verbs.h1854
-rw-r--r--sys/contrib/rdma/iw_cm.h266
-rw-r--r--sys/contrib/rdma/krping/getopt.c77
-rw-r--r--sys/contrib/rdma/krping/getopt.h21
-rw-r--r--sys/contrib/rdma/krping/krping.c1865
-rw-r--r--sys/contrib/rdma/krping/krping.h128
-rw-r--r--sys/contrib/rdma/krping/krping_dev.c180
-rw-r--r--sys/contrib/rdma/rdma_addr.c408
-rw-r--r--sys/contrib/rdma/rdma_cache.c411
-rw-r--r--sys/contrib/rdma/rdma_cm.h318
-rw-r--r--sys/contrib/rdma/rdma_cm_ib.h51
-rw-r--r--sys/contrib/rdma/rdma_cma.c2998
-rw-r--r--sys/contrib/rdma/rdma_device.c776
-rw-r--r--sys/contrib/rdma/rdma_iwcm.c1086
-rw-r--r--sys/contrib/rdma/rdma_user_cm.h215
-rw-r--r--sys/contrib/rdma/rdma_verbs.c822
-rw-r--r--sys/contrib/rdma/types.h121
32 files changed, 15395 insertions, 0 deletions
diff --git a/sys/contrib/rdma/core_priv.h b/sys/contrib/rdma/core_priv.h
new file mode 100644
index 000000000000..5d6c9d805e34
--- /dev/null
+++ b/sys/contrib/rdma/core_priv.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: core_priv.h 1349 2004-12-16 21:09:43Z roland $
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _CORE_PRIV_H
+#define _CORE_PRIV_H
+#include <sys/mutex.h>
+#include <sys/lock.h>
+
+#include <contrib/rdma/ib_verbs.h>
+
+#ifdef notyet
+int ib_device_register_sysfs(struct ib_device *device);
+void ib_device_unregister_sysfs(struct ib_device *device);
+
+int ib_sysfs_setup(void);
+void ib_sysfs_cleanup(void);
+#endif
+
+int ib_cache_setup(void);
+void ib_cache_cleanup(void);
+
+#endif /* _CORE_PRIV_H */
diff --git a/sys/contrib/rdma/ib_addr.h b/sys/contrib/rdma/ib_addr.h
new file mode 100644
index 000000000000..3df9949ad28d
--- /dev/null
+++ b/sys/contrib/rdma/ib_addr.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This Software is licensed under one of the following licenses:
+ *
+ * 1) under the terms of the "Common Public License 1.0" a copy of which is
+ * available from the Open Source Initiative, see
+ * http://www.opensource.org/licenses/cpl.php.
+ *
+ * 2) under the terms of the "The BSD License" a copy of which is
+ * available from the Open Source Initiative, see
+ * http://www.opensource.org/licenses/bsd-license.php.
+ *
+ * 3) under the terms of the "GNU General Public License (GPL) Version 2" a
+ * copy of which is available from the Open Source Initiative, see
+ * http://www.opensource.org/licenses/gpl-license.php.
+ *
+ * Licensee has the right to choose one of the above licenses.
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice and one of the license notices.
+ *
+ * Redistributions in binary form must reproduce both the above copyright
+ * notice, one of the license notices in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#if !defined(IB_ADDR_H)
+#define IB_ADDR_H
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/condvar.h>
+
+#include <net/if.h>
+#include <net/ethernet.h>
+
+#include <contrib/rdma/ib_verbs.h>
+
+
+#define MAX_ADDR_LEN ETHER_ADDR_LEN /* XXX doesn't support IB! */
+
+struct rdma_addr_client {
+ int refcount;
+ struct cv comp;
+ struct mtx lock;
+};
+
+/**
+ * rdma_addr_register_client - Register an address client.
+ */
+void rdma_addr_register_client(struct rdma_addr_client *client);
+
+/**
+ * rdma_addr_unregister_client - Deregister an address client.
+ * @client: Client object to deregister.
+ */
+void rdma_addr_unregister_client(struct rdma_addr_client *client);
+
+struct rdma_dev_addr {
+ unsigned char src_dev_addr[MAX_ADDR_LEN];
+ unsigned char dst_dev_addr[MAX_ADDR_LEN];
+ unsigned char broadcast[MAX_ADDR_LEN];
+ enum rdma_node_type dev_type;
+};
+
+/**
+ * rdma_translate_ip - Translate a local IP address to an RDMA hardware
+ * address.
+ */
+int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr);
+
+/**
+ * rdma_resolve_ip - Resolve source and destination IP addresses to
+ * RDMA hardware addresses.
+ * @client: Address client associated with request.
+ * @src_addr: An optional source address to use in the resolution. If a
+ * source address is not provided, a usable address will be returned via
+ * the callback.
+ * @dst_addr: The destination address to resolve.
+ * @addr: A reference to a data location that will receive the resolved
+ * addresses. The data location must remain valid until the callback has
+ * been invoked.
+ * @timeout_ms: Amount of time to wait for the address resolution to complete.
+ * @callback: Call invoked once address resolution has completed, timed out,
+ * or been canceled. A status of 0 indicates success.
+ * @context: User-specified context associated with the call.
+ */
+int rdma_resolve_ip(struct rdma_addr_client *client,
+ struct sockaddr *src_addr, struct sockaddr *dst_addr,
+ struct rdma_dev_addr *addr, int timeout_ms,
+ void (*callback)(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *addr, void *context),
+ void *context);
+
+void rdma_addr_cancel(struct rdma_dev_addr *addr);
+
+int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev,
+ const unsigned char *dst_dev_addr);
+
+static inline int ip_addr_size(struct sockaddr *addr)
+{
+ return addr->sa_family == AF_INET6 ?
+ sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in);
+}
+
+static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr)
+{
+ return ((u16)dev_addr->broadcast[8] << 8) | (u16)dev_addr->broadcast[9];
+}
+
+static inline void ib_addr_set_pkey(struct rdma_dev_addr *dev_addr, u16 pkey)
+{
+ dev_addr->broadcast[8] = pkey >> 8;
+ dev_addr->broadcast[9] = (unsigned char) pkey;
+}
+
+static inline void ib_addr_get_mgid(struct rdma_dev_addr *dev_addr,
+ union ib_gid *gid)
+{
+ memcpy(gid, dev_addr->broadcast + 4, sizeof *gid);
+}
+
+static inline void ib_addr_get_sgid(struct rdma_dev_addr *dev_addr,
+ union ib_gid *gid)
+{
+ memcpy(gid, dev_addr->src_dev_addr + 4, sizeof *gid);
+}
+
+static inline void ib_addr_set_sgid(struct rdma_dev_addr *dev_addr,
+ union ib_gid *gid)
+{
+ memcpy(dev_addr->src_dev_addr + 4, gid, sizeof *gid);
+}
+
+static inline void ib_addr_get_dgid(struct rdma_dev_addr *dev_addr,
+ union ib_gid *gid)
+{
+ memcpy(gid, dev_addr->dst_dev_addr + 4, sizeof *gid);
+}
+
+static inline void ib_addr_set_dgid(struct rdma_dev_addr *dev_addr,
+ union ib_gid *gid)
+{
+ memcpy(dev_addr->dst_dev_addr + 4, gid, sizeof *gid);
+}
+
+static inline void iw_addr_get_sgid(struct rdma_dev_addr *dev_addr,
+ union ib_gid *gid)
+{
+ memcpy(gid, dev_addr->src_dev_addr, sizeof *gid);
+}
+
+static inline void iw_addr_get_dgid(struct rdma_dev_addr *dev_addr,
+ union ib_gid *gid)
+{
+ memcpy(gid, dev_addr->dst_dev_addr, sizeof *gid);
+}
+
+#endif /* IB_ADDR_H */
diff --git a/sys/contrib/rdma/ib_cache.h b/sys/contrib/rdma/ib_cache.h
new file mode 100644
index 000000000000..419bea2f252b
--- /dev/null
+++ b/sys/contrib/rdma/ib_cache.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_cache.h 1349 2004-12-16 21:09:43Z roland $
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IB_CACHE_H
+#define _IB_CACHE_H
+
+#include <contrib/rdma/ib_verbs.h>
+
+/**
+ * ib_get_cached_gid - Returns a cached GID table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @index: The index into the cached GID table to query.
+ * @gid: The GID value found at the specified index.
+ *
+ * ib_get_cached_gid() fetches the specified GID table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_gid(struct ib_device *device,
+ u8 port_num,
+ int index,
+ union ib_gid *gid);
+
+/**
+ * ib_find_cached_gid - Returns the port number and GID table index where
+ * a specified GID value occurs.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the cached GID table where the GID was found. This
+ * parameter may be NULL.
+ *
+ * ib_find_cached_gid() searches for the specified GID value in
+ * the local software cache.
+ */
+int ib_find_cached_gid(struct ib_device *device,
+ union ib_gid *gid,
+ u8 *port_num,
+ u16 *index);
+
+/**
+ * ib_get_cached_pkey - Returns a cached PKey table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @index: The index into the cached PKey table to query.
+ * @pkey: The PKey value found at the specified index.
+ *
+ * ib_get_cached_pkey() fetches the specified PKey table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_pkey(struct ib_device *device_handle,
+ u8 port_num,
+ int index,
+ u16 *pkey);
+
+/**
+ * ib_find_cached_pkey - Returns the PKey table index where a specified
+ * PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the cached PKey table where the PKey was found.
+ *
+ * ib_find_cached_pkey() searches the specified PKey table in
+ * the local software cache.
+ */
+int ib_find_cached_pkey(struct ib_device *device,
+ u8 port_num,
+ u16 pkey,
+ u16 *index);
+
+/**
+ * ib_get_cached_lmc - Returns a cached lmc table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @lmc: The lmc value for the specified port for that device.
+ *
+ * ib_get_cached_lmc() fetches the specified lmc table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_lmc(struct ib_device *device,
+ u8 port_num,
+ u8 *lmc);
+
+#endif /* _IB_CACHE_H */
diff --git a/sys/contrib/rdma/ib_cm.h b/sys/contrib/rdma/ib_cm.h
new file mode 100644
index 000000000000..5fa918a949ec
--- /dev/null
+++ b/sys/contrib/rdma/ib_cm.h
@@ -0,0 +1,593 @@
+/*
+ * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_cm.h 4311 2005-12-05 18:42:01Z sean.hefty $
+ *
+ * $FreeBSD$
+ */
+
+
+#if !defined(IB_CM_H)
+#define IB_CM_H
+
+#include <contrib/rdma/ib_mad.h>
+#include <contrib/rdma/ib_sa.h>
+
+enum ib_cm_state {
+ IB_CM_IDLE,
+ IB_CM_LISTEN,
+ IB_CM_REQ_SENT,
+ IB_CM_REQ_RCVD,
+ IB_CM_MRA_REQ_SENT,
+ IB_CM_MRA_REQ_RCVD,
+ IB_CM_REP_SENT,
+ IB_CM_REP_RCVD,
+ IB_CM_MRA_REP_SENT,
+ IB_CM_MRA_REP_RCVD,
+ IB_CM_ESTABLISHED,
+ IB_CM_DREQ_SENT,
+ IB_CM_DREQ_RCVD,
+ IB_CM_TIMEWAIT,
+ IB_CM_SIDR_REQ_SENT,
+ IB_CM_SIDR_REQ_RCVD
+};
+
+enum ib_cm_lap_state {
+ IB_CM_LAP_UNINIT,
+ IB_CM_LAP_IDLE,
+ IB_CM_LAP_SENT,
+ IB_CM_LAP_RCVD,
+ IB_CM_MRA_LAP_SENT,
+ IB_CM_MRA_LAP_RCVD,
+};
+
+enum ib_cm_event_type {
+ IB_CM_REQ_ERROR,
+ IB_CM_REQ_RECEIVED,
+ IB_CM_REP_ERROR,
+ IB_CM_REP_RECEIVED,
+ IB_CM_RTU_RECEIVED,
+ IB_CM_USER_ESTABLISHED,
+ IB_CM_DREQ_ERROR,
+ IB_CM_DREQ_RECEIVED,
+ IB_CM_DREP_RECEIVED,
+ IB_CM_TIMEWAIT_EXIT,
+ IB_CM_MRA_RECEIVED,
+ IB_CM_REJ_RECEIVED,
+ IB_CM_LAP_ERROR,
+ IB_CM_LAP_RECEIVED,
+ IB_CM_APR_RECEIVED,
+ IB_CM_SIDR_REQ_ERROR,
+ IB_CM_SIDR_REQ_RECEIVED,
+ IB_CM_SIDR_REP_RECEIVED
+};
+
+enum ib_cm_data_size {
+ IB_CM_REQ_PRIVATE_DATA_SIZE = 92,
+ IB_CM_MRA_PRIVATE_DATA_SIZE = 222,
+ IB_CM_REJ_PRIVATE_DATA_SIZE = 148,
+ IB_CM_REP_PRIVATE_DATA_SIZE = 196,
+ IB_CM_RTU_PRIVATE_DATA_SIZE = 224,
+ IB_CM_DREQ_PRIVATE_DATA_SIZE = 220,
+ IB_CM_DREP_PRIVATE_DATA_SIZE = 224,
+ IB_CM_REJ_ARI_LENGTH = 72,
+ IB_CM_LAP_PRIVATE_DATA_SIZE = 168,
+ IB_CM_APR_PRIVATE_DATA_SIZE = 148,
+ IB_CM_APR_INFO_LENGTH = 72,
+ IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE = 216,
+ IB_CM_SIDR_REP_PRIVATE_DATA_SIZE = 136,
+ IB_CM_SIDR_REP_INFO_LENGTH = 72,
+ IB_CM_COMPARE_SIZE = 64
+};
+
+struct ib_cm_id;
+
+struct ib_cm_req_event_param {
+ struct ib_cm_id *listen_id;
+ u8 port;
+
+ struct ib_sa_path_rec *primary_path;
+ struct ib_sa_path_rec *alternate_path;
+
+ __be64 remote_ca_guid;
+ u32 remote_qkey;
+ u32 remote_qpn;
+ enum ib_qp_type qp_type;
+
+ u32 starting_psn;
+ u8 responder_resources;
+ u8 initiator_depth;
+ unsigned int local_cm_response_timeout:5;
+ unsigned int flow_control:1;
+ unsigned int remote_cm_response_timeout:5;
+ unsigned int retry_count:3;
+ unsigned int rnr_retry_count:3;
+ unsigned int srq:1;
+};
+
+struct ib_cm_rep_event_param {
+ __be64 remote_ca_guid;
+ u32 remote_qkey;
+ u32 remote_qpn;
+ u32 starting_psn;
+ u8 responder_resources;
+ u8 initiator_depth;
+ unsigned int target_ack_delay:5;
+ unsigned int failover_accepted:2;
+ unsigned int flow_control:1;
+ unsigned int rnr_retry_count:3;
+ unsigned int srq:1;
+};
+
+enum ib_cm_rej_reason {
+ IB_CM_REJ_NO_QP = 1,
+ IB_CM_REJ_NO_EEC = 2,
+ IB_CM_REJ_NO_RESOURCES = 3,
+ IB_CM_REJ_TIMEOUT = 4,
+ IB_CM_REJ_UNSUPPORTED = 5,
+ IB_CM_REJ_INVALID_COMM_ID = 6,
+ IB_CM_REJ_INVALID_COMM_INSTANCE = 7,
+ IB_CM_REJ_INVALID_SERVICE_ID = 8,
+ IB_CM_REJ_INVALID_TRANSPORT_TYPE = 9,
+ IB_CM_REJ_STALE_CONN = 10,
+ IB_CM_REJ_RDC_NOT_EXIST = 11,
+ IB_CM_REJ_INVALID_GID = 12,
+ IB_CM_REJ_INVALID_LID = 13,
+ IB_CM_REJ_INVALID_SL = 14,
+ IB_CM_REJ_INVALID_TRAFFIC_CLASS = 15,
+ IB_CM_REJ_INVALID_HOP_LIMIT = 16,
+ IB_CM_REJ_INVALID_PACKET_RATE = 17,
+ IB_CM_REJ_INVALID_ALT_GID = 18,
+ IB_CM_REJ_INVALID_ALT_LID = 19,
+ IB_CM_REJ_INVALID_ALT_SL = 20,
+ IB_CM_REJ_INVALID_ALT_TRAFFIC_CLASS = 21,
+ IB_CM_REJ_INVALID_ALT_HOP_LIMIT = 22,
+ IB_CM_REJ_INVALID_ALT_PACKET_RATE = 23,
+ IB_CM_REJ_PORT_CM_REDIRECT = 24,
+ IB_CM_REJ_PORT_REDIRECT = 25,
+ IB_CM_REJ_INVALID_MTU = 26,
+ IB_CM_REJ_INSUFFICIENT_RESP_RESOURCES = 27,
+ IB_CM_REJ_CONSUMER_DEFINED = 28,
+ IB_CM_REJ_INVALID_RNR_RETRY = 29,
+ IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID = 30,
+ IB_CM_REJ_INVALID_CLASS_VERSION = 31,
+ IB_CM_REJ_INVALID_FLOW_LABEL = 32,
+ IB_CM_REJ_INVALID_ALT_FLOW_LABEL = 33
+};
+
+struct ib_cm_rej_event_param {
+ enum ib_cm_rej_reason reason;
+ void *ari;
+ u8 ari_length;
+};
+
+struct ib_cm_mra_event_param {
+ u8 service_timeout;
+};
+
+struct ib_cm_lap_event_param {
+ struct ib_sa_path_rec *alternate_path;
+};
+
+enum ib_cm_apr_status {
+ IB_CM_APR_SUCCESS,
+ IB_CM_APR_INVALID_COMM_ID,
+ IB_CM_APR_UNSUPPORTED,
+ IB_CM_APR_REJECT,
+ IB_CM_APR_REDIRECT,
+ IB_CM_APR_IS_CURRENT,
+ IB_CM_APR_INVALID_QPN_EECN,
+ IB_CM_APR_INVALID_LID,
+ IB_CM_APR_INVALID_GID,
+ IB_CM_APR_INVALID_FLOW_LABEL,
+ IB_CM_APR_INVALID_TCLASS,
+ IB_CM_APR_INVALID_HOP_LIMIT,
+ IB_CM_APR_INVALID_PACKET_RATE,
+ IB_CM_APR_INVALID_SL
+};
+
+struct ib_cm_apr_event_param {
+ enum ib_cm_apr_status ap_status;
+ void *apr_info;
+ u8 info_len;
+};
+
+struct ib_cm_sidr_req_event_param {
+ struct ib_cm_id *listen_id;
+ u8 port;
+ u16 pkey;
+};
+
+enum ib_cm_sidr_status {
+ IB_SIDR_SUCCESS,
+ IB_SIDR_UNSUPPORTED,
+ IB_SIDR_REJECT,
+ IB_SIDR_NO_QP,
+ IB_SIDR_REDIRECT,
+ IB_SIDR_UNSUPPORTED_VERSION
+};
+
+struct ib_cm_sidr_rep_event_param {
+ enum ib_cm_sidr_status status;
+ u32 qkey;
+ u32 qpn;
+ void *info;
+ u8 info_len;
+};
+
+struct ib_cm_event {
+ enum ib_cm_event_type event;
+ union {
+ struct ib_cm_req_event_param req_rcvd;
+ struct ib_cm_rep_event_param rep_rcvd;
+ /* No data for RTU received events. */
+ struct ib_cm_rej_event_param rej_rcvd;
+ struct ib_cm_mra_event_param mra_rcvd;
+ struct ib_cm_lap_event_param lap_rcvd;
+ struct ib_cm_apr_event_param apr_rcvd;
+ /* No data for DREQ/DREP received events. */
+ struct ib_cm_sidr_req_event_param sidr_req_rcvd;
+ struct ib_cm_sidr_rep_event_param sidr_rep_rcvd;
+ enum ib_wc_status send_status;
+ } param;
+
+ void *private_data;
+};
+
+/**
+ * ib_cm_handler - User-defined callback to process communication events.
+ * @cm_id: Communication identifier associated with the reported event.
+ * @event: Information about the communication event.
+ *
+ * IB_CM_REQ_RECEIVED and IB_CM_SIDR_REQ_RECEIVED communication events
+ * generated as a result of listen requests result in the allocation of a
+ * new @cm_id. The new @cm_id is returned to the user through this callback.
+ * Clients are responsible for destroying the new @cm_id. For peer-to-peer
+ * IB_CM_REQ_RECEIVED and all other events, the returned @cm_id corresponds
+ * to a user's existing communication identifier.
+ *
+ * Users may not call ib_destroy_cm_id while in the context of this callback;
+ * however, returning a non-zero value instructs the communication manager to
+ * destroy the @cm_id after the callback completes.
+ */
+typedef int (*ib_cm_handler)(struct ib_cm_id *cm_id,
+ struct ib_cm_event *event);
+
+struct ib_cm_id {
+ ib_cm_handler cm_handler;
+ void *context;
+ struct ib_device *device;
+ __be64 service_id;
+ __be64 service_mask;
+ enum ib_cm_state state; /* internal CM/debug use */
+ enum ib_cm_lap_state lap_state; /* internal CM/debug use */
+ __be32 local_id;
+ __be32 remote_id;
+ u32 remote_cm_qpn; /* 1 unless redirected */
+};
+
+/**
+ * ib_create_cm_id - Allocate a communication identifier.
+ * @device: Device associated with the cm_id. All related communication will
+ * be associated with the specified device.
+ * @cm_handler: Callback invoked to notify the user of CM events.
+ * @context: User specified context associated with the communication
+ * identifier.
+ *
+ * Communication identifiers are used to track connection states, service
+ * ID resolution requests, and listen requests.
+ */
+struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
+ ib_cm_handler cm_handler,
+ void *context);
+
+/**
+ * ib_destroy_cm_id - Destroy a connection identifier.
+ * @cm_id: Connection identifier to destroy.
+ *
+ * This call blocks until the connection identifier is destroyed.
+ */
+void ib_destroy_cm_id(struct ib_cm_id *cm_id);
+
+#define IB_SERVICE_ID_AGN_MASK __constant_cpu_to_be64(0xFF00000000000000ULL)
+#define IB_CM_ASSIGN_SERVICE_ID __constant_cpu_to_be64(0x0200000000000000ULL)
+#define IB_CMA_SERVICE_ID __constant_cpu_to_be64(0x0000000001000000ULL)
+#define IB_CMA_SERVICE_ID_MASK __constant_cpu_to_be64(0xFFFFFFFFFF000000ULL)
+#define IB_SDP_SERVICE_ID __constant_cpu_to_be64(0x0000000000010000ULL)
+#define IB_SDP_SERVICE_ID_MASK __constant_cpu_to_be64(0xFFFFFFFFFFFF0000ULL)
+
+struct ib_cm_compare_data {
+ u8 data[IB_CM_COMPARE_SIZE];
+ u8 mask[IB_CM_COMPARE_SIZE];
+};
+
+/**
+ * ib_cm_listen - Initiates listening on the specified service ID for
+ * connection and service ID resolution requests.
+ * @cm_id: Connection identifier associated with the listen request.
+ * @service_id: Service identifier matched against incoming connection
+ * and service ID resolution requests. The service ID should be specified
+ * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ * assign a service ID to the caller.
+ * @service_mask: Mask applied to service ID used to listen across a
+ * range of service IDs. If set to 0, the service ID is matched
+ * exactly. This parameter is ignored if %service_id is set to
+ * IB_CM_ASSIGN_SERVICE_ID.
+ * @compare_data: This parameter is optional. It specifies data that must
+ * appear in the private data of a connection request for the specified
+ * listen request.
+ */
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
+ struct ib_cm_compare_data *compare_data);
+
+struct ib_cm_req_param {
+ struct ib_sa_path_rec *primary_path;
+ struct ib_sa_path_rec *alternate_path;
+ __be64 service_id;
+ u32 qp_num;
+ enum ib_qp_type qp_type;
+ u32 starting_psn;
+ const void *private_data;
+ u8 private_data_len;
+ u8 peer_to_peer;
+ u8 responder_resources;
+ u8 initiator_depth;
+ u8 remote_cm_response_timeout;
+ u8 flow_control;
+ u8 local_cm_response_timeout;
+ u8 retry_count;
+ u8 rnr_retry_count;
+ u8 max_cm_retries;
+ u8 srq;
+};
+
+/**
+ * ib_send_cm_req - Sends a connection request to the remote node.
+ * @cm_id: Connection identifier that will be associated with the
+ * connection request.
+ * @param: Connection request information needed to establish the
+ * connection.
+ */
+int ib_send_cm_req(struct ib_cm_id *cm_id,
+ struct ib_cm_req_param *param);
+
+struct ib_cm_rep_param {
+ u32 qp_num;
+ u32 starting_psn;
+ const void *private_data;
+ u8 private_data_len;
+ u8 responder_resources;
+ u8 initiator_depth;
+ u8 target_ack_delay;
+ u8 failover_accepted;
+ u8 flow_control;
+ u8 rnr_retry_count;
+ u8 srq;
+};
+
+/**
+ * ib_send_cm_rep - Sends a connection reply in response to a connection
+ * request.
+ * @cm_id: Connection identifier that will be associated with the
+ * connection request.
+ * @param: Connection reply information needed to establish the
+ * connection.
+ */
+int ib_send_cm_rep(struct ib_cm_id *cm_id,
+ struct ib_cm_rep_param *param);
+
+/**
+ * ib_send_cm_rtu - Sends a connection ready to use message in response
+ * to a connection reply message.
+ * @cm_id: Connection identifier associated with the connection request.
+ * @private_data: Optional user-defined private data sent with the
+ * ready to use message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_rtu(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len);
+
+/**
+ * ib_send_cm_dreq - Sends a disconnection request for an existing
+ * connection.
+ * @cm_id: Connection identifier associated with the connection being
+ * released.
+ * @private_data: Optional user-defined private data sent with the
+ * disconnection request message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_dreq(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len);
+
+/**
+ * ib_send_cm_drep - Sends a disconnection reply to a disconnection request.
+ * @cm_id: Connection identifier associated with the connection being
+ * released.
+ * @private_data: Optional user-defined private data sent with the
+ * disconnection reply message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ *
+ * If the cm_id is in the correct state, the CM will transition the connection
+ * to the timewait state, even if an error occurs sending the DREP message.
+ */
+int ib_send_cm_drep(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len);
+
+/**
+ * ib_cm_notify - Notifies the CM of an event reported to the consumer.
+ * @cm_id: Connection identifier to transition to established.
+ * @event: Type of event.
+ *
+ * This routine should be invoked by users to notify the CM of relevant
+ * communication events. Events that should be reported to the CM and
+ * when to report them are:
+ *
+ * IB_EVENT_COMM_EST - Used when a message is received on a connected
+ * QP before an RTU has been received.
+ * IB_EVENT_PATH_MIG - Notifies the CM that the connection has failed over
+ * to the alternate path.
+ */
+int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event);
+
+/**
+ * ib_send_cm_rej - Sends a connection rejection message to the
+ * remote node.
+ * @cm_id: Connection identifier associated with the connection being
+ * rejected.
+ * @reason: Reason for the connection request rejection.
+ * @ari: Optional additional rejection information.
+ * @ari_length: Size of the additional rejection information, in bytes.
+ * @private_data: Optional user-defined private data sent with the
+ * rejection message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_rej(struct ib_cm_id *cm_id,
+ enum ib_cm_rej_reason reason,
+ void *ari,
+ u8 ari_length,
+ const void *private_data,
+ u8 private_data_len);
+
+/**
+ * ib_send_cm_mra - Sends a message receipt acknowledgement to a connection
+ * message.
+ * @cm_id: Connection identifier associated with the connection message.
+ * @service_timeout: The maximum time required for the sender to reply to
+ * to the connection message.
+ * @private_data: Optional user-defined private data sent with the
+ * message receipt acknowledgement.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_mra(struct ib_cm_id *cm_id,
+ u8 service_timeout,
+ const void *private_data,
+ u8 private_data_len);
+
+/**
+ * ib_send_cm_lap - Sends a load alternate path request.
+ * @cm_id: Connection identifier associated with the load alternate path
+ * message.
+ * @alternate_path: A path record that identifies the alternate path to
+ * load.
+ * @private_data: Optional user-defined private data sent with the
+ * load alternate path message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_lap(struct ib_cm_id *cm_id,
+ struct ib_sa_path_rec *alternate_path,
+ const void *private_data,
+ u8 private_data_len);
+
+/**
+ * ib_cm_init_qp_attr - Initializes the QP attributes for use in transitioning
+ * to a specified QP state.
+ * @cm_id: Communication identifier associated with the QP attributes to
+ * initialize.
+ * @qp_attr: On input, specifies the desired QP state. On output, the
+ * mandatory and desired optional attributes will be set in order to
+ * modify the QP to the specified state.
+ * @qp_attr_mask: The QP attribute mask that may be used to transition the
+ * QP to the specified state.
+ *
+ * Users must set the @qp_attr->qp_state to the desired QP state. This call
+ * will set all required attributes for the given transition, along with
+ * known optional attributes. Users may override the attributes returned from
+ * this call before calling ib_modify_qp.
+ */
+int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask);
+
+/**
+ * ib_send_cm_apr - Sends an alternate path response message in response to
+ * a load alternate path request.
+ * @cm_id: Connection identifier associated with the alternate path response.
+ * @status: Reply status sent with the alternate path response.
+ * @info: Optional additional information sent with the alternate path
+ * response.
+ * @info_length: Size of the additional information, in bytes.
+ * @private_data: Optional user-defined private data sent with the
+ * alternate path response message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_apr(struct ib_cm_id *cm_id,
+ enum ib_cm_apr_status status,
+ void *info,
+ u8 info_length,
+ const void *private_data,
+ u8 private_data_len);
+
+struct ib_cm_sidr_req_param {
+ struct ib_sa_path_rec *path;
+ __be64 service_id;
+ int timeout_ms;
+ const void *private_data;
+ u8 private_data_len;
+ u8 max_cm_retries;
+};
+
+/**
+ * ib_send_cm_sidr_req - Sends a service ID resolution request to the
+ * remote node.
+ * @cm_id: Communication identifier that will be associated with the
+ * service ID resolution request.
+ * @param: Service ID resolution request information.
+ */
+int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
+ struct ib_cm_sidr_req_param *param);
+
+struct ib_cm_sidr_rep_param {
+ u32 qp_num;
+ u32 qkey;
+ enum ib_cm_sidr_status status;
+ const void *info;
+ u8 info_length;
+ const void *private_data;
+ u8 private_data_len;
+};
+
+/**
+ * ib_send_cm_sidr_rep - Sends a service ID resolution reply to the
+ * remote node.
+ * @cm_id: Communication identifier associated with the received service ID
+ * resolution request.
+ * @param: Service ID resolution reply information.
+ */
+int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
+ struct ib_cm_sidr_rep_param *param);
+
+#endif /* IB_CM_H */
diff --git a/sys/contrib/rdma/ib_fmr_pool.h b/sys/contrib/rdma/ib_fmr_pool.h
new file mode 100644
index 000000000000..55996b8228f5
--- /dev/null
+++ b/sys/contrib/rdma/ib_fmr_pool.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_fmr_pool.h 2730 2005-06-28 16:43:03Z sean.hefty $
+ *
+ * $FreeBSD$
+ */
+
+#if !defined(IB_FMR_POOL_H)
+#define IB_FMR_POOL_H
+
+#include <rdma/ib_verbs.h>
+
+struct ib_fmr_pool;
+
+/**
+ * struct ib_fmr_pool_param - Parameters for creating FMR pool
+ * @max_pages_per_fmr:Maximum number of pages per map request.
+ * @page_shift: Log2 of sizeof "pages" mapped by this fmr
+ * @access:Access flags for FMRs in pool.
+ * @pool_size:Number of FMRs to allocate for pool.
+ * @dirty_watermark:Flush is triggered when @dirty_watermark dirty
+ * FMRs are present.
+ * @flush_function:Callback called when unmapped FMRs are flushed and
+ * more FMRs are possibly available for mapping
+ * @flush_arg:Context passed to user's flush function.
+ * @cache:If set, FMRs may be reused after unmapping for identical map
+ * requests.
+ */
+struct ib_fmr_pool_param {
+ int max_pages_per_fmr;
+ int page_shift;
+ enum ib_access_flags access;
+ int pool_size;
+ int dirty_watermark;
+ void (*flush_function)(struct ib_fmr_pool *pool,
+ void * arg);
+ void *flush_arg;
+ unsigned cache:1;
+};
+
+struct ib_pool_fmr {
+ struct ib_fmr *fmr;
+ struct ib_fmr_pool *pool;
+ struct list_head list;
+ struct hlist_node cache_node;
+ int ref_count;
+ int remap_count;
+ u64 io_virtual_address;
+ int page_list_len;
+ u64 page_list[0];
+};
+
+struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd,
+ struct ib_fmr_pool_param *params);
+
+void ib_destroy_fmr_pool(struct ib_fmr_pool *pool);
+
+int ib_flush_fmr_pool(struct ib_fmr_pool *pool);
+
+struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
+ u64 *page_list,
+ int list_len,
+ u64 io_virtual_address);
+
+int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr);
+
+#endif /* IB_FMR_POOL_H */
diff --git a/sys/contrib/rdma/ib_mad.h b/sys/contrib/rdma/ib_mad.h
new file mode 100644
index 000000000000..7fabc97bc083
--- /dev/null
+++ b/sys/contrib/rdma/ib_mad.h
@@ -0,0 +1,656 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004-2006 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_mad.h 5596 2006-03-03 01:00:07Z sean.hefty $
+ *
+ * $FreeBSD$
+ */
+
+#if !defined( IB_MAD_H )
+#define IB_MAD_H
+
+
+#include <contrib/rdma/ib_verbs.h>
+
+/* Management base version */
+#define IB_MGMT_BASE_VERSION 1
+
+/* Management classes */
+#define IB_MGMT_CLASS_SUBN_LID_ROUTED 0x01
+#define IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE 0x81
+#define IB_MGMT_CLASS_SUBN_ADM 0x03
+#define IB_MGMT_CLASS_PERF_MGMT 0x04
+#define IB_MGMT_CLASS_BM 0x05
+#define IB_MGMT_CLASS_DEVICE_MGMT 0x06
+#define IB_MGMT_CLASS_CM 0x07
+#define IB_MGMT_CLASS_SNMP 0x08
+#define IB_MGMT_CLASS_DEVICE_ADM 0x10
+#define IB_MGMT_CLASS_BOOT_MGMT 0x11
+#define IB_MGMT_CLASS_BIS 0x12
+#define IB_MGMT_CLASS_CONG_MGMT 0x21
+#define IB_MGMT_CLASS_VENDOR_RANGE2_START 0x30
+#define IB_MGMT_CLASS_VENDOR_RANGE2_END 0x4F
+
+#define IB_OPENIB_OUI (0x001405)
+
+/* Management methods */
+#define IB_MGMT_METHOD_GET 0x01
+#define IB_MGMT_METHOD_SET 0x02
+#define IB_MGMT_METHOD_GET_RESP 0x81
+#define IB_MGMT_METHOD_SEND 0x03
+#define IB_MGMT_METHOD_TRAP 0x05
+#define IB_MGMT_METHOD_REPORT 0x06
+#define IB_MGMT_METHOD_REPORT_RESP 0x86
+#define IB_MGMT_METHOD_TRAP_REPRESS 0x07
+
+#define IB_MGMT_METHOD_RESP 0x80
+#define IB_BM_ATTR_MOD_RESP cpu_to_be32(1)
+
+#define IB_MGMT_MAX_METHODS 128
+
+/* RMPP information */
+#define IB_MGMT_RMPP_VERSION 1
+
+#define IB_MGMT_RMPP_TYPE_DATA 1
+#define IB_MGMT_RMPP_TYPE_ACK 2
+#define IB_MGMT_RMPP_TYPE_STOP 3
+#define IB_MGMT_RMPP_TYPE_ABORT 4
+
+#define IB_MGMT_RMPP_FLAG_ACTIVE 1
+#define IB_MGMT_RMPP_FLAG_FIRST (1<<1)
+#define IB_MGMT_RMPP_FLAG_LAST (1<<2)
+
+#define IB_MGMT_RMPP_NO_RESPTIME 0x1F
+
+#define IB_MGMT_RMPP_STATUS_SUCCESS 0
+#define IB_MGMT_RMPP_STATUS_RESX 1
+#define IB_MGMT_RMPP_STATUS_ABORT_MIN 118
+#define IB_MGMT_RMPP_STATUS_T2L 118
+#define IB_MGMT_RMPP_STATUS_BAD_LEN 119
+#define IB_MGMT_RMPP_STATUS_BAD_SEG 120
+#define IB_MGMT_RMPP_STATUS_BADT 121
+#define IB_MGMT_RMPP_STATUS_W2S 122
+#define IB_MGMT_RMPP_STATUS_S2B 123
+#define IB_MGMT_RMPP_STATUS_BAD_STATUS 124
+#define IB_MGMT_RMPP_STATUS_UNV 125
+#define IB_MGMT_RMPP_STATUS_TMR 126
+#define IB_MGMT_RMPP_STATUS_UNSPEC 127
+#define IB_MGMT_RMPP_STATUS_ABORT_MAX 127
+
+#define IB_QP0 0
+#define IB_QP1 __constant_htonl(1)
+#define IB_QP1_QKEY 0x80010000
+#define IB_QP_SET_QKEY 0x80000000
+
+enum {
+ IB_MGMT_MAD_HDR = 24,
+ IB_MGMT_MAD_DATA = 232,
+ IB_MGMT_RMPP_HDR = 36,
+ IB_MGMT_RMPP_DATA = 220,
+ IB_MGMT_VENDOR_HDR = 40,
+ IB_MGMT_VENDOR_DATA = 216,
+ IB_MGMT_SA_HDR = 56,
+ IB_MGMT_SA_DATA = 200,
+ IB_MGMT_DEVICE_HDR = 64,
+ IB_MGMT_DEVICE_DATA = 192,
+};
+
+struct ib_mad_hdr {
+ u8 base_version;
+ u8 mgmt_class;
+ u8 class_version;
+ u8 method;
+ __be16 status;
+ __be16 class_specific;
+ __be64 tid;
+ __be16 attr_id;
+ __be16 resv;
+ __be32 attr_mod;
+};
+
+struct ib_rmpp_hdr {
+ u8 rmpp_version;
+ u8 rmpp_type;
+ u8 rmpp_rtime_flags;
+ u8 rmpp_status;
+ __be32 seg_num;
+ __be32 paylen_newwin;
+};
+
+typedef u64 ib_sa_comp_mask;
+
+#define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << n))
+
+/*
+ * ib_sa_hdr and ib_sa_mad structures must be packed because they have
+ * 64-bit fields that are only 32-bit aligned. 64-bit architectures will
+ * lay them out wrong otherwise. (And unfortunately they are sent on
+ * the wire so we can't change the layout)
+ */
+struct ib_sa_hdr {
+ __be64 sm_key;
+ __be16 attr_offset;
+ __be16 reserved;
+ ib_sa_comp_mask comp_mask;
+} __attribute__ ((packed));
+
+struct ib_mad {
+ struct ib_mad_hdr mad_hdr;
+ u8 data[IB_MGMT_MAD_DATA];
+};
+
+struct ib_rmpp_mad {
+ struct ib_mad_hdr mad_hdr;
+ struct ib_rmpp_hdr rmpp_hdr;
+ u8 data[IB_MGMT_RMPP_DATA];
+};
+
+struct ib_sa_mad {
+ struct ib_mad_hdr mad_hdr;
+ struct ib_rmpp_hdr rmpp_hdr;
+ struct ib_sa_hdr sa_hdr;
+ u8 data[IB_MGMT_SA_DATA];
+} __attribute__ ((packed));
+
+struct ib_vendor_mad {
+ struct ib_mad_hdr mad_hdr;
+ struct ib_rmpp_hdr rmpp_hdr;
+ u8 reserved;
+ u8 oui[3];
+ u8 data[IB_MGMT_VENDOR_DATA];
+};
+
+struct ib_class_port_info
+{
+ u8 base_version;
+ u8 class_version;
+ __be16 capability_mask;
+ u8 reserved[3];
+ u8 resp_time_value;
+ u8 redirect_gid[16];
+ __be32 redirect_tcslfl;
+ __be16 redirect_lid;
+ __be16 redirect_pkey;
+ __be32 redirect_qp;
+ __be32 redirect_qkey;
+ u8 trap_gid[16];
+ __be32 trap_tcslfl;
+ __be16 trap_lid;
+ __be16 trap_pkey;
+ __be32 trap_hlqp;
+ __be32 trap_qkey;
+};
+
+/**
+ * ib_mad_send_buf - MAD data buffer and work request for sends.
+ * @next: A pointer used to chain together MADs for posting.
+ * @mad: References an allocated MAD data buffer for MADs that do not have
+ * RMPP active. For MADs using RMPP, references the common and management
+ * class specific headers.
+ * @mad_agent: MAD agent that allocated the buffer.
+ * @ah: The address handle to use when sending the MAD.
+ * @context: User-controlled context fields.
+ * @hdr_len: Indicates the size of the data header of the MAD. This length
+ * includes the common MAD, RMPP, and class specific headers.
+ * @data_len: Indicates the total size of user-transferred data.
+ * @seg_count: The number of RMPP segments allocated for this send.
+ * @seg_size: Size of each RMPP segment.
+ * @timeout_ms: Time to wait for a response.
+ * @retries: Number of times to retry a request for a response.
+ *
+ * Users are responsible for initializing the MAD buffer itself, with the
+ * exception of any RMPP header. Additional segment buffer space allocated
+ * beyond data_len is padding.
+ */
+struct ib_mad_send_buf {
+ struct ib_mad_send_buf *next;
+ void *mad;
+ struct ib_mad_agent *mad_agent;
+ struct ib_ah *ah;
+ void *context[2];
+ int hdr_len;
+ int data_len;
+ int seg_count;
+ int seg_size;
+ int timeout_ms;
+ int retries;
+};
+
+/**
+ * ib_response_mad - Returns if the specified MAD has been generated in
+ * response to a sent request or trap.
+ */
+int ib_response_mad(struct ib_mad *mad);
+
+/**
+ * ib_get_rmpp_resptime - Returns the RMPP response time.
+ * @rmpp_hdr: An RMPP header.
+ */
+static inline u8 ib_get_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr)
+{
+ return rmpp_hdr->rmpp_rtime_flags >> 3;
+}
+
+/**
+ * ib_get_rmpp_flags - Returns the RMPP flags.
+ * @rmpp_hdr: An RMPP header.
+ */
+static inline u8 ib_get_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr)
+{
+ return rmpp_hdr->rmpp_rtime_flags & 0x7;
+}
+
+/**
+ * ib_set_rmpp_resptime - Sets the response time in an RMPP header.
+ * @rmpp_hdr: An RMPP header.
+ * @rtime: The response time to set.
+ */
+static inline void ib_set_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr, u8 rtime)
+{
+ rmpp_hdr->rmpp_rtime_flags = ib_get_rmpp_flags(rmpp_hdr) | (rtime << 3);
+}
+
+/**
+ * ib_set_rmpp_flags - Sets the flags in an RMPP header.
+ * @rmpp_hdr: An RMPP header.
+ * @flags: The flags to set.
+ */
+static inline void ib_set_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr, u8 flags)
+{
+ rmpp_hdr->rmpp_rtime_flags = (rmpp_hdr->rmpp_rtime_flags & 0xF1) |
+ (flags & 0x7);
+}
+
+struct ib_mad_agent;
+struct ib_mad_send_wc;
+struct ib_mad_recv_wc;
+
+/**
+ * ib_mad_send_handler - callback handler for a sent MAD.
+ * @mad_agent: MAD agent that sent the MAD.
+ * @mad_send_wc: Send work completion information on the sent MAD.
+ */
+typedef void (*ib_mad_send_handler)(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_wc *mad_send_wc);
+
+/**
+ * ib_mad_snoop_handler - Callback handler for snooping sent MADs.
+ * @mad_agent: MAD agent that snooped the MAD.
+ * @send_wr: Work request information on the sent MAD.
+ * @mad_send_wc: Work completion information on the sent MAD. Valid
+ * only for snooping that occurs on a send completion.
+ *
+ * Clients snooping MADs should not modify data referenced by the @send_wr
+ * or @mad_send_wc.
+ */
+typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf,
+ struct ib_mad_send_wc *mad_send_wc);
+
+/**
+ * ib_mad_recv_handler - callback handler for a received MAD.
+ * @mad_agent: MAD agent requesting the received MAD.
+ * @mad_recv_wc: Received work completion information on the received MAD.
+ *
+ * MADs received in response to a send request operation will be handed to
+ * the user before the send operation completes. All data buffers given
+ * to registered agents through this routine are owned by the receiving
+ * client, except for snooping agents. Clients snooping MADs should not
+ * modify the data referenced by @mad_recv_wc.
+ */
+typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent,
+ struct ib_mad_recv_wc *mad_recv_wc);
+
+/**
+ * ib_mad_agent - Used to track MAD registration with the access layer.
+ * @device: Reference to device registration is on.
+ * @qp: Reference to QP used for sending and receiving MADs.
+ * @mr: Memory region for system memory usable for DMA.
+ * @recv_handler: Callback handler for a received MAD.
+ * @send_handler: Callback handler for a sent MAD.
+ * @snoop_handler: Callback handler for snooped sent MADs.
+ * @context: User-specified context associated with this registration.
+ * @hi_tid: Access layer assigned transaction ID for this client.
+ * Unsolicited MADs sent by this client will have the upper 32-bits
+ * of their TID set to this value.
+ * @port_num: Port number on which QP is registered
+ * @rmpp_version: If set, indicates the RMPP version used by this agent.
+ */
+struct ib_mad_agent {
+ struct ib_device *device;
+ struct ib_qp *qp;
+ struct ib_mr *mr;
+ ib_mad_recv_handler recv_handler;
+ ib_mad_send_handler send_handler;
+ ib_mad_snoop_handler snoop_handler;
+ void *context;
+ u32 hi_tid;
+ u8 port_num;
+ u8 rmpp_version;
+};
+
+/**
+ * ib_mad_send_wc - MAD send completion information.
+ * @send_buf: Send MAD data buffer associated with the send MAD request.
+ * @status: Completion status.
+ * @vendor_err: Optional vendor error information returned with a failed
+ * request.
+ */
+struct ib_mad_send_wc {
+ struct ib_mad_send_buf *send_buf;
+ enum ib_wc_status status;
+ u32 vendor_err;
+};
+
+/**
+ * ib_mad_recv_buf - received MAD buffer information.
+ * @list: Reference to next data buffer for a received RMPP MAD.
+ * @grh: References a data buffer containing the global route header.
+ * The data refereced by this buffer is only valid if the GRH is
+ * valid.
+ * @mad: References the start of the received MAD.
+ */
+struct ib_mad_recv_buf {
+ TAILQ_ENTRY(ib_mad_recv_buf) entry;
+ struct ib_grh *grh;
+ struct ib_mad *mad;
+};
+
+/**
+ * ib_mad_recv_wc - received MAD information.
+ * @wc: Completion information for the received data.
+ * @recv_buf: Specifies the location of the received data buffer(s).
+ * @rmpp_list: Specifies a list of RMPP reassembled received MAD buffers.
+ * @mad_len: The length of the received MAD, without duplicated headers.
+ *
+ * For received response, the wr_id contains a pointer to the ib_mad_send_buf
+ * for the corresponding send request.
+ */
+struct ib_mad_recv_wc {
+ struct ib_wc *wc;
+ struct ib_mad_recv_buf recv_buf;
+ TAILQ_ENTRY(ib_mad_recv_wc) entry;
+ int mad_len;
+};
+
+/**
+ * ib_mad_reg_req - MAD registration request
+ * @mgmt_class: Indicates which management class of MADs should be receive
+ * by the caller. This field is only required if the user wishes to
+ * receive unsolicited MADs, otherwise it should be 0.
+ * @mgmt_class_version: Indicates which version of MADs for the given
+ * management class to receive.
+ * @oui: Indicates IEEE OUI when mgmt_class is a vendor class
+ * in the range from 0x30 to 0x4f. Otherwise not used.
+ * @method_mask: The caller will receive unsolicited MADs for any method
+ * where @method_mask = 1.
+ */
+struct ib_mad_reg_req {
+ u8 mgmt_class;
+ u8 mgmt_class_version;
+ u8 oui[3];
+#ifdef notyet
+ DECLARE_BITMAP(method_mask, IB_MGMT_MAX_METHODS);
+#endif
+};
+
+/**
+ * ib_register_mad_agent - Register to send/receive MADs.
+ * @device: The device to register with.
+ * @port_num: The port on the specified device to use.
+ * @qp_type: Specifies which QP to access. Must be either
+ * IB_QPT_SMI or IB_QPT_GSI.
+ * @mad_reg_req: Specifies which unsolicited MADs should be received
+ * by the caller. This parameter may be NULL if the caller only
+ * wishes to receive solicited responses.
+ * @rmpp_version: If set, indicates that the client will send
+ * and receive MADs that contain the RMPP header for the given version.
+ * If set to 0, indicates that RMPP is not used by this client.
+ * @send_handler: The completion callback routine invoked after a send
+ * request has completed.
+ * @recv_handler: The completion callback routine invoked for a received
+ * MAD.
+ * @context: User specified context associated with the registration.
+ */
+struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
+ u8 port_num,
+ enum ib_qp_type qp_type,
+ struct ib_mad_reg_req *mad_reg_req,
+ u8 rmpp_version,
+ ib_mad_send_handler send_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context);
+
+enum ib_mad_snoop_flags {
+ /*IB_MAD_SNOOP_POSTED_SENDS = 1,*/
+ /*IB_MAD_SNOOP_RMPP_SENDS = (1<<1),*/
+ IB_MAD_SNOOP_SEND_COMPLETIONS = (1<<2),
+ /*IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS = (1<<3),*/
+ IB_MAD_SNOOP_RECVS = (1<<4)
+ /*IB_MAD_SNOOP_RMPP_RECVS = (1<<5),*/
+ /*IB_MAD_SNOOP_REDIRECTED_QPS = (1<<6)*/
+};
+
+/**
+ * ib_register_mad_snoop - Register to snoop sent and received MADs.
+ * @device: The device to register with.
+ * @port_num: The port on the specified device to use.
+ * @qp_type: Specifies which QP traffic to snoop. Must be either
+ * IB_QPT_SMI or IB_QPT_GSI.
+ * @mad_snoop_flags: Specifies information where snooping occurs.
+ * @send_handler: The callback routine invoked for a snooped send.
+ * @recv_handler: The callback routine invoked for a snooped receive.
+ * @context: User specified context associated with the registration.
+ */
+struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device,
+ u8 port_num,
+ enum ib_qp_type qp_type,
+ int mad_snoop_flags,
+ ib_mad_snoop_handler snoop_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context);
+
+/**
+ * ib_unregister_mad_agent - Unregisters a client from using MAD services.
+ * @mad_agent: Corresponding MAD registration request to deregister.
+ *
+ * After invoking this routine, MAD services are no longer usable by the
+ * client on the associated QP.
+ */
+int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent);
+
+/**
+ * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
+ * with the registered client.
+ * @send_buf: Specifies the information needed to send the MAD(s).
+ * @bad_send_buf: Specifies the MAD on which an error was encountered. This
+ * parameter is optional if only a single MAD is posted.
+ *
+ * Sent MADs are not guaranteed to complete in the order that they were posted.
+ *
+ * If the MAD requires RMPP, the data buffer should contain a single copy
+ * of the common MAD, RMPP, and class specific headers, followed by the class
+ * defined data. If the class defined data would not divide evenly into
+ * RMPP segments, then space must be allocated at the end of the referenced
+ * buffer for any required padding. To indicate the amount of class defined
+ * data being transferred, the paylen_newwin field in the RMPP header should
+ * be set to the size of the class specific header plus the amount of class
+ * defined data being transferred. The paylen_newwin field should be
+ * specified in network-byte order.
+ */
+int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
+ struct ib_mad_send_buf **bad_send_buf);
+
+
+/**
+ * ib_free_recv_mad - Returns data buffers used to receive a MAD.
+ * @mad_recv_wc: Work completion information for a received MAD.
+ *
+ * Clients receiving MADs through their ib_mad_recv_handler must call this
+ * routine to return the work completion buffers to the access layer.
+ */
+void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc);
+
+/**
+ * ib_cancel_mad - Cancels an outstanding send MAD operation.
+ * @mad_agent: Specifies the registration associated with sent MAD.
+ * @send_buf: Indicates the MAD to cancel.
+ *
+ * MADs will be returned to the user through the corresponding
+ * ib_mad_send_handler.
+ */
+void ib_cancel_mad(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf);
+
+/**
+ * ib_modify_mad - Modifies an outstanding send MAD operation.
+ * @mad_agent: Specifies the registration associated with sent MAD.
+ * @send_buf: Indicates the MAD to modify.
+ * @timeout_ms: New timeout value for sent MAD.
+ *
+ * This call will reset the timeout value for a sent MAD to the specified
+ * value.
+ */
+int ib_modify_mad(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf, u32 timeout_ms);
+
+/**
+ * ib_redirect_mad_qp - Registers a QP for MAD services.
+ * @qp: Reference to a QP that requires MAD services.
+ * @rmpp_version: If set, indicates that the client will send
+ * and receive MADs that contain the RMPP header for the given version.
+ * If set to 0, indicates that RMPP is not used by this client.
+ * @send_handler: The completion callback routine invoked after a send
+ * request has completed.
+ * @recv_handler: The completion callback routine invoked for a received
+ * MAD.
+ * @context: User specified context associated with the registration.
+ *
+ * Use of this call allows clients to use MAD services, such as RMPP,
+ * on user-owned QPs. After calling this routine, users may send
+ * MADs on the specified QP by calling ib_mad_post_send.
+ */
+struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp,
+ u8 rmpp_version,
+ ib_mad_send_handler send_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context);
+
+/**
+ * ib_process_mad_wc - Processes a work completion associated with a
+ * MAD sent or received on a redirected QP.
+ * @mad_agent: Specifies the registered MAD service using the redirected QP.
+ * @wc: References a work completion associated with a sent or received
+ * MAD segment.
+ *
+ * This routine is used to complete or continue processing on a MAD request.
+ * If the work completion is associated with a send operation, calling
+ * this routine is required to continue an RMPP transfer or to wait for a
+ * corresponding response, if it is a request. If the work completion is
+ * associated with a receive operation, calling this routine is required to
+ * process an inbound or outbound RMPP transfer, or to match a response MAD
+ * with its corresponding request.
+ */
+int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
+ struct ib_wc *wc);
+
+/**
+ * ib_create_send_mad - Allocate and initialize a data buffer and work request
+ * for sending a MAD.
+ * @mad_agent: Specifies the registered MAD service to associate with the MAD.
+ * @remote_qpn: Specifies the QPN of the receiving node.
+ * @pkey_index: Specifies which PKey the MAD will be sent using. This field
+ * is valid only if the remote_qpn is QP 1.
+ * @rmpp_active: Indicates if the send will enable RMPP.
+ * @hdr_len: Indicates the size of the data header of the MAD. This length
+ * should include the common MAD header, RMPP header, plus any class
+ * specific header.
+ * @data_len: Indicates the size of any user-transferred data. The call will
+ * automatically adjust the allocated buffer size to account for any
+ * additional padding that may be necessary.
+ * @gfp_mask: GFP mask used for the memory allocation.
+ *
+ * This routine allocates a MAD for sending. The returned MAD send buffer
+ * will reference a data buffer usable for sending a MAD, along
+ * with an initialized work request structure. Users may modify the returned
+ * MAD data buffer before posting the send.
+ *
+ * The returned MAD header, class specific headers, and any padding will be
+ * cleared. Users are responsible for initializing the common MAD header,
+ * any class specific header, and MAD data area.
+ * If @rmpp_active is set, the RMPP header will be initialized for sending.
+ */
+struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
+ u32 remote_qpn, u16 pkey_index,
+ int rmpp_active,
+ int hdr_len, int data_len,
+ int gfp_mask);
+
+/**
+ * ib_is_mad_class_rmpp - returns whether given management class
+ * supports RMPP.
+ * @mgmt_class: management class
+ *
+ * This routine returns whether the management class supports RMPP.
+ */
+int ib_is_mad_class_rmpp(u8 mgmt_class);
+
+/**
+ * ib_get_mad_data_offset - returns the data offset for a given
+ * management class.
+ * @mgmt_class: management class
+ *
+ * This routine returns the data offset in the MAD for the management
+ * class requested.
+ */
+int ib_get_mad_data_offset(u8 mgmt_class);
+
+/**
+ * ib_get_rmpp_segment - returns the data buffer for a given RMPP segment.
+ * @send_buf: Previously allocated send data buffer.
+ * @seg_num: number of segment to return
+ *
+ * This routine returns a pointer to the data buffer of an RMPP MAD.
+ * Users must provide synchronization to @send_buf around this call.
+ */
+void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num);
+
+/**
+ * ib_free_send_mad - Returns data buffers used to send a MAD.
+ * @send_buf: Previously allocated send data buffer.
+ */
+void ib_free_send_mad(struct ib_mad_send_buf *send_buf);
+
+#endif /* IB_MAD_H */
diff --git a/sys/contrib/rdma/ib_marshall.h b/sys/contrib/rdma/ib_marshall.h
new file mode 100644
index 000000000000..60cd219efd04
--- /dev/null
+++ b/sys/contrib/rdma/ib_marshall.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $FreeBSD$
+ */
+
+#if !defined(IB_USER_MARSHALL_H)
+#define IB_USER_MARSHALL_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_user_sa.h>
+
+void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst,
+ struct ib_qp_attr *src);
+
+void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst,
+ struct ib_ah_attr *src);
+
+void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
+ struct ib_sa_path_rec *src);
+
+void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst,
+ struct ib_user_path_rec *src);
+
+#endif /* IB_USER_MARSHALL_H */
diff --git a/sys/contrib/rdma/ib_pack.h b/sys/contrib/rdma/ib_pack.h
new file mode 100644
index 000000000000..206d1f1c0a63
--- /dev/null
+++ b/sys/contrib/rdma/ib_pack.h
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_pack.h 1349 2004-12-16 21:09:43Z roland $
+ *
+ * $FreeBSD$
+ */
+
+#ifndef IB_PACK_H
+#define IB_PACK_H
+
+#include <rdma/ib_verbs.h>
+
+enum {
+ IB_LRH_BYTES = 8,
+ IB_GRH_BYTES = 40,
+ IB_BTH_BYTES = 12,
+ IB_DETH_BYTES = 8
+};
+
+struct ib_field {
+ size_t struct_offset_bytes;
+ size_t struct_size_bytes;
+ int offset_words;
+ int offset_bits;
+ int size_bits;
+ char *field_name;
+};
+
+#define RESERVED \
+ .field_name = "reserved"
+
+/*
+ * This macro cleans up the definitions of constants for BTH opcodes.
+ * It is used to define constants such as IB_OPCODE_UD_SEND_ONLY,
+ * which becomes IB_OPCODE_UD + IB_OPCODE_SEND_ONLY, and this gives
+ * the correct value.
+ *
+ * In short, user code should use the constants defined using the
+ * macro rather than worrying about adding together other constants.
+*/
+#define IB_OPCODE(transport, op) \
+ IB_OPCODE_ ## transport ## _ ## op = \
+ IB_OPCODE_ ## transport + IB_OPCODE_ ## op
+
+enum {
+ /* transport types -- just used to define real constants */
+ IB_OPCODE_RC = 0x00,
+ IB_OPCODE_UC = 0x20,
+ IB_OPCODE_RD = 0x40,
+ IB_OPCODE_UD = 0x60,
+
+ /* operations -- just used to define real constants */
+ IB_OPCODE_SEND_FIRST = 0x00,
+ IB_OPCODE_SEND_MIDDLE = 0x01,
+ IB_OPCODE_SEND_LAST = 0x02,
+ IB_OPCODE_SEND_LAST_WITH_IMMEDIATE = 0x03,
+ IB_OPCODE_SEND_ONLY = 0x04,
+ IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE = 0x05,
+ IB_OPCODE_RDMA_WRITE_FIRST = 0x06,
+ IB_OPCODE_RDMA_WRITE_MIDDLE = 0x07,
+ IB_OPCODE_RDMA_WRITE_LAST = 0x08,
+ IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE = 0x09,
+ IB_OPCODE_RDMA_WRITE_ONLY = 0x0a,
+ IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE = 0x0b,
+ IB_OPCODE_RDMA_READ_REQUEST = 0x0c,
+ IB_OPCODE_RDMA_READ_RESPONSE_FIRST = 0x0d,
+ IB_OPCODE_RDMA_READ_RESPONSE_MIDDLE = 0x0e,
+ IB_OPCODE_RDMA_READ_RESPONSE_LAST = 0x0f,
+ IB_OPCODE_RDMA_READ_RESPONSE_ONLY = 0x10,
+ IB_OPCODE_ACKNOWLEDGE = 0x11,
+ IB_OPCODE_ATOMIC_ACKNOWLEDGE = 0x12,
+ IB_OPCODE_COMPARE_SWAP = 0x13,
+ IB_OPCODE_FETCH_ADD = 0x14,
+
+ /* real constants follow -- see comment about above IB_OPCODE()
+ macro for more details */
+
+ /* RC */
+ IB_OPCODE(RC, SEND_FIRST),
+ IB_OPCODE(RC, SEND_MIDDLE),
+ IB_OPCODE(RC, SEND_LAST),
+ IB_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RC, SEND_ONLY),
+ IB_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_WRITE_FIRST),
+ IB_OPCODE(RC, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(RC, RDMA_WRITE_LAST),
+ IB_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_WRITE_ONLY),
+ IB_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_READ_REQUEST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_FIRST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_LAST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_ONLY),
+ IB_OPCODE(RC, ACKNOWLEDGE),
+ IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE),
+ IB_OPCODE(RC, COMPARE_SWAP),
+ IB_OPCODE(RC, FETCH_ADD),
+
+ /* UC */
+ IB_OPCODE(UC, SEND_FIRST),
+ IB_OPCODE(UC, SEND_MIDDLE),
+ IB_OPCODE(UC, SEND_LAST),
+ IB_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(UC, SEND_ONLY),
+ IB_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(UC, RDMA_WRITE_FIRST),
+ IB_OPCODE(UC, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(UC, RDMA_WRITE_LAST),
+ IB_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(UC, RDMA_WRITE_ONLY),
+ IB_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+
+ /* RD */
+ IB_OPCODE(RD, SEND_FIRST),
+ IB_OPCODE(RD, SEND_MIDDLE),
+ IB_OPCODE(RD, SEND_LAST),
+ IB_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RD, SEND_ONLY),
+ IB_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_WRITE_FIRST),
+ IB_OPCODE(RD, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(RD, RDMA_WRITE_LAST),
+ IB_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_WRITE_ONLY),
+ IB_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_READ_REQUEST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_FIRST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_LAST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_ONLY),
+ IB_OPCODE(RD, ACKNOWLEDGE),
+ IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE),
+ IB_OPCODE(RD, COMPARE_SWAP),
+ IB_OPCODE(RD, FETCH_ADD),
+
+ /* UD */
+ IB_OPCODE(UD, SEND_ONLY),
+ IB_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE)
+};
+
+enum {
+ IB_LNH_RAW = 0,
+ IB_LNH_IP = 1,
+ IB_LNH_IBA_LOCAL = 2,
+ IB_LNH_IBA_GLOBAL = 3
+};
+
+struct ib_unpacked_lrh {
+ u8 virtual_lane;
+ u8 link_version;
+ u8 service_level;
+ u8 link_next_header;
+ __be16 destination_lid;
+ __be16 packet_length;
+ __be16 source_lid;
+};
+
+struct ib_unpacked_grh {
+ u8 ip_version;
+ u8 traffic_class;
+ __be32 flow_label;
+ __be16 payload_length;
+ u8 next_header;
+ u8 hop_limit;
+ union ib_gid source_gid;
+ union ib_gid destination_gid;
+};
+
+struct ib_unpacked_bth {
+ u8 opcode;
+ u8 solicited_event;
+ u8 mig_req;
+ u8 pad_count;
+ u8 transport_header_version;
+ __be16 pkey;
+ __be32 destination_qpn;
+ u8 ack_req;
+ __be32 psn;
+};
+
+struct ib_unpacked_deth {
+ __be32 qkey;
+ __be32 source_qpn;
+};
+
+struct ib_ud_header {
+ struct ib_unpacked_lrh lrh;
+ int grh_present;
+ struct ib_unpacked_grh grh;
+ struct ib_unpacked_bth bth;
+ struct ib_unpacked_deth deth;
+ int immediate_present;
+ __be32 immediate_data;
+};
+
+void ib_pack(const struct ib_field *desc,
+ int desc_len,
+ void *structure,
+ void *buf);
+
+void ib_unpack(const struct ib_field *desc,
+ int desc_len,
+ void *buf,
+ void *structure);
+
+void ib_ud_header_init(int payload_bytes,
+ int grh_present,
+ struct ib_ud_header *header);
+
+int ib_ud_header_pack(struct ib_ud_header *header,
+ void *buf);
+
+int ib_ud_header_unpack(void *buf,
+ struct ib_ud_header *header);
+
+#endif /* IB_PACK_H */
diff --git a/sys/contrib/rdma/ib_sa.h b/sys/contrib/rdma/ib_sa.h
new file mode 100644
index 000000000000..bf6a28a1c616
--- /dev/null
+++ b/sys/contrib/rdma/ib_sa.h
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_sa.h 2811 2005-07-06 18:11:43Z halr $
+ *
+ * $FreeBSD$
+ */
+
+#ifndef IB_SA_H
+#define IB_SA_H
+
+#include <machine/atomic.h>
+
+#include <contrib/rdma/ib_verbs.h>
+#include <contrib/rdma/ib_mad.h>
+
+enum {
+ IB_SA_CLASS_VERSION = 2, /* IB spec version 1.1/1.2 */
+
+ IB_SA_METHOD_GET_TABLE = 0x12,
+ IB_SA_METHOD_GET_TABLE_RESP = 0x92,
+ IB_SA_METHOD_DELETE = 0x15,
+ IB_SA_METHOD_DELETE_RESP = 0x95,
+ IB_SA_METHOD_GET_MULTI = 0x14,
+ IB_SA_METHOD_GET_MULTI_RESP = 0x94,
+ IB_SA_METHOD_GET_TRACE_TBL = 0x13
+};
+
+enum {
+ IB_SA_ATTR_CLASS_PORTINFO = 0x01,
+ IB_SA_ATTR_NOTICE = 0x02,
+ IB_SA_ATTR_INFORM_INFO = 0x03,
+ IB_SA_ATTR_NODE_REC = 0x11,
+ IB_SA_ATTR_PORT_INFO_REC = 0x12,
+ IB_SA_ATTR_SL2VL_REC = 0x13,
+ IB_SA_ATTR_SWITCH_REC = 0x14,
+ IB_SA_ATTR_LINEAR_FDB_REC = 0x15,
+ IB_SA_ATTR_RANDOM_FDB_REC = 0x16,
+ IB_SA_ATTR_MCAST_FDB_REC = 0x17,
+ IB_SA_ATTR_SM_INFO_REC = 0x18,
+ IB_SA_ATTR_LINK_REC = 0x20,
+ IB_SA_ATTR_GUID_INFO_REC = 0x30,
+ IB_SA_ATTR_SERVICE_REC = 0x31,
+ IB_SA_ATTR_PARTITION_REC = 0x33,
+ IB_SA_ATTR_PATH_REC = 0x35,
+ IB_SA_ATTR_VL_ARB_REC = 0x36,
+ IB_SA_ATTR_MC_MEMBER_REC = 0x38,
+ IB_SA_ATTR_TRACE_REC = 0x39,
+ IB_SA_ATTR_MULTI_PATH_REC = 0x3a,
+ IB_SA_ATTR_SERVICE_ASSOC_REC = 0x3b,
+ IB_SA_ATTR_INFORM_INFO_REC = 0xf3
+};
+
+enum ib_sa_selector {
+ IB_SA_GT = 0,
+ IB_SA_LT = 1,
+ IB_SA_EQ = 2,
+ /*
+ * The meaning of "best" depends on the attribute: for
+ * example, for MTU best will return the largest available
+ * MTU, while for packet life time, best will return the
+ * smallest available life time.
+ */
+ IB_SA_BEST = 3
+};
+
+/*
+ * Structures for SA records are named "struct ib_sa_xxx_rec." No
+ * attempt is made to pack structures to match the physical layout of
+ * SA records in SA MADs; all packing and unpacking is handled by the
+ * SA query code.
+ *
+ * For a record with structure ib_sa_xxx_rec, the naming convention
+ * for the component mask value for field yyy is IB_SA_XXX_REC_YYY (we
+ * never use different abbreviations or otherwise change the spelling
+ * of xxx/yyy between ib_sa_xxx_rec.yyy and IB_SA_XXX_REC_YYY).
+ *
+ * Reserved rows are indicated with comments to help maintainability.
+ */
+
+/* reserved: 0 */
+/* reserved: 1 */
+#define IB_SA_PATH_REC_DGID IB_SA_COMP_MASK( 2)
+#define IB_SA_PATH_REC_SGID IB_SA_COMP_MASK( 3)
+#define IB_SA_PATH_REC_DLID IB_SA_COMP_MASK( 4)
+#define IB_SA_PATH_REC_SLID IB_SA_COMP_MASK( 5)
+#define IB_SA_PATH_REC_RAW_TRAFFIC IB_SA_COMP_MASK( 6)
+/* reserved: 7 */
+#define IB_SA_PATH_REC_FLOW_LABEL IB_SA_COMP_MASK( 8)
+#define IB_SA_PATH_REC_HOP_LIMIT IB_SA_COMP_MASK( 9)
+#define IB_SA_PATH_REC_TRAFFIC_CLASS IB_SA_COMP_MASK(10)
+#define IB_SA_PATH_REC_REVERSIBLE IB_SA_COMP_MASK(11)
+#define IB_SA_PATH_REC_NUMB_PATH IB_SA_COMP_MASK(12)
+#define IB_SA_PATH_REC_PKEY IB_SA_COMP_MASK(13)
+/* reserved: 14 */
+#define IB_SA_PATH_REC_SL IB_SA_COMP_MASK(15)
+#define IB_SA_PATH_REC_MTU_SELECTOR IB_SA_COMP_MASK(16)
+#define IB_SA_PATH_REC_MTU IB_SA_COMP_MASK(17)
+#define IB_SA_PATH_REC_RATE_SELECTOR IB_SA_COMP_MASK(18)
+#define IB_SA_PATH_REC_RATE IB_SA_COMP_MASK(19)
+#define IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR IB_SA_COMP_MASK(20)
+#define IB_SA_PATH_REC_PACKET_LIFE_TIME IB_SA_COMP_MASK(21)
+#define IB_SA_PATH_REC_PREFERENCE IB_SA_COMP_MASK(22)
+
+struct ib_sa_path_rec {
+ /* reserved */
+ /* reserved */
+ union ib_gid dgid;
+ union ib_gid sgid;
+ __be16 dlid;
+ __be16 slid;
+ int raw_traffic;
+ /* reserved */
+ __be32 flow_label;
+ u8 hop_limit;
+ u8 traffic_class;
+ int reversible;
+ u8 numb_path;
+ __be16 pkey;
+ /* reserved */
+ u8 sl;
+ u8 mtu_selector;
+ u8 mtu;
+ u8 rate_selector;
+ u8 rate;
+ u8 packet_life_time_selector;
+ u8 packet_life_time;
+ u8 preference;
+};
+
+#define IB_SA_MCMEMBER_REC_MGID IB_SA_COMP_MASK( 0)
+#define IB_SA_MCMEMBER_REC_PORT_GID IB_SA_COMP_MASK( 1)
+#define IB_SA_MCMEMBER_REC_QKEY IB_SA_COMP_MASK( 2)
+#define IB_SA_MCMEMBER_REC_MLID IB_SA_COMP_MASK( 3)
+#define IB_SA_MCMEMBER_REC_MTU_SELECTOR IB_SA_COMP_MASK( 4)
+#define IB_SA_MCMEMBER_REC_MTU IB_SA_COMP_MASK( 5)
+#define IB_SA_MCMEMBER_REC_TRAFFIC_CLASS IB_SA_COMP_MASK( 6)
+#define IB_SA_MCMEMBER_REC_PKEY IB_SA_COMP_MASK( 7)
+#define IB_SA_MCMEMBER_REC_RATE_SELECTOR IB_SA_COMP_MASK( 8)
+#define IB_SA_MCMEMBER_REC_RATE IB_SA_COMP_MASK( 9)
+#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR IB_SA_COMP_MASK(10)
+#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME IB_SA_COMP_MASK(11)
+#define IB_SA_MCMEMBER_REC_SL IB_SA_COMP_MASK(12)
+#define IB_SA_MCMEMBER_REC_FLOW_LABEL IB_SA_COMP_MASK(13)
+#define IB_SA_MCMEMBER_REC_HOP_LIMIT IB_SA_COMP_MASK(14)
+#define IB_SA_MCMEMBER_REC_SCOPE IB_SA_COMP_MASK(15)
+#define IB_SA_MCMEMBER_REC_JOIN_STATE IB_SA_COMP_MASK(16)
+#define IB_SA_MCMEMBER_REC_PROXY_JOIN IB_SA_COMP_MASK(17)
+
+struct ib_sa_mcmember_rec {
+ union ib_gid mgid;
+ union ib_gid port_gid;
+ __be32 qkey;
+ __be16 mlid;
+ u8 mtu_selector;
+ u8 mtu;
+ u8 traffic_class;
+ __be16 pkey;
+ u8 rate_selector;
+ u8 rate;
+ u8 packet_life_time_selector;
+ u8 packet_life_time;
+ u8 sl;
+ __be32 flow_label;
+ u8 hop_limit;
+ u8 scope;
+ u8 join_state;
+ int proxy_join;
+};
+
+/* Service Record Component Mask Sec 15.2.5.14 Ver 1.1 */
+#define IB_SA_SERVICE_REC_SERVICE_ID IB_SA_COMP_MASK( 0)
+#define IB_SA_SERVICE_REC_SERVICE_GID IB_SA_COMP_MASK( 1)
+#define IB_SA_SERVICE_REC_SERVICE_PKEY IB_SA_COMP_MASK( 2)
+/* reserved: 3 */
+#define IB_SA_SERVICE_REC_SERVICE_LEASE IB_SA_COMP_MASK( 4)
+#define IB_SA_SERVICE_REC_SERVICE_KEY IB_SA_COMP_MASK( 5)
+#define IB_SA_SERVICE_REC_SERVICE_NAME IB_SA_COMP_MASK( 6)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_0 IB_SA_COMP_MASK( 7)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_1 IB_SA_COMP_MASK( 8)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_2 IB_SA_COMP_MASK( 9)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_3 IB_SA_COMP_MASK(10)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_4 IB_SA_COMP_MASK(11)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_5 IB_SA_COMP_MASK(12)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_6 IB_SA_COMP_MASK(13)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_7 IB_SA_COMP_MASK(14)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_8 IB_SA_COMP_MASK(15)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_9 IB_SA_COMP_MASK(16)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_10 IB_SA_COMP_MASK(17)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_11 IB_SA_COMP_MASK(18)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_12 IB_SA_COMP_MASK(19)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_13 IB_SA_COMP_MASK(20)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_14 IB_SA_COMP_MASK(21)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_15 IB_SA_COMP_MASK(22)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_0 IB_SA_COMP_MASK(23)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_1 IB_SA_COMP_MASK(24)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_2 IB_SA_COMP_MASK(25)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_3 IB_SA_COMP_MASK(26)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_4 IB_SA_COMP_MASK(27)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_5 IB_SA_COMP_MASK(28)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_6 IB_SA_COMP_MASK(29)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_7 IB_SA_COMP_MASK(30)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_0 IB_SA_COMP_MASK(31)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_1 IB_SA_COMP_MASK(32)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_2 IB_SA_COMP_MASK(33)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_3 IB_SA_COMP_MASK(34)
+#define IB_SA_SERVICE_REC_SERVICE_DATA64_0 IB_SA_COMP_MASK(35)
+#define IB_SA_SERVICE_REC_SERVICE_DATA64_1 IB_SA_COMP_MASK(36)
+
+#define IB_DEFAULT_SERVICE_LEASE 0xFFFFFFFF
+
+struct ib_sa_service_rec {
+ u64 id;
+ union ib_gid gid;
+ __be16 pkey;
+ /* reserved */
+ u32 lease;
+ u8 key[16];
+ u8 name[64];
+ u8 data8[16];
+ u16 data16[8];
+ u32 data32[4];
+ u64 data64[2];
+};
+
+struct ib_sa_client {
+ volatile int users;
+#ifdef notyet
+ struct completion comp;
+#endif
+};
+
+/**
+ * ib_sa_register_client - Register an SA client.
+ */
+void ib_sa_register_client(struct ib_sa_client *client);
+
+/**
+ * ib_sa_unregister_client - Deregister an SA client.
+ * @client: Client object to deregister.
+ */
+void ib_sa_unregister_client(struct ib_sa_client *client);
+
+struct ib_sa_query;
+
+void ib_sa_cancel_query(int id, struct ib_sa_query *query);
+
+int ib_sa_path_rec_get(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ struct ib_sa_path_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, int gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_path_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **query);
+
+int ib_sa_service_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ u8 method,
+ struct ib_sa_service_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, int gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_service_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query);
+
+struct ib_sa_multicast {
+ struct ib_sa_mcmember_rec rec;
+ ib_sa_comp_mask comp_mask;
+ int (*callback)(int status,
+ struct ib_sa_multicast *multicast);
+ void *context;
+};
+
+/**
+ * ib_sa_join_multicast - Initiates a join request to the specified multicast
+ * group.
+ * @client: SA client
+ * @device: Device associated with the multicast group.
+ * @port_num: Port on the specified device to associate with the multicast
+ * group.
+ * @rec: SA multicast member record specifying group attributes.
+ * @comp_mask: Component mask indicating which group attributes of %rec are
+ * valid.
+ * @gfp_mask: GFP mask for memory allocations.
+ * @callback: User callback invoked once the join operation completes.
+ * @context: User specified context stored with the ib_sa_multicast structure.
+ *
+ * This call initiates a multicast join request with the SA for the specified
+ * multicast group. If the join operation is started successfully, it returns
+ * an ib_sa_multicast structure that is used to track the multicast operation.
+ * Users must free this structure by calling ib_free_multicast, even if the
+ * join operation later fails. (The callback status is non-zero.)
+ *
+ * If the join operation fails; status will be non-zero, with the following
+ * failures possible:
+ * ETIMEDOUT: The request timed out.
+ * EIO: An error occurred sending the query.
+ * EINVAL: The MCMemberRecord values differed from the existing group's.
+ * ENETRESET: Indicates that an fatal error has occurred on the multicast
+ * group, and the user must rejoin the group to continue using it.
+ */
+struct ib_sa_multicast *ib_sa_join_multicast(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ struct ib_sa_mcmember_rec *rec,
+ ib_sa_comp_mask comp_mask, int gfp_mask,
+ int (*callback)(int status,
+ struct ib_sa_multicast
+ *multicast),
+ void *context);
+
+/**
+ * ib_free_multicast - Frees the multicast tracking structure, and releases
+ * any reference on the multicast group.
+ * @multicast: Multicast tracking structure allocated by ib_join_multicast.
+ *
+ * This call blocks until the multicast identifier is destroyed. It may
+ * not be called from within the multicast callback; however, returning a non-
+ * zero value from the callback will result in destroying the multicast
+ * tracking structure.
+ */
+void ib_sa_free_multicast(struct ib_sa_multicast *multicast);
+
+/**
+ * ib_get_mcmember_rec - Looks up a multicast member record by its MGID and
+ * returns it if found.
+ * @device: Device associated with the multicast group.
+ * @port_num: Port on the specified device to associate with the multicast
+ * group.
+ * @mgid: MGID of multicast group.
+ * @rec: Location to copy SA multicast member record.
+ */
+int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
+ union ib_gid *mgid, struct ib_sa_mcmember_rec *rec);
+
+/**
+ * ib_init_ah_from_mcmember - Initialize address handle attributes based on
+ * an SA multicast member record.
+ */
+int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
+ struct ib_sa_mcmember_rec *rec,
+ struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_init_ah_from_path - Initialize address handle attributes based on an SA
+ * path record.
+ */
+int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
+ struct ib_sa_path_rec *rec,
+ struct ib_ah_attr *ah_attr);
+
+#endif /* IB_SA_H */
diff --git a/sys/contrib/rdma/ib_smi.h b/sys/contrib/rdma/ib_smi.h
new file mode 100644
index 000000000000..0e4b1e940d8e
--- /dev/null
+++ b/sys/contrib/rdma/ib_smi.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_smi.h 1389 2004-12-27 22:56:47Z roland $
+ *
+ * $FreeBSD$
+ */
+
+#if !defined( IB_SMI_H )
+#define IB_SMI_H
+
+#include <rdma/ib_mad.h>
+
+#define IB_SMP_DATA_SIZE 64
+#define IB_SMP_MAX_PATH_HOPS 64
+
+struct ib_smp {
+ u8 base_version;
+ u8 mgmt_class;
+ u8 class_version;
+ u8 method;
+ __be16 status;
+ u8 hop_ptr;
+ u8 hop_cnt;
+ __be64 tid;
+ __be16 attr_id;
+ __be16 resv;
+ __be32 attr_mod;
+ __be64 mkey;
+ __be16 dr_slid;
+ __be16 dr_dlid;
+ u8 reserved[28];
+ u8 data[IB_SMP_DATA_SIZE];
+ u8 initial_path[IB_SMP_MAX_PATH_HOPS];
+ u8 return_path[IB_SMP_MAX_PATH_HOPS];
+} __attribute__ ((packed));
+
+#define IB_SMP_DIRECTION __constant_htons(0x8000)
+
+/* Subnet management attributes */
+#define IB_SMP_ATTR_NOTICE __constant_htons(0x0002)
+#define IB_SMP_ATTR_NODE_DESC __constant_htons(0x0010)
+#define IB_SMP_ATTR_NODE_INFO __constant_htons(0x0011)
+#define IB_SMP_ATTR_SWITCH_INFO __constant_htons(0x0012)
+#define IB_SMP_ATTR_GUID_INFO __constant_htons(0x0014)
+#define IB_SMP_ATTR_PORT_INFO __constant_htons(0x0015)
+#define IB_SMP_ATTR_PKEY_TABLE __constant_htons(0x0016)
+#define IB_SMP_ATTR_SL_TO_VL_TABLE __constant_htons(0x0017)
+#define IB_SMP_ATTR_VL_ARB_TABLE __constant_htons(0x0018)
+#define IB_SMP_ATTR_LINEAR_FORWARD_TABLE __constant_htons(0x0019)
+#define IB_SMP_ATTR_RANDOM_FORWARD_TABLE __constant_htons(0x001A)
+#define IB_SMP_ATTR_MCAST_FORWARD_TABLE __constant_htons(0x001B)
+#define IB_SMP_ATTR_SM_INFO __constant_htons(0x0020)
+#define IB_SMP_ATTR_VENDOR_DIAG __constant_htons(0x0030)
+#define IB_SMP_ATTR_LED_INFO __constant_htons(0x0031)
+#define IB_SMP_ATTR_VENDOR_MASK __constant_htons(0xFF00)
+
+struct ib_port_info {
+ __be64 mkey;
+ __be64 gid_prefix;
+ __be16 lid;
+ __be16 sm_lid;
+ __be32 cap_mask;
+ __be16 diag_code;
+ __be16 mkey_lease_period;
+ u8 local_port_num;
+ u8 link_width_enabled;
+ u8 link_width_supported;
+ u8 link_width_active;
+ u8 linkspeed_portstate; /* 4 bits, 4 bits */
+ u8 portphysstate_linkdown; /* 4 bits, 4 bits */
+ u8 mkeyprot_resv_lmc; /* 2 bits, 3, 3 */
+ u8 linkspeedactive_enabled; /* 4 bits, 4 bits */
+ u8 neighbormtu_mastersmsl; /* 4 bits, 4 bits */
+ u8 vlcap_inittype; /* 4 bits, 4 bits */
+ u8 vl_high_limit;
+ u8 vl_arb_high_cap;
+ u8 vl_arb_low_cap;
+ u8 inittypereply_mtucap; /* 4 bits, 4 bits */
+ u8 vlstallcnt_hoqlife; /* 3 bits, 5 bits */
+ u8 operationalvl_pei_peo_fpi_fpo; /* 4 bits, 1, 1, 1, 1 */
+ __be16 mkey_violations;
+ __be16 pkey_violations;
+ __be16 qkey_violations;
+ u8 guid_cap;
+ u8 clientrereg_resv_subnetto; /* 1 bit, 2 bits, 5 */
+ u8 resv_resptimevalue; /* 3 bits, 5 bits */
+ u8 localphyerrors_overrunerrors; /* 4 bits, 4 bits */
+ __be16 max_credit_hint;
+ u8 resv;
+ u8 link_roundtrip_latency[3];
+};
+
+static inline u8
+ib_get_smp_direction(struct ib_smp *smp)
+{
+ return ((smp->status & IB_SMP_DIRECTION) == IB_SMP_DIRECTION);
+}
+
+#endif /* IB_SMI_H */
diff --git a/sys/contrib/rdma/ib_umem.h b/sys/contrib/rdma/ib_umem.h
new file mode 100644
index 000000000000..50dd5dac867d
--- /dev/null
+++ b/sys/contrib/rdma/ib_umem.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2007 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef IB_UMEM_H
+#define IB_UMEM_H
+
+struct ib_ucontext;
+
+struct ib_umem_chunk {
+ TAILQ_ENTRY(ib_umem_chunk) entry;
+ int nents;
+ int nmap;
+ struct rdma_scatterlist page_list[0];
+};
+
+struct ib_umem {
+ struct ib_ucontext *context;
+ size_t length;
+ int offset;
+ int page_size;
+ int writable;
+ TAILQ_HEAD(, ib_umem_chunk) chunk_list;
+#ifdef notyet
+ struct work_struct work;
+ struct mm_struct *mm;
+#endif
+ unsigned long diff;
+};
+
+#ifdef CONFIG_INFINIBAND_USER_MEM
+
+struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+ size_t size, int access);
+void ib_umem_release(struct ib_umem *umem);
+int ib_umem_page_count(struct ib_umem *umem);
+
+#else /* CONFIG_INFINIBAND_USER_MEM */
+
+
+static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context,
+ unsigned long addr, size_t size,
+ int access) {
+ return ERR_PTR(EINVAL);
+}
+static inline void ib_umem_release(struct ib_umem *umem) { }
+static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; }
+
+#endif /* CONFIG_INFINIBAND_USER_MEM */
+
+#endif /* IB_UMEM_H */
diff --git a/sys/contrib/rdma/ib_user_cm.h b/sys/contrib/rdma/ib_user_cm.h
new file mode 100644
index 000000000000..e8815d51cf5b
--- /dev/null
+++ b/sys/contrib/rdma/ib_user_cm.h
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_user_cm.h 4019 2005-11-11 00:33:09Z sean.hefty $
+ *
+ * $FreeBSD$
+ */
+
+#ifndef IB_USER_CM_H
+#define IB_USER_CM_H
+
+#include <rdma/ib_user_sa.h>
+
+#define IB_USER_CM_ABI_VERSION 5
+
+enum {
+ IB_USER_CM_CMD_CREATE_ID,
+ IB_USER_CM_CMD_DESTROY_ID,
+ IB_USER_CM_CMD_ATTR_ID,
+
+ IB_USER_CM_CMD_LISTEN,
+ IB_USER_CM_CMD_NOTIFY,
+
+ IB_USER_CM_CMD_SEND_REQ,
+ IB_USER_CM_CMD_SEND_REP,
+ IB_USER_CM_CMD_SEND_RTU,
+ IB_USER_CM_CMD_SEND_DREQ,
+ IB_USER_CM_CMD_SEND_DREP,
+ IB_USER_CM_CMD_SEND_REJ,
+ IB_USER_CM_CMD_SEND_MRA,
+ IB_USER_CM_CMD_SEND_LAP,
+ IB_USER_CM_CMD_SEND_APR,
+ IB_USER_CM_CMD_SEND_SIDR_REQ,
+ IB_USER_CM_CMD_SEND_SIDR_REP,
+
+ IB_USER_CM_CMD_EVENT,
+ IB_USER_CM_CMD_INIT_QP_ATTR,
+};
+/*
+ * command ABI structures.
+ */
+struct ib_ucm_cmd_hdr {
+ __u32 cmd;
+ __u16 in;
+ __u16 out;
+};
+
+struct ib_ucm_create_id {
+ __u64 uid;
+ __u64 response;
+};
+
+struct ib_ucm_create_id_resp {
+ __u32 id;
+};
+
+struct ib_ucm_destroy_id {
+ __u64 response;
+ __u32 id;
+ __u32 reserved;
+};
+
+struct ib_ucm_destroy_id_resp {
+ __u32 events_reported;
+};
+
+struct ib_ucm_attr_id {
+ __u64 response;
+ __u32 id;
+ __u32 reserved;
+};
+
+struct ib_ucm_attr_id_resp {
+ __be64 service_id;
+ __be64 service_mask;
+ __be32 local_id;
+ __be32 remote_id;
+};
+
+struct ib_ucm_init_qp_attr {
+ __u64 response;
+ __u32 id;
+ __u32 qp_state;
+};
+
+struct ib_ucm_listen {
+ __be64 service_id;
+ __be64 service_mask;
+ __u32 id;
+ __u32 reserved;
+};
+
+struct ib_ucm_notify {
+ __u32 id;
+ __u32 event;
+};
+
+struct ib_ucm_private_data {
+ __u64 data;
+ __u32 id;
+ __u8 len;
+ __u8 reserved[3];
+};
+
+struct ib_ucm_req {
+ __u32 id;
+ __u32 qpn;
+ __u32 qp_type;
+ __u32 psn;
+ __be64 sid;
+ __u64 data;
+ __u64 primary_path;
+ __u64 alternate_path;
+ __u8 len;
+ __u8 peer_to_peer;
+ __u8 responder_resources;
+ __u8 initiator_depth;
+ __u8 remote_cm_response_timeout;
+ __u8 flow_control;
+ __u8 local_cm_response_timeout;
+ __u8 retry_count;
+ __u8 rnr_retry_count;
+ __u8 max_cm_retries;
+ __u8 srq;
+ __u8 reserved[5];
+};
+
+struct ib_ucm_rep {
+ __u64 uid;
+ __u64 data;
+ __u32 id;
+ __u32 qpn;
+ __u32 psn;
+ __u8 len;
+ __u8 responder_resources;
+ __u8 initiator_depth;
+ __u8 target_ack_delay;
+ __u8 failover_accepted;
+ __u8 flow_control;
+ __u8 rnr_retry_count;
+ __u8 srq;
+ __u8 reserved[4];
+};
+
+struct ib_ucm_info {
+ __u32 id;
+ __u32 status;
+ __u64 info;
+ __u64 data;
+ __u8 info_len;
+ __u8 data_len;
+ __u8 reserved[6];
+};
+
+struct ib_ucm_mra {
+ __u64 data;
+ __u32 id;
+ __u8 len;
+ __u8 timeout;
+ __u8 reserved[2];
+};
+
+struct ib_ucm_lap {
+ __u64 path;
+ __u64 data;
+ __u32 id;
+ __u8 len;
+ __u8 reserved[3];
+};
+
+struct ib_ucm_sidr_req {
+ __u32 id;
+ __u32 timeout;
+ __be64 sid;
+ __u64 data;
+ __u64 path;
+ __u16 reserved_pkey;
+ __u8 len;
+ __u8 max_cm_retries;
+ __u8 reserved[4];
+};
+
+struct ib_ucm_sidr_rep {
+ __u32 id;
+ __u32 qpn;
+ __u32 qkey;
+ __u32 status;
+ __u64 info;
+ __u64 data;
+ __u8 info_len;
+ __u8 data_len;
+ __u8 reserved[6];
+};
+/*
+ * event notification ABI structures.
+ */
+struct ib_ucm_event_get {
+ __u64 response;
+ __u64 data;
+ __u64 info;
+ __u8 data_len;
+ __u8 info_len;
+ __u8 reserved[6];
+};
+
+struct ib_ucm_req_event_resp {
+ struct ib_user_path_rec primary_path;
+ struct ib_user_path_rec alternate_path;
+ __be64 remote_ca_guid;
+ __u32 remote_qkey;
+ __u32 remote_qpn;
+ __u32 qp_type;
+ __u32 starting_psn;
+ __u8 responder_resources;
+ __u8 initiator_depth;
+ __u8 local_cm_response_timeout;
+ __u8 flow_control;
+ __u8 remote_cm_response_timeout;
+ __u8 retry_count;
+ __u8 rnr_retry_count;
+ __u8 srq;
+ __u8 port;
+ __u8 reserved[7];
+};
+
+struct ib_ucm_rep_event_resp {
+ __be64 remote_ca_guid;
+ __u32 remote_qkey;
+ __u32 remote_qpn;
+ __u32 starting_psn;
+ __u8 responder_resources;
+ __u8 initiator_depth;
+ __u8 target_ack_delay;
+ __u8 failover_accepted;
+ __u8 flow_control;
+ __u8 rnr_retry_count;
+ __u8 srq;
+ __u8 reserved[5];
+};
+
+struct ib_ucm_rej_event_resp {
+ __u32 reason;
+ /* ari in ib_ucm_event_get info field. */
+};
+
+struct ib_ucm_mra_event_resp {
+ __u8 timeout;
+ __u8 reserved[3];
+};
+
+struct ib_ucm_lap_event_resp {
+ struct ib_user_path_rec path;
+};
+
+struct ib_ucm_apr_event_resp {
+ __u32 status;
+ /* apr info in ib_ucm_event_get info field. */
+};
+
+struct ib_ucm_sidr_req_event_resp {
+ __u16 pkey;
+ __u8 port;
+ __u8 reserved;
+};
+
+struct ib_ucm_sidr_rep_event_resp {
+ __u32 status;
+ __u32 qkey;
+ __u32 qpn;
+ /* info in ib_ucm_event_get info field. */
+};
+
+#define IB_UCM_PRES_DATA 0x01
+#define IB_UCM_PRES_INFO 0x02
+#define IB_UCM_PRES_PRIMARY 0x04
+#define IB_UCM_PRES_ALTERNATE 0x08
+
+struct ib_ucm_event_resp {
+ __u64 uid;
+ __u32 id;
+ __u32 event;
+ __u32 present;
+ __u32 reserved;
+ union {
+ struct ib_ucm_req_event_resp req_resp;
+ struct ib_ucm_rep_event_resp rep_resp;
+ struct ib_ucm_rej_event_resp rej_resp;
+ struct ib_ucm_mra_event_resp mra_resp;
+ struct ib_ucm_lap_event_resp lap_resp;
+ struct ib_ucm_apr_event_resp apr_resp;
+
+ struct ib_ucm_sidr_req_event_resp sidr_req_resp;
+ struct ib_ucm_sidr_rep_event_resp sidr_rep_resp;
+
+ __u32 send_status;
+ } u;
+};
+
+#endif /* IB_USER_CM_H */
diff --git a/sys/contrib/rdma/ib_user_mad.h b/sys/contrib/rdma/ib_user_mad.h
new file mode 100644
index 000000000000..ec1d50987c88
--- /dev/null
+++ b/sys/contrib/rdma/ib_user_mad.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_user_mad.h 2814 2005-07-06 19:14:09Z halr $
+ *
+ * $FreeBSD$
+ */
+
+#ifndef IB_USER_MAD_H
+#define IB_USER_MAD_H
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define IB_USER_MAD_ABI_VERSION 5
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ */
+
+/**
+ * ib_user_mad_hdr - MAD packet header
+ * @id - ID of agent MAD received with/to be sent with
+ * @status - 0 on successful receive, ETIMEDOUT if no response
+ * received (transaction ID in data[] will be set to TID of original
+ * request) (ignored on send)
+ * @timeout_ms - Milliseconds to wait for response (unset on receive)
+ * @retries - Number of automatic retries to attempt
+ * @qpn - Remote QP number received from/to be sent to
+ * @qkey - Remote Q_Key to be sent with (unset on receive)
+ * @lid - Remote lid received from/to be sent to
+ * @sl - Service level received with/to be sent with
+ * @path_bits - Local path bits received with/to be sent with
+ * @grh_present - If set, GRH was received/should be sent
+ * @gid_index - Local GID index to send with (unset on receive)
+ * @hop_limit - Hop limit in GRH
+ * @traffic_class - Traffic class in GRH
+ * @gid - Remote GID in GRH
+ * @flow_label - Flow label in GRH
+ */
+struct ib_user_mad_hdr {
+ __u32 id;
+ __u32 status;
+ __u32 timeout_ms;
+ __u32 retries;
+ __u32 length;
+ __be32 qpn;
+ __be32 qkey;
+ __be16 lid;
+ __u8 sl;
+ __u8 path_bits;
+ __u8 grh_present;
+ __u8 gid_index;
+ __u8 hop_limit;
+ __u8 traffic_class;
+ __u8 gid[16];
+ __be32 flow_label;
+};
+
+/**
+ * ib_user_mad - MAD packet
+ * @hdr - MAD packet header
+ * @data - Contents of MAD
+ *
+ */
+struct ib_user_mad {
+ struct ib_user_mad_hdr hdr;
+ __u64 data[0];
+};
+
+/**
+ * ib_user_mad_reg_req - MAD registration request
+ * @id - Set by the kernel; used to identify agent in future requests.
+ * @qpn - Queue pair number; must be 0 or 1.
+ * @method_mask - The caller will receive unsolicited MADs for any method
+ * where @method_mask = 1.
+ * @mgmt_class - Indicates which management class of MADs should be receive
+ * by the caller. This field is only required if the user wishes to
+ * receive unsolicited MADs, otherwise it should be 0.
+ * @mgmt_class_version - Indicates which version of MADs for the given
+ * management class to receive.
+ * @oui: Indicates IEEE OUI when mgmt_class is a vendor class
+ * in the range from 0x30 to 0x4f. Otherwise not used.
+ * @rmpp_version: If set, indicates the RMPP version used.
+ *
+ */
+struct ib_user_mad_reg_req {
+ __u32 id;
+ __u32 method_mask[4];
+ __u8 qpn;
+ __u8 mgmt_class;
+ __u8 mgmt_class_version;
+ __u8 oui[3];
+ __u8 rmpp_version;
+};
+
+#define IB_IOCTL_MAGIC 0x1b
+
+#define IB_USER_MAD_REGISTER_AGENT _IOWR(IB_IOCTL_MAGIC, 1, \
+ struct ib_user_mad_reg_req)
+
+#define IB_USER_MAD_UNREGISTER_AGENT _IOW(IB_IOCTL_MAGIC, 2, __u32)
+
+#endif /* IB_USER_MAD_H */
diff --git a/sys/contrib/rdma/ib_user_sa.h b/sys/contrib/rdma/ib_user_sa.h
new file mode 100644
index 000000000000..ddb76ed5c9c2
--- /dev/null
+++ b/sys/contrib/rdma/ib_user_sa.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef IB_USER_SA_H
+#define IB_USER_SA_H
+
+struct ib_user_path_rec {
+ __u8 dgid[16];
+ __u8 sgid[16];
+ __be16 dlid;
+ __be16 slid;
+ __u32 raw_traffic;
+ __be32 flow_label;
+ __u32 reversible;
+ __u32 mtu;
+ __be16 pkey;
+ __u8 hop_limit;
+ __u8 traffic_class;
+ __u8 numb_path;
+ __u8 sl;
+ __u8 mtu_selector;
+ __u8 rate_selector;
+ __u8 rate;
+ __u8 packet_life_time_selector;
+ __u8 packet_life_time;
+ __u8 preference;
+};
+
+#endif /* IB_USER_SA_H */
diff --git a/sys/contrib/rdma/ib_user_verbs.h b/sys/contrib/rdma/ib_user_verbs.h
new file mode 100644
index 000000000000..faa966c20920
--- /dev/null
+++ b/sys/contrib/rdma/ib_user_verbs.h
@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ * Copyright (c) 2006 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_user_verbs.h 4019 2005-11-11 00:33:09Z sean.hefty $
+ *
+ * $FreeBSD$
+ */
+
+#ifndef IB_USER_VERBS_H
+#define IB_USER_VERBS_H
+
+#include <contrib/rdma/types.h>
+
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define IB_USER_VERBS_ABI_VERSION 6
+
+enum {
+ IB_USER_VERBS_CMD_GET_CONTEXT,
+ IB_USER_VERBS_CMD_QUERY_DEVICE,
+ IB_USER_VERBS_CMD_QUERY_PORT,
+ IB_USER_VERBS_CMD_ALLOC_PD,
+ IB_USER_VERBS_CMD_DEALLOC_PD,
+ IB_USER_VERBS_CMD_CREATE_AH,
+ IB_USER_VERBS_CMD_MODIFY_AH,
+ IB_USER_VERBS_CMD_QUERY_AH,
+ IB_USER_VERBS_CMD_DESTROY_AH,
+ IB_USER_VERBS_CMD_REG_MR,
+ IB_USER_VERBS_CMD_REG_SMR,
+ IB_USER_VERBS_CMD_REREG_MR,
+ IB_USER_VERBS_CMD_QUERY_MR,
+ IB_USER_VERBS_CMD_DEREG_MR,
+ IB_USER_VERBS_CMD_ALLOC_MW,
+ IB_USER_VERBS_CMD_BIND_MW,
+ IB_USER_VERBS_CMD_DEALLOC_MW,
+ IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL,
+ IB_USER_VERBS_CMD_CREATE_CQ,
+ IB_USER_VERBS_CMD_RESIZE_CQ,
+ IB_USER_VERBS_CMD_DESTROY_CQ,
+ IB_USER_VERBS_CMD_POLL_CQ,
+ IB_USER_VERBS_CMD_PEEK_CQ,
+ IB_USER_VERBS_CMD_REQ_NOTIFY_CQ,
+ IB_USER_VERBS_CMD_CREATE_QP,
+ IB_USER_VERBS_CMD_QUERY_QP,
+ IB_USER_VERBS_CMD_MODIFY_QP,
+ IB_USER_VERBS_CMD_DESTROY_QP,
+ IB_USER_VERBS_CMD_POST_SEND,
+ IB_USER_VERBS_CMD_POST_RECV,
+ IB_USER_VERBS_CMD_ATTACH_MCAST,
+ IB_USER_VERBS_CMD_DETACH_MCAST,
+ IB_USER_VERBS_CMD_CREATE_SRQ,
+ IB_USER_VERBS_CMD_MODIFY_SRQ,
+ IB_USER_VERBS_CMD_QUERY_SRQ,
+ IB_USER_VERBS_CMD_DESTROY_SRQ,
+ IB_USER_VERBS_CMD_POST_SRQ_RECV
+};
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * Specifically:
+ * - Do not use pointer types -- pass pointers in __u64 instead.
+ * - Make sure that any structure larger than 4 bytes is padded to a
+ * multiple of 8 bytes. Otherwise the structure size will be
+ * different between 32-bit and 64-bit architectures.
+ */
+
+struct ib_uverbs_async_event_desc {
+ __u64 element;
+ __u32 event_type; /* enum ib_event_type */
+ __u32 reserved;
+};
+
+struct ib_uverbs_comp_event_desc {
+ __u64 cq_handle;
+};
+
+/*
+ * All commands from userspace should start with a __u32 command field
+ * followed by __u16 in_words and out_words fields (which give the
+ * length of the command block and response buffer if any in 32-bit
+ * words). The kernel driver will read these fields first and read
+ * the rest of the command struct based on these value.
+ */
+
+struct ib_uverbs_cmd_hdr {
+ __u32 command;
+ __u16 in_words;
+ __u16 out_words;
+};
+
+struct ib_uverbs_get_context {
+ __u64 response;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_get_context_resp {
+ __u32 async_fd;
+ __u32 num_comp_vectors;
+};
+
+struct ib_uverbs_query_device {
+ __u64 response;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_query_device_resp {
+ __u64 fw_ver;
+ __be64 node_guid;
+ __be64 sys_image_guid;
+ __u64 max_mr_size;
+ __u64 page_size_cap;
+ __u32 vendor_id;
+ __u32 vendor_part_id;
+ __u32 hw_ver;
+ __u32 max_qp;
+ __u32 max_qp_wr;
+ __u32 device_cap_flags;
+ __u32 max_sge;
+ __u32 max_sge_rd;
+ __u32 max_cq;
+ __u32 max_cqe;
+ __u32 max_mr;
+ __u32 max_pd;
+ __u32 max_qp_rd_atom;
+ __u32 max_ee_rd_atom;
+ __u32 max_res_rd_atom;
+ __u32 max_qp_init_rd_atom;
+ __u32 max_ee_init_rd_atom;
+ __u32 atomic_cap;
+ __u32 max_ee;
+ __u32 max_rdd;
+ __u32 max_mw;
+ __u32 max_raw_ipv6_qp;
+ __u32 max_raw_ethy_qp;
+ __u32 max_mcast_grp;
+ __u32 max_mcast_qp_attach;
+ __u32 max_total_mcast_qp_attach;
+ __u32 max_ah;
+ __u32 max_fmr;
+ __u32 max_map_per_fmr;
+ __u32 max_srq;
+ __u32 max_srq_wr;
+ __u32 max_srq_sge;
+ __u16 max_pkeys;
+ __u8 local_ca_ack_delay;
+ __u8 phys_port_cnt;
+ __u8 reserved[4];
+};
+
+struct ib_uverbs_query_port {
+ __u64 response;
+ __u8 port_num;
+ __u8 reserved[7];
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_query_port_resp {
+ __u32 port_cap_flags;
+ __u32 max_msg_sz;
+ __u32 bad_pkey_cntr;
+ __u32 qkey_viol_cntr;
+ __u32 gid_tbl_len;
+ __u16 pkey_tbl_len;
+ __u16 lid;
+ __u16 sm_lid;
+ __u8 state;
+ __u8 max_mtu;
+ __u8 active_mtu;
+ __u8 lmc;
+ __u8 max_vl_num;
+ __u8 sm_sl;
+ __u8 subnet_timeout;
+ __u8 init_type_reply;
+ __u8 active_width;
+ __u8 active_speed;
+ __u8 phys_state;
+ __u8 reserved[3];
+};
+
+struct ib_uverbs_alloc_pd {
+ __u64 response;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_alloc_pd_resp {
+ __u32 pd_handle;
+};
+
+struct ib_uverbs_dealloc_pd {
+ __u32 pd_handle;
+};
+
+struct ib_uverbs_reg_mr {
+ __u64 response;
+ __u64 start;
+ __u64 length;
+ __u64 hca_va;
+ __u32 pd_handle;
+ __u32 access_flags;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_reg_mr_resp {
+ __u32 mr_handle;
+ __u32 lkey;
+ __u32 rkey;
+};
+
+struct ib_uverbs_dereg_mr {
+ __u32 mr_handle;
+};
+
+struct ib_uverbs_create_comp_channel {
+ __u64 response;
+};
+
+struct ib_uverbs_create_comp_channel_resp {
+ __u32 fd;
+};
+
+struct ib_uverbs_create_cq {
+ __u64 response;
+ __u64 user_handle;
+ __u32 cqe;
+ __u32 comp_vector;
+ __s32 comp_channel;
+ __u32 reserved;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_create_cq_resp {
+ __u32 cq_handle;
+ __u32 cqe;
+};
+
+struct ib_uverbs_resize_cq {
+ __u64 response;
+ __u32 cq_handle;
+ __u32 cqe;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_resize_cq_resp {
+ __u32 cqe;
+ __u32 reserved;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_poll_cq {
+ __u64 response;
+ __u32 cq_handle;
+ __u32 ne;
+};
+
+struct ib_uverbs_wc {
+ __u64 wr_id;
+ __u32 status;
+ __u32 opcode;
+ __u32 vendor_err;
+ __u32 byte_len;
+ __u32 imm_data;
+ __u32 qp_num;
+ __u32 src_qp;
+ __u32 wc_flags;
+ __u16 pkey_index;
+ __u16 slid;
+ __u8 sl;
+ __u8 dlid_path_bits;
+ __u8 port_num;
+ __u8 reserved;
+};
+
+struct ib_uverbs_poll_cq_resp {
+ __u32 count;
+ __u32 reserved;
+ struct ib_uverbs_wc wc[0];
+};
+
+struct ib_uverbs_req_notify_cq {
+ __u32 cq_handle;
+ __u32 solicited_only;
+};
+
+struct ib_uverbs_destroy_cq {
+ __u64 response;
+ __u32 cq_handle;
+ __u32 reserved;
+};
+
+struct ib_uverbs_destroy_cq_resp {
+ __u32 comp_events_reported;
+ __u32 async_events_reported;
+};
+
+struct ib_uverbs_global_route {
+ __u8 dgid[16];
+ __u32 flow_label;
+ __u8 sgid_index;
+ __u8 hop_limit;
+ __u8 traffic_class;
+ __u8 reserved;
+};
+
+struct ib_uverbs_ah_attr {
+ struct ib_uverbs_global_route grh;
+ __u16 dlid;
+ __u8 sl;
+ __u8 src_path_bits;
+ __u8 static_rate;
+ __u8 is_global;
+ __u8 port_num;
+ __u8 reserved;
+};
+
+struct ib_uverbs_qp_attr {
+ __u32 qp_attr_mask;
+ __u32 qp_state;
+ __u32 cur_qp_state;
+ __u32 path_mtu;
+ __u32 path_mig_state;
+ __u32 qkey;
+ __u32 rq_psn;
+ __u32 sq_psn;
+ __u32 dest_qp_num;
+ __u32 qp_access_flags;
+
+ struct ib_uverbs_ah_attr ah_attr;
+ struct ib_uverbs_ah_attr alt_ah_attr;
+
+ /* ib_qp_cap */
+ __u32 max_send_wr;
+ __u32 max_recv_wr;
+ __u32 max_send_sge;
+ __u32 max_recv_sge;
+ __u32 max_inline_data;
+
+ __u16 pkey_index;
+ __u16 alt_pkey_index;
+ __u8 en_sqd_async_notify;
+ __u8 sq_draining;
+ __u8 max_rd_atomic;
+ __u8 max_dest_rd_atomic;
+ __u8 min_rnr_timer;
+ __u8 port_num;
+ __u8 timeout;
+ __u8 retry_cnt;
+ __u8 rnr_retry;
+ __u8 alt_port_num;
+ __u8 alt_timeout;
+ __u8 reserved[5];
+};
+
+struct ib_uverbs_create_qp {
+ __u64 response;
+ __u64 user_handle;
+ __u32 pd_handle;
+ __u32 send_cq_handle;
+ __u32 recv_cq_handle;
+ __u32 srq_handle;
+ __u32 max_send_wr;
+ __u32 max_recv_wr;
+ __u32 max_send_sge;
+ __u32 max_recv_sge;
+ __u32 max_inline_data;
+ __u8 sq_sig_all;
+ __u8 qp_type;
+ __u8 is_srq;
+ __u8 reserved;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_create_qp_resp {
+ __u32 qp_handle;
+ __u32 qpn;
+ __u32 max_send_wr;
+ __u32 max_recv_wr;
+ __u32 max_send_sge;
+ __u32 max_recv_sge;
+ __u32 max_inline_data;
+ __u32 reserved;
+};
+
+/*
+ * This struct needs to remain a multiple of 8 bytes to keep the
+ * alignment of the modify QP parameters.
+ */
+struct ib_uverbs_qp_dest {
+ __u8 dgid[16];
+ __u32 flow_label;
+ __u16 dlid;
+ __u16 reserved;
+ __u8 sgid_index;
+ __u8 hop_limit;
+ __u8 traffic_class;
+ __u8 sl;
+ __u8 src_path_bits;
+ __u8 static_rate;
+ __u8 is_global;
+ __u8 port_num;
+};
+
+struct ib_uverbs_query_qp {
+ __u64 response;
+ __u32 qp_handle;
+ __u32 attr_mask;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_query_qp_resp {
+ struct ib_uverbs_qp_dest dest;
+ struct ib_uverbs_qp_dest alt_dest;
+ __u32 max_send_wr;
+ __u32 max_recv_wr;
+ __u32 max_send_sge;
+ __u32 max_recv_sge;
+ __u32 max_inline_data;
+ __u32 qkey;
+ __u32 rq_psn;
+ __u32 sq_psn;
+ __u32 dest_qp_num;
+ __u32 qp_access_flags;
+ __u16 pkey_index;
+ __u16 alt_pkey_index;
+ __u8 qp_state;
+ __u8 cur_qp_state;
+ __u8 path_mtu;
+ __u8 path_mig_state;
+ __u8 sq_draining;
+ __u8 max_rd_atomic;
+ __u8 max_dest_rd_atomic;
+ __u8 min_rnr_timer;
+ __u8 port_num;
+ __u8 timeout;
+ __u8 retry_cnt;
+ __u8 rnr_retry;
+ __u8 alt_port_num;
+ __u8 alt_timeout;
+ __u8 sq_sig_all;
+ __u8 reserved[5];
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_modify_qp {
+ struct ib_uverbs_qp_dest dest;
+ struct ib_uverbs_qp_dest alt_dest;
+ __u32 qp_handle;
+ __u32 attr_mask;
+ __u32 qkey;
+ __u32 rq_psn;
+ __u32 sq_psn;
+ __u32 dest_qp_num;
+ __u32 qp_access_flags;
+ __u16 pkey_index;
+ __u16 alt_pkey_index;
+ __u8 qp_state;
+ __u8 cur_qp_state;
+ __u8 path_mtu;
+ __u8 path_mig_state;
+ __u8 en_sqd_async_notify;
+ __u8 max_rd_atomic;
+ __u8 max_dest_rd_atomic;
+ __u8 min_rnr_timer;
+ __u8 port_num;
+ __u8 timeout;
+ __u8 retry_cnt;
+ __u8 rnr_retry;
+ __u8 alt_port_num;
+ __u8 alt_timeout;
+ __u8 reserved[2];
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_modify_qp_resp {
+};
+
+struct ib_uverbs_destroy_qp {
+ __u64 response;
+ __u32 qp_handle;
+ __u32 reserved;
+};
+
+struct ib_uverbs_destroy_qp_resp {
+ __u32 events_reported;
+};
+
+/*
+ * The ib_uverbs_sge structure isn't used anywhere, since we assume
+ * the ib_sge structure is packed the same way on 32-bit and 64-bit
+ * architectures in both kernel and user space. It's just here to
+ * document the ABI.
+ */
+struct ib_uverbs_sge {
+ __u64 addr;
+ __u32 length;
+ __u32 lkey;
+};
+
+struct ib_uverbs_send_wr {
+ __u64 wr_id;
+ __u32 num_sge;
+ __u32 opcode;
+ __u32 send_flags;
+ __u32 imm_data;
+ union {
+ struct {
+ __u64 remote_addr;
+ __u32 rkey;
+ __u32 reserved;
+ } rdma;
+ struct {
+ __u64 remote_addr;
+ __u64 compare_add;
+ __u64 swap;
+ __u32 rkey;
+ __u32 reserved;
+ } atomic;
+ struct {
+ __u32 ah;
+ __u32 remote_qpn;
+ __u32 remote_qkey;
+ __u32 reserved;
+ } ud;
+ } wr;
+};
+
+struct ib_uverbs_post_send {
+ __u64 response;
+ __u32 qp_handle;
+ __u32 wr_count;
+ __u32 sge_count;
+ __u32 wqe_size;
+ struct ib_uverbs_send_wr send_wr[0];
+};
+
+struct ib_uverbs_post_send_resp {
+ __u32 bad_wr;
+};
+
+struct ib_uverbs_recv_wr {
+ __u64 wr_id;
+ __u32 num_sge;
+ __u32 reserved;
+};
+
+struct ib_uverbs_post_recv {
+ __u64 response;
+ __u32 qp_handle;
+ __u32 wr_count;
+ __u32 sge_count;
+ __u32 wqe_size;
+ struct ib_uverbs_recv_wr recv_wr[0];
+};
+
+struct ib_uverbs_post_recv_resp {
+ __u32 bad_wr;
+};
+
+struct ib_uverbs_post_srq_recv {
+ __u64 response;
+ __u32 srq_handle;
+ __u32 wr_count;
+ __u32 sge_count;
+ __u32 wqe_size;
+ struct ib_uverbs_recv_wr recv[0];
+};
+
+struct ib_uverbs_post_srq_recv_resp {
+ __u32 bad_wr;
+};
+
+struct ib_uverbs_create_ah {
+ __u64 response;
+ __u64 user_handle;
+ __u32 pd_handle;
+ __u32 reserved;
+ struct ib_uverbs_ah_attr attr;
+};
+
+struct ib_uverbs_create_ah_resp {
+ __u32 ah_handle;
+};
+
+struct ib_uverbs_destroy_ah {
+ __u32 ah_handle;
+};
+
+struct ib_uverbs_attach_mcast {
+ __u8 gid[16];
+ __u32 qp_handle;
+ __u16 mlid;
+ __u16 reserved;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_detach_mcast {
+ __u8 gid[16];
+ __u32 qp_handle;
+ __u16 mlid;
+ __u16 reserved;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_create_srq {
+ __u64 response;
+ __u64 user_handle;
+ __u32 pd_handle;
+ __u32 max_wr;
+ __u32 max_sge;
+ __u32 srq_limit;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_create_srq_resp {
+ __u32 srq_handle;
+ __u32 max_wr;
+ __u32 max_sge;
+ __u32 reserved;
+};
+
+struct ib_uverbs_modify_srq {
+ __u32 srq_handle;
+ __u32 attr_mask;
+ __u32 max_wr;
+ __u32 srq_limit;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_query_srq {
+ __u64 response;
+ __u32 srq_handle;
+ __u32 reserved;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_query_srq_resp {
+ __u32 max_wr;
+ __u32 max_sge;
+ __u32 srq_limit;
+ __u32 reserved;
+};
+
+struct ib_uverbs_destroy_srq {
+ __u64 response;
+ __u32 srq_handle;
+ __u32 reserved;
+};
+
+struct ib_uverbs_destroy_srq_resp {
+ __u32 events_reported;
+};
+
+#endif /* IB_USER_VERBS_H */
diff --git a/sys/contrib/rdma/ib_verbs.h b/sys/contrib/rdma/ib_verbs.h
new file mode 100644
index 000000000000..5d98ef7e0e41
--- /dev/null
+++ b/sys/contrib/rdma/ib_verbs.h
@@ -0,0 +1,1854 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_verbs.h 1349 2004-12-16 21:09:43Z roland $
+ *
+ * $FreeBSD$
+ */
+
+
+#if !defined(IB_VERBS_H)
+#define IB_VERBS_H
+
+#include <contrib/rdma/types.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+
+struct rdma_scatterlist {
+ void *page;
+ unsigned int length;
+ unsigned int offset;
+};
+struct vm_object;
+
+union ib_gid {
+ u8 raw[16];
+ struct {
+ __be64 subnet_prefix;
+ __be64 interface_id;
+ } global;
+};
+
+enum rdma_node_type {
+ /* IB values map to NodeInfo:NodeType. */
+ RDMA_NODE_IB_CA = 1,
+ RDMA_NODE_IB_SWITCH,
+ RDMA_NODE_IB_ROUTER,
+ RDMA_NODE_RNIC
+};
+
+enum rdma_transport_type {
+ RDMA_TRANSPORT_IB,
+ RDMA_TRANSPORT_IWARP
+};
+
+enum rdma_transport_type
+rdma_node_get_transport(enum rdma_node_type node_type);
+
+enum ib_device_cap_flags {
+ IB_DEVICE_RESIZE_MAX_WR = 1,
+ IB_DEVICE_BAD_PKEY_CNTR = (1<<1),
+ IB_DEVICE_BAD_QKEY_CNTR = (1<<2),
+ IB_DEVICE_RAW_MULTI = (1<<3),
+ IB_DEVICE_AUTO_PATH_MIG = (1<<4),
+ IB_DEVICE_CHANGE_PHY_PORT = (1<<5),
+ IB_DEVICE_UD_AV_PORT_ENFORCE = (1<<6),
+ IB_DEVICE_CURR_QP_STATE_MOD = (1<<7),
+ IB_DEVICE_SHUTDOWN_PORT = (1<<8),
+ IB_DEVICE_INIT_TYPE = (1<<9),
+ IB_DEVICE_PORT_ACTIVE_EVENT = (1<<10),
+ IB_DEVICE_SYS_IMAGE_GUID = (1<<11),
+ IB_DEVICE_RC_RNR_NAK_GEN = (1<<12),
+ IB_DEVICE_SRQ_RESIZE = (1<<13),
+ IB_DEVICE_N_NOTIFY_CQ = (1<<14),
+ IB_DEVICE_ZERO_STAG = (1<<15),
+ IB_DEVICE_SEND_W_INV = (1<<16),
+ IB_DEVICE_MEM_WINDOW = (1<<17)
+};
+
+enum ib_atomic_cap {
+ IB_ATOMIC_NONE,
+ IB_ATOMIC_HCA,
+ IB_ATOMIC_GLOB
+};
+
+struct ib_device_attr {
+ u64 fw_ver;
+ __be64 sys_image_guid;
+ u64 max_mr_size;
+ u64 page_size_cap;
+ u32 vendor_id;
+ u32 vendor_part_id;
+ u32 hw_ver;
+ int max_qp;
+ int max_qp_wr;
+ int device_cap_flags;
+ int max_sge;
+ int max_sge_rd;
+ int max_cq;
+ int max_cqe;
+ int max_mr;
+ int max_pd;
+ int max_qp_rd_atom;
+ int max_ee_rd_atom;
+ int max_res_rd_atom;
+ int max_qp_init_rd_atom;
+ int max_ee_init_rd_atom;
+ enum ib_atomic_cap atomic_cap;
+ int max_ee;
+ int max_rdd;
+ int max_mw;
+ int max_raw_ipv6_qp;
+ int max_raw_ethy_qp;
+ int max_mcast_grp;
+ int max_mcast_qp_attach;
+ int max_total_mcast_qp_attach;
+ int max_ah;
+ int max_fmr;
+ int max_map_per_fmr;
+ int max_srq;
+ int max_srq_wr;
+ int max_srq_sge;
+ u16 max_pkeys;
+ u8 local_ca_ack_delay;
+};
+
+enum ib_mtu {
+ IB_MTU_256 = 1,
+ IB_MTU_512 = 2,
+ IB_MTU_1024 = 3,
+ IB_MTU_2048 = 4,
+ IB_MTU_4096 = 5
+};
+
+static inline int ib_mtu_enum_to_int(enum ib_mtu mtu)
+{
+ switch (mtu) {
+ case IB_MTU_256: return 256;
+ case IB_MTU_512: return 512;
+ case IB_MTU_1024: return 1024;
+ case IB_MTU_2048: return 2048;
+ case IB_MTU_4096: return 4096;
+ default: return -1;
+ }
+}
+
+enum ib_port_state {
+ IB_PORT_NOP = 0,
+ IB_PORT_DOWN = 1,
+ IB_PORT_INIT = 2,
+ IB_PORT_ARMED = 3,
+ IB_PORT_ACTIVE = 4,
+ IB_PORT_ACTIVE_DEFER = 5
+};
+
+enum ib_port_cap_flags {
+ IB_PORT_SM = 1 << 1,
+ IB_PORT_NOTICE_SUP = 1 << 2,
+ IB_PORT_TRAP_SUP = 1 << 3,
+ IB_PORT_OPT_IPD_SUP = 1 << 4,
+ IB_PORT_AUTO_MIGR_SUP = 1 << 5,
+ IB_PORT_SL_MAP_SUP = 1 << 6,
+ IB_PORT_MKEY_NVRAM = 1 << 7,
+ IB_PORT_PKEY_NVRAM = 1 << 8,
+ IB_PORT_LED_INFO_SUP = 1 << 9,
+ IB_PORT_SM_DISABLED = 1 << 10,
+ IB_PORT_SYS_IMAGE_GUID_SUP = 1 << 11,
+ IB_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12,
+ IB_PORT_CM_SUP = 1 << 16,
+ IB_PORT_SNMP_TUNNEL_SUP = 1 << 17,
+ IB_PORT_REINIT_SUP = 1 << 18,
+ IB_PORT_DEVICE_MGMT_SUP = 1 << 19,
+ IB_PORT_VENDOR_CLASS_SUP = 1 << 20,
+ IB_PORT_DR_NOTICE_SUP = 1 << 21,
+ IB_PORT_CAP_MASK_NOTICE_SUP = 1 << 22,
+ IB_PORT_BOOT_MGMT_SUP = 1 << 23,
+ IB_PORT_LINK_LATENCY_SUP = 1 << 24,
+ IB_PORT_CLIENT_REG_SUP = 1 << 25
+};
+
+enum ib_port_width {
+ IB_WIDTH_1X = 1,
+ IB_WIDTH_4X = 2,
+ IB_WIDTH_8X = 4,
+ IB_WIDTH_12X = 8
+};
+
+static inline int ib_width_enum_to_int(enum ib_port_width width)
+{
+ switch (width) {
+ case IB_WIDTH_1X: return 1;
+ case IB_WIDTH_4X: return 4;
+ case IB_WIDTH_8X: return 8;
+ case IB_WIDTH_12X: return 12;
+ default: return -1;
+ }
+}
+
+struct ib_port_attr {
+ enum ib_port_state state;
+ enum ib_mtu max_mtu;
+ enum ib_mtu active_mtu;
+ int gid_tbl_len;
+ u32 port_cap_flags;
+ u32 max_msg_sz;
+ u32 bad_pkey_cntr;
+ u32 qkey_viol_cntr;
+ u16 pkey_tbl_len;
+ u16 lid;
+ u16 sm_lid;
+ u8 lmc;
+ u8 max_vl_num;
+ u8 sm_sl;
+ u8 subnet_timeout;
+ u8 init_type_reply;
+ u8 active_width;
+ u8 active_speed;
+ u8 phys_state;
+};
+
+enum ib_device_modify_flags {
+ IB_DEVICE_MODIFY_SYS_IMAGE_GUID = 1 << 0,
+ IB_DEVICE_MODIFY_NODE_DESC = 1 << 1
+};
+
+struct ib_device_modify {
+ u64 sys_image_guid;
+ char node_desc[64];
+};
+
+enum ib_port_modify_flags {
+ IB_PORT_SHUTDOWN = 1,
+ IB_PORT_INIT_TYPE = (1<<2),
+ IB_PORT_RESET_QKEY_CNTR = (1<<3)
+};
+
+struct ib_port_modify {
+ u32 set_port_cap_mask;
+ u32 clr_port_cap_mask;
+ u8 init_type;
+};
+
+enum ib_event_type {
+ IB_EVENT_CQ_ERR,
+ IB_EVENT_QP_FATAL,
+ IB_EVENT_QP_REQ_ERR,
+ IB_EVENT_QP_ACCESS_ERR,
+ IB_EVENT_COMM_EST,
+ IB_EVENT_SQ_DRAINED,
+ IB_EVENT_PATH_MIG,
+ IB_EVENT_PATH_MIG_ERR,
+ IB_EVENT_DEVICE_FATAL,
+ IB_EVENT_PORT_ACTIVE,
+ IB_EVENT_PORT_ERR,
+ IB_EVENT_LID_CHANGE,
+ IB_EVENT_PKEY_CHANGE,
+ IB_EVENT_SM_CHANGE,
+ IB_EVENT_SRQ_ERR,
+ IB_EVENT_SRQ_LIMIT_REACHED,
+ IB_EVENT_QP_LAST_WQE_REACHED,
+ IB_EVENT_CLIENT_REREGISTER
+};
+
+enum dma_data_direction {
+ DMA_BIDIRECTIONAL = 0,
+ DMA_TO_DEVICE = 1,
+ DMA_FROM_DEVICE = 2,
+ DMA_NONE = 3,
+};
+
+struct ib_event {
+ struct ib_device *device;
+ union {
+ struct ib_cq *cq;
+ struct ib_qp *qp;
+ struct ib_srq *srq;
+ u8 port_num;
+ } element;
+ enum ib_event_type event;
+};
+
+struct ib_event_handler {
+ struct ib_device *device;
+ void (*handler)(struct ib_event_handler *, struct ib_event *);
+ TAILQ_ENTRY(ib_event_handler) list;
+};
+
+#define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler) \
+ do { \
+ (_ptr)->device = _device; \
+ (_ptr)->handler = _handler; \
+ } while (0)
+
+struct ib_global_route {
+ union ib_gid dgid;
+ u32 flow_label;
+ u8 sgid_index;
+ u8 hop_limit;
+ u8 traffic_class;
+};
+
+struct ib_grh {
+ __be32 version_tclass_flow;
+ __be16 paylen;
+ u8 next_hdr;
+ u8 hop_limit;
+ union ib_gid sgid;
+ union ib_gid dgid;
+};
+
+enum {
+ IB_MULTICAST_QPN = 0xffffff
+};
+
+#define IB_LID_PERMISSIVE __constant_htons(0xFFFF)
+
+enum ib_ah_flags {
+ IB_AH_GRH = 1
+};
+
+enum ib_rate {
+ IB_RATE_PORT_CURRENT = 0,
+ IB_RATE_2_5_GBPS = 2,
+ IB_RATE_5_GBPS = 5,
+ IB_RATE_10_GBPS = 3,
+ IB_RATE_20_GBPS = 6,
+ IB_RATE_30_GBPS = 4,
+ IB_RATE_40_GBPS = 7,
+ IB_RATE_60_GBPS = 8,
+ IB_RATE_80_GBPS = 9,
+ IB_RATE_120_GBPS = 10
+};
+
+/**
+ * ib_rate_to_mult - Convert the IB rate enum to a multiple of the
+ * base rate of 2.5 Gbit/sec. For example, IB_RATE_5_GBPS will be
+ * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec.
+ * @rate: rate to convert.
+ */
+int ib_rate_to_mult(enum ib_rate rate);
+
+/**
+ * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate
+ * enum.
+ * @mult: multiple to convert.
+ */
+enum ib_rate mult_to_ib_rate(int mult);
+
+struct ib_ah_attr {
+ struct ib_global_route grh;
+ u16 dlid;
+ u8 sl;
+ u8 src_path_bits;
+ u8 static_rate;
+ u8 ah_flags;
+ u8 port_num;
+};
+
+enum ib_wc_status {
+ IB_WC_SUCCESS,
+ IB_WC_LOC_LEN_ERR,
+ IB_WC_LOC_QP_OP_ERR,
+ IB_WC_LOC_EEC_OP_ERR,
+ IB_WC_LOC_PROT_ERR,
+ IB_WC_WR_FLUSH_ERR,
+ IB_WC_MW_BIND_ERR,
+ IB_WC_BAD_RESP_ERR,
+ IB_WC_LOC_ACCESS_ERR,
+ IB_WC_REM_INV_REQ_ERR,
+ IB_WC_REM_ACCESS_ERR,
+ IB_WC_REM_OP_ERR,
+ IB_WC_RETRY_EXC_ERR,
+ IB_WC_RNR_RETRY_EXC_ERR,
+ IB_WC_LOC_RDD_VIOL_ERR,
+ IB_WC_REM_INV_RD_REQ_ERR,
+ IB_WC_REM_ABORT_ERR,
+ IB_WC_INV_EECN_ERR,
+ IB_WC_INV_EEC_STATE_ERR,
+ IB_WC_FATAL_ERR,
+ IB_WC_RESP_TIMEOUT_ERR,
+ IB_WC_GENERAL_ERR
+};
+
+enum ib_wc_opcode {
+ IB_WC_SEND,
+ IB_WC_RDMA_WRITE,
+ IB_WC_RDMA_READ,
+ IB_WC_COMP_SWAP,
+ IB_WC_FETCH_ADD,
+ IB_WC_BIND_MW,
+/*
+ * Set value of IB_WC_RECV so consumers can test if a completion is a
+ * receive by testing (opcode & IB_WC_RECV).
+ */
+ IB_WC_RECV = 1 << 7,
+ IB_WC_RECV_RDMA_WITH_IMM
+};
+
+enum ib_wc_flags {
+ IB_WC_GRH = 1,
+ IB_WC_WITH_IMM = (1<<1)
+};
+
+struct ib_wc {
+ u64 wr_id;
+ enum ib_wc_status status;
+ enum ib_wc_opcode opcode;
+ u32 vendor_err;
+ u32 byte_len;
+ struct ib_qp *qp;
+ __be32 imm_data;
+ u32 src_qp;
+ int wc_flags;
+ u16 pkey_index;
+ u16 slid;
+ u8 sl;
+ u8 dlid_path_bits;
+ u8 port_num; /* valid only for DR SMPs on switches */
+};
+
+enum ib_cq_notify_flags {
+ IB_CQ_SOLICITED = 1 << 0,
+ IB_CQ_NEXT_COMP = 1 << 1,
+ IB_CQ_SOLICITED_MASK = IB_CQ_SOLICITED | IB_CQ_NEXT_COMP,
+ IB_CQ_REPORT_MISSED_EVENTS = 1 << 2,
+};
+
+enum ib_srq_attr_mask {
+ IB_SRQ_MAX_WR = 1 << 0,
+ IB_SRQ_LIMIT = 1 << 1,
+};
+
+struct ib_srq_attr {
+ u32 max_wr;
+ u32 max_sge;
+ u32 srq_limit;
+};
+
+struct ib_srq_init_attr {
+ void (*event_handler)(struct ib_event *, void *);
+ void *srq_context;
+ struct ib_srq_attr attr;
+};
+
+struct ib_qp_cap {
+ u32 max_send_wr;
+ u32 max_recv_wr;
+ u32 max_send_sge;
+ u32 max_recv_sge;
+ u32 max_inline_data;
+};
+
+enum ib_sig_type {
+ IB_SIGNAL_ALL_WR,
+ IB_SIGNAL_REQ_WR
+};
+
+enum ib_qp_type {
+ /*
+ * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries
+ * here (and in that order) since the MAD layer uses them as
+ * indices into a 2-entry table.
+ */
+ IB_QPT_SMI,
+ IB_QPT_GSI,
+
+ IB_QPT_RC,
+ IB_QPT_UC,
+ IB_QPT_UD,
+ IB_QPT_RAW_IPV6,
+ IB_QPT_RAW_ETY
+};
+
+struct ib_qp_init_attr {
+ void (*event_handler)(struct ib_event *, void *);
+ void *qp_context;
+ struct ib_cq *send_cq;
+ struct ib_cq *recv_cq;
+ struct ib_srq *srq;
+ struct ib_qp_cap cap;
+ enum ib_sig_type sq_sig_type;
+ enum ib_qp_type qp_type;
+ u8 port_num; /* special QP types only */
+};
+
+enum ib_rnr_timeout {
+ IB_RNR_TIMER_655_36 = 0,
+ IB_RNR_TIMER_000_01 = 1,
+ IB_RNR_TIMER_000_02 = 2,
+ IB_RNR_TIMER_000_03 = 3,
+ IB_RNR_TIMER_000_04 = 4,
+ IB_RNR_TIMER_000_06 = 5,
+ IB_RNR_TIMER_000_08 = 6,
+ IB_RNR_TIMER_000_12 = 7,
+ IB_RNR_TIMER_000_16 = 8,
+ IB_RNR_TIMER_000_24 = 9,
+ IB_RNR_TIMER_000_32 = 10,
+ IB_RNR_TIMER_000_48 = 11,
+ IB_RNR_TIMER_000_64 = 12,
+ IB_RNR_TIMER_000_96 = 13,
+ IB_RNR_TIMER_001_28 = 14,
+ IB_RNR_TIMER_001_92 = 15,
+ IB_RNR_TIMER_002_56 = 16,
+ IB_RNR_TIMER_003_84 = 17,
+ IB_RNR_TIMER_005_12 = 18,
+ IB_RNR_TIMER_007_68 = 19,
+ IB_RNR_TIMER_010_24 = 20,
+ IB_RNR_TIMER_015_36 = 21,
+ IB_RNR_TIMER_020_48 = 22,
+ IB_RNR_TIMER_030_72 = 23,
+ IB_RNR_TIMER_040_96 = 24,
+ IB_RNR_TIMER_061_44 = 25,
+ IB_RNR_TIMER_081_92 = 26,
+ IB_RNR_TIMER_122_88 = 27,
+ IB_RNR_TIMER_163_84 = 28,
+ IB_RNR_TIMER_245_76 = 29,
+ IB_RNR_TIMER_327_68 = 30,
+ IB_RNR_TIMER_491_52 = 31
+};
+
+enum ib_qp_attr_mask {
+ IB_QP_STATE = 1,
+ IB_QP_CUR_STATE = (1<<1),
+ IB_QP_EN_SQD_ASYNC_NOTIFY = (1<<2),
+ IB_QP_ACCESS_FLAGS = (1<<3),
+ IB_QP_PKEY_INDEX = (1<<4),
+ IB_QP_PORT = (1<<5),
+ IB_QP_QKEY = (1<<6),
+ IB_QP_AV = (1<<7),
+ IB_QP_PATH_MTU = (1<<8),
+ IB_QP_TIMEOUT = (1<<9),
+ IB_QP_RETRY_CNT = (1<<10),
+ IB_QP_RNR_RETRY = (1<<11),
+ IB_QP_RQ_PSN = (1<<12),
+ IB_QP_MAX_QP_RD_ATOMIC = (1<<13),
+ IB_QP_ALT_PATH = (1<<14),
+ IB_QP_MIN_RNR_TIMER = (1<<15),
+ IB_QP_SQ_PSN = (1<<16),
+ IB_QP_MAX_DEST_RD_ATOMIC = (1<<17),
+ IB_QP_PATH_MIG_STATE = (1<<18),
+ IB_QP_CAP = (1<<19),
+ IB_QP_DEST_QPN = (1<<20)
+};
+
+enum ib_qp_state {
+ IB_QPS_RESET,
+ IB_QPS_INIT,
+ IB_QPS_RTR,
+ IB_QPS_RTS,
+ IB_QPS_SQD,
+ IB_QPS_SQE,
+ IB_QPS_ERR
+};
+
+enum ib_mig_state {
+ IB_MIG_MIGRATED,
+ IB_MIG_REARM,
+ IB_MIG_ARMED
+};
+
+struct ib_qp_attr {
+ enum ib_qp_state qp_state;
+ enum ib_qp_state cur_qp_state;
+ enum ib_mtu path_mtu;
+ enum ib_mig_state path_mig_state;
+ u32 qkey;
+ u32 rq_psn;
+ u32 sq_psn;
+ u32 dest_qp_num;
+ int qp_access_flags;
+ struct ib_qp_cap cap;
+ struct ib_ah_attr ah_attr;
+ struct ib_ah_attr alt_ah_attr;
+ u16 pkey_index;
+ u16 alt_pkey_index;
+ u8 en_sqd_async_notify;
+ u8 sq_draining;
+ u8 max_rd_atomic;
+ u8 max_dest_rd_atomic;
+ u8 min_rnr_timer;
+ u8 port_num;
+ u8 timeout;
+ u8 retry_cnt;
+ u8 rnr_retry;
+ u8 alt_port_num;
+ u8 alt_timeout;
+};
+
+enum ib_wr_opcode {
+ IB_WR_RDMA_WRITE,
+ IB_WR_RDMA_WRITE_WITH_IMM,
+ IB_WR_SEND,
+ IB_WR_SEND_WITH_IMM,
+ IB_WR_RDMA_READ,
+ IB_WR_ATOMIC_CMP_AND_SWP,
+ IB_WR_ATOMIC_FETCH_AND_ADD
+};
+
+enum ib_send_flags {
+ IB_SEND_FENCE = 1,
+ IB_SEND_SIGNALED = (1<<1),
+ IB_SEND_SOLICITED = (1<<2),
+ IB_SEND_INLINE = (1<<3)
+};
+
+struct ib_sge {
+ u64 addr;
+ u32 length;
+ u32 lkey;
+};
+
+struct ib_send_wr {
+ struct ib_send_wr *next;
+ u64 wr_id;
+ struct ib_sge *sg_list;
+ int num_sge;
+ enum ib_wr_opcode opcode;
+ int send_flags;
+ __be32 imm_data;
+ union {
+ struct {
+ u64 remote_addr;
+ u32 rkey;
+ } rdma;
+ struct {
+ u64 remote_addr;
+ u64 compare_add;
+ u64 swap;
+ u32 rkey;
+ } atomic;
+ struct {
+ struct ib_ah *ah;
+ u32 remote_qpn;
+ u32 remote_qkey;
+ u16 pkey_index; /* valid for GSI only */
+ u8 port_num; /* valid for DR SMPs on switch only */
+ } ud;
+ } wr;
+};
+
+struct ib_recv_wr {
+ struct ib_recv_wr *next;
+ u64 wr_id;
+ struct ib_sge *sg_list;
+ int num_sge;
+};
+
+enum ib_access_flags {
+ IB_ACCESS_LOCAL_WRITE = 1,
+ IB_ACCESS_REMOTE_WRITE = (1<<1),
+ IB_ACCESS_REMOTE_READ = (1<<2),
+ IB_ACCESS_REMOTE_ATOMIC = (1<<3),
+ IB_ACCESS_MW_BIND = (1<<4)
+};
+
+struct ib_phys_buf {
+ u64 addr;
+ u64 size;
+};
+
+struct ib_mr_attr {
+ struct ib_pd *pd;
+ u64 device_virt_addr;
+ u64 size;
+ int mr_access_flags;
+ u32 lkey;
+ u32 rkey;
+};
+
+enum ib_mr_rereg_flags {
+ IB_MR_REREG_TRANS = 1,
+ IB_MR_REREG_PD = (1<<1),
+ IB_MR_REREG_ACCESS = (1<<2)
+};
+
+struct ib_mw_bind {
+ struct ib_mr *mr;
+ u64 wr_id;
+ u64 addr;
+ u32 length;
+ int send_flags;
+ int mw_access_flags;
+};
+
+struct ib_fmr_attr {
+ int max_pages;
+ int max_maps;
+ u8 page_shift;
+};
+
+/*
+ * XXX can this really be on 7 different lists at once?
+ *
+ */
+struct ib_ucontext {
+ struct ib_device *device;
+ TAILQ_ENTRY(ib_ucontext) pd_list;
+ TAILQ_ENTRY(ib_ucontext) mr_list;
+ TAILQ_ENTRY(ib_ucontext) mw_list;
+ TAILQ_ENTRY(ib_ucontext) cq_list;
+ TAILQ_ENTRY(ib_ucontext) qp_list;
+ TAILQ_ENTRY(ib_ucontext) srq_list;
+ TAILQ_ENTRY(ib_ucontext) ah_list;
+ int closing;
+};
+
+struct ib_uobject {
+ u64 user_handle; /* handle given to us by userspace */
+ struct ib_ucontext *context; /* associated user context */
+ void *object; /* containing object */
+ TAILQ_ENTRY(ib_uobject) entry; /* link to context's list */
+ u32 id; /* index into kernel idr */
+ volatile uint32_t ref;
+ struct mtx lock; /* protects .live */
+ int live;
+};
+
+struct ib_udata {
+ void *inbuf;
+ void *outbuf;
+ size_t inlen;
+ size_t outlen;
+};
+
+#define IB_UMEM_MAX_PAGE_CHUNK \
+ ((PAGE_SIZE - offsetof(struct ib_umem_chunk, page_list)) / \
+ ((void *) &((struct ib_umem_chunk *) 0)->page_list[1] - \
+ (void *) &((struct ib_umem_chunk *) 0)->page_list[0]))
+
+struct ib_pd {
+ struct ib_device *device;
+ struct ib_uobject *uobject;
+ volatile int usecnt; /* count all resources */
+};
+
+struct ib_ah {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ struct ib_uobject *uobject;
+};
+
+typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
+
+struct ib_cq {
+ struct ib_device *device;
+ struct ib_uobject *uobject;
+ ib_comp_handler comp_handler;
+ void (*event_handler)(struct ib_event *, void *);
+ void * cq_context;
+ int cqe;
+ volatile int usecnt; /* count number of work queues */
+};
+
+struct ib_srq {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ struct ib_uobject *uobject;
+ void (*event_handler)(struct ib_event *, void *);
+ void *srq_context;
+ volatile int usecnt;
+};
+
+struct ib_qp {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ struct ib_cq *send_cq;
+ struct ib_cq *recv_cq;
+ struct ib_srq *srq;
+ struct ib_uobject *uobject;
+ void (*event_handler)(struct ib_event *, void *);
+ void *qp_context;
+ u32 qp_num;
+ enum ib_qp_type qp_type;
+};
+
+struct ib_mr {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ struct ib_uobject *uobject;
+ u32 lkey;
+ u32 rkey;
+ volatile int usecnt; /* count number of MWs */
+};
+
+struct ib_mw {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ struct ib_uobject *uobject;
+ u32 rkey;
+};
+
+
+struct ib_fmr {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ TAILQ_ENTRY(ib_fmr) entry;
+ u32 lkey;
+ u32 rkey;
+};
+
+TAILQ_HEAD(ib_fmr_list_head, ib_fmr);
+
+struct ib_mad;
+struct ib_grh;
+
+enum ib_process_mad_flags {
+ IB_MAD_IGNORE_MKEY = 1,
+ IB_MAD_IGNORE_BKEY = 2,
+ IB_MAD_IGNORE_ALL = IB_MAD_IGNORE_MKEY | IB_MAD_IGNORE_BKEY
+};
+
+enum ib_mad_result {
+ IB_MAD_RESULT_FAILURE = 0, /* (!SUCCESS is the important flag) */
+ IB_MAD_RESULT_SUCCESS = 1 << 0, /* MAD was successfully processed */
+ IB_MAD_RESULT_REPLY = 1 << 1, /* Reply packet needs to be sent */
+ IB_MAD_RESULT_CONSUMED = 1 << 2 /* Packet consumed: stop processing */
+};
+
+#define IB_DEVICE_NAME_MAX 64
+
+struct ib_cache {
+ struct mtx lock;
+ struct ib_event_handler event_handler;
+ struct ib_pkey_cache **pkey_cache;
+ struct ib_gid_cache **gid_cache;
+ u8 *lmc_cache;
+};
+
+struct ib_dma_mapping_ops {
+ int (*mapping_error)(struct ib_device *dev,
+ u64 dma_addr);
+ u64 (*map_single)(struct ib_device *dev,
+ void *ptr, size_t size,
+ enum dma_data_direction direction);
+ void (*unmap_single)(struct ib_device *dev,
+ u64 addr, size_t size,
+ enum dma_data_direction direction);
+ u64 (*map_page)(struct ib_device *dev,
+ void *page, unsigned long offset,
+ size_t size,
+ enum dma_data_direction direction);
+ void (*unmap_page)(struct ib_device *dev,
+ u64 addr, size_t size,
+ enum dma_data_direction direction);
+ int (*map_sg)(struct ib_device *dev,
+ struct rdma_scatterlist *sg, int nents,
+ enum dma_data_direction direction);
+ void (*unmap_sg)(struct ib_device *dev,
+ struct rdma_scatterlist *sg, int nents,
+ enum dma_data_direction direction);
+ u64 (*dma_address)(struct ib_device *dev,
+ struct rdma_scatterlist *sg);
+ unsigned int (*dma_len)(struct ib_device *dev,
+ struct rdma_scatterlist *sg);
+ void (*sync_single_for_cpu)(struct ib_device *dev,
+ u64 dma_handle,
+ size_t size,
+ enum dma_data_direction dir);
+ void (*sync_single_for_device)(struct ib_device *dev,
+ u64 dma_handle,
+ size_t size,
+ enum dma_data_direction dir);
+ void *(*alloc_coherent)(struct ib_device *dev,
+ size_t size,
+ u64 *dma_handle,
+ int flag);
+ void (*free_coherent)(struct ib_device *dev,
+ size_t size, void *cpu_addr,
+ u64 dma_handle);
+};
+
+struct iw_cm_verbs;
+
+struct ib_device {
+ struct device *dma_device;
+
+ char name[IB_DEVICE_NAME_MAX];
+
+ TAILQ_HEAD(, ib_event_handler) event_handler_list;
+ struct mtx event_handler_lock;
+
+ TAILQ_ENTRY(ib_device) core_list;
+ TAILQ_HEAD(, ib_client_data) client_data_list;
+ struct mtx client_data_lock;
+
+ struct ib_cache cache;
+ int *pkey_tbl_len;
+ int *gid_tbl_len;
+
+ u32 flags;
+
+ int num_comp_vectors;
+
+ struct iw_cm_verbs *iwcm;
+
+ int (*query_device)(struct ib_device *device,
+ struct ib_device_attr *device_attr);
+ int (*query_port)(struct ib_device *device,
+ u8 port_num,
+ struct ib_port_attr *port_attr);
+ int (*query_gid)(struct ib_device *device,
+ u8 port_num, int index,
+ union ib_gid *gid);
+ int (*query_pkey)(struct ib_device *device,
+ u8 port_num, u16 index, u16 *pkey);
+ int (*modify_device)(struct ib_device *device,
+ int device_modify_mask,
+ struct ib_device_modify *device_modify);
+ int (*modify_port)(struct ib_device *device,
+ u8 port_num, int port_modify_mask,
+ struct ib_port_modify *port_modify);
+ struct ib_ucontext * (*alloc_ucontext)(struct ib_device *device,
+ struct ib_udata *udata);
+ int (*dealloc_ucontext)(struct ib_ucontext *context);
+ int (*mmap)(struct ib_ucontext *context,
+ struct vm_object *vma);
+ struct ib_pd * (*alloc_pd)(struct ib_device *device,
+ struct ib_ucontext *context,
+ struct ib_udata *udata);
+ int (*dealloc_pd)(struct ib_pd *pd);
+ struct ib_ah * (*create_ah)(struct ib_pd *pd,
+ struct ib_ah_attr *ah_attr);
+ int (*modify_ah)(struct ib_ah *ah,
+ struct ib_ah_attr *ah_attr);
+ int (*query_ah)(struct ib_ah *ah,
+ struct ib_ah_attr *ah_attr);
+ int (*destroy_ah)(struct ib_ah *ah);
+ struct ib_srq * (*create_srq)(struct ib_pd *pd,
+ struct ib_srq_init_attr *srq_init_attr,
+ struct ib_udata *udata);
+ int (*modify_srq)(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr,
+ enum ib_srq_attr_mask srq_attr_mask,
+ struct ib_udata *udata);
+ int (*query_srq)(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr);
+ int (*destroy_srq)(struct ib_srq *srq);
+ int (*post_srq_recv)(struct ib_srq *srq,
+ struct ib_recv_wr *recv_wr,
+ struct ib_recv_wr **bad_recv_wr);
+ struct ib_qp * (*create_qp)(struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr,
+ struct ib_udata *udata);
+ int (*modify_qp)(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_udata *udata);
+ int (*query_qp)(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr);
+ int (*destroy_qp)(struct ib_qp *qp);
+ int (*post_send)(struct ib_qp *qp,
+ struct ib_send_wr *send_wr,
+ struct ib_send_wr **bad_send_wr);
+ int (*post_recv)(struct ib_qp *qp,
+ struct ib_recv_wr *recv_wr,
+ struct ib_recv_wr **bad_recv_wr);
+ struct ib_cq * (*create_cq)(struct ib_device *device, int cqe,
+ int comp_vector,
+ struct ib_ucontext *context,
+ struct ib_udata *udata);
+ int (*destroy_cq)(struct ib_cq *cq);
+ int (*resize_cq)(struct ib_cq *cq, int cqe,
+ struct ib_udata *udata);
+ int (*poll_cq)(struct ib_cq *cq, int num_entries,
+ struct ib_wc *wc);
+ int (*peek_cq)(struct ib_cq *cq, int wc_cnt);
+ int (*req_notify_cq)(struct ib_cq *cq,
+ enum ib_cq_notify_flags flags);
+ int (*req_ncomp_notif)(struct ib_cq *cq,
+ int wc_cnt);
+ struct ib_mr * (*get_dma_mr)(struct ib_pd *pd,
+ int mr_access_flags);
+ struct ib_mr * (*reg_phys_mr)(struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start);
+ struct ib_mr * (*reg_user_mr)(struct ib_pd *pd,
+ u64 start, u64 length,
+ u64 virt_addr,
+ int mr_access_flags,
+ struct ib_udata *udata);
+ int (*query_mr)(struct ib_mr *mr,
+ struct ib_mr_attr *mr_attr);
+ int (*dereg_mr)(struct ib_mr *mr);
+ int (*rereg_phys_mr)(struct ib_mr *mr,
+ int mr_rereg_mask,
+ struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start);
+ struct ib_mw * (*alloc_mw)(struct ib_pd *pd);
+ int (*bind_mw)(struct ib_qp *qp,
+ struct ib_mw *mw,
+ struct ib_mw_bind *mw_bind);
+ int (*dealloc_mw)(struct ib_mw *mw);
+ struct ib_fmr * (*alloc_fmr)(struct ib_pd *pd,
+ int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr);
+ int (*map_phys_fmr)(struct ib_fmr *fmr,
+ u64 *page_list, int list_len,
+ u64 iova);
+ int (*unmap_fmr)(struct ib_fmr_list_head *fmr_list);
+ int (*dealloc_fmr)(struct ib_fmr *fmr);
+ int (*attach_mcast)(struct ib_qp *qp,
+ union ib_gid *gid,
+ u16 lid);
+ int (*detach_mcast)(struct ib_qp *qp,
+ union ib_gid *gid,
+ u16 lid);
+ int (*process_mad)(struct ib_device *device,
+ int process_mad_flags,
+ u8 port_num,
+ struct ib_wc *in_wc,
+ struct ib_grh *in_grh,
+ struct ib_mad *in_mad,
+ struct ib_mad *out_mad);
+
+ struct ib_dma_mapping_ops *dma_ops;
+
+ struct module *owner;
+#ifdef notyet
+ struct class_device class_dev;
+ struct kobject ports_parent;
+ struct list_head port_list;
+#endif
+ enum {
+ IB_DEV_UNINITIALIZED,
+ IB_DEV_REGISTERED,
+ IB_DEV_UNREGISTERED
+ } reg_state;
+
+ u64 uverbs_cmd_mask;
+ int uverbs_abi_ver;
+
+ char node_desc[64];
+ __be64 node_guid;
+ u8 node_type;
+ u8 phys_port_cnt;
+};
+
+struct ib_client {
+ char *name;
+ void (*add) (struct ib_device *);
+ void (*remove)(struct ib_device *);
+ TAILQ_ENTRY(ib_client) list;
+};
+
+struct ib_device *ib_alloc_device(size_t size);
+void ib_dealloc_device(struct ib_device *device);
+
+int ib_register_device (struct ib_device *device);
+void ib_unregister_device(struct ib_device *device);
+
+int ib_register_client (struct ib_client *client);
+void ib_unregister_client(struct ib_client *client);
+
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client);
+void ib_set_client_data(struct ib_device *device, struct ib_client *client,
+ void *data);
+
+static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len)
+{
+ return copyin(udata->inbuf, dest, len);
+}
+
+static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len)
+{
+ return copyout(src, udata->outbuf, len);
+}
+
+/**
+ * ib_modify_qp_is_ok - Check that the supplied attribute mask
+ * contains all required attributes and no attributes not allowed for
+ * the given QP state transition.
+ * @cur_state: Current QP state
+ * @next_state: Next QP state
+ * @type: QP type
+ * @mask: Mask of supplied QP attributes
+ *
+ * This function is a helper function that a low-level driver's
+ * modify_qp method can use to validate the consumer's input. It
+ * checks that cur_state and next_state are valid QP states, that a
+ * transition from cur_state to next_state is allowed by the IB spec,
+ * and that the attribute mask supplied is allowed for the transition.
+ */
+int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
+ enum ib_qp_type type, enum ib_qp_attr_mask mask);
+
+int ib_register_event_handler (struct ib_event_handler *event_handler);
+int ib_unregister_event_handler(struct ib_event_handler *event_handler);
+void ib_dispatch_event(struct ib_event *event);
+
+int ib_query_device(struct ib_device *device,
+ struct ib_device_attr *device_attr);
+
+int ib_query_port(struct ib_device *device,
+ u8 port_num, struct ib_port_attr *port_attr);
+
+int ib_query_gid(struct ib_device *device,
+ u8 port_num, int index, union ib_gid *gid);
+
+int ib_query_pkey(struct ib_device *device,
+ u8 port_num, u16 index, u16 *pkey);
+
+int ib_modify_device(struct ib_device *device,
+ int device_modify_mask,
+ struct ib_device_modify *device_modify);
+
+int ib_modify_port(struct ib_device *device,
+ u8 port_num, int port_modify_mask,
+ struct ib_port_modify *port_modify);
+
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+ u8 *port_num, u16 *index);
+
+int ib_find_pkey(struct ib_device *device,
+ u8 port_num, u16 pkey, u16 *index);
+
+/**
+ * ib_alloc_pd - Allocates an unused protection domain.
+ * @device: The device on which to allocate the protection domain.
+ *
+ * A protection domain object provides an association between QPs, shared
+ * receive queues, address handles, memory regions, and memory windows.
+ */
+struct ib_pd *ib_alloc_pd(struct ib_device *device);
+
+/**
+ * ib_dealloc_pd - Deallocates a protection domain.
+ * @pd: The protection domain to deallocate.
+ */
+int ib_dealloc_pd(struct ib_pd *pd);
+
+/**
+ * ib_create_ah - Creates an address handle for the given address vector.
+ * @pd: The protection domain associated with the address handle.
+ * @ah_attr: The attributes of the address vector.
+ *
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_init_ah_from_wc - Initializes address handle attributes from a
+ * work completion.
+ * @device: Device on which the received message arrived.
+ * @port_num: Port on which the received message arrived.
+ * @wc: Work completion associated with the received message.
+ * @grh: References the received global route header. This parameter is
+ * ignored unless the work completion indicates that the GRH is valid.
+ * @ah_attr: Returned attributes that can be used when creating an address
+ * handle for replying to the message.
+ */
+int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
+ struct ib_grh *grh, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_create_ah_from_wc - Creates an address handle associated with the
+ * sender of the specified work completion.
+ * @pd: The protection domain associated with the address handle.
+ * @wc: Work completion information associated with a received message.
+ * @grh: References the received global route header. This parameter is
+ * ignored unless the work completion indicates that the GRH is valid.
+ * @port_num: The outbound port number to associate with the address.
+ *
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc,
+ struct ib_grh *grh, u8 port_num);
+
+/**
+ * ib_modify_ah - Modifies the address vector associated with an address
+ * handle.
+ * @ah: The address handle to modify.
+ * @ah_attr: The new address vector attributes to associate with the
+ * address handle.
+ */
+int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_query_ah - Queries the address vector associated with an address
+ * handle.
+ * @ah: The address handle to query.
+ * @ah_attr: The address vector attributes associated with the address
+ * handle.
+ */
+int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_destroy_ah - Destroys an address handle.
+ * @ah: The address handle to destroy.
+ */
+int ib_destroy_ah(struct ib_ah *ah);
+
+/**
+ * ib_create_srq - Creates a SRQ associated with the specified protection
+ * domain.
+ * @pd: The protection domain associated with the SRQ.
+ * @srq_init_attr: A list of initial attributes required to create the
+ * SRQ. If SRQ creation succeeds, then the attributes are updated to
+ * the actual capabilities of the created SRQ.
+ *
+ * srq_attr->max_wr and srq_attr->max_sge are read the determine the
+ * requested size of the SRQ, and set to the actual values allocated
+ * on return. If ib_create_srq() succeeds, then max_wr and max_sge
+ * will always be at least as large as the requested values.
+ */
+struct ib_srq *ib_create_srq(struct ib_pd *pd,
+ struct ib_srq_init_attr *srq_init_attr);
+
+/**
+ * ib_modify_srq - Modifies the attributes for the specified SRQ.
+ * @srq: The SRQ to modify.
+ * @srq_attr: On input, specifies the SRQ attributes to modify. On output,
+ * the current values of selected SRQ attributes are returned.
+ * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ
+ * are being modified.
+ *
+ * The mask may contain IB_SRQ_MAX_WR to resize the SRQ and/or
+ * IB_SRQ_LIMIT to set the SRQ's limit and request notification when
+ * the number of receives queued drops below the limit.
+ */
+int ib_modify_srq(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr,
+ enum ib_srq_attr_mask srq_attr_mask);
+
+/**
+ * ib_query_srq - Returns the attribute list and current values for the
+ * specified SRQ.
+ * @srq: The SRQ to query.
+ * @srq_attr: The attributes of the specified SRQ.
+ */
+int ib_query_srq(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr);
+
+/**
+ * ib_destroy_srq - Destroys the specified SRQ.
+ * @srq: The SRQ to destroy.
+ */
+int ib_destroy_srq(struct ib_srq *srq);
+
+/**
+ * ib_post_srq_recv - Posts a list of work requests to the specified SRQ.
+ * @srq: The SRQ to post the work request on.
+ * @recv_wr: A list of work requests to post on the receive queue.
+ * @bad_recv_wr: On an immediate failure, this parameter will reference
+ * the work request that failed to be posted on the QP.
+ */
+static inline int ib_post_srq_recv(struct ib_srq *srq,
+ struct ib_recv_wr *recv_wr,
+ struct ib_recv_wr **bad_recv_wr)
+{
+ return srq->device->post_srq_recv(srq, recv_wr, bad_recv_wr);
+}
+
+/**
+ * ib_create_qp - Creates a QP associated with the specified protection
+ * domain.
+ * @pd: The protection domain associated with the QP.
+ * @qp_init_attr: A list of initial attributes required to create the
+ * QP. If QP creation succeeds, then the attributes are updated to
+ * the actual capabilities of the created QP.
+ */
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * ib_modify_qp - Modifies the attributes for the specified QP and then
+ * transitions the QP to the given state.
+ * @qp: The QP to modify.
+ * @qp_attr: On input, specifies the QP attributes to modify. On output,
+ * the current values of selected QP attributes are returned.
+ * @qp_attr_mask: A bit-mask used to specify which attributes of the QP
+ * are being modified.
+ */
+int ib_modify_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask);
+
+/**
+ * ib_query_qp - Returns the attribute list and current values for the
+ * specified QP.
+ * @qp: The QP to query.
+ * @qp_attr: The attributes of the specified QP.
+ * @qp_attr_mask: A bit-mask used to select specific attributes to query.
+ * @qp_init_attr: Additional attributes of the selected QP.
+ *
+ * The qp_attr_mask may be used to limit the query to gathering only the
+ * selected attributes.
+ */
+int ib_query_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * ib_destroy_qp - Destroys the specified QP.
+ * @qp: The QP to destroy.
+ */
+int ib_destroy_qp(struct ib_qp *qp);
+
+/**
+ * ib_post_send - Posts a list of work requests to the send queue of
+ * the specified QP.
+ * @qp: The QP to post the work request on.
+ * @send_wr: A list of work requests to post on the send queue.
+ * @bad_send_wr: On an immediate failure, this parameter will reference
+ * the work request that failed to be posted on the QP.
+ */
+static inline int ib_post_send(struct ib_qp *qp,
+ struct ib_send_wr *send_wr,
+ struct ib_send_wr **bad_send_wr)
+{
+ return qp->device->post_send(qp, send_wr, bad_send_wr);
+}
+
+/**
+ * ib_post_recv - Posts a list of work requests to the receive queue of
+ * the specified QP.
+ * @qp: The QP to post the work request on.
+ * @recv_wr: A list of work requests to post on the receive queue.
+ * @bad_recv_wr: On an immediate failure, this parameter will reference
+ * the work request that failed to be posted on the QP.
+ */
+static inline int ib_post_recv(struct ib_qp *qp,
+ struct ib_recv_wr *recv_wr,
+ struct ib_recv_wr **bad_recv_wr)
+{
+ return qp->device->post_recv(qp, recv_wr, bad_recv_wr);
+}
+
+/**
+ * ib_create_cq - Creates a CQ on the specified device.
+ * @device: The device on which to create the CQ.
+ * @comp_handler: A user-specified callback that is invoked when a
+ * completion event occurs on the CQ.
+ * @event_handler: A user-specified callback that is invoked when an
+ * asynchronous event not associated with a completion occurs on the CQ.
+ * @cq_context: Context associated with the CQ returned to the user via
+ * the associated completion and event handlers.
+ * @cqe: The minimum size of the CQ.
+ * @comp_vector - Completion vector used to signal completion events.
+ * Must be >= 0 and < context->num_comp_vectors.
+ *
+ * Users can examine the cq structure to determine the actual CQ size.
+ */
+struct ib_cq *ib_create_cq(struct ib_device *device,
+ ib_comp_handler comp_handler,
+ void (*event_handler)(struct ib_event *, void *),
+ void *cq_context, int cqe, int comp_vector);
+
+/**
+ * ib_resize_cq - Modifies the capacity of the CQ.
+ * @cq: The CQ to resize.
+ * @cqe: The minimum size of the CQ.
+ *
+ * Users can examine the cq structure to determine the actual CQ size.
+ */
+int ib_resize_cq(struct ib_cq *cq, int cqe);
+
+/**
+ * ib_destroy_cq - Destroys the specified CQ.
+ * @cq: The CQ to destroy.
+ */
+int ib_destroy_cq(struct ib_cq *cq);
+
+/**
+ * ib_poll_cq - poll a CQ for completion(s)
+ * @cq:the CQ being polled
+ * @num_entries:maximum number of completions to return
+ * @wc:array of at least @num_entries &struct ib_wc where completions
+ * will be returned
+ *
+ * Poll a CQ for (possibly multiple) completions. If the return value
+ * is < 0, an error occurred. If the return value is >= 0, it is the
+ * number of completions returned. If the return value is
+ * non-negative and < num_entries, then the CQ was emptied.
+ */
+static inline int ib_poll_cq(struct ib_cq *cq, int num_entries,
+ struct ib_wc *wc)
+{
+ return cq->device->poll_cq(cq, num_entries, wc);
+}
+
+/**
+ * ib_peek_cq - Returns the number of unreaped completions currently
+ * on the specified CQ.
+ * @cq: The CQ to peek.
+ * @wc_cnt: A minimum number of unreaped completions to check for.
+ *
+ * If the number of unreaped completions is greater than or equal to wc_cnt,
+ * this function returns wc_cnt, otherwise, it returns the actual number of
+ * unreaped completions.
+ */
+int ib_peek_cq(struct ib_cq *cq, int wc_cnt);
+
+/**
+ * ib_req_notify_cq - Request completion notification on a CQ.
+ * @cq: The CQ to generate an event for.
+ * @flags:
+ * Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP
+ * to request an event on the next solicited event or next work
+ * completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS
+ * may also be |ed in to request a hint about missed events, as
+ * described below.
+ *
+ * Return Value:
+ * < 0 means an error occurred while requesting notification
+ * == 0 means notification was requested successfully, and if
+ * IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events
+ * were missed and it is safe to wait for another event. In
+ * this case is it guaranteed that any work completions added
+ * to the CQ since the last CQ poll will trigger a completion
+ * notification event.
+ * > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed
+ * in. It means that the consumer must poll the CQ again to
+ * make sure it is empty to avoid missing an event because of a
+ * race between requesting notification and an entry being
+ * added to the CQ. This return value means it is possible
+ * (but not guaranteed) that a work completion has been added
+ * to the CQ since the last poll without triggering a
+ * completion notification event.
+ */
+static inline int ib_req_notify_cq(struct ib_cq *cq,
+ enum ib_cq_notify_flags flags)
+{
+ return cq->device->req_notify_cq(cq, flags);
+}
+
+/**
+ * ib_req_ncomp_notif - Request completion notification when there are
+ * at least the specified number of unreaped completions on the CQ.
+ * @cq: The CQ to generate an event for.
+ * @wc_cnt: The number of unreaped completions that should be on the
+ * CQ before an event is generated.
+ */
+static inline int ib_req_ncomp_notif(struct ib_cq *cq, int wc_cnt)
+{
+ return cq->device->req_ncomp_notif ?
+ cq->device->req_ncomp_notif(cq, wc_cnt) :
+ ENOSYS;
+}
+
+/**
+ * ib_get_dma_mr - Returns a memory region for system memory that is
+ * usable for DMA.
+ * @pd: The protection domain associated with the memory region.
+ * @mr_access_flags: Specifies the memory access rights.
+ *
+ * Note that the ib_dma_*() functions defined below must be used
+ * to create/destroy addresses used with the Lkey or Rkey returned
+ * by ib_get_dma_mr().
+ */
+struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
+#ifdef notyet
+/**
+ * ib_dma_mapping_error - check a DMA addr for error
+ * @dev: The device for which the dma_addr was created
+ * @dma_addr: The DMA address to check
+ */
+static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+ if (dev->dma_ops)
+ return dev->dma_ops->mapping_error(dev, dma_addr);
+ return dma_mapping_error(dma_addr);
+}
+
+/**
+ * ib_dma_map_single - Map a kernel virtual address to DMA address
+ * @dev: The device for which the dma_addr is to be created
+ * @cpu_addr: The kernel virtual address
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline u64 ib_dma_map_single(struct ib_device *dev,
+ void *cpu_addr, size_t size,
+ enum dma_data_direction direction)
+{
+ if (dev->dma_ops)
+ return dev->dma_ops->map_single(dev, cpu_addr, size, direction);
+ return dma_map_single(dev->dma_device, cpu_addr, size, direction);
+}
+
+/**
+ * ib_dma_unmap_single - Destroy a mapping created by ib_dma_map_single()
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline void ib_dma_unmap_single(struct ib_device *dev,
+ u64 addr, size_t size,
+ enum dma_data_direction direction)
+{
+ if (dev->dma_ops)
+ dev->dma_ops->unmap_single(dev, addr, size, direction);
+ else
+ dma_unmap_single(dev->dma_device, addr, size, direction);
+}
+
+/**
+ * ib_dma_map_page - Map a physical page to DMA address
+ * @dev: The device for which the dma_addr is to be created
+ * @page: The page to be mapped
+ * @offset: The offset within the page
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline u64 ib_dma_map_page(struct ib_device *dev,
+ struct page *page,
+ unsigned long offset,
+ size_t size,
+ enum dma_data_direction direction)
+{
+ if (dev->dma_ops)
+ return dev->dma_ops->map_page(dev, page, offset, size, direction);
+ return dma_map_page(dev->dma_device, page, offset, size, direction);
+}
+
+/**
+ * ib_dma_unmap_page - Destroy a mapping created by ib_dma_map_page()
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline void ib_dma_unmap_page(struct ib_device *dev,
+ u64 addr, size_t size,
+ enum dma_data_direction direction)
+{
+ if (dev->dma_ops)
+ dev->dma_ops->unmap_page(dev, addr, size, direction);
+ else
+ dma_unmap_page(dev->dma_device, addr, size, direction);
+}
+
+/**
+ * ib_dma_map_sg - Map a scatter/gather list to DMA addresses
+ * @dev: The device for which the DMA addresses are to be created
+ * @sg: The array of scatter/gather entries
+ * @nents: The number of scatter/gather entries
+ * @direction: The direction of the DMA
+ */
+static inline int ib_dma_map_sg(struct ib_device *dev,
+ struct rdma_scatterlist *sg, int nents,
+ enum dma_data_direction direction)
+{
+ if (dev->dma_ops)
+ return dev->dma_ops->map_sg(dev, sg, nents, direction);
+ return dma_map_sg(dev->dma_device, sg, nents, direction);
+}
+
+/**
+ * ib_dma_unmap_sg - Unmap a scatter/gather list of DMA addresses
+ * @dev: The device for which the DMA addresses were created
+ * @sg: The array of scatter/gather entries
+ * @nents: The number of scatter/gather entries
+ * @direction: The direction of the DMA
+ */
+static inline void ib_dma_unmap_sg(struct ib_device *dev,
+ struct rdma_scatterlist *sg, int nents,
+ enum dma_data_direction direction)
+{
+ if (dev->dma_ops)
+ dev->dma_ops->unmap_sg(dev, sg, nents, direction);
+ else
+ dma_unmap_sg(dev->dma_device, sg, nents, direction);
+}
+
+/**
+ * ib_sg_dma_address - Return the DMA address from a scatter/gather entry
+ * @dev: The device for which the DMA addresses were created
+ * @sg: The scatter/gather entry
+ */
+static inline u64 ib_sg_dma_address(struct ib_device *dev,
+ struct rdma_scatterlist *sg)
+{
+ if (dev->dma_ops)
+ return dev->dma_ops->dma_address(dev, sg);
+ return sg_dma_address(sg);
+}
+
+/**
+ * ib_sg_dma_len - Return the DMA length from a scatter/gather entry
+ * @dev: The device for which the DMA addresses were created
+ * @sg: The scatter/gather entry
+ */
+static inline unsigned int ib_sg_dma_len(struct ib_device *dev,
+ struct rdma_scatterlist *sg)
+{
+ if (dev->dma_ops)
+ return dev->dma_ops->dma_len(dev, sg);
+ return sg_dma_len(sg);
+}
+
+/**
+ * ib_dma_sync_single_for_cpu - Prepare DMA region to be accessed by CPU
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @dir: The direction of the DMA
+ */
+static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev,
+ u64 addr,
+ size_t size,
+ enum dma_data_direction dir)
+{
+ if (dev->dma_ops)
+ dev->dma_ops->sync_single_for_cpu(dev, addr, size, dir);
+ else
+ dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
+}
+
+/**
+ * ib_dma_sync_single_for_device - Prepare DMA region to be accessed by device
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @dir: The direction of the DMA
+ */
+static inline void ib_dma_sync_single_for_device(struct ib_device *dev,
+ u64 addr,
+ size_t size,
+ enum dma_data_direction dir)
+{
+ if (dev->dma_ops)
+ dev->dma_ops->sync_single_for_device(dev, addr, size, dir);
+ else
+ dma_sync_single_for_device(dev->dma_device, addr, size, dir);
+}
+
+/**
+ * ib_dma_alloc_coherent - Allocate memory and map it for DMA
+ * @dev: The device for which the DMA address is requested
+ * @size: The size of the region to allocate in bytes
+ * @dma_handle: A pointer for returning the DMA address of the region
+ * @flag: memory allocator flags
+ */
+static inline void *ib_dma_alloc_coherent(struct ib_device *dev,
+ size_t size,
+ u64 *dma_handle,
+ gfp_t flag)
+{
+ if (dev->dma_ops)
+ return dev->dma_ops->alloc_coherent(dev, size, dma_handle, flag);
+ else {
+ dma_addr_t handle;
+ void *ret;
+
+ ret = dma_alloc_coherent(dev->dma_device, size, &handle, flag);
+ *dma_handle = handle;
+ return ret;
+ }
+}
+
+/**
+ * ib_dma_free_coherent - Free memory allocated by ib_dma_alloc_coherent()
+ * @dev: The device for which the DMA addresses were allocated
+ * @size: The size of the region
+ * @cpu_addr: the address returned by ib_dma_alloc_coherent()
+ * @dma_handle: the DMA address returned by ib_dma_alloc_coherent()
+ */
+static inline void ib_dma_free_coherent(struct ib_device *dev,
+ size_t size, void *cpu_addr,
+ u64 dma_handle)
+{
+ if (dev->dma_ops)
+ dev->dma_ops->free_coherent(dev, size, cpu_addr, dma_handle);
+ else
+ dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle);
+}
+#endif
+/**
+ * ib_reg_phys_mr - Prepares a virtually addressed memory region for use
+ * by an HCA.
+ * @pd: The protection domain associated assigned to the registered region.
+ * @phys_buf_array: Specifies a list of physical buffers to use in the
+ * memory region.
+ * @num_phys_buf: Specifies the size of the phys_buf_array.
+ * @mr_access_flags: Specifies the memory access rights.
+ * @iova_start: The offset of the region's starting I/O virtual address.
+ */
+struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start);
+
+/**
+ * ib_rereg_phys_mr - Modifies the attributes of an existing memory region.
+ * Conceptually, this call performs the functions deregister memory region
+ * followed by register physical memory region. Where possible,
+ * resources are reused instead of deallocated and reallocated.
+ * @mr: The memory region to modify.
+ * @mr_rereg_mask: A bit-mask used to indicate which of the following
+ * properties of the memory region are being modified.
+ * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies
+ * the new protection domain to associated with the memory region,
+ * otherwise, this parameter is ignored.
+ * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
+ * field specifies a list of physical buffers to use in the new
+ * translation, otherwise, this parameter is ignored.
+ * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
+ * field specifies the size of the phys_buf_array, otherwise, this
+ * parameter is ignored.
+ * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this
+ * field specifies the new memory access rights, otherwise, this
+ * parameter is ignored.
+ * @iova_start: The offset of the region's starting I/O virtual address.
+ */
+int ib_rereg_phys_mr(struct ib_mr *mr,
+ int mr_rereg_mask,
+ struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start);
+
+/**
+ * ib_query_mr - Retrieves information about a specific memory region.
+ * @mr: The memory region to retrieve information about.
+ * @mr_attr: The attributes of the specified memory region.
+ */
+int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
+
+/**
+ * ib_dereg_mr - Deregisters a memory region and removes it from the
+ * HCA translation table.
+ * @mr: The memory region to deregister.
+ */
+int ib_dereg_mr(struct ib_mr *mr);
+
+/**
+ * ib_alloc_mw - Allocates a memory window.
+ * @pd: The protection domain associated with the memory window.
+ */
+struct ib_mw *ib_alloc_mw(struct ib_pd *pd);
+
+/**
+ * ib_bind_mw - Posts a work request to the send queue of the specified
+ * QP, which binds the memory window to the given address range and
+ * remote access attributes.
+ * @qp: QP to post the bind work request on.
+ * @mw: The memory window to bind.
+ * @mw_bind: Specifies information about the memory window, including
+ * its address range, remote access rights, and associated memory region.
+ */
+static inline int ib_bind_mw(struct ib_qp *qp,
+ struct ib_mw *mw,
+ struct ib_mw_bind *mw_bind)
+{
+ /* XXX reference counting in corresponding MR? */
+ return mw->device->bind_mw ?
+ mw->device->bind_mw(qp, mw, mw_bind) :
+ ENOSYS;
+}
+
+/**
+ * ib_dealloc_mw - Deallocates a memory window.
+ * @mw: The memory window to deallocate.
+ */
+int ib_dealloc_mw(struct ib_mw *mw);
+
+/**
+ * ib_alloc_fmr - Allocates a unmapped fast memory region.
+ * @pd: The protection domain associated with the unmapped region.
+ * @mr_access_flags: Specifies the memory access rights.
+ * @fmr_attr: Attributes of the unmapped region.
+ *
+ * A fast memory region must be mapped before it can be used as part of
+ * a work request.
+ */
+struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
+ int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr);
+
+/**
+ * ib_map_phys_fmr - Maps a list of physical pages to a fast memory region.
+ * @fmr: The fast memory region to associate with the pages.
+ * @page_list: An array of physical pages to map to the fast memory region.
+ * @list_len: The number of pages in page_list.
+ * @iova: The I/O virtual address to use with the mapped region.
+ */
+static inline int ib_map_phys_fmr(struct ib_fmr *fmr,
+ u64 *page_list, int list_len,
+ u64 iova)
+{
+ return fmr->device->map_phys_fmr(fmr, page_list, list_len, iova);
+}
+
+/**
+ * ib_unmap_fmr - Removes the mapping from a list of fast memory regions.
+ * @fmr_list: A linked list of fast memory regions to unmap.
+ */
+int ib_unmap_fmr(struct ib_fmr_list_head *fmr_list);
+
+/**
+ * ib_dealloc_fmr - Deallocates a fast memory region.
+ * @fmr: The fast memory region to deallocate.
+ */
+int ib_dealloc_fmr(struct ib_fmr *fmr);
+
+/**
+ * ib_attach_mcast - Attaches the specified QP to a multicast group.
+ * @qp: QP to attach to the multicast group. The QP must be type
+ * IB_QPT_UD.
+ * @gid: Multicast group GID.
+ * @lid: Multicast group LID in host byte order.
+ *
+ * In order to send and receive multicast packets, subnet
+ * administration must have created the multicast group and configured
+ * the fabric appropriately. The port associated with the specified
+ * QP must also be a member of the multicast group.
+ */
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+/**
+ * ib_detach_mcast - Detaches the specified QP from a multicast group.
+ * @qp: QP to detach from the multicast group.
+ * @gid: Multicast group GID.
+ * @lid: Multicast group LID in host byte order.
+ */
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+#endif /* IB_VERBS_H */
diff --git a/sys/contrib/rdma/iw_cm.h b/sys/contrib/rdma/iw_cm.h
new file mode 100644
index 000000000000..e594d669375a
--- /dev/null
+++ b/sys/contrib/rdma/iw_cm.h
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $FreeBSD$
+ */
+#ifndef IW_CM_H
+#define IW_CM_H
+
+#include <contrib/rdma/ib_cm.h>
+
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+
+struct iw_cm_id;
+
+enum iw_cm_event_type {
+ IW_CM_EVENT_CONNECT_REQUEST = 1, /* connect request received */
+ IW_CM_EVENT_CONNECT_REPLY, /* reply from active connect request */
+ IW_CM_EVENT_ESTABLISHED, /* passive side accept successful */
+ IW_CM_EVENT_DISCONNECT, /* orderly shutdown */
+ IW_CM_EVENT_CLOSE /* close complete */
+};
+
+enum iw_cm_event_status {
+ IW_CM_EVENT_STATUS_OK = 0, /* request successful */
+ IW_CM_EVENT_STATUS_ACCEPTED = 0, /* connect request accepted */
+ IW_CM_EVENT_STATUS_REJECTED, /* connect request rejected */
+ IW_CM_EVENT_STATUS_TIMEOUT, /* the operation timed out */
+ IW_CM_EVENT_STATUS_RESET, /* reset from remote peer */
+ IW_CM_EVENT_STATUS_EINVAL, /* asynchronous failure for bad parm */
+};
+
+struct iw_cm_event {
+ enum iw_cm_event_type event;
+ enum iw_cm_event_status status;
+ struct sockaddr_in local_addr;
+ struct sockaddr_in remote_addr;
+ void *private_data;
+ u8 private_data_len;
+ void* provider_data;
+ struct socket *so;
+};
+
+/**
+ * iw_cm_handler - Function to be called by the IW CM when delivering events
+ * to the client.
+ *
+ * @cm_id: The IW CM identifier associated with the event.
+ * @event: Pointer to the event structure.
+ */
+typedef int (*iw_cm_handler)(struct iw_cm_id *cm_id,
+ struct iw_cm_event *event);
+
+/**
+ * iw_event_handler - Function called by the provider when delivering provider
+ * events to the IW CM. Returns either 0 indicating the event was processed
+ * or -errno if the event could not be processed.
+ *
+ * @cm_id: The IW CM identifier associated with the event.
+ * @event: Pointer to the event structure.
+ */
+typedef int (*iw_event_handler)(struct iw_cm_id *cm_id,
+ struct iw_cm_event *event);
+
+struct iw_cm_id {
+ iw_cm_handler cm_handler; /* client callback function */
+ void *context; /* client cb context */
+ struct ib_device *device;
+ struct sockaddr_in local_addr;
+ struct sockaddr_in remote_addr;
+ void *provider_data; /* provider private data */
+ iw_event_handler event_handler; /* cb for provider
+ events */
+ /* Used by provider to add and remove refs on IW cm_id */
+ void (*add_ref)(struct iw_cm_id *);
+ void (*rem_ref)(struct iw_cm_id *);
+ struct socket *so;
+};
+
+struct iw_cm_conn_param {
+ const void *private_data;
+ u16 private_data_len;
+ u32 ord;
+ u32 ird;
+ u32 qpn;
+};
+
+struct iw_cm_verbs {
+ void (*add_ref)(struct ib_qp *qp);
+
+ void (*rem_ref)(struct ib_qp *qp);
+
+ struct ib_qp * (*get_qp)(struct ib_device *device,
+ int qpn);
+
+ int (*connect)(struct iw_cm_id *cm_id,
+ struct iw_cm_conn_param *conn_param);
+
+ int (*accept)(struct iw_cm_id *cm_id,
+ struct iw_cm_conn_param *conn_param);
+
+ int (*reject)(struct iw_cm_id *cm_id,
+ const void *pdata, u8 pdata_len);
+
+ int (*create_listen)(struct iw_cm_id *cm_id,
+ int backlog);
+
+ int (*destroy_listen)(struct iw_cm_id *cm_id);
+};
+
+/**
+ * iw_create_cm_id - Create an IW CM identifier.
+ *
+ * @device: The IB device on which to create the IW CM identier.
+ * @so: The socket to be used for establishing the rdma connection.
+ * @event_handler: User callback invoked to report events associated with the
+ * returned IW CM identifier.
+ * @context: User specified context associated with the id.
+ */
+struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
+ struct socket *so,
+ iw_cm_handler cm_handler, void *context);
+
+/**
+ * iw_destroy_cm_id - Destroy an IW CM identifier.
+ *
+ * @cm_id: The previously created IW CM identifier to destroy.
+ *
+ * The client can assume that no events will be delivered for the CM ID after
+ * this function returns.
+ */
+void iw_destroy_cm_id(struct iw_cm_id *cm_id);
+
+/**
+ * iw_cm_bind_qp - Unbind the specified IW CM identifier and QP
+ *
+ * @cm_id: The IW CM idenfier to unbind from the QP.
+ * @qp: The QP
+ *
+ * This is called by the provider when destroying the QP to ensure
+ * that any references held by the IWCM are released. It may also
+ * be called by the IWCM when destroying a CM_ID to that any
+ * references held by the provider are released.
+ */
+void iw_cm_unbind_qp(struct iw_cm_id *cm_id, struct ib_qp *qp);
+
+/**
+ * iw_cm_get_qp - Return the ib_qp associated with a QPN
+ *
+ * @ib_device: The IB device
+ * @qpn: The queue pair number
+ */
+struct ib_qp *iw_cm_get_qp(struct ib_device *device, int qpn);
+
+/**
+ * iw_cm_listen - Listen for incoming connection requests on the
+ * specified IW CM id.
+ *
+ * @cm_id: The IW CM identifier.
+ * @backlog: The maximum number of outstanding un-accepted inbound listen
+ * requests to queue.
+ *
+ * The source address and port number are specified in the IW CM identifier
+ * structure.
+ */
+int iw_cm_listen(struct iw_cm_id *cm_id, int backlog);
+
+/**
+ * iw_cm_accept - Called to accept an incoming connect request.
+ *
+ * @cm_id: The IW CM identifier associated with the connection request.
+ * @iw_param: Pointer to a structure containing connection establishment
+ * parameters.
+ *
+ * The specified cm_id will have been provided in the event data for a
+ * CONNECT_REQUEST event. Subsequent events related to this connection will be
+ * delivered to the specified IW CM identifier prior and may occur prior to
+ * the return of this function. If this function returns a non-zero value, the
+ * client can assume that no events will be delivered to the specified IW CM
+ * identifier.
+ */
+int iw_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param);
+
+/**
+ * iw_cm_reject - Reject an incoming connection request.
+ *
+ * @cm_id: Connection identifier associated with the request.
+ * @private_daa: Pointer to data to deliver to the remote peer as part of the
+ * reject message.
+ * @private_data_len: The number of bytes in the private_data parameter.
+ *
+ * The client can assume that no events will be delivered to the specified IW
+ * CM identifier following the return of this function. The private_data
+ * buffer is available for reuse when this function returns.
+ */
+int iw_cm_reject(struct iw_cm_id *cm_id, const void *private_data,
+ u8 private_data_len);
+
+/**
+ * iw_cm_connect - Called to request a connection to a remote peer.
+ *
+ * @cm_id: The IW CM identifier for the connection.
+ * @iw_param: Pointer to a structure containing connection establishment
+ * parameters.
+ *
+ * Events may be delivered to the specified IW CM identifier prior to the
+ * return of this function. If this function returns a non-zero value, the
+ * client can assume that no events will be delivered to the specified IW CM
+ * identifier.
+ */
+int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param);
+
+/**
+ * iw_cm_disconnect - Close the specified connection.
+ *
+ * @cm_id: The IW CM identifier to close.
+ * @abrupt: If 0, the connection will be closed gracefully, otherwise, the
+ * connection will be reset.
+ *
+ * The IW CM identifier is still active until the IW_CM_EVENT_CLOSE event is
+ * delivered.
+ */
+int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt);
+
+/**
+ * iw_cm_init_qp_attr - Called to initialize the attributes of the QP
+ * associated with a IW CM identifier.
+ *
+ * @cm_id: The IW CM identifier associated with the QP
+ * @qp_attr: Pointer to the QP attributes structure.
+ * @qp_attr_mask: Pointer to a bit vector specifying which QP attributes are
+ * valid.
+ */
+int iw_cm_init_qp_attr(struct iw_cm_id *cm_id, struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask);
+
+#endif /* IW_CM_H */
diff --git a/sys/contrib/rdma/krping/getopt.c b/sys/contrib/rdma/krping/getopt.c
new file mode 100644
index 000000000000..701910ea8b33
--- /dev/null
+++ b/sys/contrib/rdma/krping/getopt.c
@@ -0,0 +1,77 @@
+/*
+ * lifted from fs/ncpfs/getopt.c
+ *
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/ctype.h>
+#include <sys/param.h>
+#include <sys/libkern.h>
+#include "getopt.h"
+
+/**
+ * krping_getopt - option parser
+ * @caller: name of the caller, for error messages
+ * @options: the options string
+ * @opts: an array of &struct option entries controlling parser operations
+ * @optopt: output; will contain the current option
+ * @optarg: output; will contain the value (if one exists)
+ * @flag: output; may be NULL; should point to a long for or'ing flags
+ * @value: output; may be NULL; will be overwritten with the integer value
+ * of the current argument.
+ *
+ * Helper to parse options on the format used by mount ("a=b,c=d,e,f").
+ * Returns opts->val if a matching entry in the 'opts' array is found,
+ * 0 when no more tokens are found, -1 if an error is encountered.
+ */
+int krping_getopt(const char *caller, char **options,
+ const struct krping_option *opts, char **optopt,
+ char **optarg, unsigned long *value)
+{
+ char *token;
+ char *val;
+
+ do {
+ if ((token = strsep(options, ",")) == NULL)
+ return 0;
+ } while (*token == '\0');
+ if (optopt)
+ *optopt = token;
+
+ if ((val = strchr (token, '=')) != NULL) {
+ *val++ = 0;
+ }
+ *optarg = val;
+ for (; opts->name; opts++) {
+ if (!strcmp(opts->name, token)) {
+ if (!val) {
+ if (opts->has_arg & OPT_NOPARAM) {
+ return opts->val;
+ }
+ printf("%s: the %s option requires "
+ "an argument\n", caller, token);
+ return -EINVAL;
+ }
+ if (opts->has_arg & OPT_INT) {
+ char* v;
+
+ *value = strtoul(val, &v, 0);
+ if (!*v) {
+ return opts->val;
+ }
+ printf("%s: invalid numeric value "
+ "in %s=%s\n", caller, token, val);
+ return -EDOM;
+ }
+ if (opts->has_arg & OPT_STRING) {
+ return opts->val;
+ }
+ printf("%s: unexpected argument %s to the "
+ "%s option\n", caller, val, token);
+ return -EINVAL;
+ }
+ }
+ printf("%s: Unrecognized option %s\n", caller, token);
+ return -EOPNOTSUPP;
+}
diff --git a/sys/contrib/rdma/krping/getopt.h b/sys/contrib/rdma/krping/getopt.h
new file mode 100644
index 000000000000..610ec7625424
--- /dev/null
+++ b/sys/contrib/rdma/krping/getopt.h
@@ -0,0 +1,21 @@
+/*
+ * lifted from fs/ncpfs/getopt.c
+ *
+ * $FreeBSD$
+ */
+#ifndef _KRPING_GETOPT_H
+#define _KRPING_GETOPT_H
+
+#define OPT_NOPARAM 1
+#define OPT_INT 2
+#define OPT_STRING 4
+struct krping_option {
+ const char *name;
+ unsigned int has_arg;
+ int val;
+};
+
+extern int krping_getopt(const char *caller, char **options, const struct krping_option *opts,
+ char **optopt, char **optarg, unsigned long *value);
+
+#endif /* _KRPING_GETOPT_H */
diff --git a/sys/contrib/rdma/krping/krping.c b/sys/contrib/rdma/krping/krping.c
new file mode 100644
index 000000000000..202daf85eda6
--- /dev/null
+++ b/sys/contrib/rdma/krping/krping.c
@@ -0,0 +1,1865 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/ctype.h>
+
+#include <sys/param.h>
+#include <sys/condvar.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>
+#include <sys/module.h>
+#include <sys/endian.h>
+#include <sys/limits.h>
+#include <sys/proc.h>
+#include <sys/signalvar.h>
+
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+#include <sys/syslog.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <contrib/rdma/rdma_cm.h>
+
+#include "getopt.h"
+#include "krping.h"
+
+#define PFX "krping: "
+
+static int debug = 0;
+#define DEBUG_LOG if (debug) printf
+
+static const struct krping_option krping_opts[] = {
+ {"count", OPT_INT, 'C'},
+ {"size", OPT_INT, 'S'},
+ {"addr", OPT_STRING, 'a'},
+ {"port", OPT_INT, 'p'},
+ {"verbose", OPT_NOPARAM, 'v'},
+ {"validate", OPT_NOPARAM, 'V'},
+ {"server", OPT_NOPARAM, 's'},
+ {"client", OPT_NOPARAM, 'c'},
+ {"dmamr", OPT_NOPARAM, 'D'},
+ {"debug", OPT_NOPARAM, 'd'},
+ {"wlat", OPT_NOPARAM, 'l'},
+ {"rlat", OPT_NOPARAM, 'L'},
+ {"bw", OPT_NOPARAM, 'B'},
+ {"tx-depth", OPT_INT, 't'},
+ {"poll", OPT_NOPARAM, 'P'},
+ {NULL, 0, 0}
+};
+
+struct mtx krping_mutex;
+
+/*
+ * List of running krping threads.
+ */
+struct krping_cb_list krping_cbs;
+
+/*
+ * krping "ping/pong" loop:
+ * client sends source rkey/addr/len
+ * server receives source rkey/add/len
+ * server rdma reads "ping" data from source
+ * server sends "go ahead" on rdma read completion
+ * client sends sink rkey/addr/len
+ * server receives sink rkey/addr/len
+ * server rdma writes "pong" data to sink
+ * server sends "go ahead" on rdma write completion
+ * <repeat loop>
+ */
+
+/*
+ * Default max buffer size for IO...
+ */
+#define RPING_BUFSIZE 128*1024
+#define RPING_SQ_DEPTH 32
+
+
+/* lifted from netinet/libalias/alias_proxy.c */
+static int inet_aton(const char *cp, struct in_addr *addr);
+static int
+inet_aton(cp, addr)
+ const char *cp;
+ struct in_addr *addr;
+{
+ u_long parts[4];
+ in_addr_t val;
+ const char *c;
+ char *endptr;
+ int gotend, n;
+
+ c = (const char *)cp;
+ n = 0;
+ /*
+ * Run through the string, grabbing numbers until
+ * the end of the string, or some error
+ */
+ gotend = 0;
+ while (!gotend) {
+ unsigned long l;
+
+ l = strtoul(c, &endptr, 0);
+
+ if (l == ULONG_MAX || (l == 0 && endptr == c))
+ return (0);
+
+ val = (in_addr_t)l;
+ /*
+ * If the whole string is invalid, endptr will equal
+ * c.. this way we can make sure someone hasn't
+ * gone '.12' or something which would get past
+ * the next check.
+ */
+ if (endptr == c)
+ return (0);
+ parts[n] = val;
+ c = endptr;
+
+ /* Check the next character past the previous number's end */
+ switch (*c) {
+ case '.' :
+ /* Make sure we only do 3 dots .. */
+ if (n == 3) /* Whoops. Quit. */
+ return (0);
+ n++;
+ c++;
+ break;
+
+ case '\0':
+ gotend = 1;
+ break;
+
+ default:
+ if (isspace((unsigned char)*c)) {
+ gotend = 1;
+ break;
+ } else
+ return (0); /* Invalid character, so fail */
+ }
+
+ }
+
+ /*
+ * Concoct the address according to
+ * the number of parts specified.
+ */
+
+ switch (n) {
+ case 0: /* a -- 32 bits */
+ /*
+ * Nothing is necessary here. Overflow checking was
+ * already done in strtoul().
+ */
+ break;
+ case 1: /* a.b -- 8.24 bits */
+ if (val > 0xffffff || parts[0] > 0xff)
+ return (0);
+ val |= parts[0] << 24;
+ break;
+
+ case 2: /* a.b.c -- 8.8.16 bits */
+ if (val > 0xffff || parts[0] > 0xff || parts[1] > 0xff)
+ return (0);
+ val |= (parts[0] << 24) | (parts[1] << 16);
+ break;
+
+ case 3: /* a.b.c.d -- 8.8.8.8 bits */
+ if (val > 0xff || parts[0] > 0xff || parts[1] > 0xff ||
+ parts[2] > 0xff)
+ return (0);
+ val |= (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8);
+ break;
+ }
+
+ if (addr != NULL)
+ addr->s_addr = htonl(val);
+ return (1);
+}
+
+
+static void krping_wait(struct krping_cb *cb, int state)
+{
+ int rc;
+ mtx_lock(&cb->lock);
+ while (cb->state < state) {
+ rc = msleep(cb, &cb->lock, 0, "krping", 0);
+ if (rc && rc != ERESTART) {
+ cb->state = ERROR;
+ break;
+ }
+ }
+ mtx_unlock(&cb->lock);
+}
+
+static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
+ struct rdma_cm_event *event)
+{
+ int ret;
+ struct krping_cb *cb = cma_id->context;
+
+ DEBUG_LOG(PFX "cma_event type %d cma_id %p (%s)\n", event->event, cma_id,
+ (cma_id == cb->cm_id) ? "parent" : "child");
+
+ mtx_lock(&cb->lock);
+ switch (event->event) {
+ case RDMA_CM_EVENT_ADDR_RESOLVED:
+ cb->state = ADDR_RESOLVED;
+ ret = rdma_resolve_route(cma_id, 2000);
+ if (ret) {
+ log(LOG_ERR, "rdma_resolve_route error %d\n",
+ ret);
+ wakeup(cb);
+ }
+ break;
+
+ case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ cb->state = ROUTE_RESOLVED;
+ wakeup(cb);
+ break;
+
+ case RDMA_CM_EVENT_CONNECT_REQUEST:
+ cb->state = CONNECT_REQUEST;
+ cb->child_cm_id = cma_id;
+ DEBUG_LOG(PFX "child cma %p\n", cb->child_cm_id);
+ wakeup(cb);
+ break;
+
+ case RDMA_CM_EVENT_ESTABLISHED:
+ DEBUG_LOG(PFX "ESTABLISHED\n");
+ if (!cb->server) {
+ cb->state = CONNECTED;
+ wakeup(cb);
+ }
+ break;
+
+ case RDMA_CM_EVENT_ADDR_ERROR:
+ case RDMA_CM_EVENT_ROUTE_ERROR:
+ case RDMA_CM_EVENT_CONNECT_ERROR:
+ case RDMA_CM_EVENT_UNREACHABLE:
+ case RDMA_CM_EVENT_REJECTED:
+ log(LOG_ERR, "cma event %d, error %d\n", event->event,
+ event->status);
+ cb->state = ERROR;
+ wakeup(cb);
+ break;
+
+ case RDMA_CM_EVENT_DISCONNECTED:
+ DEBUG_LOG(PFX "DISCONNECT EVENT...\n");
+ cb->state = ERROR;
+ wakeup(cb);
+ break;
+
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ DEBUG_LOG(PFX "cma detected device removal!!!!\n");
+ break;
+
+ default:
+ log(LOG_ERR, "oof bad type!\n");
+ wakeup(cb);
+ break;
+ }
+ mtx_unlock(&cb->lock);
+ return 0;
+}
+
+static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
+{
+ if (wc->byte_len != sizeof(cb->recv_buf)) {
+ log(LOG_ERR, "Received bogus data, size %d\n",
+ wc->byte_len);
+ return -1;
+ }
+
+ cb->remote_rkey = ntohl(cb->recv_buf.rkey);
+ cb->remote_addr = ntohll(cb->recv_buf.buf);
+ cb->remote_len = ntohl(cb->recv_buf.size);
+ DEBUG_LOG(PFX "Received rkey %x addr %llx len %d from peer\n",
+ cb->remote_rkey, (unsigned long long)cb->remote_addr,
+ cb->remote_len);
+
+ if (cb->state <= CONNECTED || cb->state == RDMA_WRITE_COMPLETE)
+ cb->state = RDMA_READ_ADV;
+ else
+ cb->state = RDMA_WRITE_ADV;
+
+ return 0;
+}
+
+static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
+{
+ if (wc->byte_len != sizeof(cb->recv_buf)) {
+ log(LOG_ERR, "Received bogus data, size %d\n",
+ wc->byte_len);
+ return -1;
+ }
+
+ if (cb->state == RDMA_READ_ADV)
+ cb->state = RDMA_WRITE_ADV;
+ else
+ cb->state = RDMA_WRITE_COMPLETE;
+
+ return 0;
+}
+
+static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
+{
+ struct krping_cb *cb = ctx;
+ struct ib_wc wc;
+ struct ib_recv_wr *bad_wr;
+ int ret;
+
+ mtx_lock(&cb->lock);
+ KASSERT(cb->cq == cq, ("bad condition"));
+ if (cb->state == ERROR) {
+ log(LOG_ERR, "cq completion in ERROR state\n");
+ mtx_unlock(&cb->lock);
+ return;
+ }
+ if (!cb->wlat && !cb->rlat && !cb->bw)
+ ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
+ while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
+ if (wc.status) {
+ if (wc.status != IB_WC_WR_FLUSH_ERR)
+ log(LOG_ERR, "cq completion failed status %d\n",
+ wc.status);
+ goto error;
+ }
+
+ switch (wc.opcode) {
+ case IB_WC_SEND:
+ DEBUG_LOG(PFX "send completion\n");
+ cb->stats.send_bytes += cb->send_sgl.length;
+ cb->stats.send_msgs++;
+ break;
+
+ case IB_WC_RDMA_WRITE:
+ DEBUG_LOG(PFX "rdma write completion\n");
+ cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
+ cb->stats.write_msgs++;
+ cb->state = RDMA_WRITE_COMPLETE;
+ wakeup(cb);
+ break;
+
+ case IB_WC_RDMA_READ:
+ DEBUG_LOG(PFX "rdma read completion\n");
+ cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
+ cb->stats.read_msgs++;
+ cb->state = RDMA_READ_COMPLETE;
+ wakeup(cb);
+ break;
+
+ case IB_WC_RECV:
+ DEBUG_LOG(PFX "recv completion\n");
+ cb->stats.recv_bytes += sizeof(cb->recv_buf);
+ cb->stats.recv_msgs++;
+ if (cb->wlat || cb->rlat || cb->bw)
+ ret = server_recv(cb, &wc);
+ else
+ ret = cb->server ? server_recv(cb, &wc) :
+ client_recv(cb, &wc);
+ if (ret) {
+ log(LOG_ERR, "recv wc error: %d\n", ret);
+ goto error;
+ }
+
+ ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
+ if (ret) {
+ log(LOG_ERR, "post recv error: %d\n",
+ ret);
+ goto error;
+ }
+ wakeup(cb);
+ break;
+
+ default:
+ log(LOG_ERR, "unknown!!!!! completion\n");
+ goto error;
+ }
+ }
+ if (ret) {
+ log(LOG_ERR, "poll error %d\n", ret);
+ goto error;
+ }
+ mtx_unlock(&cb->lock);
+ return;
+error:
+ cb->state = ERROR;
+ wakeup(cb);
+ mtx_unlock(&cb->lock);
+}
+
+static int krping_accept(struct krping_cb *cb)
+{
+ struct rdma_conn_param conn_param;
+ int ret;
+
+ DEBUG_LOG(PFX "accepting client connection request\n");
+
+ memset(&conn_param, 0, sizeof conn_param);
+ conn_param.responder_resources = 1;
+ conn_param.initiator_depth = 1;
+
+ ret = rdma_accept(cb->child_cm_id, &conn_param);
+ if (ret) {
+ log(LOG_ERR, "rdma_accept error: %d\n", ret);
+ return ret;
+ }
+
+ if (!cb->wlat && !cb->rlat && !cb->bw) {
+ krping_wait(cb, CONNECTED);
+ if (cb->state == ERROR) {
+ log(LOG_ERR, "wait for CONNECTED state %d\n", cb->state);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static void krping_setup_wr(struct krping_cb *cb)
+{
+ /* XXX X86 only here... not mapping for dma! */
+ cb->recv_sgl.addr = vtophys(&cb->recv_buf);
+ cb->recv_sgl.length = sizeof cb->recv_buf;
+ if (cb->use_dmamr)
+ cb->recv_sgl.lkey = cb->dma_mr->lkey;
+ else
+ cb->recv_sgl.lkey = cb->recv_mr->lkey;
+ cb->rq_wr.sg_list = &cb->recv_sgl;
+ cb->rq_wr.num_sge = 1;
+
+ cb->send_sgl.addr = vtophys(&cb->send_buf);
+ cb->send_sgl.length = sizeof cb->send_buf;
+ if (cb->use_dmamr)
+ cb->send_sgl.lkey = cb->dma_mr->lkey;
+ else
+ cb->send_sgl.lkey = cb->send_mr->lkey;
+
+ cb->sq_wr.opcode = IB_WR_SEND;
+ cb->sq_wr.send_flags = IB_SEND_SIGNALED;
+ cb->sq_wr.sg_list = &cb->send_sgl;
+ cb->sq_wr.num_sge = 1;
+
+ cb->rdma_addr = vtophys(cb->rdma_buf);
+ cb->rdma_sgl.addr = cb->rdma_addr;
+ if (cb->use_dmamr)
+ cb->rdma_sgl.lkey = cb->dma_mr->lkey;
+ else
+ cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
+ cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
+ cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
+ cb->rdma_sq_wr.num_sge = 1;
+
+ if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
+ cb->start_addr = vtophys(cb->start_buf);
+ }
+}
+
+static int krping_setup_buffers(struct krping_cb *cb)
+{
+ int ret;
+ struct ib_phys_buf buf;
+ u64 iovbase;
+
+ DEBUG_LOG(PFX "krping_setup_buffers called on cb %p\n", cb);
+
+ if (cb->use_dmamr) {
+ cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
+ IB_ACCESS_REMOTE_READ|
+ IB_ACCESS_REMOTE_WRITE);
+ if (IS_ERR(cb->dma_mr)) {
+ log(LOG_ERR, "reg_dmamr failed\n");
+ return PTR_ERR(cb->dma_mr);
+ }
+ } else {
+
+ buf.addr = vtophys(&cb->recv_buf);
+ buf.size = sizeof cb->recv_buf;
+ iovbase = vtophys(&cb->recv_buf);
+ cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
+ IB_ACCESS_LOCAL_WRITE,
+ &iovbase);
+
+ if (IS_ERR(cb->recv_mr)) {
+ log(LOG_ERR, "recv_buf reg_mr failed\n");
+ return PTR_ERR(cb->recv_mr);
+ }
+
+ buf.addr = vtophys(&cb->send_buf);
+ buf.size = sizeof cb->send_buf;
+ iovbase = vtophys(&cb->send_buf);
+ cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
+ 0, &iovbase);
+
+ if (IS_ERR(cb->send_mr)) {
+ log(LOG_ERR, "send_buf reg_mr failed\n");
+ ib_dereg_mr(cb->recv_mr);
+ return PTR_ERR(cb->send_mr);
+ }
+ }
+
+ cb->rdma_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK, 0, -1UL,
+ PAGE_SIZE, 0);
+
+ if (!cb->rdma_buf) {
+ log(LOG_ERR, "rdma_buf malloc failed\n");
+ ret = ENOMEM;
+ goto err1;
+ }
+ if (!cb->use_dmamr) {
+
+ buf.addr = vtophys(cb->rdma_buf);
+ buf.size = cb->size;
+ iovbase = vtophys(cb->rdma_buf);
+ cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
+ IB_ACCESS_REMOTE_READ|
+ IB_ACCESS_REMOTE_WRITE,
+ &iovbase);
+
+ if (IS_ERR(cb->rdma_mr)) {
+ log(LOG_ERR, "rdma_buf reg_mr failed\n");
+ ret = PTR_ERR(cb->rdma_mr);
+ goto err2;
+ }
+ }
+
+ if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
+ cb->start_buf = contigmalloc(cb->size, M_DEVBUF, M_WAITOK,
+ 0, -1UL, PAGE_SIZE, 0);
+ if (!cb->start_buf) {
+ log(LOG_ERR, "start_buf malloc failed\n");
+ ret = ENOMEM;
+ goto err2;
+ }
+ if (!cb->use_dmamr) {
+ unsigned flags = IB_ACCESS_REMOTE_READ;
+
+ if (cb->wlat || cb->rlat || cb->bw)
+ flags |= IB_ACCESS_REMOTE_WRITE;
+ buf.addr = vtophys(cb->start_buf);
+ buf.size = cb->size;
+ iovbase = vtophys(cb->start_buf);
+ cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
+ flags,
+ &iovbase);
+
+ if (IS_ERR(cb->start_mr)) {
+ log(LOG_ERR, "start_buf reg_mr failed\n");
+ ret = PTR_ERR(cb->start_mr);
+ goto err3;
+ }
+ }
+ }
+
+ krping_setup_wr(cb);
+ DEBUG_LOG(PFX "allocated & registered buffers...\n");
+ return 0;
+err3:
+ contigfree(cb->start_buf, cb->size, M_DEVBUF);
+
+ if (!cb->use_dmamr)
+ ib_dereg_mr(cb->rdma_mr);
+err2:
+ contigfree(cb->rdma_buf, cb->size, M_DEVBUF);
+err1:
+ if (cb->use_dmamr)
+ ib_dereg_mr(cb->dma_mr);
+ else {
+ ib_dereg_mr(cb->recv_mr);
+ ib_dereg_mr(cb->send_mr);
+ }
+ return ret;
+}
+
+static void krping_free_buffers(struct krping_cb *cb)
+{
+ DEBUG_LOG(PFX "krping_free_buffers called on cb %p\n", cb);
+
+#if 0
+ dma_unmap_single(cb->pd->device->dma_device,
+ pci_unmap_addr(cb, recv_mapping),
+ sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
+ dma_unmap_single(cb->pd->device->dma_device,
+ pci_unmap_addr(cb, send_mapping),
+ sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
+ dma_unmap_single(cb->pd->device->dma_device,
+ pci_unmap_addr(cb, rdma_mapping),
+ cb->size, DMA_BIDIRECTIONAL);
+#endif
+ contigfree(cb->rdma_buf, cb->size, M_DEVBUF);
+ if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
+#if 0
+ dma_unmap_single(cb->pd->device->dma_device,
+ pci_unmap_addr(cb, start_mapping),
+ cb->size, DMA_BIDIRECTIONAL);
+#endif
+ contigfree(cb->start_buf, cb->size, M_DEVBUF);
+ }
+ if (cb->use_dmamr)
+ ib_dereg_mr(cb->dma_mr);
+ else {
+ ib_dereg_mr(cb->send_mr);
+ ib_dereg_mr(cb->recv_mr);
+ ib_dereg_mr(cb->rdma_mr);
+ if (!cb->server)
+ ib_dereg_mr(cb->start_mr);
+ }
+}
+
+static int krping_create_qp(struct krping_cb *cb)
+{
+ struct ib_qp_init_attr init_attr;
+ int ret;
+
+ memset(&init_attr, 0, sizeof(init_attr));
+ init_attr.cap.max_send_wr = cb->txdepth;
+ init_attr.cap.max_recv_wr = 2;
+ init_attr.cap.max_recv_sge = 1;
+ init_attr.cap.max_send_sge = 1;
+ init_attr.qp_type = IB_QPT_RC;
+ init_attr.send_cq = cb->cq;
+ init_attr.recv_cq = cb->cq;
+
+ if (cb->server) {
+ ret = rdma_create_qp(cb->child_cm_id, cb->pd, &init_attr);
+ if (!ret)
+ cb->qp = cb->child_cm_id->qp;
+ } else {
+ ret = rdma_create_qp(cb->cm_id, cb->pd, &init_attr);
+ if (!ret)
+ cb->qp = cb->cm_id->qp;
+ }
+
+ return ret;
+}
+
+static void krping_free_qp(struct krping_cb *cb)
+{
+ ib_destroy_qp(cb->qp);
+ ib_destroy_cq(cb->cq);
+ ib_dealloc_pd(cb->pd);
+}
+
+static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
+{
+ int ret;
+ cb->pd = ib_alloc_pd(cm_id->device);
+ if (IS_ERR(cb->pd)) {
+ log(LOG_ERR, "ib_alloc_pd failed\n");
+ return PTR_ERR(cb->pd);
+ }
+ DEBUG_LOG(PFX "created pd %p\n", cb->pd);
+
+ cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
+ cb, cb->txdepth * 2, 0);
+ if (IS_ERR(cb->cq)) {
+ log(LOG_ERR, "ib_create_cq failed\n");
+ ret = PTR_ERR(cb->cq);
+ goto err1;
+ }
+ DEBUG_LOG(PFX "created cq %p\n", cb->cq);
+
+ if (!cb->wlat && !cb->rlat && !cb->bw) {
+ ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
+ if (ret) {
+ log(LOG_ERR, "ib_create_cq failed\n");
+ goto err2;
+ }
+ }
+
+ ret = krping_create_qp(cb);
+ if (ret) {
+ log(LOG_ERR, "krping_create_qp failed: %d\n", ret);
+ goto err2;
+ }
+ DEBUG_LOG(PFX "created qp %p\n", cb->qp);
+ return 0;
+err2:
+ ib_destroy_cq(cb->cq);
+err1:
+ ib_dealloc_pd(cb->pd);
+ return ret;
+}
+
+static void krping_format_send(struct krping_cb *cb, u64 buf,
+ struct ib_mr *mr)
+{
+ struct krping_rdma_info *info = &cb->send_buf;
+
+ info->buf = htonll(buf);
+ info->rkey = htonl(mr->rkey);
+ info->size = htonl(cb->size);
+
+ DEBUG_LOG(PFX "RDMA addr %llx rkey %x len %d\n",
+ (unsigned long long)buf, mr->rkey, cb->size);
+}
+
+static void krping_test_server(struct krping_cb *cb)
+{
+ struct ib_send_wr *bad_wr;
+ int ret;
+
+ while (1) {
+ /* Wait for client's Start STAG/TO/Len */
+ krping_wait(cb, RDMA_READ_ADV);
+ if (cb->state != RDMA_READ_ADV) {
+ DEBUG_LOG(PFX "wait for RDMA_READ_ADV state %d\n",
+ cb->state);
+ break;
+ }
+
+ DEBUG_LOG(PFX "server received sink adv\n");
+
+ /* Issue RDMA Read. */
+ cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
+ cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
+ cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
+ cb->rdma_sq_wr.sg_list->length = cb->remote_len;
+
+ ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
+ if (ret) {
+ log(LOG_ERR, "post send error %d\n", ret);
+ break;
+ }
+ DEBUG_LOG(PFX "server posted rdma read req \n");
+
+ /* Wait for read completion */
+ krping_wait(cb, RDMA_READ_COMPLETE);
+ if (cb->state != RDMA_READ_COMPLETE) {
+ log(LOG_ERR,
+ "wait for RDMA_READ_COMPLETE state %d\n",
+ cb->state);
+ break;
+ }
+ DEBUG_LOG(PFX "server received read complete\n");
+
+ /* Display data in recv buf */
+ if (cb->verbose)
+ DEBUG_LOG("server ping data: %s\n", cb->rdma_buf);
+
+ /* Tell client to continue */
+ ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
+ if (ret) {
+ log(LOG_ERR, "post send error %d\n", ret);
+ break;
+ }
+ DEBUG_LOG(PFX "server posted go ahead\n");
+
+ /* Wait for client's RDMA STAG/TO/Len */
+ krping_wait(cb, RDMA_WRITE_ADV);
+ if (cb->state != RDMA_WRITE_ADV) {
+ log(LOG_ERR,
+ "wait for RDMA_WRITE_ADV state %d\n",
+ cb->state);
+ break;
+ }
+ DEBUG_LOG(PFX "server received sink adv\n");
+
+ /* RDMA Write echo data */
+ cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
+ cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
+ cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
+ cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
+ DEBUG_LOG(PFX "rdma write from lkey %x laddr %llx len %d\n",
+ cb->rdma_sq_wr.sg_list->lkey,
+ (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
+ cb->rdma_sq_wr.sg_list->length);
+
+ ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
+ if (ret) {
+ log(LOG_ERR, "post send error %d\n", ret);
+ break;
+ }
+
+ /* Wait for completion */
+ krping_wait(cb, RDMA_WRITE_COMPLETE);
+ if (cb->state != RDMA_WRITE_COMPLETE) {
+ log(LOG_ERR,
+ "wait for RDMA_WRITE_COMPLETE state %d\n",
+ cb->state);
+ break;
+ }
+ DEBUG_LOG(PFX "server rdma write complete \n");
+
+ cb->state = CONNECTED;
+
+ /* Tell client to begin again */
+ ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
+ if (ret) {
+ log(LOG_ERR, "post send error %d\n", ret);
+ break;
+ }
+ DEBUG_LOG(PFX "server posted go ahead\n");
+ }
+}
+
+static void rlat_test(struct krping_cb *cb)
+{
+ int scnt;
+ int iters = cb->count;
+ struct timeval start_tv, stop_tv;
+ int ret;
+ struct ib_wc wc;
+ struct ib_send_wr *bad_wr;
+ int ne;
+
+ scnt = 0;
+ cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
+ cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
+ cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
+ cb->rdma_sq_wr.sg_list->length = cb->size;
+
+ microtime(&start_tv);
+ if (!cb->poll) {
+ cb->state = RDMA_READ_ADV;
+ ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
+ }
+ while (scnt < iters) {
+
+ cb->state = RDMA_READ_ADV;
+ ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
+ if (ret) {
+ log(LOG_ERR,
+ "Couldn't post send: ret=%d scnt %d\n",
+ ret, scnt);
+ return;
+ }
+
+ do {
+ if (!cb->poll) {
+ krping_wait(cb, RDMA_READ_COMPLETE);
+ if (cb->state == RDMA_READ_COMPLETE) {
+ ne = 1;
+ ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
+ } else {
+ ne = -1;
+ }
+ } else
+ ne = ib_poll_cq(cb->cq, 1, &wc);
+ if (cb->state == ERROR) {
+ log(LOG_ERR,
+ "state == ERROR...bailing scnt %d\n", scnt);
+ return;
+ }
+ } while (ne == 0);
+
+ if (ne < 0) {
+ log(LOG_ERR, "poll CQ failed %d\n", ne);
+ return;
+ }
+ if (cb->poll && wc.status != IB_WC_SUCCESS) {
+ log(LOG_ERR, "Completion wth error at %s:\n",
+ cb->server ? "server" : "client");
+ log(LOG_ERR, "Failed status %d: wr_id %d\n",
+ wc.status, (int) wc.wr_id);
+ return;
+ }
+ ++scnt;
+ }
+ microtime(&stop_tv);
+
+ if (stop_tv.tv_usec < start_tv.tv_usec) {
+ stop_tv.tv_usec += 1000000;
+ stop_tv.tv_sec -= 1;
+ }
+
+ log(LOG_ERR, "delta sec %lu delta usec %lu iter %d size %d\n",
+ stop_tv.tv_sec - start_tv.tv_sec,
+ stop_tv.tv_usec - start_tv.tv_usec,
+ scnt, cb->size);
+}
+
+static int alloc_cycle_mem(int cycle_iters,
+ cycles_t **post_cycles_start,
+ cycles_t **post_cycles_stop,
+ cycles_t **poll_cycles_start,
+ cycles_t **poll_cycles_stop,
+ cycles_t **last_poll_cycles_start)
+{
+ *post_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
+ if (!*post_cycles_start) {
+ goto fail1;
+ }
+ *post_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
+ if (!*post_cycles_stop) {
+ goto fail2;
+ }
+ *poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
+ if (!*poll_cycles_start) {
+ goto fail3;
+ }
+ *poll_cycles_stop = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
+ if (!*poll_cycles_stop) {
+ goto fail4;
+ }
+ *last_poll_cycles_start = malloc(cycle_iters * sizeof(cycles_t), M_DEVBUF, M_WAITOK);
+ if (!*last_poll_cycles_start) {
+ goto fail5;
+ }
+ return 0;
+fail5:
+ free(*poll_cycles_stop, M_DEVBUF);
+fail4:
+ free(*poll_cycles_start, M_DEVBUF);
+fail3:
+ free(*post_cycles_stop, M_DEVBUF);
+fail2:
+ free(*post_cycles_start, M_DEVBUF);
+fail1:
+ log(LOG_ERR, "%s malloc failed\n", __FUNCTION__);
+ return ENOMEM;
+}
+
+static void free_cycle_mem(cycles_t *post_cycles_start,
+ cycles_t *post_cycles_stop,
+ cycles_t *poll_cycles_start,
+ cycles_t *poll_cycles_stop,
+ cycles_t *last_poll_cycles_start)
+{
+ free(last_poll_cycles_start, M_DEVBUF);
+ free(poll_cycles_stop, M_DEVBUF);
+ free(poll_cycles_start, M_DEVBUF);
+ free(post_cycles_stop, M_DEVBUF);
+ free(post_cycles_start, M_DEVBUF);
+}
+
+static void wlat_test(struct krping_cb *cb)
+{
+ int ccnt, scnt, rcnt;
+ int iters=cb->count;
+ volatile char *poll_buf = (char *) cb->start_buf;
+ char *buf = (char *)cb->rdma_buf;
+ ccnt = 0;
+ scnt = 0;
+ rcnt = 0;
+ struct timeval start_tv, stop_tv;
+ cycles_t *post_cycles_start, *post_cycles_stop;
+ cycles_t *poll_cycles_start, *poll_cycles_stop;
+ cycles_t *last_poll_cycles_start;
+ cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
+ int i;
+ int cycle_iters = 1000;
+ int err;
+
+ err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop,
+ &poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start);
+
+ if (err) {
+ log(LOG_ERR, "%s malloc failed\n", __FUNCTION__);
+ return;
+ }
+
+ cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
+ cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
+ cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
+ cb->rdma_sq_wr.sg_list->length = cb->size;
+
+ if (cycle_iters > iters)
+ cycle_iters = iters;
+ microtime(&start_tv);
+ while (scnt < iters || ccnt < iters || rcnt < iters) {
+
+ /* Wait till buffer changes. */
+ if (rcnt < iters && !(scnt < 1 && !cb->server)) {
+ ++rcnt;
+ while (*poll_buf != (char)rcnt) {
+ if (cb->state == ERROR) {
+ log(LOG_ERR, "state = ERROR, bailing\n");
+ return;
+ }
+ }
+ }
+
+ if (scnt < iters) {
+ struct ib_send_wr *bad_wr;
+
+ *buf = (char)scnt+1;
+ if (scnt < cycle_iters)
+ post_cycles_start[scnt] = get_cycles();
+ if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
+ log(LOG_ERR, "Couldn't post send: scnt=%d\n",
+ scnt);
+ return;
+ }
+ if (scnt < cycle_iters)
+ post_cycles_stop[scnt] = get_cycles();
+ scnt++;
+ }
+
+ if (ccnt < iters) {
+ struct ib_wc wc;
+ int ne;
+
+ if (ccnt < cycle_iters)
+ poll_cycles_start[ccnt] = get_cycles();
+ do {
+ if (ccnt < cycle_iters)
+ last_poll_cycles_start[ccnt] = get_cycles();
+ ne = ib_poll_cq(cb->cq, 1, &wc);
+ } while (ne == 0);
+ if (ccnt < cycle_iters)
+ poll_cycles_stop[ccnt] = get_cycles();
+ ++ccnt;
+
+ if (ne < 0) {
+ log(LOG_ERR, "poll CQ failed %d\n", ne);
+ return;
+ }
+ if (wc.status != IB_WC_SUCCESS) {
+ log(LOG_ERR, "Completion wth error at %s:\n",
+ cb->server ? "server" : "client");
+ log(LOG_ERR, "Failed status %d: wr_id %d\n",
+ wc.status, (int) wc.wr_id);
+ log(LOG_ERR, "scnt=%d, rcnt=%d, ccnt=%d\n",
+ scnt, rcnt, ccnt);
+ return;
+ }
+ }
+ }
+ microtime(&stop_tv);
+
+ if (stop_tv.tv_usec < start_tv.tv_usec) {
+ stop_tv.tv_usec += 1000000;
+ stop_tv.tv_sec -= 1;
+ }
+
+ for (i=0; i < cycle_iters; i++) {
+ sum_post += post_cycles_stop[i] - post_cycles_start[i];
+ sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
+ sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i];
+ }
+
+ log(LOG_ERR, "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n",
+ stop_tv.tv_sec - start_tv.tv_sec,
+ stop_tv.tv_usec - start_tv.tv_usec,
+ scnt, cb->size, cycle_iters,
+ (unsigned long long)sum_post, (unsigned long long)sum_poll,
+ (unsigned long long)sum_last_poll);
+
+ free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start,
+ poll_cycles_stop, last_poll_cycles_start);
+}
+
+static void bw_test(struct krping_cb *cb)
+{
+ int ccnt, scnt, rcnt;
+ int iters=cb->count;
+ ccnt = 0;
+ scnt = 0;
+ rcnt = 0;
+ struct timeval start_tv, stop_tv;
+ cycles_t *post_cycles_start, *post_cycles_stop;
+ cycles_t *poll_cycles_start, *poll_cycles_stop;
+ cycles_t *last_poll_cycles_start;
+ cycles_t sum_poll = 0, sum_post = 0, sum_last_poll = 0;
+ int i;
+ int cycle_iters = 1000;
+ int err;
+
+ err = alloc_cycle_mem(cycle_iters, &post_cycles_start, &post_cycles_stop,
+ &poll_cycles_start, &poll_cycles_stop, &last_poll_cycles_start);
+
+ if (err) {
+ log(LOG_ERR, "%s kmalloc failed\n", __FUNCTION__);
+ return;
+ }
+
+ cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
+ cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
+ cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
+ cb->rdma_sq_wr.sg_list->length = cb->size;
+
+ if (cycle_iters > iters)
+ cycle_iters = iters;
+ microtime(&start_tv);
+ while (scnt < iters || ccnt < iters) {
+
+ while (scnt < iters && scnt - ccnt < cb->txdepth) {
+ struct ib_send_wr *bad_wr;
+
+ if (scnt < cycle_iters)
+ post_cycles_start[scnt] = get_cycles();
+ if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
+ log(LOG_ERR, "Couldn't post send: scnt=%d\n",
+ scnt);
+ return;
+ }
+ if (scnt < cycle_iters)
+ post_cycles_stop[scnt] = get_cycles();
+ ++scnt;
+ }
+
+ if (ccnt < iters) {
+ int ne;
+ struct ib_wc wc;
+
+ if (ccnt < cycle_iters)
+ poll_cycles_start[ccnt] = get_cycles();
+ do {
+ if (ccnt < cycle_iters)
+ last_poll_cycles_start[ccnt] = get_cycles();
+ ne = ib_poll_cq(cb->cq, 1, &wc);
+ } while (ne == 0);
+ if (ccnt < cycle_iters)
+ poll_cycles_stop[ccnt] = get_cycles();
+ ccnt += 1;
+
+ if (ne < 0) {
+ log(LOG_ERR, "poll CQ failed %d\n", ne);
+ return;
+ }
+ if (wc.status != IB_WC_SUCCESS) {
+ log(LOG_ERR, "Completion wth error at %s:\n",
+ cb->server ? "server" : "client");
+ log(LOG_ERR, "Failed status %d: wr_id %d\n",
+ wc.status, (int) wc.wr_id);
+ return;
+ }
+ }
+ }
+ microtime(&stop_tv);
+
+ if (stop_tv.tv_usec < start_tv.tv_usec) {
+ stop_tv.tv_usec += 1000000;
+ stop_tv.tv_sec -= 1;
+ }
+
+ for (i=0; i < cycle_iters; i++) {
+ sum_post += post_cycles_stop[i] - post_cycles_start[i];
+ sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
+ sum_last_poll += poll_cycles_stop[i] - last_poll_cycles_start[i];
+ }
+
+ log(LOG_ERR, "delta sec %lu delta usec %lu iter %d size %d cycle_iters %d sum_post %llu sum_poll %llu sum_last_poll %llu\n",
+ stop_tv.tv_sec - start_tv.tv_sec,
+ stop_tv.tv_usec - start_tv.tv_usec,
+ scnt, cb->size, cycle_iters,
+ (unsigned long long)sum_post, (unsigned long long)sum_poll,
+ (unsigned long long)sum_last_poll);
+
+ free_cycle_mem(post_cycles_start, post_cycles_stop, poll_cycles_start,
+ poll_cycles_stop, last_poll_cycles_start);
+}
+
+static void krping_rlat_test_server(struct krping_cb *cb)
+{
+ struct ib_send_wr *bad_wr;
+ struct ib_wc wc;
+ int ret;
+
+ /* Spin waiting for client's Start STAG/TO/Len */
+ while (cb->state < RDMA_READ_ADV) {
+ krping_cq_event_handler(cb->cq, cb);
+ }
+
+ /* Send STAG/TO/Len to client */
+ if (cb->dma_mr)
+ krping_format_send(cb, cb->start_addr, cb->dma_mr);
+ else
+ krping_format_send(cb, cb->start_addr, cb->start_mr);
+ ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
+ if (ret) {
+ log(LOG_ERR, "post send error %d\n", ret);
+ return;
+ }
+
+ /* Spin waiting for send completion */
+ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
+ if (ret < 0) {
+ log(LOG_ERR, "poll error %d\n", ret);
+ return;
+ }
+ if (wc.status) {
+ log(LOG_ERR, "send completiong error %d\n", wc.status);
+ return;
+ }
+
+ krping_wait(cb, ERROR);
+}
+
+static void krping_wlat_test_server(struct krping_cb *cb)
+{
+ struct ib_send_wr *bad_wr;
+ struct ib_wc wc;
+ int ret;
+
+ /* Spin waiting for client's Start STAG/TO/Len */
+ while (cb->state < RDMA_READ_ADV) {
+ krping_cq_event_handler(cb->cq, cb);
+ }
+
+ /* Send STAG/TO/Len to client */
+ if (cb->dma_mr)
+ krping_format_send(cb, cb->start_addr, cb->dma_mr);
+ else
+ krping_format_send(cb, cb->start_addr, cb->start_mr);
+ ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
+ if (ret) {
+ log(LOG_ERR, "post send error %d\n", ret);
+ return;
+ }
+
+ /* Spin waiting for send completion */
+ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
+ if (ret < 0) {
+ log(LOG_ERR, "poll error %d\n", ret);
+ return;
+ }
+ if (wc.status) {
+ log(LOG_ERR, "send completiong error %d\n", wc.status);
+ return;
+ }
+
+ wlat_test(cb);
+
+}
+
+static void krping_bw_test_server(struct krping_cb *cb)
+{
+ struct ib_send_wr *bad_wr;
+ struct ib_wc wc;
+ int ret;
+
+ /* Spin waiting for client's Start STAG/TO/Len */
+ while (cb->state < RDMA_READ_ADV) {
+ krping_cq_event_handler(cb->cq, cb);
+ }
+
+ /* Send STAG/TO/Len to client */
+ if (cb->dma_mr)
+ krping_format_send(cb, cb->start_addr, cb->dma_mr);
+ else
+ krping_format_send(cb, cb->start_addr, cb->start_mr);
+ ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
+ if (ret) {
+ log(LOG_ERR, "post send error %d\n", ret);
+ return;
+ }
+
+ /* Spin waiting for send completion */
+ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
+ if (ret < 0) {
+ log(LOG_ERR, "poll error %d\n", ret);
+ return;
+ }
+ if (wc.status) {
+ log(LOG_ERR, "send completiong error %d\n", wc.status);
+ return;
+ }
+
+ if (cb->duplex)
+ bw_test(cb);
+ krping_wait(cb, ERROR);
+}
+
+static int krping_bind_server(struct krping_cb *cb)
+{
+ struct sockaddr_in sin;
+ int ret;
+
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_len = sizeof sin;
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = cb->addr.s_addr;
+ sin.sin_port = cb->port;
+
+ ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
+ if (ret) {
+ log(LOG_ERR, "rdma_bind_addr error %d\n", ret);
+ return ret;
+ }
+ DEBUG_LOG(PFX "rdma_bind_addr successful\n");
+
+ DEBUG_LOG(PFX "rdma_listen\n");
+ ret = rdma_listen(cb->cm_id, 3);
+ if (ret) {
+ log(LOG_ERR, "rdma_listen failed: %d\n", ret);
+ return ret;
+ }
+
+ krping_wait(cb, CONNECT_REQUEST);
+ if (cb->state != CONNECT_REQUEST) {
+ log(LOG_ERR, "wait for CONNECT_REQUEST state %d\n",
+ cb->state);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void krping_run_server(struct krping_cb *cb)
+{
+ struct ib_recv_wr *bad_wr;
+ int ret;
+
+ ret = krping_bind_server(cb);
+ if (ret)
+ return;
+
+ ret = krping_setup_qp(cb, cb->child_cm_id);
+ if (ret) {
+ log(LOG_ERR, "setup_qp failed: %d\n", ret);
+ return;
+ }
+
+ ret = krping_setup_buffers(cb);
+ if (ret) {
+ log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret);
+ goto err1;
+ }
+
+ ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
+ if (ret) {
+ log(LOG_ERR, "ib_post_recv failed: %d\n", ret);
+ goto err2;
+ }
+
+ ret = krping_accept(cb);
+ if (ret) {
+ log(LOG_ERR, "connect error %d\n", ret);
+ goto err2;
+ }
+
+ if (cb->wlat)
+ krping_wlat_test_server(cb);
+ else if (cb->rlat)
+ krping_rlat_test_server(cb);
+ else if (cb->bw)
+ krping_bw_test_server(cb);
+ else
+ krping_test_server(cb);
+
+ rdma_disconnect(cb->child_cm_id);
+ rdma_destroy_id(cb->child_cm_id);
+err2:
+ krping_free_buffers(cb);
+err1:
+ krping_free_qp(cb);
+}
+
+static void krping_test_client(struct krping_cb *cb)
+{
+ int ping, start, cc, i, ret;
+ struct ib_send_wr *bad_wr;
+ unsigned char c;
+
+ start = 65;
+ for (ping = 0; !cb->count || ping < cb->count; ping++) {
+ cb->state = RDMA_READ_ADV;
+
+ /* Put some ascii text in the buffer. */
+ cc = sprintf(cb->start_buf, "rdma-ping-%d: ", ping);
+ for (i = cc, c = start; i < cb->size; i++) {
+ cb->start_buf[i] = c;
+ c++;
+ if (c > 122)
+ c = 65;
+ }
+ start++;
+ if (start > 122)
+ start = 65;
+ cb->start_buf[cb->size - 1] = 0;
+
+ if (cb->dma_mr)
+ krping_format_send(cb, cb->start_addr, cb->dma_mr);
+ else
+ krping_format_send(cb, cb->start_addr, cb->start_mr);
+
+ ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
+ if (ret) {
+ log(LOG_ERR, "post send error %d\n", ret);
+ break;
+ }
+
+ /* Wait for server to ACK */
+ krping_wait(cb, RDMA_WRITE_ADV);
+ if (cb->state != RDMA_WRITE_ADV) {
+ log(LOG_ERR,
+ "wait for RDMA_WRITE_ADV state %d\n",
+ cb->state);
+ break;
+ }
+
+ if (cb->dma_mr)
+ krping_format_send(cb, cb->rdma_addr, cb->dma_mr);
+ else
+ krping_format_send(cb, cb->rdma_addr, cb->rdma_mr);
+
+ ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
+ if (ret) {
+ log(LOG_ERR, "post send error %d\n", ret);
+ break;
+ }
+
+ /* Wait for the server to say the RDMA Write is complete. */
+ krping_wait(cb, RDMA_WRITE_COMPLETE);
+ if (cb->state != RDMA_WRITE_COMPLETE) {
+ log(LOG_ERR,
+ "wait for RDMA_WRITE_COMPLETE state %d\n",
+ cb->state);
+ break;
+ }
+
+ if (cb->validate)
+ if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
+ log(LOG_ERR, "data mismatch!\n");
+ break;
+ }
+
+ if (cb->verbose)
+ DEBUG_LOG("ping data: %s\n", cb->rdma_buf);
+ }
+}
+
+static void krping_rlat_test_client(struct krping_cb *cb)
+{
+ struct ib_send_wr *bad_wr;
+ struct ib_wc wc;
+ int ret;
+
+ cb->state = RDMA_READ_ADV;
+
+ /* Send STAG/TO/Len to client */
+ if (cb->dma_mr)
+ krping_format_send(cb, cb->start_addr, cb->dma_mr);
+ else
+ krping_format_send(cb, cb->start_addr, cb->rdma_mr);
+ ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
+ if (ret) {
+ log(LOG_ERR, "post send error %d\n", ret);
+ return;
+ }
+
+ /* Spin waiting for send completion */
+ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
+ if (ret < 0) {
+ log(LOG_ERR, "poll error %d\n", ret);
+ return;
+ }
+ if (wc.status) {
+ log(LOG_ERR, "send completion error %d\n", wc.status);
+ return;
+ }
+
+ /* Spin waiting for server's Start STAG/TO/Len */
+ while (cb->state < RDMA_WRITE_ADV) {
+ krping_cq_event_handler(cb->cq, cb);
+ }
+
+#if 0
+{
+ int i;
+ struct timeval start, stop;
+ time_t sec;
+ suseconds_t usec;
+ unsigned long long elapsed;
+ struct ib_wc wc;
+ struct ib_send_wr *bad_wr;
+ int ne;
+
+ cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
+ cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
+ cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
+ cb->rdma_sq_wr.sg_list->length = 0;
+ cb->rdma_sq_wr.num_sge = 0;
+
+ microtime(&start);
+ for (i=0; i < 100000; i++) {
+ if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
+ log(LOG_ERR, "Couldn't post send\n");
+ return;
+ }
+ do {
+ ne = ib_poll_cq(cb->cq, 1, &wc);
+ } while (ne == 0);
+ if (ne < 0) {
+ log(LOG_ERR, "poll CQ failed %d\n", ne);
+ return;
+ }
+ if (wc.status != IB_WC_SUCCESS) {
+ log(LOG_ERR, "Completion wth error at %s:\n",
+ cb->server ? "server" : "client");
+ log(LOG_ERR, "Failed status %d: wr_id %d\n",
+ wc.status, (int) wc.wr_id);
+ return;
+ }
+ }
+ microtime(&stop);
+
+ if (stop.tv_usec < start.tv_usec) {
+ stop.tv_usec += 1000000;
+ stop.tv_sec -= 1;
+ }
+ sec = stop.tv_sec - start.tv_sec;
+ usec = stop.tv_usec - start.tv_usec;
+ elapsed = sec * 1000000 + usec;
+ log(LOG_ERR, "0B-write-lat iters 100000 usec %llu\n", elapsed);
+}
+#endif
+
+ rlat_test(cb);
+}
+
+static void krping_wlat_test_client(struct krping_cb *cb)
+{
+ struct ib_send_wr *bad_wr;
+ struct ib_wc wc;
+ int ret;
+
+ cb->state = RDMA_READ_ADV;
+
+ /* Send STAG/TO/Len to client */
+ if (cb->dma_mr)
+ krping_format_send(cb, cb->start_addr, cb->dma_mr);
+ else
+ krping_format_send(cb, cb->start_addr, cb->start_mr);
+ ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
+ if (ret) {
+ log(LOG_ERR, "post send error %d\n", ret);
+ return;
+ }
+
+ /* Spin waiting for send completion */
+ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
+ if (ret < 0) {
+ log(LOG_ERR, "poll error %d\n", ret);
+ return;
+ }
+ if (wc.status) {
+ log(LOG_ERR, "send completion error %d\n", wc.status);
+ return;
+ }
+
+ /* Spin waiting for server's Start STAG/TO/Len */
+ while (cb->state < RDMA_WRITE_ADV) {
+ krping_cq_event_handler(cb->cq, cb);
+ }
+
+ wlat_test(cb);
+}
+
+static void krping_bw_test_client(struct krping_cb *cb)
+{
+ struct ib_send_wr *bad_wr;
+ struct ib_wc wc;
+ int ret;
+
+ cb->state = RDMA_READ_ADV;
+
+ /* Send STAG/TO/Len to client */
+ if (cb->dma_mr)
+ krping_format_send(cb, cb->start_addr, cb->dma_mr);
+ else
+ krping_format_send(cb, cb->start_addr, cb->start_mr);
+ ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
+ if (ret) {
+ log(LOG_ERR, "post send error %d\n", ret);
+ return;
+ }
+
+ /* Spin waiting for send completion */
+ while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
+ if (ret < 0) {
+ log(LOG_ERR, "poll error %d\n", ret);
+ return;
+ }
+ if (wc.status) {
+ log(LOG_ERR, "send completion error %d\n", wc.status);
+ return;
+ }
+
+ /* Spin waiting for server's Start STAG/TO/Len */
+ while (cb->state < RDMA_WRITE_ADV) {
+ krping_cq_event_handler(cb->cq, cb);
+ }
+
+ bw_test(cb);
+}
+
+static int krping_connect_client(struct krping_cb *cb)
+{
+ struct rdma_conn_param conn_param;
+ int ret;
+
+ memset(&conn_param, 0, sizeof conn_param);
+ conn_param.responder_resources = 1;
+ conn_param.initiator_depth = 1;
+ conn_param.retry_count = 10;
+
+ ret = rdma_connect(cb->cm_id, &conn_param);
+ if (ret) {
+ log(LOG_ERR, "rdma_connect error %d\n", ret);
+ return ret;
+ }
+
+ krping_wait(cb, CONNECTED);
+ if (cb->state == ERROR) {
+ log(LOG_ERR, "wait for CONNECTED state %d\n", cb->state);
+ return -1;
+ }
+
+ DEBUG_LOG(PFX "rdma_connect successful\n");
+ return 0;
+}
+
+static int krping_bind_client(struct krping_cb *cb)
+{
+ struct sockaddr_in sin;
+ int ret;
+
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_len = sizeof sin;
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = cb->addr.s_addr;
+ sin.sin_port = cb->port;
+
+ ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
+ 2000);
+ if (ret) {
+ log(LOG_ERR, "rdma_resolve_addr error %d\n", ret);
+ return ret;
+ }
+
+ krping_wait(cb, ROUTE_RESOLVED);
+ if (cb->state != ROUTE_RESOLVED) {
+ log(LOG_ERR,
+ "addr/route resolution did not resolve: state %d\n",
+ cb->state);
+ return EINTR;
+ }
+
+ DEBUG_LOG(PFX "rdma_resolve_addr - rdma_resolve_route successful\n");
+ return 0;
+}
+
+static void krping_run_client(struct krping_cb *cb)
+{
+ struct ib_recv_wr *bad_wr;
+ int ret;
+
+ ret = krping_bind_client(cb);
+ if (ret)
+ return;
+
+ ret = krping_setup_qp(cb, cb->cm_id);
+ if (ret) {
+ log(LOG_ERR, "setup_qp failed: %d\n", ret);
+ return;
+ }
+
+ ret = krping_setup_buffers(cb);
+ if (ret) {
+ log(LOG_ERR, "krping_setup_buffers failed: %d\n", ret);
+ goto err1;
+ }
+
+ ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
+ if (ret) {
+ log(LOG_ERR, "ib_post_recv failed: %d\n", ret);
+ goto err2;
+ }
+
+ ret = krping_connect_client(cb);
+ if (ret) {
+ log(LOG_ERR, "connect error %d\n", ret);
+ goto err2;
+ }
+
+ if (cb->wlat)
+ krping_wlat_test_client(cb);
+ else if (cb->rlat)
+ krping_rlat_test_client(cb);
+ else if (cb->bw)
+ krping_bw_test_client(cb);
+ else
+ krping_test_client(cb);
+ rdma_disconnect(cb->cm_id);
+err2:
+ krping_free_buffers(cb);
+err1:
+ krping_free_qp(cb);
+}
+
+int krping_doit(char *cmd)
+{
+ struct krping_cb *cb;
+ int op;
+ int ret = 0;
+ char *optarg;
+ unsigned long optint;
+ debug = 0;
+
+ cb = malloc(sizeof(*cb), M_DEVBUF, M_WAITOK);
+ if (!cb)
+ return ENOMEM;
+ bzero(cb, sizeof *cb);
+
+ mtx_lock(&krping_mutex);
+ TAILQ_INSERT_TAIL(&krping_cbs, cb, list);
+ mtx_unlock(&krping_mutex);
+
+ cb->server = -1;
+ cb->state = IDLE;
+ cb->size = 64;
+ cb->txdepth = RPING_SQ_DEPTH;
+ mtx_init(&cb->lock, "krping mtx", NULL, MTX_DUPOK|MTX_DEF);
+
+ while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
+ &optint)) != 0) {
+ switch (op) {
+ case 'a':
+ cb->addr_str = optarg;
+ DEBUG_LOG(PFX "ipaddr (%s)\n", optarg);
+ if (!inet_aton(optarg, &cb->addr)) {
+ log(LOG_ERR, "bad addr string %s\n", optarg);
+ ret = EINVAL;
+ }
+ break;
+ case 'D':
+ cb->use_dmamr = 1;
+ DEBUG_LOG(PFX "using dma mr\n");
+ break;
+ case 'p':
+ cb->port = htons(optint);
+ DEBUG_LOG(PFX "port %d\n", (int)optint);
+ break;
+ case 'P':
+ cb->poll = 1;
+ DEBUG_LOG("server\n");
+ break;
+ case 's':
+ cb->server = 1;
+ DEBUG_LOG(PFX "server\n");
+ break;
+ case 'c':
+ cb->server = 0;
+ DEBUG_LOG(PFX "client\n");
+ break;
+ case 'S':
+ cb->size = optint;
+ if ((cb->size < 1) ||
+ (cb->size > RPING_BUFSIZE)) {
+ log(LOG_ERR, "Invalid size %d "
+ "(valid range is 1 to %d)\n",
+ cb->size, RPING_BUFSIZE);
+ ret = EINVAL;
+ } else
+ DEBUG_LOG(PFX "size %d\n", (int)optint);
+ break;
+ case 'C':
+ cb->count = optint;
+ if (cb->count < 0) {
+ log(LOG_ERR, "Invalid count %d\n",
+ cb->count);
+ ret = EINVAL;
+ } else
+ DEBUG_LOG(PFX "count %d\n", (int) cb->count);
+ break;
+ case 'v':
+ cb->verbose++;
+ DEBUG_LOG(PFX "verbose\n");
+ break;
+ case 'V':
+ cb->validate++;
+ DEBUG_LOG(PFX "validate data\n");
+ break;
+ case 'L':
+ cb->rlat++;
+ break;
+ case 'l':
+ cb->wlat++;
+ break;
+ case 'B':
+ cb->bw++;
+ break;
+ case 't':
+ cb->txdepth = optint;
+ DEBUG_LOG(PFX "txdepth %d\n", cb->txdepth);
+ break;
+ case 'd':
+ debug++;
+ break;
+ default:
+ log(LOG_ERR, "unknown opt %s\n", optarg);
+ ret = EINVAL;
+ break;
+ }
+ }
+ if (ret)
+ goto out;
+
+ if (cb->server == -1) {
+ log(LOG_ERR, "must be either client or server\n");
+ ret = EINVAL;
+ goto out;
+ }
+ if ((cb->bw + cb->rlat + cb->wlat) > 1) {
+ log(LOG_ERR, "Pick only one test: bw, rlat, wlat\n");
+ ret = EINVAL;
+ goto out;
+ }
+
+
+ cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP);
+ if (IS_ERR(cb->cm_id)) {
+ ret = PTR_ERR(cb->cm_id);
+ log(LOG_ERR, "rdma_create_id error %d\n", ret);
+ goto out;
+ }
+ DEBUG_LOG(PFX "created cm_id %p\n", cb->cm_id);
+ if (cb->server)
+ krping_run_server(cb);
+ else
+ krping_run_client(cb);
+ DEBUG_LOG(PFX "destroy cm_id %p\n", cb->cm_id);
+ rdma_destroy_id(cb->cm_id);
+out:
+ mtx_lock(&krping_mutex);
+ TAILQ_REMOVE(&krping_cbs, cb, list);
+ mtx_unlock(&krping_mutex);
+ free(cb, M_DEVBUF);
+ return ret;
+}
+
+void krping_init(void)
+{
+ mtx_init(&krping_mutex, "krping lock", NULL, MTX_DEF);
+ TAILQ_INIT(&krping_cbs);
+}
diff --git a/sys/contrib/rdma/krping/krping.h b/sys/contrib/rdma/krping/krping.h
new file mode 100644
index 000000000000..8578e7e7979b
--- /dev/null
+++ b/sys/contrib/rdma/krping/krping.h
@@ -0,0 +1,128 @@
+/*
+ * $FreeBSD$
+ */
+#include <contrib/rdma/ib_verbs.h>
+#include <netinet/in.h>
+
+/*
+ * Krping header stuffs...
+ */
+
+struct krping_stats {
+ unsigned send_bytes;
+ unsigned send_msgs;
+ unsigned recv_bytes;
+ unsigned recv_msgs;
+ unsigned write_bytes;
+ unsigned write_msgs;
+ unsigned read_bytes;
+ unsigned read_msgs;
+};
+
+
+/*
+ * These states are used to signal events between the completion handler
+ * and the main client or server thread.
+ *
+ * Once CONNECTED, they cycle through RDMA_READ_ADV, RDMA_WRITE_ADV,
+ * and RDMA_WRITE_COMPLETE for each ping.
+ */
+enum test_state {
+ IDLE = 1,
+ CONNECT_REQUEST,
+ ADDR_RESOLVED,
+ ROUTE_RESOLVED,
+ CONNECTED,
+ RDMA_READ_ADV,
+ RDMA_READ_COMPLETE,
+ RDMA_WRITE_ADV,
+ RDMA_WRITE_COMPLETE,
+ ERROR
+};
+
+struct krping_rdma_info {
+ uint64_t buf;
+ uint32_t rkey;
+ uint32_t size;
+};
+
+/*
+ * Control block struct.
+ */
+struct krping_cb {
+ int server; /* 0 iff client */
+ struct ib_cq *cq;
+ struct ib_pd *pd;
+ struct ib_qp *qp;
+ struct ib_mr *dma_mr;
+ int use_dmamr;
+
+ struct ib_recv_wr rq_wr; /* recv work request record */
+ struct ib_sge recv_sgl; /* recv single SGE */
+ struct krping_rdma_info recv_buf;/* malloc'd buffer */
+ struct ib_mr *recv_mr;
+
+ struct ib_send_wr sq_wr; /* send work requrest record */
+ struct ib_sge send_sgl;
+ struct krping_rdma_info send_buf;/* single send buf */
+ struct ib_mr *send_mr;
+
+ struct ib_send_wr rdma_sq_wr; /* rdma work request record */
+ struct ib_sge rdma_sgl; /* rdma single SGE */
+ char *rdma_buf; /* used as rdma sink */
+ u64 rdma_addr;
+ struct ib_mr *rdma_mr;
+
+ uint32_t remote_rkey; /* remote guys RKEY */
+ uint64_t remote_addr; /* remote guys TO */
+ uint32_t remote_len; /* remote guys LEN */
+
+ char *start_buf; /* rdma read src */
+ u64 start_addr;
+ struct ib_mr *start_mr;
+
+ enum test_state state; /* used for cond/signalling */
+ struct mtx lock;
+ struct krping_stats stats;
+
+ uint16_t port; /* dst port in NBO */
+ struct in_addr addr; /* dst addr in NBO */
+ char *addr_str; /* dst addr string */
+ int verbose; /* verbose logging */
+ int count; /* ping count */
+ int size; /* ping data size */
+ int validate; /* validate ping data */
+
+ /* CM stuff */
+ struct rdma_cm_id *cm_id; /* connection on client side,*/
+ /* listener on service side. */
+ struct rdma_cm_id *child_cm_id; /* connection on server side */
+ TAILQ_ENTRY(krping_cb) list;
+
+ int rlat; /* run read latency test */
+ int wlat; /* run write latency test */
+ int bw; /* run write bw test */
+ int duplex; /* run write bw full duplex test */
+ int poll; /* poll vs block in rlat */
+ int txdepth;
+};
+
+static __inline uint64_t
+get_cycles(void)
+{
+ u_int32_t low, high;
+ __asm __volatile("rdtsc" : "=a" (low), "=d" (high));
+ return (low | ((u_int64_t)high << 32));
+}
+
+#define htonll(x) htobe64((x))
+#define ntohll(x) be64toh((x))
+
+typedef uint64_t cycles_t;
+
+extern struct mtx krping_mutex;
+TAILQ_HEAD(krping_cb_list, krping_cb);
+extern struct krping_cb_list krping_cbs;
+
+int krping_doit(char *cmd);
+void krping_init(void);
diff --git a/sys/contrib/rdma/krping/krping_dev.c b/sys/contrib/rdma/krping/krping_dev.c
new file mode 100644
index 000000000000..448f19717578
--- /dev/null
+++ b/sys/contrib/rdma/krping/krping_dev.c
@@ -0,0 +1,180 @@
+/*
+ * This code lifted from:
+ * Simple `echo' pseudo-device KLD
+ * Murray Stokely
+ * Converted to 5.X by Søren (Xride) Straarup
+ */
+
+/*
+ * /bin/echo "server,port=9999,addr=192.168.69.142,validate" > /dev/krping
+ * /bin/echo "client,port=9999,addr=192.168.69.142,validate" > /dev/krping
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/module.h>
+#include <sys/systm.h> /* uprintf */
+#include <sys/errno.h>
+#include <sys/param.h> /* defines used in kernel.h */
+#include <sys/kernel.h> /* types used in module initialization */
+#include <sys/conf.h> /* cdevsw struct */
+#include <sys/uio.h> /* uio struct */
+#include <sys/malloc.h>
+
+#include "krping.h"
+
+#define BUFFERSIZE 512
+
+/* Function prototypes */
+static d_open_t krping_open;
+static d_close_t krping_close;
+static d_read_t krping_read;
+static d_write_t krping_write;
+
+/* Character device entry points */
+static struct cdevsw krping_cdevsw = {
+ .d_version = D_VERSION,
+ .d_open = krping_open,
+ .d_close = krping_close,
+ .d_read = krping_read,
+ .d_write = krping_write,
+ .d_name = "krping",
+};
+
+typedef struct s_krping {
+ char msg[BUFFERSIZE];
+ int len;
+} krping_t;
+
+/* vars */
+static struct cdev *krping_dev;
+
+static int
+krping_loader(struct module *m, int what, void *arg)
+{
+ int err = 0;
+
+ switch (what) {
+ case MOD_LOAD: /* kldload */
+ krping_init();
+ krping_dev = make_dev(&krping_cdevsw, 0, UID_ROOT, GID_WHEEL,
+ 0600, "krping");
+ printf("Krping device loaded.\n");
+ break;
+ case MOD_UNLOAD:
+ destroy_dev(krping_dev);
+ printf("Krping device unloaded.\n");
+ break;
+ default:
+ err = EOPNOTSUPP;
+ break;
+ }
+ return err;
+}
+
+static int
+krping_open(struct cdev *dev, int oflags, int devtype, struct thread *p)
+{
+ int err = 0;
+ return err;
+}
+
+static int
+krping_close(struct cdev *dev, int fflag, int devtype, struct thread *p)
+{
+ return 0;
+}
+
+static int
+krping_read(struct cdev *dev, struct uio *uio, int ioflag)
+{
+ struct krping_cb *cb, *cb2;
+ int num=1;
+ struct krping_cb_list copy_cbs;
+
+ uprintf("krping: %4s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n",
+ "num", "device", "snd bytes", "snd msgs", "rcv bytes",
+ "rcv msgs", "wr bytes", "wr msgs", "rd bytes", "rd msgs");
+ TAILQ_INIT(&copy_cbs);
+
+ mtx_lock(&krping_mutex);
+ TAILQ_FOREACH(cb, &krping_cbs, list) {
+ cb2 = malloc(sizeof(*cb), M_DEVBUF, M_NOWAIT|M_ZERO);
+ if (!cb2)
+ break;
+ bcopy(cb, cb2, sizeof(*cb));
+ TAILQ_INSERT_TAIL(&copy_cbs, cb2, list);
+ }
+ mtx_unlock(&krping_mutex);
+
+ while (!TAILQ_EMPTY(&copy_cbs)) {
+
+ cb = TAILQ_FIRST(&copy_cbs);
+ TAILQ_REMOVE(&copy_cbs, cb, list);
+ if (cb->pd) {
+ uprintf("krping: %4d %10s %10u %10u %10u %10u %10u %10u %10u %10u\n",
+ num++, cb->pd->device->name, cb->stats.send_bytes,
+ cb->stats.send_msgs, cb->stats.recv_bytes,
+ cb->stats.recv_msgs, cb->stats.write_bytes,
+ cb->stats.write_msgs,
+ cb->stats.read_bytes,
+ cb->stats.read_msgs);
+ } else {
+ uprintf("krping: %d listen\n", num++);
+ }
+ free(cb, M_DEVBUF);
+ }
+ return 0;
+}
+
+static int
+krping_write(struct cdev *dev, struct uio *uio, int ioflag)
+{
+ int err = 0;
+ int amt;
+ int remain = BUFFERSIZE;
+ char *cp;
+ krping_t *krpingmsg;
+
+ krpingmsg = malloc(sizeof *krpingmsg, M_DEVBUF, M_WAITOK|M_ZERO);
+ if (!krpingmsg) {
+ uprintf("Could not malloc mem!\n");
+ return ENOMEM;
+ }
+
+ cp = krpingmsg->msg;
+ while (uio->uio_resid) {
+ amt = MIN(uio->uio_resid, remain);
+ if (amt == 0)
+ break;
+
+ /* Copy the string in from user memory to kernel memory */
+ err = uiomove(cp, amt, uio);
+ if (err) {
+ uprintf("Write failed: bad address!\n");
+ return err;
+ }
+ cp += amt;
+ remain -= amt;
+ }
+
+ if (uio->uio_resid != 0) {
+ uprintf("Message too big. max size is %d!\n", BUFFERSIZE);
+ return EMSGSIZE;
+ }
+
+ /* null terminate and remove the \n */
+ cp--;
+ *cp = 0;
+ krpingmsg->len = (unsigned long)(cp - krpingmsg->msg);
+ uprintf("krping: write string = |%s|\n", krpingmsg->msg);
+ err = krping_doit(krpingmsg->msg);
+ free(krpingmsg, M_DEVBUF);
+ return(err);
+}
+
+MODULE_DEPEND(krping, rdma_core, 1, 1, 1);
+MODULE_DEPEND(krping, rdma_cma, 1, 1, 1);
+DEV_MODULE(krping,krping_loader,NULL);
diff --git a/sys/contrib/rdma/rdma_addr.c b/sys/contrib/rdma/rdma_addr.c
new file mode 100644
index 000000000000..e052b807a7d9
--- /dev/null
+++ b/sys/contrib/rdma/rdma_addr.c
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc. All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This Software is licensed under one of the following licenses:
+ *
+ * 1) under the terms of the "Common Public License 1.0" a copy of which is
+ * available from the Open Source Initiative, see
+ * http://www.opensource.org/licenses/cpl.php.
+ *
+ * 2) under the terms of the "The BSD License" a copy of which is
+ * available from the Open Source Initiative, see
+ * http://www.opensource.org/licenses/bsd-license.php.
+ *
+ * 3) under the terms of the "GNU General Public License (GPL) Version 2" a
+ * copy of which is available from the Open Source Initiative, see
+ * http://www.opensource.org/licenses/gpl-license.php.
+ *
+ * Licensee has the right to choose one of the above licenses.
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice and one of the license notices.
+ *
+ * Redistributions in binary form must reproduce both the above copyright
+ * notice, one of the license notices in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/condvar.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>
+#include <sys/module.h>
+
+#include <sys/lock.h>
+#include <sys/condvar.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_var.h>
+#include <net/if_arp.h>
+#include <net/route.h>
+
+#include <net80211/ieee80211_freebsd.h>
+
+#include <netinet/in.h>
+#include <netinet/if_ether.h>
+
+#include <contrib/rdma/ib_addr.h>
+
+struct addr_req {
+ TAILQ_ENTRY(addr_req) entry;
+ struct sockaddr src_addr;
+ struct sockaddr dst_addr;
+ struct rdma_dev_addr *addr;
+ struct rdma_addr_client *client;
+ void *context;
+ void (*callback)(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *addr, void *context);
+ unsigned long timeout;
+ int status;
+};
+
+static void process_req(void *ctx, int pending);
+
+static struct mtx lock;
+
+static TAILQ_HEAD(addr_req_list, addr_req) req_list;
+static struct task addr_task;
+static struct taskqueue *addr_taskq;
+static struct callout addr_ch;
+static eventhandler_tag route_event_tag;
+
+static void addr_timeout(void *arg)
+{
+ taskqueue_enqueue(addr_taskq, &addr_task);
+}
+
+void rdma_addr_register_client(struct rdma_addr_client *client)
+{
+ mtx_init(&client->lock, "rdma_addr client lock", NULL, MTX_DUPOK|MTX_DEF);
+ cv_init(&client->comp, "rdma_addr cv");
+ client->refcount = 1;
+}
+
+static inline void put_client(struct rdma_addr_client *client)
+{
+ mtx_lock(&client->lock);
+ if (--client->refcount == 0) {
+ cv_broadcast(&client->comp);
+ }
+ mtx_unlock(&client->lock);
+}
+
+void rdma_addr_unregister_client(struct rdma_addr_client *client)
+{
+ put_client(client);
+ mtx_lock(&client->lock);
+ if (client->refcount) {
+ cv_wait(&client->comp, &client->lock);
+ }
+ mtx_unlock(&client->lock);
+}
+
+int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev,
+ const unsigned char *dst_dev_addr)
+{
+ dev_addr->dev_type = RDMA_NODE_RNIC;
+ memcpy(dev_addr->src_dev_addr, IF_LLADDR(dev), MAX_ADDR_LEN);
+ memcpy(dev_addr->broadcast, dev->if_broadcastaddr, MAX_ADDR_LEN);
+ if (dst_dev_addr)
+ memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
+ return 0;
+}
+
+int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr)
+{
+ struct ifaddr *ifa;
+ struct sockaddr_in *sin = (struct sockaddr_in *)addr;
+ uint16_t port = sin->sin_port;
+
+ sin->sin_port = 0;
+ ifa = ifa_ifwithaddr(addr);
+ sin->sin_port = port;
+ if (!ifa)
+ return (EADDRNOTAVAIL);
+ return rdma_copy_addr(dev_addr, ifa->ifa_ifp, NULL);
+}
+
+static void queue_req(struct addr_req *req)
+{
+ struct addr_req *tmp_req = NULL;
+
+ mtx_lock(&lock);
+ TAILQ_FOREACH_REVERSE(tmp_req, &req_list, addr_req_list, entry)
+ if (time_after_eq(req->timeout, tmp_req->timeout))
+ break;
+
+ if (tmp_req)
+ TAILQ_INSERT_AFTER(&req_list, tmp_req, req, entry);
+ else
+ TAILQ_INSERT_TAIL(&req_list, req, entry);
+
+ if (TAILQ_FIRST(&req_list) == req)
+ callout_reset(&addr_ch, req->timeout - ticks, addr_timeout, NULL);
+ mtx_unlock(&lock);
+}
+
+#ifdef needed
+static void addr_send_arp(struct sockaddr_in *dst_in)
+{
+ struct route iproute;
+ struct sockaddr_in *dst = (struct sockaddr_in *)&iproute.ro_dst;
+ char dmac[ETHER_ADDR_LEN];
+
+ bzero(&iproute, sizeof iproute);
+ *dst = *dst_in;
+
+ rtalloc(&iproute);
+ if (iproute.ro_rt == NULL);
+ return;
+
+ arpresolve(iproute.ro_rt->rt_ifp, iproute.ro_rt, NULL,
+ rt_key(iproute.ro_rt), dmac);
+
+ RTFREE(iproute.ro_rt);
+}
+#endif
+
+static int addr_resolve_remote(struct sockaddr_in *src_in,
+ struct sockaddr_in *dst_in,
+ struct rdma_dev_addr *addr)
+{
+ int ret = 0;
+ struct route iproute;
+ struct sockaddr_in *dst = (struct sockaddr_in *)&iproute.ro_dst;
+ char dmac[ETHER_ADDR_LEN];
+
+ bzero(&iproute, sizeof iproute);
+ *dst = *dst_in;
+
+ rtalloc(&iproute);
+ if (iproute.ro_rt == NULL) {
+ ret = EHOSTUNREACH;
+ goto out;
+ }
+
+ /* If the device does ARP internally, return 'done' */
+ if (iproute.ro_rt->rt_ifp->if_flags & IFF_NOARP) {
+ rdma_copy_addr(addr, iproute.ro_rt->rt_ifp, NULL);
+ goto put;
+ }
+ ret = arpresolve(iproute.ro_rt->rt_ifp, iproute.ro_rt, NULL,
+ rt_key(iproute.ro_rt), dmac);
+ if (ret) {
+ goto put;
+ }
+
+ if (!src_in->sin_addr.s_addr) {
+ src_in->sin_len = sizeof *src_in;
+ src_in->sin_family = dst_in->sin_family;
+ src_in->sin_addr.s_addr = ((struct sockaddr_in *)iproute.ro_rt->rt_ifa->ifa_addr)->sin_addr.s_addr;
+ }
+
+ ret = rdma_copy_addr(addr, iproute.ro_rt->rt_ifp, dmac);
+put:
+ RTFREE(iproute.ro_rt);
+out:
+ return ret;
+}
+
+static void process_req(void *ctx, int pending)
+{
+ struct addr_req *req, *tmp_req;
+ struct sockaddr_in *src_in, *dst_in;
+ TAILQ_HEAD(, addr_req) done_list;
+
+ TAILQ_INIT(&done_list);
+
+ mtx_lock(&lock);
+ TAILQ_FOREACH_SAFE(req, &req_list, entry, tmp_req) {
+ if (req->status == EWOULDBLOCK) {
+ src_in = (struct sockaddr_in *) &req->src_addr;
+ dst_in = (struct sockaddr_in *) &req->dst_addr;
+ req->status = addr_resolve_remote(src_in, dst_in,
+ req->addr);
+ if (req->status && time_after_eq(ticks, req->timeout))
+ req->status = ETIMEDOUT;
+ else if (req->status == EWOULDBLOCK)
+ continue;
+ }
+ TAILQ_REMOVE(&req_list, req, entry);
+ TAILQ_INSERT_TAIL(&done_list, req, entry);
+ }
+
+ if (!TAILQ_EMPTY(&req_list)) {
+ req = TAILQ_FIRST(&req_list);
+ callout_reset(&addr_ch, req->timeout - ticks, addr_timeout,
+ NULL);
+ }
+ mtx_unlock(&lock);
+
+ TAILQ_FOREACH_SAFE(req, &done_list, entry, tmp_req) {
+ TAILQ_REMOVE(&done_list, req, entry);
+ req->callback(req->status, &req->src_addr, req->addr,
+ req->context);
+ put_client(req->client);
+ free(req, M_DEVBUF);
+ }
+}
+
+int rdma_resolve_ip(struct rdma_addr_client *client,
+ struct sockaddr *src_addr, struct sockaddr *dst_addr,
+ struct rdma_dev_addr *addr, int timeout_ms,
+ void (*callback)(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *addr, void *context),
+ void *context)
+{
+ struct sockaddr_in *src_in, *dst_in;
+ struct addr_req *req;
+ int ret = 0;
+
+ req = malloc(sizeof *req, M_DEVBUF, M_NOWAIT);
+ if (!req)
+ return (ENOMEM);
+ memset(req, 0, sizeof *req);
+
+ if (src_addr)
+ memcpy(&req->src_addr, src_addr, ip_addr_size(src_addr));
+ memcpy(&req->dst_addr, dst_addr, ip_addr_size(dst_addr));
+ req->addr = addr;
+ req->callback = callback;
+ req->context = context;
+ req->client = client;
+ mtx_lock(&client->lock);
+ client->refcount++;
+ mtx_unlock(&client->lock);
+
+ src_in = (struct sockaddr_in *) &req->src_addr;
+ dst_in = (struct sockaddr_in *) &req->dst_addr;
+
+ req->status = addr_resolve_remote(src_in, dst_in, addr);
+
+ switch (req->status) {
+ case 0:
+ req->timeout = ticks;
+ queue_req(req);
+ break;
+ case EWOULDBLOCK:
+ req->timeout = msecs_to_ticks(timeout_ms) + ticks;
+ queue_req(req);
+#ifdef needed
+ addr_send_arp(dst_in);
+#endif
+ break;
+ default:
+ ret = req->status;
+ mtx_lock(&client->lock);
+ client->refcount--;
+ mtx_unlock(&client->lock);
+ free(req, M_DEVBUF);
+ break;
+ }
+ return ret;
+}
+
+void rdma_addr_cancel(struct rdma_dev_addr *addr)
+{
+ struct addr_req *req, *tmp_req;
+
+ mtx_lock(&lock);
+ TAILQ_FOREACH_SAFE(req, &req_list, entry, tmp_req) {
+ if (req->addr == addr) {
+ req->status = ECANCELED;
+ req->timeout = ticks;
+ TAILQ_REMOVE(&req_list, req, entry);
+ TAILQ_INSERT_HEAD(&req_list, req, entry);
+ callout_reset(&addr_ch, req->timeout - ticks, addr_timeout, NULL);
+ break;
+ }
+ }
+ mtx_unlock(&lock);
+}
+
+static void
+route_event_arp_update(void *unused, struct rtentry *rt0, uint8_t *enaddr,
+ struct sockaddr *sa)
+{
+ callout_stop(&addr_ch);
+ taskqueue_enqueue(addr_taskq, &addr_task);
+}
+
+static int addr_init(void)
+{
+ TAILQ_INIT(&req_list);
+ mtx_init(&lock, "rdma_addr req_list lock", NULL, MTX_DEF);
+
+ addr_taskq = taskqueue_create("rdma_addr_taskq", M_NOWAIT,
+ taskqueue_thread_enqueue, &addr_taskq);
+ if (addr_taskq == NULL) {
+ printf("failed to allocate rdma_addr taskqueue\n");
+ return (ENOMEM);
+ }
+ taskqueue_start_threads(&addr_taskq, 1, PI_NET, "rdma_addr taskq");
+ TASK_INIT(&addr_task, 0, process_req, NULL);
+
+ callout_init(&addr_ch, TRUE);
+
+ route_event_tag = EVENTHANDLER_REGISTER(route_arp_update_event,
+ route_event_arp_update, NULL, EVENTHANDLER_PRI_ANY);
+
+ return 0;
+}
+
+static void addr_cleanup(void)
+{
+ EVENTHANDLER_DEREGISTER(route_event_arp_update, route_event_tag);
+ callout_stop(&addr_ch);
+ taskqueue_drain(addr_taskq, &addr_task);
+ taskqueue_free(addr_taskq);
+}
+
+static int
+addr_load(module_t mod, int cmd, void *arg)
+{
+ int err = 0;
+
+ switch (cmd) {
+ case MOD_LOAD:
+ printf("Loading rdma_addr.\n");
+
+ addr_init();
+ break;
+ case MOD_QUIESCE:
+ break;
+ case MOD_UNLOAD:
+ printf("Unloading rdma_addr.\n");
+ addr_cleanup();
+ break;
+ case MOD_SHUTDOWN:
+ break;
+ default:
+ err = EOPNOTSUPP;
+ break;
+ }
+
+ return (err);
+}
+
+static moduledata_t mod_data = {
+ "rdma_addr",
+ addr_load,
+ 0
+};
+
+MODULE_VERSION(rdma_addr, 1);
+DECLARE_MODULE(rdma_addr, mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
diff --git a/sys/contrib/rdma/rdma_cache.c b/sys/contrib/rdma/rdma_cache.c
new file mode 100644
index 000000000000..dced8ebd0110
--- /dev/null
+++ b/sys/contrib/rdma/rdma_cache.c
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: cache.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/taskqueue.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/mutex.h>
+#include <sys/module.h>
+#include <sys/syslog.h>
+
+
+#ifdef needed
+#include <sys/condvar.h>
+#include <sys/socket.h>
+#include <sys/condvar.h>
+#endif
+
+#include <contrib/rdma/ib_cache.h>
+
+#include "core_priv.h"
+
+struct ib_pkey_cache {
+ int table_len;
+ u16 table[0];
+};
+
+struct ib_gid_cache {
+ int table_len;
+ union ib_gid table[0];
+};
+
+struct ib_update_work {
+ struct task task;
+ struct ib_device *device;
+ u8 port_num;
+};
+
+static inline int start_port(struct ib_device *device)
+{
+ return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+}
+
+static inline int end_port(struct ib_device *device)
+{
+ return (device->node_type == RDMA_NODE_IB_SWITCH) ?
+ 0 : device->phys_port_cnt;
+}
+
+int ib_get_cached_gid(struct ib_device *device,
+ u8 port_num,
+ int index,
+ union ib_gid *gid)
+{
+ struct ib_gid_cache *cache;
+ int ret = 0;
+
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
+
+ mtx_lock(&device->cache.lock);
+
+ cache = device->cache.gid_cache[port_num - start_port(device)];
+
+ if (index < 0 || index >= cache->table_len)
+ ret = -EINVAL;
+ else
+ *gid = cache->table[index];
+
+ mtx_unlock(&device->cache.lock);
+
+ return ret;
+}
+
+int ib_find_cached_gid(struct ib_device *device,
+ union ib_gid *gid,
+ u8 *port_num,
+ u16 *index)
+{
+ struct ib_gid_cache *cache;
+ int p, i;
+ int ret = -ENOENT;
+
+ *port_num = -1;
+ if (index)
+ *index = -1;
+
+ mtx_lock(&device->cache.lock);
+
+ for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+ cache = device->cache.gid_cache[p];
+ for (i = 0; i < cache->table_len; ++i) {
+ if (!memcmp(gid, &cache->table[i], 6)) { /* XXX */
+ *port_num = p + start_port(device);
+ if (index)
+ *index = i;
+ ret = 0;
+ goto found;
+ }
+ }
+ }
+found:
+ mtx_unlock(&device->cache.lock);
+
+ return ret;
+}
+
+int ib_get_cached_pkey(struct ib_device *device,
+ u8 port_num,
+ int index,
+ u16 *pkey)
+{
+ struct ib_pkey_cache *cache;
+ int ret = 0;
+
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
+
+ mtx_lock(&device->cache.lock);
+
+ cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+ if (index < 0 || index >= cache->table_len)
+ ret = -EINVAL;
+ else
+ *pkey = cache->table[index];
+
+ mtx_unlock(&device->cache.lock);
+
+ return ret;
+}
+
+int ib_find_cached_pkey(struct ib_device *device,
+ u8 port_num,
+ u16 pkey,
+ u16 *index)
+{
+ struct ib_pkey_cache *cache;
+ int i;
+ int ret = -ENOENT;
+
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
+
+ mtx_lock(&device->cache.lock);
+
+ cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+ *index = -1;
+
+ for (i = 0; i < cache->table_len; ++i)
+ if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
+ *index = i;
+ ret = 0;
+ break;
+ }
+
+ mtx_unlock(&device->cache.lock);
+
+ return ret;
+}
+
+int ib_get_cached_lmc(struct ib_device *device,
+ u8 port_num,
+ u8 *lmc)
+{
+ int ret = 0;
+
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
+
+ mtx_lock(&device->cache.lock);
+ *lmc = device->cache.lmc_cache[port_num - start_port(device)];
+ mtx_unlock(&device->cache.lock);
+
+ return ret;
+}
+
+static void ib_cache_update(struct ib_device *device,
+ u8 port)
+{
+ struct ib_port_attr *tprops = NULL;
+ struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache;
+ struct ib_gid_cache *gid_cache = NULL, *old_gid_cache;
+ int i;
+ int ret;
+
+ tprops = malloc(sizeof *tprops, M_DEVBUF, M_NOWAIT);
+ if (!tprops)
+ return;
+
+ ret = ib_query_port(device, port, tprops);
+ if (ret) {
+ log(LOG_WARNING, "ib_query_port failed (%d) for %s\n",
+ ret, device->name);
+ goto err;
+ }
+
+ pkey_cache = malloc(sizeof *pkey_cache + tprops->pkey_tbl_len *
+ sizeof *pkey_cache->table, M_DEVBUF, M_NOWAIT);
+ if (!pkey_cache)
+ goto err;
+
+ pkey_cache->table_len = tprops->pkey_tbl_len;
+
+ gid_cache = malloc(sizeof *gid_cache + tprops->gid_tbl_len *
+ sizeof *gid_cache->table, M_DEVBUF, M_NOWAIT);
+ if (!gid_cache)
+ goto err;
+
+ gid_cache->table_len = tprops->gid_tbl_len;
+
+ for (i = 0; i < pkey_cache->table_len; ++i) {
+ ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
+ if (ret) {
+ log(LOG_WARNING, "ib_query_pkey failed (%d) for %s (index %d)\n",
+ ret, device->name, i);
+ goto err;
+ }
+ }
+
+ for (i = 0; i < gid_cache->table_len; ++i) {
+ ret = ib_query_gid(device, port, i, gid_cache->table + i);
+ if (ret) {
+ log(LOG_WARNING, "ib_query_gid failed (%d) for %s (index %d)\n",
+ ret, device->name, i);
+ goto err;
+ }
+ }
+
+ mtx_lock(&device->cache.lock);
+
+ old_pkey_cache = device->cache.pkey_cache[port - start_port(device)];
+ old_gid_cache = device->cache.gid_cache [port - start_port(device)];
+
+ device->cache.pkey_cache[port - start_port(device)] = pkey_cache;
+ device->cache.gid_cache [port - start_port(device)] = gid_cache;
+
+ device->cache.lmc_cache[port - start_port(device)] = tprops->lmc;
+
+ mtx_unlock(&device->cache.lock);
+
+ free(old_pkey_cache, M_DEVBUF);
+ free(old_gid_cache, M_DEVBUF);
+ free(tprops, M_DEVBUF);
+ return;
+
+err:
+ free(pkey_cache, M_DEVBUF);
+ free(gid_cache, M_DEVBUF);
+ free(tprops, M_DEVBUF);
+}
+
+static void ib_cache_task(void *context, int pending)
+{
+ struct ib_update_work *work = context;
+
+ ib_cache_update(work->device, work->port_num);
+ free(work, M_DEVBUF);
+}
+
+static void ib_cache_event(struct ib_event_handler *handler,
+ struct ib_event *event)
+{
+ struct ib_update_work *work;
+
+ if (event->event == IB_EVENT_PORT_ERR ||
+ event->event == IB_EVENT_PORT_ACTIVE ||
+ event->event == IB_EVENT_LID_CHANGE ||
+ event->event == IB_EVENT_PKEY_CHANGE ||
+ event->event == IB_EVENT_SM_CHANGE ||
+ event->event == IB_EVENT_CLIENT_REREGISTER) {
+ work = malloc(sizeof *work, M_DEVBUF, M_NOWAIT);
+ if (work) {
+ TASK_INIT(&work->task, 0, ib_cache_task, work);
+ work->device = event->device;
+ work->port_num = event->element.port_num;
+ taskqueue_enqueue(taskqueue_thread, &work->task);
+ }
+ }
+}
+
+static void ib_cache_setup_one(struct ib_device *device)
+{
+ int p;
+
+ mtx_init(&device->cache.lock, "ib device cache", NULL,
+ MTX_DUPOK|MTX_DEF);
+
+ device->cache.pkey_cache =
+ malloc(sizeof *device->cache.pkey_cache *
+ (end_port(device) - start_port(device) + 1), M_DEVBUF,
+ M_NOWAIT);
+ device->cache.gid_cache =
+ malloc(sizeof *device->cache.gid_cache *
+ (end_port(device) - start_port(device) + 1), M_DEVBUF,
+ M_NOWAIT);
+
+ device->cache.lmc_cache = malloc(sizeof *device->cache.lmc_cache *
+ (end_port(device) -
+ start_port(device) + 1),
+ M_DEVBUF, M_NOWAIT);
+
+ if (!device->cache.pkey_cache || !device->cache.gid_cache ||
+ !device->cache.lmc_cache) {
+ log(LOG_WARNING, "Couldn't allocate cache "
+ "for %s\n", device->name);
+ goto err;
+ }
+
+ for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+ device->cache.pkey_cache[p] = NULL;
+ device->cache.gid_cache [p] = NULL;
+ ib_cache_update(device, p + start_port(device));
+ }
+
+ INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
+ device, ib_cache_event);
+ if (ib_register_event_handler(&device->cache.event_handler))
+ goto err_cache;
+
+ return;
+
+err_cache:
+ for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+ free(device->cache.pkey_cache[p], M_DEVBUF);
+ free(device->cache.gid_cache[p], M_DEVBUF);
+ }
+
+err:
+ free(device->cache.pkey_cache, M_DEVBUF);
+ free(device->cache.gid_cache, M_DEVBUF);
+ free(device->cache.lmc_cache, M_DEVBUF);
+}
+
+static void ib_cache_cleanup_one(struct ib_device *device)
+{
+ int p;
+
+ ib_unregister_event_handler(&device->cache.event_handler);
+#ifdef XXX
+ flush_scheduled_work();
+#endif
+
+ for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+ free(device->cache.pkey_cache[p], M_DEVBUF);
+ free(device->cache.gid_cache[p], M_DEVBUF);
+ }
+
+ free(device->cache.pkey_cache, M_DEVBUF);
+ free(device->cache.gid_cache, M_DEVBUF);
+ free(device->cache.lmc_cache, M_DEVBUF);
+}
+
+static struct ib_client cache_client = {
+ .name = "cache",
+ .add = ib_cache_setup_one,
+ .remove = ib_cache_cleanup_one
+};
+
+int ib_cache_setup(void)
+{
+ return ib_register_client(&cache_client);
+}
+
+void ib_cache_cleanup(void)
+{
+ ib_unregister_client(&cache_client);
+}
diff --git a/sys/contrib/rdma/rdma_cm.h b/sys/contrib/rdma/rdma_cm.h
new file mode 100644
index 000000000000..1b30d04435d5
--- /dev/null
+++ b/sys/contrib/rdma/rdma_cm.h
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This Software is licensed under one of the following licenses:
+ *
+ * 1) under the terms of the "Common Public License 1.0" a copy of which is
+ * available from the Open Source Initiative, see
+ * http://www.opensource.org/licenses/cpl.php.
+ *
+ * 2) under the terms of the "The BSD License" a copy of which is
+ * available from the Open Source Initiative, see
+ * http://www.opensource.org/licenses/bsd-license.php.
+ *
+ * 3) under the terms of the "GNU General Public License (GPL) Version 2" a
+ * copy of which is available from the Open Source Initiative, see
+ * http://www.opensource.org/licenses/gpl-license.php.
+ *
+ * Licensee has the right to choose one of the above licenses.
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice and one of the license notices.
+ *
+ * Redistributions in binary form must reproduce both the above copyright
+ * notice, one of the license notices in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * $FreeBSD$
+ */
+
+#if !defined(RDMA_CM_H)
+#define RDMA_CM_H
+
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <contrib/rdma/ib_addr.h>
+#include <contrib/rdma/ib_sa.h>
+
+/*
+ * Upon receiving a device removal event, users must destroy the associated
+ * RDMA identifier and release all resources allocated with the device.
+ */
+enum rdma_cm_event_type {
+ RDMA_CM_EVENT_ADDR_RESOLVED,
+ RDMA_CM_EVENT_ADDR_ERROR,
+ RDMA_CM_EVENT_ROUTE_RESOLVED,
+ RDMA_CM_EVENT_ROUTE_ERROR,
+ RDMA_CM_EVENT_CONNECT_REQUEST,
+ RDMA_CM_EVENT_CONNECT_RESPONSE,
+ RDMA_CM_EVENT_CONNECT_ERROR,
+ RDMA_CM_EVENT_UNREACHABLE,
+ RDMA_CM_EVENT_REJECTED,
+ RDMA_CM_EVENT_ESTABLISHED,
+ RDMA_CM_EVENT_DISCONNECTED,
+ RDMA_CM_EVENT_DEVICE_REMOVAL,
+ RDMA_CM_EVENT_MULTICAST_JOIN,
+ RDMA_CM_EVENT_MULTICAST_ERROR
+};
+
+enum rdma_port_space {
+ RDMA_PS_SDP = 0x0001,
+ RDMA_PS_IPOIB= 0x0002,
+ RDMA_PS_TCP = 0x0106,
+ RDMA_PS_UDP = 0x0111,
+ RDMA_PS_SCTP = 0x0183
+};
+
+struct rdma_addr {
+ struct sockaddr src_addr;
+ u8 src_pad[sizeof(struct sockaddr_in6) -
+ sizeof(struct sockaddr)];
+ struct sockaddr dst_addr;
+ u8 dst_pad[sizeof(struct sockaddr_in6) -
+ sizeof(struct sockaddr)];
+ struct rdma_dev_addr dev_addr;
+};
+
+struct rdma_route {
+ struct rdma_addr addr;
+ struct ib_sa_path_rec *path_rec;
+ int num_paths;
+};
+
+struct rdma_conn_param {
+ const void *private_data;
+ u8 private_data_len;
+ u8 responder_resources;
+ u8 initiator_depth;
+ u8 flow_control;
+ u8 retry_count; /* ignored when accepting */
+ u8 rnr_retry_count;
+ /* Fields below ignored if a QP is created on the rdma_cm_id. */
+ u8 srq;
+ u32 qp_num;
+};
+
+struct rdma_ud_param {
+ const void *private_data;
+ u8 private_data_len;
+ struct ib_ah_attr ah_attr;
+ u32 qp_num;
+ u32 qkey;
+};
+
+struct rdma_cm_event {
+ enum rdma_cm_event_type event;
+ int status;
+ union {
+ struct rdma_conn_param conn;
+ struct rdma_ud_param ud;
+ } param;
+};
+
+struct rdma_cm_id;
+
+/**
+ * rdma_cm_event_handler - Callback used to report user events.
+ *
+ * Notes: Users may not call rdma_destroy_id from this callback to destroy
+ * the passed in id, or a corresponding listen id. Returning a
+ * non-zero value from the callback will destroy the passed in id.
+ */
+typedef int (*rdma_cm_event_handler)(struct rdma_cm_id *id,
+ struct rdma_cm_event *event);
+
+struct rdma_cm_id {
+ struct ib_device *device;
+ void *context;
+ struct ib_qp *qp;
+ rdma_cm_event_handler event_handler;
+ struct rdma_route route;
+ enum rdma_port_space ps;
+ u8 port_num;
+};
+
+/**
+ * rdma_create_id - Create an RDMA identifier.
+ *
+ * @event_handler: User callback invoked to report events associated with the
+ * returned rdma_id.
+ * @context: User specified context associated with the id.
+ * @ps: RDMA port space.
+ */
+struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
+ void *context, enum rdma_port_space ps);
+
+/**
+ * rdma_destroy_id - Destroys an RDMA identifier.
+ *
+ * @id: RDMA identifier.
+ *
+ * Note: calling this function has the effect of canceling in-flight
+ * asynchronous operations associated with the id.
+ */
+void rdma_destroy_id(struct rdma_cm_id *id);
+
+/**
+ * rdma_bind_addr - Bind an RDMA identifier to a source address and
+ * associated RDMA device, if needed.
+ *
+ * @id: RDMA identifier.
+ * @addr: Local address information. Wildcard values are permitted.
+ *
+ * This associates a source address with the RDMA identifier before calling
+ * rdma_listen. If a specific local address is given, the RDMA identifier will
+ * be bound to a local RDMA device.
+ */
+int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr);
+
+/**
+ * rdma_resolve_addr - Resolve destination and optional source addresses
+ * from IP addresses to an RDMA address. If successful, the specified
+ * rdma_cm_id will be bound to a local device.
+ *
+ * @id: RDMA identifier.
+ * @src_addr: Source address information. This parameter may be NULL.
+ * @dst_addr: Destination address information.
+ * @timeout_ms: Time to wait for resolution to complete.
+ */
+int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+ struct sockaddr *dst_addr, int timeout_ms);
+
+/**
+ * rdma_resolve_route - Resolve the RDMA address bound to the RDMA identifier
+ * into route information needed to establish a connection.
+ *
+ * This is called on the client side of a connection.
+ * Users must have first called rdma_resolve_addr to resolve a dst_addr
+ * into an RDMA address before calling this routine.
+ */
+int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms);
+
+/**
+ * rdma_create_qp - Allocate a QP and associate it with the specified RDMA
+ * identifier.
+ *
+ * QPs allocated to an rdma_cm_id will automatically be transitioned by the CMA
+ * through their states.
+ */
+int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * rdma_destroy_qp - Deallocate the QP associated with the specified RDMA
+ * identifier.
+ *
+ * Users must destroy any QP associated with an RDMA identifier before
+ * destroying the RDMA ID.
+ */
+void rdma_destroy_qp(struct rdma_cm_id *id);
+
+/**
+ * rdma_init_qp_attr - Initializes the QP attributes for use in transitioning
+ * to a specified QP state.
+ * @id: Communication identifier associated with the QP attributes to
+ * initialize.
+ * @qp_attr: On input, specifies the desired QP state. On output, the
+ * mandatory and desired optional attributes will be set in order to
+ * modify the QP to the specified state.
+ * @qp_attr_mask: The QP attribute mask that may be used to transition the
+ * QP to the specified state.
+ *
+ * Users must set the @qp_attr->qp_state to the desired QP state. This call
+ * will set all required attributes for the given transition, along with
+ * known optional attributes. Users may override the attributes returned from
+ * this call before calling ib_modify_qp.
+ *
+ * Users that wish to have their QP automatically transitioned through its
+ * states can associate a QP with the rdma_cm_id by calling rdma_create_qp().
+ */
+int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask);
+
+/**
+ * rdma_connect - Initiate an active connection request.
+ * @id: Connection identifier to connect.
+ * @conn_param: Connection information used for connected QPs.
+ *
+ * Users must have resolved a route for the rdma_cm_id to connect with
+ * by having called rdma_resolve_route before calling this routine.
+ *
+ * This call will either connect to a remote QP or obtain remote QP
+ * information for unconnected rdma_cm_id's. The actual operation is
+ * based on the rdma_cm_id's port space.
+ */
+int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param);
+
+/**
+ * rdma_listen - This function is called by the passive side to
+ * listen for incoming connection requests.
+ *
+ * Users must have bound the rdma_cm_id to a local address by calling
+ * rdma_bind_addr before calling this routine.
+ */
+int rdma_listen(struct rdma_cm_id *id, int backlog);
+
+/**
+ * rdma_accept - Called to accept a connection request or response.
+ * @id: Connection identifier associated with the request.
+ * @conn_param: Information needed to establish the connection. This must be
+ * provided if accepting a connection request. If accepting a connection
+ * response, this parameter must be NULL.
+ *
+ * Typically, this routine is only called by the listener to accept a connection
+ * request. It must also be called on the active side of a connection if the
+ * user is performing their own QP transitions.
+ *
+ * In the case of error, a reject message is sent to the remote side and the
+ * state of the qp associated with the id is modified to error, such that any
+ * previously posted receive buffers would be flushed.
+ */
+int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param);
+
+/**
+ * rdma_notify - Notifies the RDMA CM of an asynchronous event that has
+ * occurred on the connection.
+ * @id: Connection identifier to transition to established.
+ * @event: Asynchronous event.
+ *
+ * This routine should be invoked by users to notify the CM of relevant
+ * communication events. Events that should be reported to the CM and
+ * when to report them are:
+ *
+ * IB_EVENT_COMM_EST - Used when a message is received on a connected
+ * QP before an RTU has been received.
+ */
+int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event);
+
+/**
+ * rdma_reject - Called to reject a connection request or response.
+ */
+int rdma_reject(struct rdma_cm_id *id, const void *private_data,
+ u8 private_data_len);
+
+/**
+ * rdma_disconnect - This function disconnects the associated QP and
+ * transitions it into the error state.
+ */
+int rdma_disconnect(struct rdma_cm_id *id);
+
+/**
+ * rdma_join_multicast - Join the multicast group specified by the given
+ * address.
+ * @id: Communication identifier associated with the request.
+ * @addr: Multicast address identifying the group to join.
+ * @context: User-defined context associated with the join request, returned
+ * to the user through the private_data pointer in multicast events.
+ */
+int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
+ void *context);
+
+/**
+ * rdma_leave_multicast - Leave the multicast group specified by the given
+ * address.
+ */
+void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr);
+
+#endif /* RDMA_CM_H */
diff --git a/sys/contrib/rdma/rdma_cm_ib.h b/sys/contrib/rdma/rdma_cm_ib.h
new file mode 100644
index 000000000000..b69f66613bd9
--- /dev/null
+++ b/sys/contrib/rdma/rdma_cm_ib.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This Software is licensed under one of the following licenses:
+ *
+ * 1) under the terms of the "Common Public License 1.0" a copy of which is
+ * available from the Open Source Initiative, see
+ * http://www.opensource.org/licenses/cpl.php.
+ *
+ * 2) under the terms of the "The BSD License" a copy of which is
+ * available from the Open Source Initiative, see
+ * http://www.opensource.org/licenses/bsd-license.php.
+ *
+ * 3) under the terms of the "GNU General Public License (GPL) Version 2" a
+ * copy of which is available from the Open Source Initiative, see
+ * http://www.opensource.org/licenses/gpl-license.php.
+ *
+ * Licensee has the right to choose one of the above licenses.
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice and one of the license notices.
+ *
+ * Redistributions in binary form must reproduce both the above copyright
+ * notice, one of the license notices in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * $FreeBSD$
+ */
+
+#if !defined(RDMA_CM_IB_H)
+#define RDMA_CM_IB_H
+
+#include <rdma/rdma_cm.h>
+
+/**
+ * rdma_set_ib_paths - Manually sets the path records used to establish a
+ * connection.
+ * @id: Connection identifier associated with the request.
+ * @path_rec: Reference to the path record
+ *
+ * This call permits a user to specify routing information for rdma_cm_id's
+ * bound to Infiniband devices. It is called on the client side of a
+ * connection and replaces the call to rdma_resolve_route.
+ */
+int rdma_set_ib_paths(struct rdma_cm_id *id,
+ struct ib_sa_path_rec *path_rec, int num_paths);
+
+/* Global qkey for UDP QPs and multicast groups. */
+#define RDMA_UDP_QKEY 0x01234567
+
+#endif /* RDMA_CM_IB_H */
diff --git a/sys/contrib/rdma/rdma_cma.c b/sys/contrib/rdma/rdma_cma.c
new file mode 100644
index 000000000000..8dddf6ce157d
--- /dev/null
+++ b/sys/contrib/rdma/rdma_cma.c
@@ -0,0 +1,2998 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc. All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
+ *
+ * This Software is licensed under one of the following licenses:
+ *
+ * 1) under the terms of the "Common Public License 1.0" a copy of which is
+ * available from the Open Source Initiative, see
+ * http://www.opensource.org/licenses/cpl.php.
+ *
+ * 2) under the terms of the "The BSD License" a copy of which is
+ * available from the Open Source Initiative, see
+ * http://www.opensource.org/licenses/bsd-license.php.
+ *
+ * 3) under the terms of the "GNU General Public License (GPL) Version 2" a
+ * copy of which is available from the Open Source Initiative, see
+ * http://www.opensource.org/licenses/gpl-license.php.
+ *
+ * Licensee has the right to choose one of the above licenses.
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice and one of the license notices.
+ *
+ * Redistributions in binary form must reproduce both the above copyright
+ * notice, one of the license notices in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/condvar.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/socket.h>
+#include <sys/module.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+#include <sys/priv.h>
+#include <sys/syslog.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+
+#include <contrib/rdma/rdma_cm.h>
+#include <contrib/rdma/ib_cache.h>
+#include <contrib/rdma/ib_cm.h>
+#include <contrib/rdma/ib_sa.h>
+#include <contrib/rdma/iw_cm.h>
+
+#define CMA_CM_RESPONSE_TIMEOUT 20
+#define CMA_MAX_CM_RETRIES 15
+
+static void cma_add_one(struct ib_device *device);
+static void cma_remove_one(struct ib_device *device);
+
+static struct ib_client cma_client = {
+ .name = "cma",
+ .add = cma_add_one,
+ .remove = cma_remove_one
+};
+
+#ifdef IB_SUPPORTED
+static struct ib_sa_client sa_client;
+#endif
+static struct rdma_addr_client addr_client;
+static TAILQ_HEAD(, cma_device) dev_list;
+static LIST_HEAD(, rdma_id_private) listen_any_list;
+static struct mtx lock;
+static struct taskqueue *cma_wq;
+static DEFINE_KVL(sdp_ps);
+static DEFINE_KVL(tcp_ps);
+static DEFINE_KVL(udp_ps);
+static DEFINE_KVL(ipoib_ps);
+static int next_port;
+
+struct cma_device {
+ struct ib_device *device;
+ struct mtx lock;
+ struct cv comp;
+ int refcount;
+
+ LIST_HEAD(, rdma_id_private) id_list;
+ TAILQ_ENTRY(cma_device) list;
+};
+
+enum cma_state {
+ CMA_IDLE,
+ CMA_ADDR_QUERY,
+ CMA_ADDR_RESOLVED,
+ CMA_ROUTE_QUERY,
+ CMA_ROUTE_RESOLVED,
+ CMA_CONNECT,
+ CMA_DISCONNECT,
+ CMA_ADDR_BOUND,
+ CMA_LISTEN,
+ CMA_DEVICE_REMOVAL,
+ CMA_DESTROYING
+};
+
+struct rdma_bind_list {
+ struct kvl *ps;
+ TAILQ_HEAD(, rdma_id_private) owners;
+ unsigned short port;
+};
+
+/*
+ * Device removal can occur at anytime, so we need extra handling to
+ * serialize notifying the user of device removal with other callbacks.
+ * We do this by disabling removal notification while a callback is in process,
+ * and reporting it after the callback completes.
+ */
+struct rdma_id_private {
+ struct rdma_cm_id id;
+
+ struct rdma_bind_list *bind_list;
+ struct socket *so;
+ TAILQ_ENTRY(rdma_id_private) node;
+ LIST_ENTRY(rdma_id_private) list; /* listen_any_list or cma_dev.list */
+ LIST_HEAD(, rdma_id_private) listen_list; /* per-device listens */
+ LIST_ENTRY(rdma_id_private) listen_entry;
+ struct cma_device *cma_dev;
+#ifdef IB_SUPPORTED
+ LIST_HEAD(, cma_multicast) mc_list;
+#endif
+ enum cma_state state;
+ struct mtx lock;
+ struct cv comp;
+ int refcount;
+ struct cv wait_remove;
+ int dev_remove;
+
+ int backlog;
+ int timeout_ms;
+ struct ib_sa_query *query;
+ int query_id;
+ union {
+ struct ib_cm_id *ib;
+ struct iw_cm_id *iw;
+ } cm_id;
+
+ u32 seq_num;
+ u32 qkey;
+ u32 qp_num;
+ u8 srq;
+};
+
+#ifdef IB_SUPPORTED
+struct cma_multicast {
+ struct rdma_id_private *id_priv;
+ union {
+ struct ib_sa_multicast *ib;
+ } multicast;
+ struct list_head list;
+ void *context;
+ struct sockaddr addr;
+ u8 pad[sizeof(struct sockaddr_in6) -
+ sizeof(struct sockaddr)];
+};
+#endif
+
+struct cma_work {
+ struct task task;
+ struct rdma_id_private *id;
+ enum cma_state old_state;
+ enum cma_state new_state;
+ struct rdma_cm_event event;
+};
+
+union cma_ip_addr {
+ struct in6_addr ip6;
+ struct {
+ __u32 pad[3];
+ __u32 addr;
+ } ip4;
+};
+
+struct cma_hdr {
+ u8 cma_version;
+ u8 ip_version; /* IP version: 7:4 */
+ __u16 port;
+ union cma_ip_addr src_addr;
+ union cma_ip_addr dst_addr;
+};
+
+struct sdp_hh {
+ u8 bsdh[16];
+ u8 sdp_version; /* Major version: 7:4 */
+ u8 ip_version; /* IP version: 7:4 */
+ u8 sdp_specific1[10];
+ __u16 port;
+ __u16 sdp_specific2;
+ union cma_ip_addr src_addr;
+ union cma_ip_addr dst_addr;
+};
+
+struct sdp_hah {
+ u8 bsdh[16];
+ u8 sdp_version;
+};
+
+#define CMA_VERSION 0x00
+#define SDP_MAJ_VERSION 0x2
+
+static int cma_comp(struct rdma_id_private *id_priv, enum cma_state comp)
+{
+ int ret;
+
+ mtx_lock(&id_priv->lock);
+ ret = (id_priv->state == comp);
+ mtx_unlock(&id_priv->lock);
+ return ret;
+}
+
+static int cma_comp_exch(struct rdma_id_private *id_priv,
+ enum cma_state comp, enum cma_state exch)
+{
+ int ret;
+
+ mtx_lock(&id_priv->lock);
+ if ((ret = (id_priv->state == comp)))
+ id_priv->state = exch;
+ mtx_unlock(&id_priv->lock);
+ return ret;
+}
+
+static enum cma_state cma_exch(struct rdma_id_private *id_priv,
+ enum cma_state exch)
+{
+ enum cma_state old;
+
+ mtx_lock(&id_priv->lock);
+ old = id_priv->state;
+ id_priv->state = exch;
+ mtx_unlock(&id_priv->lock);
+ return old;
+}
+
+static inline u8 cma_get_ip_ver(struct cma_hdr *hdr)
+{
+ return hdr->ip_version >> 4;
+}
+
+static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
+{
+ hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
+}
+
+static inline u8 sdp_get_majv(u8 sdp_version)
+{
+ return sdp_version >> 4;
+}
+
+static inline u8 sdp_get_ip_ver(struct sdp_hh *hh)
+{
+ return hh->ip_version >> 4;
+}
+
+static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver)
+{
+ hh->ip_version = (ip_ver << 4) | (hh->ip_version & 0xF);
+}
+
+static inline int cma_is_ud_ps(enum rdma_port_space ps)
+{
+ return (ps == RDMA_PS_UDP || ps == RDMA_PS_IPOIB);
+}
+
+static void cma_attach_to_dev(struct rdma_id_private *id_priv,
+ struct cma_device *cma_dev)
+{
+ mtx_lock(&cma_dev->lock);
+ cma_dev->refcount++;
+ mtx_unlock(&cma_dev->lock);
+ id_priv->cma_dev = cma_dev;
+ id_priv->id.device = cma_dev->device;
+ LIST_INSERT_HEAD(&cma_dev->id_list, id_priv, list);
+}
+
+static inline void cma_deref_dev(struct cma_device *cma_dev)
+{
+ mtx_lock(&cma_dev->lock);
+ if (--cma_dev->refcount == 0)
+ cv_broadcast(&cma_dev->comp);
+ mtx_unlock(&cma_dev->lock);
+}
+
+static void cma_detach_from_dev(struct rdma_id_private *id_priv)
+{
+ LIST_REMOVE(id_priv, list);
+ cma_deref_dev(id_priv->cma_dev);
+ id_priv->cma_dev = NULL;
+}
+
+#ifdef IB_SUPPORTED
+static int cma_set_qkey(struct ib_device *device, u8 port_num,
+ enum rdma_port_space ps,
+ struct rdma_dev_addr *dev_addr, u32 *qkey)
+{
+ struct ib_sa_mcmember_rec rec;
+ int ret = 0;
+
+ switch (ps) {
+ case RDMA_PS_UDP:
+ *qkey = RDMA_UDP_QKEY;
+ break;
+ case RDMA_PS_IPOIB:
+ ib_addr_get_mgid(dev_addr, &rec.mgid);
+ ret = ib_sa_get_mcmember_rec(device, port_num, &rec.mgid, &rec);
+ *qkey = be32_to_cpu(rec.qkey);
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+#endif
+
+static int cma_acquire_dev(struct rdma_id_private *id_priv)
+{
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ struct cma_device *cma_dev;
+ union ib_gid gid;
+ int ret = ENODEV;
+
+ switch (rdma_node_get_transport(dev_addr->dev_type)) {
+#ifdef IB_SUPPORTED
+ case RDMA_TRANSPORT_IB:
+ ib_addr_get_sgid(dev_addr, &gid);
+ break;
+#endif
+ case RDMA_TRANSPORT_IWARP:
+ iw_addr_get_sgid(dev_addr, &gid);
+ break;
+ default:
+ return (ENODEV);
+ }
+
+ TAILQ_FOREACH(cma_dev, &dev_list, list) {
+ ret = ib_find_cached_gid(cma_dev->device, &gid,
+ &id_priv->id.port_num, NULL);
+ if (!ret) {
+#ifdef IB_SUPPORTED
+ ret = cma_set_qkey(cma_dev->device,
+ id_priv->id.port_num,
+ id_priv->id.ps, dev_addr,
+ &id_priv->qkey);
+ if (!ret)
+#endif
+ cma_attach_to_dev(id_priv, cma_dev);
+ break;
+ }
+ }
+ return ret;
+}
+
+static void cma_deref_id(struct rdma_id_private *id_priv)
+{
+ mtx_lock(&id_priv->lock);
+ if (--id_priv->refcount == 0) {
+ cv_broadcast(&id_priv->comp);
+ }
+ mtx_unlock(&id_priv->lock);
+}
+
+static int cma_disable_remove(struct rdma_id_private *id_priv,
+ enum cma_state state)
+{
+ int ret;
+
+ mtx_lock(&id_priv->lock);
+ if (id_priv->state == state) {
+ id_priv->dev_remove++;
+ ret = 0;
+ } else
+ ret = EINVAL;
+ mtx_unlock(&id_priv->lock);
+ return ret;
+}
+
+static void cma_enable_remove(struct rdma_id_private *id_priv)
+{
+ mtx_lock(&id_priv->lock);
+ if (--id_priv->dev_remove == 0)
+ cv_broadcast(&id_priv->wait_remove);
+ mtx_unlock(&id_priv->lock);
+}
+
+static int cma_has_cm_dev(struct rdma_id_private *id_priv)
+{
+ return (id_priv->id.device && id_priv->cm_id.ib);
+}
+
+struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
+ void *context, enum rdma_port_space ps)
+{
+ struct rdma_id_private *id_priv;
+
+ id_priv = malloc(sizeof *id_priv, M_DEVBUF, M_NOWAIT);
+ if (!id_priv)
+ return ERR_PTR(-ENOMEM);
+ bzero(id_priv, sizeof *id_priv);
+
+ id_priv->state = CMA_IDLE;
+ id_priv->id.context = context;
+ id_priv->id.event_handler = event_handler;
+ id_priv->id.ps = ps;
+ mtx_init(&id_priv->lock, "rdma_cm_id_priv", NULL, MTX_DUPOK|MTX_DEF);
+ cv_init(&id_priv->comp, "rdma_cm_id_priv");
+ id_priv->refcount = 1;
+ cv_init(&id_priv->wait_remove, "id priv wait remove");
+ LIST_INIT(&id_priv->listen_list);
+ arc4rand(&id_priv->seq_num, sizeof id_priv->seq_num, 0);
+
+ return &id_priv->id;
+}
+
+static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ return ret;
+
+ ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+ if (ret)
+ return ret;
+
+ qp_attr.qp_state = IB_QPS_RTR;
+ ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+ if (ret)
+ return ret;
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ qp_attr.sq_psn = 0;
+ ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN);
+
+ return ret;
+}
+
+static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ return ret;
+
+ return ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+}
+
+int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct rdma_id_private *id_priv;
+ struct ib_qp *qp;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (id->device != pd->device)
+ return (EINVAL);
+
+ qp = ib_create_qp(pd, qp_init_attr);
+ if (IS_ERR(qp))
+ return PTR_ERR(qp);
+ if (cma_is_ud_ps(id_priv->id.ps))
+ ret = cma_init_ud_qp(id_priv, qp);
+ else
+ ret = cma_init_conn_qp(id_priv, qp);
+ if (ret)
+ goto err;
+
+ id->qp = qp;
+ id_priv->qp_num = qp->qp_num;
+ id_priv->srq = (qp->srq != NULL);
+ return 0;
+err:
+ ib_destroy_qp(qp);
+ return ret;
+}
+
+void rdma_destroy_qp(struct rdma_cm_id *id)
+{
+ ib_destroy_qp(id->qp);
+}
+
+static int cma_modify_qp_rtr(struct rdma_cm_id *id)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ if (!id->qp)
+ return 0;
+
+ /* Need to update QP attributes from default values. */
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ return ret;
+
+ ret = ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
+ if (ret)
+ return ret;
+
+ qp_attr.qp_state = IB_QPS_RTR;
+ ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ return ret;
+
+ return ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
+}
+
+#ifdef IB_SUPPORTED
+static int cma_modify_qp_rts(struct rdma_cm_id *id)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ if (!id->qp)
+ return 0;
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ return ret;
+
+ return ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
+}
+#endif
+
+static int cma_modify_qp_err(struct rdma_cm_id *id)
+{
+ struct ib_qp_attr qp_attr;
+
+ if (!id->qp)
+ return 0;
+
+ qp_attr.qp_state = IB_QPS_ERR;
+ return ib_modify_qp(id->qp, &qp_attr, IB_QP_STATE);
+}
+
+#ifdef IB_SUPPORTED
+static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
+ struct ib_qp_attr *qp_attr, int *qp_attr_mask)
+{
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ int ret;
+
+ ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num,
+ ib_addr_get_pkey(dev_addr),
+ &qp_attr->pkey_index);
+ if (ret)
+ return ret;
+
+ qp_attr->port_num = id_priv->id.port_num;
+ *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT;
+
+ if (cma_is_ud_ps(id_priv->id.ps)) {
+ qp_attr->qkey = id_priv->qkey;
+ *qp_attr_mask |= IB_QP_QKEY;
+ } else {
+ qp_attr->qp_access_flags = 0;
+ *qp_attr_mask |= IB_QP_ACCESS_FLAGS;
+ }
+ return 0;
+}
+#endif
+
+int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ struct rdma_id_private *id_priv;
+ int ret = 0;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+#ifdef IB_SUPPORTED
+ switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ if (!id_priv->cm_id.ib || cma_is_ud_ps(id_priv->id.ps))
+ ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask);
+ else
+ ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr,
+ qp_attr_mask);
+ if (qp_attr->qp_state == IB_QPS_RTR)
+ qp_attr->rq_psn = id_priv->seq_num;
+ break;
+ case RDMA_TRANSPORT_IWARP:
+#endif
+ if (!id_priv->cm_id.iw) {
+ qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE;
+ *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+ } else
+ ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr,
+ qp_attr_mask);
+#ifdef IB_SUPPORTED
+ break;
+ default:
+ ret = ENOSYS;
+ break;
+ }
+#endif
+
+ return ret;
+}
+
+static inline int cma_zero_addr(struct sockaddr *addr)
+{
+ struct in6_addr *ip6;
+
+ if (addr->sa_family == AF_INET)
+ return in_nullhost(((struct sockaddr_in *) addr)->sin_addr);
+ else {
+ ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr;
+ return (ip6->s6_addr32[0] | ip6->s6_addr32[1] |
+ ip6->s6_addr32[2] | ip6->s6_addr32[3]) == 0;
+ }
+}
+
+static inline int cma_loopback_addr(struct sockaddr *addr)
+{
+ return ((struct sockaddr_in *)addr)->sin_addr.s_addr == INADDR_LOOPBACK;
+}
+
+static inline int cma_any_addr(struct sockaddr *addr)
+{
+ return cma_zero_addr(addr) || cma_loopback_addr(addr);
+}
+
+static inline __be16 cma_port(struct sockaddr *addr)
+{
+ if (addr->sa_family == AF_INET)
+ return ((struct sockaddr_in *) addr)->sin_port;
+ else
+ return ((struct sockaddr_in6 *) addr)->sin6_port;
+}
+
+static inline int cma_any_port(struct sockaddr *addr)
+{
+ return !cma_port(addr);
+}
+
+#ifdef IB_SUPPORTED
+static int cma_get_net_info(void *hdr, enum rdma_port_space ps,
+ u8 *ip_ver, __u16 *port,
+ union cma_ip_addr **src, union cma_ip_addr **dst)
+{
+ switch (ps) {
+ case RDMA_PS_SDP:
+ if (sdp_get_majv(((struct sdp_hh *) hdr)->sdp_version) !=
+ SDP_MAJ_VERSION)
+ return (EINVAL);
+
+ *ip_ver = sdp_get_ip_ver(hdr);
+ *port = ((struct sdp_hh *) hdr)->port;
+ *src = &((struct sdp_hh *) hdr)->src_addr;
+ *dst = &((struct sdp_hh *) hdr)->dst_addr;
+ break;
+ default:
+ if (((struct cma_hdr *) hdr)->cma_version != CMA_VERSION)
+ return (EINVAL);
+
+ *ip_ver = cma_get_ip_ver(hdr);
+ *port = ((struct cma_hdr *) hdr)->port;
+ *src = &((struct cma_hdr *) hdr)->src_addr;
+ *dst = &((struct cma_hdr *) hdr)->dst_addr;
+ break;
+ }
+
+ if (*ip_ver != 4 && *ip_ver != 6)
+ return (EINVAL);
+ return 0;
+}
+
+static void cma_save_net_info(struct rdma_addr *addr,
+ struct rdma_addr *listen_addr,
+ u8 ip_ver, __u16 port,
+ union cma_ip_addr *src, union cma_ip_addr *dst)
+{
+ struct sockaddr_in *listen4, *ip4;
+ struct sockaddr_in6 *listen6, *ip6;
+
+ switch (ip_ver) {
+ case 4:
+ listen4 = (struct sockaddr_in *) &listen_addr->src_addr;
+ ip4 = (struct sockaddr_in *) &addr->src_addr;
+ ip4->sin_family = listen4->sin_family;
+ ip4->sin_addr.s_addr = dst->ip4.addr;
+ ip4->sin_port = listen4->sin_port;
+
+ ip4 = (struct sockaddr_in *) &addr->dst_addr;
+ ip4->sin_family = listen4->sin_family;
+ ip4->sin_addr.s_addr = src->ip4.addr;
+ ip4->sin_port = port;
+ break;
+ case 6:
+ listen6 = (struct sockaddr_in6 *) &listen_addr->src_addr;
+ ip6 = (struct sockaddr_in6 *) &addr->src_addr;
+ ip6->sin6_family = listen6->sin6_family;
+ ip6->sin6_addr = dst->ip6;
+ ip6->sin6_port = listen6->sin6_port;
+
+ ip6 = (struct sockaddr_in6 *) &addr->dst_addr;
+ ip6->sin6_family = listen6->sin6_family;
+ ip6->sin6_addr = src->ip6;
+ ip6->sin6_port = port;
+ break;
+ default:
+ break;
+ }
+}
+#endif
+
+static inline int cma_user_data_offset(enum rdma_port_space ps)
+{
+ switch (ps) {
+ case RDMA_PS_SDP:
+ return 0;
+ default:
+ return sizeof(struct cma_hdr);
+ }
+}
+
+static void cma_cancel_route(struct rdma_id_private *id_priv)
+{
+#ifdef IB_SUPPORTED
+ switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ if (id_priv->query)
+ ib_sa_cancel_query(id_priv->query_id, id_priv->query);
+ break;
+ default:
+ break;
+ }
+#endif
+}
+
+static inline int cma_internal_listen(struct rdma_id_private *id_priv)
+{
+ return (id_priv->state == CMA_LISTEN) && id_priv->cma_dev &&
+ cma_any_addr(&id_priv->id.route.addr.src_addr);
+}
+
+static void cma_destroy_listen(struct rdma_id_private *id_priv)
+{
+ cma_exch(id_priv, CMA_DESTROYING);
+
+ if (id_priv->cma_dev) {
+#ifdef IB_SUPPORTED
+ switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib))
+ ib_destroy_cm_id(id_priv->cm_id.ib);
+ break;
+ case RDMA_TRANSPORT_IWARP:
+#endif
+ if (id_priv->cm_id.iw && !IS_ERR(id_priv->cm_id.iw))
+ iw_destroy_cm_id(id_priv->cm_id.iw);
+#ifdef IB_SUPPORTED
+ break;
+ default:
+ break;
+ }
+#endif
+ cma_detach_from_dev(id_priv);
+ }
+ LIST_REMOVE(id_priv, listen_entry);
+
+ cma_deref_id(id_priv);
+ mtx_lock(&id_priv->lock);
+ if (id_priv->refcount)
+ cv_wait(&id_priv->comp, &id_priv->lock);
+ mtx_unlock(&id_priv->lock);
+
+ free(id_priv, M_DEVBUF);
+}
+
+static void cma_cancel_listens(struct rdma_id_private *id_priv)
+{
+ struct rdma_id_private *dev_id_priv;
+
+ mtx_lock(&lock);
+ LIST_REMOVE(id_priv, list);
+
+ while (!LIST_EMPTY(&id_priv->listen_list)) {
+ dev_id_priv = LIST_FIRST(&id_priv->listen_list);
+ cma_destroy_listen(dev_id_priv);
+ }
+ mtx_unlock(&lock);
+}
+
+static void cma_cancel_operation(struct rdma_id_private *id_priv,
+ enum cma_state state)
+{
+ switch (state) {
+ case CMA_ADDR_QUERY:
+ rdma_addr_cancel(&id_priv->id.route.addr.dev_addr);
+ break;
+ case CMA_ROUTE_QUERY:
+ cma_cancel_route(id_priv);
+ break;
+ case CMA_LISTEN:
+ if (cma_any_addr(&id_priv->id.route.addr.src_addr) &&
+ !id_priv->cma_dev)
+ cma_cancel_listens(id_priv);
+ break;
+ default:
+ break;
+ }
+}
+
+static void cma_release_port(struct rdma_id_private *id_priv)
+{
+ struct rdma_bind_list *bind_list = id_priv->bind_list;
+
+ if (!bind_list)
+ return;
+
+ mtx_lock(&lock);
+ TAILQ_REMOVE(&bind_list->owners, id_priv, node);
+ if (TAILQ_EMPTY(&bind_list->owners)) {
+ kvl_delete(bind_list->ps, bind_list->port);
+ free(bind_list, M_DEVBUF);
+ }
+ mtx_unlock(&lock);
+ if (id_priv->so)
+ soclose(id_priv->so);
+}
+
+#ifdef IB_SUPPORTED
+static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
+{
+ struct cma_multicast *mc;
+
+ while (!LIST_EMPTY(&id_priv->mc_list)) {
+ mc = LIST_FIRST(&id_priv->mc_list);
+ LIST_REMOVE(mc, list);
+ ib_sa_free_multicast(mc->multicast.ib);
+ free(mc, M_DEVBUF);
+ }
+}
+#endif
+
+void rdma_destroy_id(struct rdma_cm_id *id)
+{
+ struct rdma_id_private *id_priv;
+ enum cma_state state;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ state = cma_exch(id_priv, CMA_DESTROYING);
+ cma_cancel_operation(id_priv, state);
+
+ mtx_lock(&lock);
+ if (id_priv->cma_dev) {
+ mtx_unlock(&lock);
+#ifdef IB_SUPPORTED
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib))
+ ib_destroy_cm_id(id_priv->cm_id.ib);
+ break;
+ case RDMA_TRANSPORT_IWARP:
+#endif
+ if (id_priv->cm_id.iw && !IS_ERR(id_priv->cm_id.iw))
+ iw_destroy_cm_id(id_priv->cm_id.iw);
+#ifdef IB_SUPPORTED
+ break;
+ default:
+ break;
+ }
+ cma_leave_mc_groups(id_priv);
+#endif
+ mtx_lock(&lock);
+ cma_detach_from_dev(id_priv);
+ }
+ mtx_unlock(&lock);
+ cma_release_port(id_priv);
+ cma_deref_id(id_priv);
+ mtx_lock(&id_priv->lock);
+ PANIC_IF(id_priv->refcount < 0);
+ if (id_priv->refcount)
+ cv_wait(&id_priv->comp, &id_priv->lock);
+ mtx_unlock(&id_priv->lock);
+ free(id_priv->id.route.path_rec, M_DEVBUF);
+ free(id_priv, M_DEVBUF);
+}
+
+#ifdef IB_SUPPORTED
+static int cma_rep_recv(struct rdma_id_private *id_priv)
+{
+ int ret;
+
+ ret = cma_modify_qp_rtr(&id_priv->id);
+ if (ret)
+ goto reject;
+
+ ret = cma_modify_qp_rts(&id_priv->id);
+ if (ret)
+ goto reject;
+
+ ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0);
+ if (ret)
+ goto reject;
+
+ return 0;
+reject:
+ cma_modify_qp_err(&id_priv->id);
+ ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
+ NULL, 0, NULL, 0);
+ return ret;
+}
+
+static int cma_verify_rep(struct rdma_id_private *id_priv, void *data)
+{
+ if (id_priv->id.ps == RDMA_PS_SDP &&
+ sdp_get_majv(((struct sdp_hah *) data)->sdp_version) !=
+ SDP_MAJ_VERSION)
+ return (EINVAL);
+
+ return 0;
+}
+
+static void cma_set_rep_event_data(struct rdma_cm_event *event,
+ struct ib_cm_rep_event_param *rep_data,
+ void *private_data)
+{
+ event->param.conn.private_data = private_data;
+ event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
+ event->param.conn.responder_resources = rep_data->responder_resources;
+ event->param.conn.initiator_depth = rep_data->initiator_depth;
+ event->param.conn.flow_control = rep_data->flow_control;
+ event->param.conn.rnr_retry_count = rep_data->rnr_retry_count;
+ event->param.conn.srq = rep_data->srq;
+ event->param.conn.qp_num = rep_data->remote_qpn;
+}
+
+static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *id_priv = cm_id->context;
+ struct rdma_cm_event event;
+ int ret = 0;
+
+ if (cma_disable_remove(id_priv, CMA_CONNECT))
+ return 0;
+
+ memset(&event, 0, sizeof event);
+ switch (ib_event->event) {
+ case IB_CM_REQ_ERROR:
+ case IB_CM_REP_ERROR:
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ event.status = ETIMEDOUT;
+ break;
+ case IB_CM_REP_RECEIVED:
+ event.status = cma_verify_rep(id_priv, ib_event->private_data);
+ if (event.status)
+ event.event = RDMA_CM_EVENT_CONNECT_ERROR;
+ else if (id_priv->id.qp && id_priv->id.ps != RDMA_PS_SDP) {
+ event.status = cma_rep_recv(id_priv);
+ event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR :
+ RDMA_CM_EVENT_ESTABLISHED;
+ } else
+ event.event = RDMA_CM_EVENT_CONNECT_RESPONSE;
+ cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd,
+ ib_event->private_data);
+ break;
+ case IB_CM_RTU_RECEIVED:
+ case IB_CM_USER_ESTABLISHED:
+ event.event = RDMA_CM_EVENT_ESTABLISHED;
+ break;
+ case IB_CM_DREQ_ERROR:
+ event.status = ETIMEDOUT; /* fall through */
+ case IB_CM_DREQ_RECEIVED:
+ case IB_CM_DREP_RECEIVED:
+ if (!cma_comp_exch(id_priv, CMA_CONNECT, CMA_DISCONNECT))
+ goto out;
+ event.event = RDMA_CM_EVENT_DISCONNECTED;
+ break;
+ case IB_CM_TIMEWAIT_EXIT:
+ case IB_CM_MRA_RECEIVED:
+ /* ignore event */
+ goto out;
+ case IB_CM_REJ_RECEIVED:
+ cma_modify_qp_err(&id_priv->id);
+ event.status = ib_event->param.rej_rcvd.reason;
+ event.event = RDMA_CM_EVENT_REJECTED;
+ event.param.conn.private_data = ib_event->private_data;
+ event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
+ break;
+ default:
+ log(LOG_ERR, "RDMA CMA: unexpected IB CM event: %d",
+ ib_event->event);
+ goto out;
+ }
+
+ ret = id_priv->id.event_handler(&id_priv->id, &event);
+ if (ret) {
+ /* Destroy the CM ID by returning a non-zero value. */
+ id_priv->cm_id.ib = NULL;
+ cma_exch(id_priv, CMA_DESTROYING);
+ cma_enable_remove(id_priv);
+ rdma_destroy_id(&id_priv->id);
+ return ret;
+ }
+out:
+ cma_enable_remove(id_priv);
+ return ret;
+}
+
+static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
+ struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *id_priv;
+ struct rdma_cm_id *id;
+ struct rdma_route *rt;
+ union cma_ip_addr *src, *dst;
+ __u16 port;
+ u8 ip_ver;
+
+ if (cma_get_net_info(ib_event->private_data, listen_id->ps,
+ &ip_ver, &port, &src, &dst))
+ goto err;
+
+ id = rdma_create_id(listen_id->event_handler, listen_id->context,
+ listen_id->ps);
+ if (IS_ERR(id))
+ goto err;
+
+ cma_save_net_info(&id->route.addr, &listen_id->route.addr,
+ ip_ver, port, src, dst);
+
+ rt = &id->route;
+ rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1;
+ rt->path_rec = malloc(sizeof *rt->path_rec * rt->num_paths,
+ M_DEVBUF, M_NOWAIT);
+ if (!rt->path_rec)
+ goto destroy_id;
+
+ rt->path_rec[0] = *ib_event->param.req_rcvd.primary_path;
+ if (rt->num_paths == 2)
+ rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
+
+ ib_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
+ ib_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
+ ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
+ rt->addr.dev_addr.dev_type = RDMA_NODE_IB_CA;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ id_priv->state = CMA_CONNECT;
+ return id_priv;
+
+destroy_id:
+ rdma_destroy_id(id);
+err:
+ return NULL;
+}
+
+static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
+ struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *id_priv;
+ struct rdma_cm_id *id;
+ union cma_ip_addr *src, *dst;
+ __u16 port;
+ u8 ip_ver;
+ int ret;
+
+ id = rdma_create_id(listen_id->event_handler, listen_id->context,
+ listen_id->ps);
+ if (IS_ERR(id))
+ return NULL;
+
+
+ if (cma_get_net_info(ib_event->private_data, listen_id->ps,
+ &ip_ver, &port, &src, &dst))
+ goto err;
+
+ cma_save_net_info(&id->route.addr, &listen_id->route.addr,
+ ip_ver, port, src, dst);
+
+ ret = rdma_translate_ip(&id->route.addr.src_addr,
+ &id->route.addr.dev_addr);
+ if (ret)
+ goto err;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ id_priv->state = CMA_CONNECT;
+ return id_priv;
+err:
+ rdma_destroy_id(id);
+ return NULL;
+}
+
+static void cma_set_req_event_data(struct rdma_cm_event *event,
+ struct ib_cm_req_event_param *req_data,
+ void *private_data, int offset)
+{
+ event->param.conn.private_data = private_data + offset;
+ event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset;
+ event->param.conn.responder_resources = req_data->responder_resources;
+ event->param.conn.initiator_depth = req_data->initiator_depth;
+ event->param.conn.flow_control = req_data->flow_control;
+ event->param.conn.retry_count = req_data->retry_count;
+ event->param.conn.rnr_retry_count = req_data->rnr_retry_count;
+ event->param.conn.srq = req_data->srq;
+ event->param.conn.qp_num = req_data->remote_qpn;
+}
+
+static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *listen_id, *conn_id;
+ struct rdma_cm_event event;
+ int offset, ret;
+
+ listen_id = cm_id->context;
+ if (cma_disable_remove(listen_id, CMA_LISTEN))
+ return (ECONNABORTED);
+
+ memset(&event, 0, sizeof event);
+ offset = cma_user_data_offset(listen_id->id.ps);
+ event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
+ if (cma_is_ud_ps(listen_id->id.ps)) {
+ conn_id = cma_new_udp_id(&listen_id->id, ib_event);
+ event.param.ud.private_data = ib_event->private_data + offset;
+ event.param.ud.private_data_len =
+ IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
+ } else {
+ conn_id = cma_new_conn_id(&listen_id->id, ib_event);
+ cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
+ ib_event->private_data, offset);
+ }
+ if (!conn_id) {
+ ret = ENOMEM;
+ goto out;
+ }
+
+ mtx_lock(&conn_id->lock);
+ conn_id->dev_remove++;
+ mtx_unlock(&conn_id->lock);
+ mtx_lock(&lock);
+ ret = cma_acquire_dev(conn_id);
+ mtx_unlock(&lock);
+ if (ret)
+ goto release_conn_id;
+
+ conn_id->cm_id.ib = cm_id;
+ cm_id->context = conn_id;
+ cm_id->cm_handler = cma_ib_handler;
+
+ ret = conn_id->id.event_handler(&conn_id->id, &event);
+ if (!ret)
+ goto out;
+
+ /* Destroy the CM ID by returning a non-zero value. */
+ conn_id->cm_id.ib = NULL;
+
+release_conn_id:
+ cma_exch(conn_id, CMA_DESTROYING);
+ cma_enable_remove(conn_id);
+ rdma_destroy_id(&conn_id->id);
+
+out:
+ cma_enable_remove(listen_id);
+ return ret;
+}
+
+static __be64 cma_get_service_id(enum rdma_port_space ps, struct sockaddr *addr)
+{
+ return cpu_to_be64(((u64)ps << 16) + be16_to_cpu(cma_port(addr)));
+}
+
+static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
+ struct ib_cm_compare_data *compare)
+{
+ struct cma_hdr *cma_data, *cma_mask;
+ struct sdp_hh *sdp_data, *sdp_mask;
+ __u32 ip4_addr;
+ struct in6_addr ip6_addr;
+
+ memset(compare, 0, sizeof *compare);
+ cma_data = (void *) compare->data;
+ cma_mask = (void *) compare->mask;
+ sdp_data = (void *) compare->data;
+ sdp_mask = (void *) compare->mask;
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr;
+ if (ps == RDMA_PS_SDP) {
+ sdp_set_ip_ver(sdp_data, 4);
+ sdp_set_ip_ver(sdp_mask, 0xF);
+ sdp_data->dst_addr.ip4.addr = ip4_addr;
+ sdp_mask->dst_addr.ip4.addr = ~0;
+ } else {
+ cma_set_ip_ver(cma_data, 4);
+ cma_set_ip_ver(cma_mask, 0xF);
+ cma_data->dst_addr.ip4.addr = ip4_addr;
+ cma_mask->dst_addr.ip4.addr = ~0;
+ }
+ break;
+ case AF_INET6:
+ ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr;
+ if (ps == RDMA_PS_SDP) {
+ sdp_set_ip_ver(sdp_data, 6);
+ sdp_set_ip_ver(sdp_mask, 0xF);
+ sdp_data->dst_addr.ip6 = ip6_addr;
+ memset(&sdp_mask->dst_addr.ip6, 0xFF,
+ sizeof sdp_mask->dst_addr.ip6);
+ } else {
+ cma_set_ip_ver(cma_data, 6);
+ cma_set_ip_ver(cma_mask, 0xF);
+ cma_data->dst_addr.ip6 = ip6_addr;
+ memset(&cma_mask->dst_addr.ip6, 0xFF,
+ sizeof cma_mask->dst_addr.ip6);
+ }
+ break;
+ default:
+ break;
+ }
+}
+#endif /* IB_SUPPORTED */
+
+static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
+{
+ struct rdma_id_private *id_priv = iw_id->context;
+ struct rdma_cm_event event;
+ struct sockaddr_in *sin;
+ int ret = 0;
+
+ if (cma_disable_remove(id_priv, CMA_CONNECT))
+ return 0;
+
+ memset(&event, 0, sizeof event);
+ switch (iw_event->event) {
+ case IW_CM_EVENT_CLOSE:
+ event.event = RDMA_CM_EVENT_DISCONNECTED;
+ break;
+ case IW_CM_EVENT_CONNECT_REPLY:
+ sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
+ *sin = iw_event->local_addr;
+ sin = (struct sockaddr_in *) &id_priv->id.route.addr.dst_addr;
+ *sin = iw_event->remote_addr;
+ switch (iw_event->status) {
+ case 0:
+ event.event = RDMA_CM_EVENT_ESTABLISHED;
+ break;
+ case ECONNRESET:
+ case ECONNREFUSED:
+ event.event = RDMA_CM_EVENT_REJECTED;
+ break;
+ case ETIMEDOUT:
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ break;
+ default:
+ event.event = RDMA_CM_EVENT_CONNECT_ERROR;
+ break;
+ }
+ break;
+ case IW_CM_EVENT_ESTABLISHED:
+ event.event = RDMA_CM_EVENT_ESTABLISHED;
+ break;
+ default:
+ panic("unknown event type %d", iw_event->event);
+
+ }
+
+ event.status = iw_event->status;
+ event.param.conn.private_data = iw_event->private_data;
+ event.param.conn.private_data_len = iw_event->private_data_len;
+ ret = id_priv->id.event_handler(&id_priv->id, &event);
+ if (ret) {
+ /* Destroy the CM ID by returning a non-zero value. */
+ id_priv->cm_id.iw = NULL;
+ cma_exch(id_priv, CMA_DESTROYING);
+ cma_enable_remove(id_priv);
+ rdma_destroy_id(&id_priv->id);
+ return ret;
+ }
+
+ cma_enable_remove(id_priv);
+ return ret;
+}
+
+static int iw_conn_req_handler(struct iw_cm_id *cm_id,
+ struct iw_cm_event *iw_event)
+{
+ struct rdma_cm_id *new_cm_id;
+ struct rdma_id_private *listen_id, *conn_id;
+ struct sockaddr_in *sin;
+ struct ifnet *dev;
+ struct rdma_cm_event event;
+ int ret;
+ struct ifaddr *ifa;
+ uint16_t port;
+
+ listen_id = cm_id->context;
+ if (cma_disable_remove(listen_id, CMA_LISTEN))
+ return (ECONNABORTED);
+
+ /* Create a new RDMA id for the new IW CM ID */
+ new_cm_id = rdma_create_id(listen_id->id.event_handler,
+ listen_id->id.context,
+ RDMA_PS_TCP);
+ if (!new_cm_id) {
+ ret = ENOMEM;
+ goto out;
+ }
+ conn_id = container_of(new_cm_id, struct rdma_id_private, id);
+ mtx_lock(&conn_id->lock);
+ ++conn_id->dev_remove;
+ mtx_unlock(&conn_id->lock);
+ conn_id->state = CMA_CONNECT;
+
+ port = iw_event->local_addr.sin_port;
+ iw_event->local_addr.sin_port = 0;
+ ifa = ifa_ifwithaddr((struct sockaddr *)&iw_event->local_addr);
+ iw_event->local_addr.sin_port = port;
+ if (!ifa) {
+ ret = EADDRNOTAVAIL;
+ cma_enable_remove(conn_id);
+ rdma_destroy_id(new_cm_id);
+ goto out;
+ }
+ dev = ifa->ifa_ifp;
+ ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL);
+ if (ret) {
+ cma_enable_remove(conn_id);
+ rdma_destroy_id(new_cm_id);
+ goto out;
+ }
+
+ mtx_lock(&lock);
+ ret = cma_acquire_dev(conn_id);
+ mtx_unlock(&lock);
+ if (ret) {
+ cma_enable_remove(conn_id);
+ rdma_destroy_id(new_cm_id);
+ goto out;
+ }
+
+ conn_id->cm_id.iw = cm_id;
+ cm_id->context = conn_id;
+ cm_id->cm_handler = cma_iw_handler;
+
+ sin = (struct sockaddr_in *) &new_cm_id->route.addr.src_addr;
+ *sin = iw_event->local_addr;
+ sin = (struct sockaddr_in *) &new_cm_id->route.addr.dst_addr;
+ *sin = iw_event->remote_addr;
+ conn_id->so = cm_id->so;
+
+ memset(&event, 0, sizeof event);
+ event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
+ event.param.conn.private_data = iw_event->private_data;
+ event.param.conn.private_data_len = iw_event->private_data_len;
+ ret = conn_id->id.event_handler(&conn_id->id, &event);
+ if (ret) {
+ /* User wants to destroy the CM ID */
+ conn_id->cm_id.iw = NULL;
+ cma_exch(conn_id, CMA_DESTROYING);
+ cma_enable_remove(conn_id);
+ rdma_destroy_id(&conn_id->id);
+ }
+
+out:
+ cma_enable_remove(listen_id);
+ return ret;
+}
+
+#ifdef IB_SUPPORTED
+static int cma_ib_listen(struct rdma_id_private *id_priv)
+{
+ struct ib_cm_compare_data compare_data;
+ struct sockaddr *addr;
+ __be64 svc_id;
+ int ret;
+
+ id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_req_handler,
+ id_priv);
+ if (IS_ERR(id_priv->cm_id.ib))
+ return PTR_ERR(id_priv->cm_id.ib);
+
+ addr = &id_priv->id.route.addr.src_addr;
+ svc_id = cma_get_service_id(id_priv->id.ps, addr);
+ if (cma_any_addr(addr))
+ ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
+ else {
+ cma_set_compare_data(id_priv->id.ps, addr, &compare_data);
+ ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data);
+ }
+
+ if (ret) {
+ ib_destroy_cm_id(id_priv->cm_id.ib);
+ id_priv->cm_id.ib = NULL;
+ }
+
+ return ret;
+}
+#endif
+
+static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
+{
+ int ret;
+ struct sockaddr_in *sin;
+
+ id_priv->cm_id.iw = iw_create_cm_id(id_priv->id.device, id_priv->so,
+ iw_conn_req_handler, id_priv);
+ if (IS_ERR(id_priv->cm_id.iw))
+ return PTR_ERR(id_priv->cm_id.iw);
+
+ sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
+ id_priv->cm_id.iw->local_addr = *sin;
+
+ ret = iw_cm_listen(id_priv->cm_id.iw, backlog);
+
+ if (ret) {
+ iw_destroy_cm_id(id_priv->cm_id.iw);
+ id_priv->cm_id.iw = NULL;
+ }
+
+ return ret;
+}
+
+static int cma_listen_handler(struct rdma_cm_id *id,
+ struct rdma_cm_event *event)
+{
+ struct rdma_id_private *id_priv = id->context;
+
+ id->context = id_priv->id.context;
+ id->event_handler = id_priv->id.event_handler;
+ return id_priv->id.event_handler(id, event);
+}
+
+static void cma_listen_on_dev(struct rdma_id_private *id_priv,
+ struct cma_device *cma_dev)
+{
+ struct rdma_id_private *dev_id_priv;
+ struct rdma_cm_id *id;
+ int ret;
+
+ id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps);
+ if (IS_ERR(id))
+ return;
+
+ dev_id_priv = container_of(id, struct rdma_id_private, id);
+
+ dev_id_priv->state = CMA_ADDR_BOUND;
+ memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr,
+ ip_addr_size(&id_priv->id.route.addr.src_addr));
+ dev_id_priv->so = id_priv->so; /* XXX */
+
+ cma_attach_to_dev(dev_id_priv, cma_dev);
+ LIST_INSERT_HEAD(&id_priv->listen_list, dev_id_priv, listen_entry);
+
+ ret = rdma_listen(id, id_priv->backlog);
+ if (ret)
+ goto err;
+
+ return;
+err:
+ cma_destroy_listen(dev_id_priv);
+}
+
+static void cma_listen_on_all(struct rdma_id_private *id_priv)
+{
+ struct cma_device *cma_dev;
+
+ mtx_lock(&lock);
+ LIST_INSERT_HEAD(&listen_any_list, id_priv, list);
+ TAILQ_FOREACH(cma_dev, &dev_list, list)
+ cma_listen_on_dev(id_priv, cma_dev);
+ mtx_unlock(&lock);
+}
+
+static int cma_bind_any(struct rdma_cm_id *id, sa_family_t af)
+{
+ struct sockaddr_in addr_in;
+
+ memset(&addr_in, 0, sizeof addr_in);
+ addr_in.sin_family = af;
+ addr_in.sin_len = sizeof addr_in;
+ return rdma_bind_addr(id, (struct sockaddr *) &addr_in);
+}
+
+int rdma_listen(struct rdma_cm_id *id, int backlog)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (id_priv->state == CMA_IDLE) {
+ ret = cma_bind_any(id, AF_INET);
+ if (ret)
+ return ret;
+ }
+
+ if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN))
+ return (EINVAL);
+
+ id_priv->backlog = backlog;
+ if (id->device) {
+#ifdef IB_SUPPORTED
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ ret = cma_ib_listen(id_priv);
+ if (ret)
+ goto err;
+ break;
+ case RDMA_TRANSPORT_IWARP:
+#endif
+ ret = cma_iw_listen(id_priv, backlog);
+ if (ret)
+ goto err;
+#ifdef IB_SUPPORTED
+ break;
+ default:
+ ret = ENOSYS;
+ goto err;
+ }
+#endif
+ } else
+ cma_listen_on_all(id_priv);
+
+ return 0;
+err:
+ id_priv->backlog = 0;
+ cma_comp_exch(id_priv, CMA_LISTEN, CMA_ADDR_BOUND);
+ return ret;
+}
+
+#ifdef IB_SUPPORTED
+static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec,
+ void *context)
+{
+ struct cma_work *work = context;
+ struct rdma_route *route;
+
+ route = &work->id->id.route;
+
+ if (!status) {
+ route->num_paths = 1;
+ *route->path_rec = *path_rec;
+ } else {
+ work->old_state = CMA_ROUTE_QUERY;
+ work->new_state = CMA_ADDR_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
+ work->event.status = status;
+ }
+
+ taskqueue_enqueue(cma_wq, &work->task);
+}
+
+static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms,
+ struct cma_work *work)
+{
+ struct rdma_dev_addr *addr = &id_priv->id.route.addr.dev_addr;
+ struct ib_sa_path_rec path_rec;
+
+ memset(&path_rec, 0, sizeof path_rec);
+ ib_addr_get_sgid(addr, &path_rec.sgid);
+ ib_addr_get_dgid(addr, &path_rec.dgid);
+ path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(addr));
+ path_rec.numb_path = 1;
+ path_rec.reversible = 1;
+
+ id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device,
+ id_priv->id.port_num, &path_rec,
+ IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
+ IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH |
+ IB_SA_PATH_REC_REVERSIBLE,
+ timeout_ms, M_NOWAIT,
+ cma_query_handler, work, &id_priv->query);
+
+ return (id_priv->query_id < 0) ? id_priv->query_id : 0;
+}
+#endif
+
+static void cma_work_handler(void *context, int pending)
+{
+ struct cma_work *work = context;
+ struct rdma_id_private *id_priv = work->id;
+ int destroy = 0;
+
+ mtx_lock(&id_priv->lock);
+ ++id_priv->dev_remove;
+ mtx_unlock(&id_priv->lock);
+ if (!cma_comp_exch(id_priv, work->old_state, work->new_state))
+ goto out;
+
+ if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
+ cma_exch(id_priv, CMA_DESTROYING);
+ destroy = 1;
+ }
+out:
+ cma_enable_remove(id_priv);
+ cma_deref_id(id_priv);
+ if (destroy)
+ rdma_destroy_id(&id_priv->id);
+ free(work, M_DEVBUF);
+}
+
+#ifdef IB_SUPPORTED
+static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
+{
+ struct rdma_route *route = &id_priv->id.route;
+ struct cma_work *work;
+ int ret;
+
+ work = malloc(sizeof *work, M_DEVBUF, M_NOWAIT);
+ if (!work)
+ return (ENOMEM);
+ bzero(work, sizeof *work);
+
+ work->id = id_priv;
+ TASK_INIT(&work->task, 0, cma_work_handler, work);
+ work->old_state = CMA_ROUTE_QUERY;
+ work->new_state = CMA_ROUTE_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+
+ route->path_rec = malloc(sizeof *route->path_rec, M_DEVBUF, M_NOWAIT);
+ if (!route->path_rec) {
+ ret = ENOMEM;
+ goto err1;
+ }
+
+ ret = cma_query_ib_route(id_priv, timeout_ms, work);
+ if (ret)
+ goto err2;
+
+ return 0;
+err2:
+ free(route->path_rec, M_DEVBUF);
+ route->path_rec = NULL;
+err1:
+ free(work, M_DEVBUF);
+ return ret;
+}
+
+int rdma_set_ib_paths(struct rdma_cm_id *id,
+ struct ib_sa_path_rec *path_rec, int num_paths)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_RESOLVED))
+ return (EINVAL);
+
+ id->route.path_rec = malloc(sizeof *path_rec * num_paths, M_DEVBUF, M_NOWAIT);
+ if (!id->route.path_rec) {
+ ret = ENOMEM;
+ goto err;
+ }
+
+ memcpy(id->route.path_rec, path_rec, sizeof *path_rec * num_paths);
+ return 0;
+err:
+ cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_ADDR_RESOLVED);
+ return ret;
+}
+#endif
+
+static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)
+{
+ struct cma_work *work;
+
+ work = malloc(sizeof *work, M_DEVBUF, M_NOWAIT);
+ if (!work)
+ return (ENOMEM);
+ bzero(work, sizeof *work);
+
+ work->id = id_priv;
+ TASK_INIT(&work->task, 0, cma_work_handler, work);
+ work->old_state = CMA_ROUTE_QUERY;
+ work->new_state = CMA_ROUTE_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+ taskqueue_enqueue(cma_wq, &work->task);
+ return 0;
+}
+
+int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_QUERY))
+ return (EINVAL);
+
+ mtx_lock(&id_priv->lock);
+ id_priv->refcount++;
+ mtx_unlock(&id_priv->lock);
+#ifdef IB_SUPPORTED
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ ret = cma_resolve_ib_route(id_priv, timeout_ms);
+ break;
+ case RDMA_TRANSPORT_IWARP:
+#endif
+ ret = cma_resolve_iw_route(id_priv, timeout_ms);
+#ifdef IB_SUPPORTED
+ break;
+ default:
+ ret = ENOSYS;
+ break;
+ }
+#endif
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ cma_comp_exch(id_priv, CMA_ROUTE_QUERY, CMA_ADDR_RESOLVED);
+ cma_deref_id(id_priv);
+ return ret;
+}
+
+static int cma_bind_loopback(struct rdma_id_private *id_priv)
+{
+ struct cma_device *cma_dev;
+ struct ib_port_attr port_attr;
+ union ib_gid gid;
+ u16 pkey;
+ int ret;
+ u8 p;
+
+ mtx_lock(&lock);
+ if (TAILQ_EMPTY(&dev_list)) {
+ ret = ENODEV;
+ goto out;
+ }
+ TAILQ_FOREACH(cma_dev, &dev_list, list)
+ for (p = 1; p <= cma_dev->device->phys_port_cnt; ++p)
+ if (!ib_query_port(cma_dev->device, p, &port_attr) &&
+ port_attr.state == IB_PORT_ACTIVE)
+ goto port_found;
+
+ p = 1;
+ cma_dev = TAILQ_FIRST(&dev_list);
+
+port_found:
+ ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid);
+ if (ret)
+ goto out;
+
+ ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey);
+ if (ret)
+ goto out;
+
+ ib_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid);
+ ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey);
+ id_priv->id.port_num = p;
+ cma_attach_to_dev(id_priv, cma_dev);
+out:
+ mtx_unlock(&lock);
+ return ret;
+}
+
+static void addr_handler(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *dev_addr, void *context)
+{
+ struct rdma_id_private *id_priv = context;
+ struct rdma_cm_event event;
+
+ memset(&event, 0, sizeof event);
+ mtx_lock(&id_priv->lock);
+ ++id_priv->dev_remove;
+ mtx_unlock(&id_priv->lock);
+
+ /*
+ * Grab mutex to block rdma_destroy_id() from removing the device while
+ * we're trying to acquire it.
+ */
+ mtx_lock(&lock);
+ if (!cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_RESOLVED)) {
+ mtx_unlock(&lock);
+ goto out;
+ }
+
+ if (!status && !id_priv->cma_dev)
+ status = cma_acquire_dev(id_priv);
+ mtx_unlock(&lock);
+
+ if (status) {
+ if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ADDR_BOUND))
+ goto out;
+ event.event = RDMA_CM_EVENT_ADDR_ERROR;
+ event.status = status;
+ } else {
+ memcpy(&id_priv->id.route.addr.src_addr, src_addr,
+ ip_addr_size(src_addr));
+ event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+ }
+
+ if (id_priv->id.event_handler(&id_priv->id, &event)) {
+ cma_exch(id_priv, CMA_DESTROYING);
+ cma_enable_remove(id_priv);
+ cma_deref_id(id_priv);
+ rdma_destroy_id(&id_priv->id);
+ return;
+ }
+out:
+ cma_enable_remove(id_priv);
+ cma_deref_id(id_priv);
+}
+
+static int cma_resolve_loopback(struct rdma_id_private *id_priv)
+{
+ struct cma_work *work;
+ struct sockaddr_in *src_in, *dst_in;
+ union ib_gid gid;
+ int ret;
+
+ work = malloc(sizeof *work, M_DEVBUF, M_NOWAIT);
+ if (!work)
+ return (ENOMEM);
+ bzero(work, sizeof *work);
+
+ if (!id_priv->cma_dev) {
+ ret = cma_bind_loopback(id_priv);
+ if (ret)
+ goto err;
+ }
+
+ ib_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid);
+ ib_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid);
+
+ if (cma_zero_addr(&id_priv->id.route.addr.src_addr)) {
+ src_in = (struct sockaddr_in *)&id_priv->id.route.addr.src_addr;
+ dst_in = (struct sockaddr_in *)&id_priv->id.route.addr.dst_addr;
+ src_in->sin_family = dst_in->sin_family;
+ src_in->sin_addr.s_addr = dst_in->sin_addr.s_addr;
+ }
+
+ work->id = id_priv;
+ TASK_INIT(&work->task, 0, cma_work_handler, work);
+ work->old_state = CMA_ADDR_QUERY;
+ work->new_state = CMA_ADDR_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+ taskqueue_enqueue(cma_wq, &work->task);
+ return 0;
+err:
+ free(work, M_DEVBUF);
+ return ret;
+}
+
+static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+ struct sockaddr *dst_addr)
+{
+ if (src_addr && src_addr->sa_family)
+ return rdma_bind_addr(id, src_addr);
+ else
+ return cma_bind_any(id, dst_addr->sa_family);
+}
+
+int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+ struct sockaddr *dst_addr, int timeout_ms)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (id_priv->state == CMA_IDLE) {
+ ret = cma_bind_addr(id, src_addr, dst_addr);
+ if (ret)
+ return ret;
+ }
+
+ if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_ADDR_QUERY))
+ return (EINVAL);
+
+ mtx_lock(&id_priv->lock);
+ id_priv->refcount++;
+ mtx_unlock(&id_priv->lock);
+ memcpy(&id->route.addr.dst_addr, dst_addr, ip_addr_size(dst_addr));
+ if (cma_any_addr(dst_addr))
+ ret = cma_resolve_loopback(id_priv);
+ else
+ ret = rdma_resolve_ip(&addr_client, &id->route.addr.src_addr,
+ dst_addr, &id->route.addr.dev_addr,
+ timeout_ms, addr_handler, id_priv);
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_BOUND);
+ cma_deref_id(id_priv);
+ return ret;
+}
+
+static void cma_bind_port(struct rdma_bind_list *bind_list,
+ struct rdma_id_private *id_priv)
+{
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
+ sin->sin_port = htons(bind_list->port);
+ id_priv->bind_list = bind_list;
+ TAILQ_INSERT_HEAD(&bind_list->owners, id_priv, node);
+}
+
+static int cma_alloc_port(struct kvl *ps, struct rdma_id_private *id_priv,
+ unsigned short snum)
+{
+ struct rdma_bind_list *bind_list;
+ int port, ret;
+
+ bind_list = malloc(sizeof *bind_list, M_DEVBUF, M_NOWAIT);
+ if (!bind_list)
+ return (ENOMEM);
+ bzero(bind_list, sizeof *bind_list);
+
+ do {
+ ret = kvl_alloc_above(ps, bind_list, snum, &port);
+ } while (ret == EAGAIN);
+
+ if (ret)
+ goto err1;
+
+ if (port != snum) {
+ ret = EADDRNOTAVAIL;
+ goto err2;
+ }
+
+ bind_list->ps = ps;
+ bind_list->port = (unsigned short) port;
+ cma_bind_port(bind_list, id_priv);
+ return 0;
+err2:
+ kvl_delete(ps, port);
+err1:
+ free(bind_list, M_DEVBUF);
+ return ret;
+}
+
+static int cma_alloc_any_port(struct kvl *ps, struct rdma_id_private *id_priv)
+{
+ struct rdma_bind_list *bind_list;
+ int port, ret;
+
+ bind_list = malloc(sizeof *bind_list, M_DEVBUF, M_NOWAIT);
+ if (!bind_list)
+ return (ENOMEM);
+ bzero(bind_list, sizeof *bind_list);
+
+retry:
+ do {
+ ret = kvl_alloc_above(ps, bind_list, next_port, &port);
+ } while (ret == EAGAIN);
+
+ if (ret)
+ goto err1;
+
+ if (port > ipport_lastauto) {
+ if (next_port != ipport_firstauto) {
+ kvl_delete(ps, port);
+ next_port = ipport_firstauto;
+ goto retry;
+ }
+ ret = EADDRNOTAVAIL;
+ goto err2;
+ }
+
+ if (port == ipport_lastauto)
+ next_port = ipport_firstauto;
+ else
+ next_port = port + 1;
+
+ bind_list->ps = ps;
+ bind_list->port = (unsigned short) port;
+ cma_bind_port(bind_list, id_priv);
+ return 0;
+err2:
+ kvl_delete(ps, port);
+err1:
+ free(bind_list, M_DEVBUF);
+ return ret;
+}
+
+static int cma_use_port(struct kvl *ps, struct rdma_id_private *id_priv)
+{
+ struct rdma_id_private *cur_id;
+ struct sockaddr_in *sin, *cur_sin;
+ struct rdma_bind_list *bind_list;
+ unsigned short snum;
+
+ sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
+ snum = ntohs(sin->sin_port);
+ if (snum <= ipport_reservedhigh && snum >= ipport_reservedlow &&
+ priv_check(curthread, PRIV_NETINET_RESERVEDPORT))
+ return (EACCES);
+
+ bind_list = kvl_lookup(ps, snum);
+ if (!bind_list)
+ return cma_alloc_port(ps, id_priv, snum);
+
+ /*
+ * We don't support binding to any address if anyone is bound to
+ * a specific address on the same port.
+ */
+ if (cma_any_addr(&id_priv->id.route.addr.src_addr))
+ return (EADDRNOTAVAIL);
+
+ TAILQ_FOREACH(cur_id, &bind_list->owners, node) {
+ if (cma_any_addr(&cur_id->id.route.addr.src_addr))
+ return (EADDRNOTAVAIL);
+
+ cur_sin = (struct sockaddr_in *)&cur_id->id.route.addr.src_addr;
+ if (sin->sin_addr.s_addr == cur_sin->sin_addr.s_addr)
+ return (EADDRINUSE);
+ }
+
+ cma_bind_port(bind_list, id_priv);
+ return 0;
+}
+
+static int cma_get_tcp_port(struct rdma_id_private *id_priv)
+{
+ int ret;
+ struct socket *so;
+
+ ret = socreate(AF_INET, &so, SOCK_STREAM, IPPROTO_TCP,
+ curthread->td_ucred, curthread);
+ if (ret) {
+ printf("%s socreate err %d\n", __FUNCTION__, ret);
+ return ret;
+ }
+
+ ret = sobind(so, (struct sockaddr *)&id_priv->id.route.addr.src_addr,
+ curthread);
+ if (ret) {
+ soclose(so);
+ return ret;
+ }
+ id_priv->so = so;
+ return 0;
+}
+
+static int cma_get_port(struct rdma_id_private *id_priv)
+{
+ struct kvl *ps;
+ int ret;
+
+ switch (id_priv->id.ps) {
+ case RDMA_PS_SDP:
+ ps = &sdp_ps;
+ break;
+ case RDMA_PS_TCP:
+ ps = &tcp_ps;
+ ret = cma_get_tcp_port(id_priv); /* Synch with native stack */
+ if (ret)
+ return ret;
+ break;
+ case RDMA_PS_UDP:
+ ps = &udp_ps;
+ break;
+ case RDMA_PS_IPOIB:
+ ps = &ipoib_ps;
+ break;
+ default:
+ return (EPROTONOSUPPORT);
+ }
+
+ mtx_lock(&lock);
+ if (cma_any_port(&id_priv->id.route.addr.src_addr))
+ ret = cma_alloc_any_port(ps, id_priv);
+ else
+ ret = cma_use_port(ps, id_priv);
+ mtx_unlock(&lock);
+
+ return ret;
+}
+
+int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ if (addr->sa_family != AF_INET)
+ return (EAFNOSUPPORT);
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND))
+ return (EINVAL);
+
+ if (!cma_any_addr(addr)) {
+ ret = rdma_translate_ip(addr, &id->route.addr.dev_addr);
+ if (ret)
+ goto err1;
+
+ mtx_lock(&lock);
+ ret = cma_acquire_dev(id_priv);
+ mtx_unlock(&lock);
+ if (ret)
+ goto err1;
+ }
+
+ memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr));
+ ret = cma_get_port(id_priv);
+ if (ret)
+ goto err2;
+
+ return 0;
+err2:
+ if (!cma_any_addr(addr)) {
+ mtx_lock(&lock);
+ cma_detach_from_dev(id_priv);
+ mtx_unlock(&lock);
+ }
+err1:
+ cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_IDLE);
+ return ret;
+}
+
+#ifdef IB_SUPPORTED
+static int cma_format_hdr(void *hdr, enum rdma_port_space ps,
+ struct rdma_route *route)
+{
+ struct sockaddr_in *src4, *dst4;
+ struct cma_hdr *cma_hdr;
+ struct sdp_hh *sdp_hdr;
+
+ src4 = (struct sockaddr_in *) &route->addr.src_addr;
+ dst4 = (struct sockaddr_in *) &route->addr.dst_addr;
+
+ switch (ps) {
+ case RDMA_PS_SDP:
+ sdp_hdr = hdr;
+ if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION)
+ return (EINVAL);
+ sdp_set_ip_ver(sdp_hdr, 4);
+ sdp_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr;
+ sdp_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr;
+ sdp_hdr->port = src4->sin_port;
+ break;
+ default:
+ cma_hdr = hdr;
+ cma_hdr->cma_version = CMA_VERSION;
+ cma_set_ip_ver(cma_hdr, 4);
+ cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr;
+ cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr;
+ cma_hdr->port = src4->sin_port;
+ break;
+ }
+ return 0;
+}
+
+static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
+ struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *id_priv = cm_id->context;
+ struct rdma_cm_event event;
+ struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd;
+ int ret = 0;
+
+ if (cma_disable_remove(id_priv, CMA_CONNECT))
+ return 0;
+
+ memset(&event, 0, sizeof event);
+ switch (ib_event->event) {
+ case IB_CM_SIDR_REQ_ERROR:
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ event.status = ETIMEDOUT;
+ break;
+ case IB_CM_SIDR_REP_RECEIVED:
+ event.param.ud.private_data = ib_event->private_data;
+ event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
+ if (rep->status != IB_SIDR_SUCCESS) {
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ event.status = ib_event->param.sidr_rep_rcvd.status;
+ break;
+ }
+ if (id_priv->qkey != rep->qkey) {
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ event.status = EINVAL;
+ break;
+ }
+ ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num,
+ id_priv->id.route.path_rec,
+ &event.param.ud.ah_attr);
+ event.param.ud.qp_num = rep->qpn;
+ event.param.ud.qkey = rep->qkey;
+ event.event = RDMA_CM_EVENT_ESTABLISHED;
+ event.status = 0;
+ break;
+ default:
+ log(LOG_ERR, "RDMA CMA: unexpected IB CM event: %d",
+ ib_event->event);
+ goto out;
+ }
+
+ ret = id_priv->id.event_handler(&id_priv->id, &event);
+ if (ret) {
+ /* Destroy the CM ID by returning a non-zero value. */
+ id_priv->cm_id.ib = NULL;
+ cma_exch(id_priv, CMA_DESTROYING);
+ cma_enable_remove(id_priv);
+ rdma_destroy_id(&id_priv->id);
+ return ret;
+ }
+out:
+ cma_enable_remove(id_priv);
+ return ret;
+}
+
+static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_cm_sidr_req_param req;
+ struct rdma_route *route;
+ int ret;
+
+ req.private_data_len = sizeof(struct cma_hdr) +
+ conn_param->private_data_len;
+ req.private_data = malloc(req.private_data_len, M_DEVBUF, M_NOWAIT);
+ if (!req.private_data)
+ return (ENOMEM);
+ bzero((void *)req.private_data, req.private_data_len);
+
+ if (conn_param->private_data && conn_param->private_data_len)
+ memcpy((caddr_t) req.private_data + sizeof(struct cma_hdr),
+ conn_param->private_data, conn_param->private_data_len);
+
+ route = &id_priv->id.route;
+ ret = cma_format_hdr((void *) req.private_data, id_priv->id.ps, route);
+ if (ret)
+ goto out;
+
+ id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device,
+ cma_sidr_rep_handler, id_priv);
+ if (IS_ERR(id_priv->cm_id.ib)) {
+ ret = PTR_ERR(id_priv->cm_id.ib);
+ goto out;
+ }
+
+ req.path = route->path_rec;
+ req.service_id = cma_get_service_id(id_priv->id.ps,
+ &route->addr.dst_addr);
+ req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8);
+ req.max_cm_retries = CMA_MAX_CM_RETRIES;
+
+ ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req);
+ if (ret) {
+ ib_destroy_cm_id(id_priv->cm_id.ib);
+ id_priv->cm_id.ib = NULL;
+ }
+out:
+ free(req.private_data, M_DEVBUF);
+ return ret;
+}
+
+static int cma_connect_ib(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_cm_req_param req;
+ struct rdma_route *route;
+ void *private_data;
+ int offset, ret;
+
+ memset(&req, 0, sizeof req);
+ offset = cma_user_data_offset(id_priv->id.ps);
+ req.private_data_len = offset + conn_param->private_data_len;
+ private_data = malloc(req.private_data_len, M_DEVBUF, M_NOWAIT);
+ if (!private_data)
+ return (ENOMEM);
+ bzero(private_data, req.private_data_len);
+
+ if (conn_param->private_data && conn_param->private_data_len)
+ memcpy(private_data + offset, conn_param->private_data,
+ conn_param->private_data_len);
+
+ id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_ib_handler,
+ id_priv);
+ if (IS_ERR(id_priv->cm_id.ib)) {
+ ret = PTR_ERR(id_priv->cm_id.ib);
+ goto out;
+ }
+
+ route = &id_priv->id.route;
+ ret = cma_format_hdr(private_data, id_priv->id.ps, route);
+ if (ret)
+ goto out;
+ req.private_data = private_data;
+
+ req.primary_path = &route->path_rec[0];
+ if (route->num_paths == 2)
+ req.alternate_path = &route->path_rec[1];
+
+ req.service_id = cma_get_service_id(id_priv->id.ps,
+ &route->addr.dst_addr);
+ req.qp_num = id_priv->qp_num;
+ req.qp_type = IB_QPT_RC;
+ req.starting_psn = id_priv->seq_num;
+ req.responder_resources = conn_param->responder_resources;
+ req.initiator_depth = conn_param->initiator_depth;
+ req.flow_control = conn_param->flow_control;
+ req.retry_count = conn_param->retry_count;
+ req.rnr_retry_count = conn_param->rnr_retry_count;
+ req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
+ req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
+ req.max_cm_retries = CMA_MAX_CM_RETRIES;
+ req.srq = id_priv->srq ? 1 : 0;
+
+ ret = ib_send_cm_req(id_priv->cm_id.ib, &req);
+out:
+ if (ret && !IS_ERR(id_priv->cm_id.ib)) {
+ ib_destroy_cm_id(id_priv->cm_id.ib);
+ id_priv->cm_id.ib = NULL;
+ }
+
+ free(private_data, M_DEVBUF);
+ return ret;
+}
+#endif
+
+static int cma_connect_iw(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct iw_cm_id *cm_id;
+ struct sockaddr_in* sin;
+ int ret;
+ struct iw_cm_conn_param iw_param;
+
+ cm_id = iw_create_cm_id(id_priv->id.device, id_priv->so,
+ cma_iw_handler, id_priv);
+ if (IS_ERR(cm_id)) {
+ ret = PTR_ERR(cm_id);
+ goto out;
+ }
+
+ id_priv->cm_id.iw = cm_id;
+
+ sin = (struct sockaddr_in*) &id_priv->id.route.addr.src_addr;
+ cm_id->local_addr = *sin;
+
+ sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr;
+ cm_id->remote_addr = *sin;
+
+ ret = cma_modify_qp_rtr(&id_priv->id);
+ if (ret)
+ goto out;
+
+ iw_param.ord = conn_param->initiator_depth;
+ iw_param.ird = conn_param->responder_resources;
+ iw_param.private_data = conn_param->private_data;
+ iw_param.private_data_len = conn_param->private_data_len;
+ if (id_priv->id.qp)
+ iw_param.qpn = id_priv->qp_num;
+ else
+ iw_param.qpn = conn_param->qp_num;
+ ret = iw_cm_connect(cm_id, &iw_param);
+out:
+ if (ret && !IS_ERR(cm_id)) {
+ iw_destroy_cm_id(cm_id);
+ id_priv->cm_id.iw = NULL;
+ }
+ return ret;
+}
+
+int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_CONNECT))
+ return (EINVAL);
+
+ if (!id->qp) {
+ id_priv->qp_num = conn_param->qp_num;
+ id_priv->srq = conn_param->srq;
+ }
+
+#ifdef IB_SUPPORTED
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ if (cma_is_ud_ps(id->ps))
+ ret = cma_resolve_ib_udp(id_priv, conn_param);
+ else
+ ret = cma_connect_ib(id_priv, conn_param);
+ break;
+ case RDMA_TRANSPORT_IWARP:
+#endif
+ ret = cma_connect_iw(id_priv, conn_param);
+#ifdef IB_SUPPORTED
+ break;
+ default:
+ ret = ENOSYS;
+ break;
+ }
+#endif
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ cma_comp_exch(id_priv, CMA_CONNECT, CMA_ROUTE_RESOLVED);
+ return ret;
+}
+
+#ifdef IB_SUPPORTED
+static int cma_accept_ib(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_cm_rep_param rep;
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ if (id_priv->id.qp) {
+ ret = cma_modify_qp_rtr(&id_priv->id);
+ if (ret)
+ goto out;
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, &qp_attr,
+ &qp_attr_mask);
+ if (ret)
+ goto out;
+
+ qp_attr.max_rd_atomic = conn_param->initiator_depth;
+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+ if (ret)
+ goto out;
+ }
+
+ memset(&rep, 0, sizeof rep);
+ rep.qp_num = id_priv->qp_num;
+ rep.starting_psn = id_priv->seq_num;
+ rep.private_data = conn_param->private_data;
+ rep.private_data_len = conn_param->private_data_len;
+ rep.responder_resources = conn_param->responder_resources;
+ rep.initiator_depth = conn_param->initiator_depth;
+ rep.target_ack_delay = CMA_CM_RESPONSE_TIMEOUT;
+ rep.failover_accepted = 0;
+ rep.flow_control = conn_param->flow_control;
+ rep.rnr_retry_count = conn_param->rnr_retry_count;
+ rep.srq = id_priv->srq ? 1 : 0;
+
+ ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep);
+out:
+ return ret;
+}
+#endif
+
+static int cma_accept_iw(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct iw_cm_conn_param iw_param;
+ int ret;
+
+ ret = cma_modify_qp_rtr(&id_priv->id);
+ if (ret)
+ return ret;
+
+ iw_param.ord = conn_param->initiator_depth;
+ iw_param.ird = conn_param->responder_resources;
+ iw_param.private_data = conn_param->private_data;
+ iw_param.private_data_len = conn_param->private_data_len;
+ if (id_priv->id.qp) {
+ iw_param.qpn = id_priv->qp_num;
+ } else
+ iw_param.qpn = conn_param->qp_num;
+
+ return iw_cm_accept(id_priv->cm_id.iw, &iw_param);
+}
+
+#ifdef IB_SUPPORTED
+static int cma_send_sidr_rep(struct rdma_id_private *id_priv,
+ enum ib_cm_sidr_status status,
+ const void *private_data, int private_data_len)
+{
+ struct ib_cm_sidr_rep_param rep;
+
+ memset(&rep, 0, sizeof rep);
+ rep.status = status;
+ if (status == IB_SIDR_SUCCESS) {
+ rep.qp_num = id_priv->qp_num;
+ rep.qkey = id_priv->qkey;
+ }
+ rep.private_data = private_data;
+ rep.private_data_len = private_data_len;
+
+ return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep);
+}
+#endif
+
+int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp(id_priv, CMA_CONNECT))
+ return (EINVAL);
+
+ if (!id->qp && conn_param) {
+ id_priv->qp_num = conn_param->qp_num;
+ id_priv->srq = conn_param->srq;
+ }
+
+#ifdef IB_SUPPORTED
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ if (cma_is_ud_ps(id->ps))
+ ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
+ conn_param->private_data,
+ conn_param->private_data_len);
+ else if (conn_param)
+ ret = cma_accept_ib(id_priv, conn_param);
+ else
+ ret = cma_rep_recv(id_priv);
+ break;
+ case RDMA_TRANSPORT_IWARP:
+#endif
+ ret = cma_accept_iw(id_priv, conn_param);
+#ifdef IB_SUPPORTED
+ break;
+ default:
+ ret = ENOSYS;
+ break;
+ }
+#endif
+
+ if (ret)
+ goto reject;
+
+ return 0;
+reject:
+ cma_modify_qp_err(id);
+ rdma_reject(id, NULL, 0);
+ return ret;
+}
+
+int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_has_cm_dev(id_priv))
+ return (EINVAL);
+#ifdef IB_SUPPORTED
+ switch (id->device->node_type) {
+ case RDMA_NODE_IB_CA:
+ ret = ib_cm_notify(id_priv->cm_id.ib, event);
+ break;
+ default:
+#endif
+ ret = 0;
+#ifdef IB_SUPPORTED
+ break;
+ }
+#endif
+ return ret;
+}
+
+int rdma_reject(struct rdma_cm_id *id, const void *private_data,
+ u8 private_data_len)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_has_cm_dev(id_priv))
+ return (EINVAL);
+
+#ifdef IB_SUPPORTED
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ if (cma_is_ud_ps(id->ps))
+ ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT,
+ private_data, private_data_len);
+ else
+ ret = ib_send_cm_rej(id_priv->cm_id.ib,
+ IB_CM_REJ_CONSUMER_DEFINED, NULL,
+ 0, private_data, private_data_len);
+ break;
+ case RDMA_TRANSPORT_IWARP:
+#endif
+ ret = iw_cm_reject(id_priv->cm_id.iw,
+ private_data, private_data_len);
+#ifdef IB_SUPPORTED
+ break;
+ default:
+ ret = ENOSYS;
+ break;
+ }
+#endif
+ return ret;
+}
+
+int rdma_disconnect(struct rdma_cm_id *id)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_has_cm_dev(id_priv))
+ return (EINVAL);
+
+#ifdef IB_SUPPORTED
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ ret = cma_modify_qp_err(id);
+ if (ret)
+ goto out;
+ /* Initiate or respond to a disconnect. */
+ if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0))
+ ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0);
+ break;
+ case RDMA_TRANSPORT_IWARP:
+#endif
+ ret = iw_cm_disconnect(id_priv->cm_id.iw, 0);
+#ifdef IB_SUPPORTED
+ break;
+ default:
+ ret = EINVAL;
+ break;
+ }
+out:
+#endif
+ return ret;
+}
+
+#ifdef IB_SUPPORTED
+static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
+{
+ struct rdma_id_private *id_priv;
+ struct cma_multicast *mc = multicast->context;
+ struct rdma_cm_event event;
+ int ret;
+
+ id_priv = mc->id_priv;
+ if (cma_disable_remove(id_priv, CMA_ADDR_BOUND) &&
+ cma_disable_remove(id_priv, CMA_ADDR_RESOLVED))
+ return 0;
+
+ if (!status && id_priv->id.qp)
+ status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid,
+ multicast->rec.mlid);
+
+ memset(&event, 0, sizeof event);
+ event.status = status;
+ event.param.ud.private_data = mc->context;
+ if (!status) {
+ event.event = RDMA_CM_EVENT_MULTICAST_JOIN;
+ ib_init_ah_from_mcmember(id_priv->id.device,
+ id_priv->id.port_num, &multicast->rec,
+ &event.param.ud.ah_attr);
+ event.param.ud.qp_num = 0xFFFFFF;
+ event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey);
+ } else
+ event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
+
+ ret = id_priv->id.event_handler(&id_priv->id, &event);
+ if (ret) {
+ cma_exch(id_priv, CMA_DESTROYING);
+ cma_enable_remove(id_priv);
+ rdma_destroy_id(&id_priv->id);
+ return 0;
+ }
+
+ cma_enable_remove(id_priv);
+ return 0;
+}
+
+static void cma_set_mgid(struct rdma_id_private *id_priv,
+ struct sockaddr *addr, union ib_gid *mgid)
+{
+ unsigned char mc_map[MAX_ADDR_LEN];
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ struct sockaddr_in *sin = (struct sockaddr_in *) addr;
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr;
+
+ if (cma_any_addr(addr)) {
+ memset(mgid, 0, sizeof *mgid);
+ } else if ((addr->sa_family == AF_INET6) &&
+ ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFF10A01B) ==
+ 0xFF10A01B)) {
+ /* IPv6 address is an SA assigned MGID. */
+ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
+ } else {
+ ip_ib_mc_map(sin->sin_addr.s_addr, mc_map);
+ if (id_priv->id.ps == RDMA_PS_UDP)
+ mc_map[7] = 0x01; /* Use RDMA CM signature */
+ mc_map[8] = ib_addr_get_pkey(dev_addr) >> 8;
+ mc_map[9] = (unsigned char) ib_addr_get_pkey(dev_addr);
+ *mgid = *(union ib_gid *) (mc_map + 4);
+ }
+}
+
+static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
+ struct cma_multicast *mc)
+{
+ struct ib_sa_mcmember_rec rec;
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ ib_sa_comp_mask comp_mask;
+ int ret;
+
+ ib_addr_get_mgid(dev_addr, &rec.mgid);
+ ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num,
+ &rec.mgid, &rec);
+ if (ret)
+ return ret;
+
+ cma_set_mgid(id_priv, &mc->addr, &rec.mgid);
+ if (id_priv->id.ps == RDMA_PS_UDP)
+ rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
+ ib_addr_get_sgid(dev_addr, &rec.port_gid);
+ rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+ rec.join_state = 1;
+
+ comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
+ IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE |
+ IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL |
+ IB_SA_MCMEMBER_REC_FLOW_LABEL |
+ IB_SA_MCMEMBER_REC_TRAFFIC_CLASS;
+
+ mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device,
+ id_priv->id.port_num, &rec,
+ comp_mask, M_NOWAIT,
+ cma_ib_mc_handler, mc);
+ if (IS_ERR(mc->multicast.ib))
+ return PTR_ERR(mc->multicast.ib);
+
+ return 0;
+}
+
+int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
+ void *context)
+{
+ struct rdma_id_private *id_priv;
+ struct cma_multicast *mc;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp(id_priv, CMA_ADDR_BOUND) &&
+ !cma_comp(id_priv, CMA_ADDR_RESOLVED))
+ return (EINVAL);
+
+ mc = malloc(sizeof *mc, M_DEVBUF, M_NOWAIT);
+ if (!mc)
+ return (ENOMEM);
+
+ memcpy(&mc->addr, addr, ip_addr_size(addr));
+ mc->context = context;
+ mc->id_priv = id_priv;
+
+ mtx_lock(&id_priv->lock);
+ LIST_INSERT_HEAD(&id_priv->mc_list, mc, list);
+ mtx_unlock(&id_priv->lock);
+
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ ret = cma_join_ib_multicast(id_priv, mc);
+ break;
+ default:
+ ret = ENOSYS;
+ break;
+ }
+
+ if (ret) {
+ mtx_lock(&id_priv->lock);
+ list_del(&mc->list);
+ mtx_unlock(&id_priv->lock);
+ free(mc, M_DEVBUF);
+ }
+ return ret;
+}
+
+void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+ struct rdma_id_private *id_priv;
+ struct cma_multicast *mc;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ mtx_lock(&id_priv->lock);
+ LIST_FOREACH(mc, &id_priv->mc_list, list) {
+ if (!memcmp(&mc->addr, addr, ip_addr_size(addr))) {
+ list_del(&mc->list);
+ mtx_unlock(&id_priv->lock);
+
+ if (id->qp)
+ ib_detach_mcast(id->qp,
+ &mc->multicast.ib->rec.mgid,
+ mc->multicast.ib->rec.mlid);
+ ib_sa_free_multicast(mc->multicast.ib, M_DEVBUF);
+ free(mc, M_DEVBUF);
+ return;
+ }
+ }
+ mtx_unlock(&id_priv->lock);
+}
+#endif
+
+static void cma_add_one(struct ib_device *device)
+{
+ struct cma_device *cma_dev;
+ struct rdma_id_private *id_priv;
+
+ cma_dev = malloc(sizeof *cma_dev, M_DEVBUF, M_NOWAIT|M_ZERO);
+ if (!cma_dev)
+ return;
+
+ cma_dev->device = device;
+
+ cv_init(&cma_dev->comp, "cma_device");
+ mtx_init(&cma_dev->lock, "cma_device", NULL, MTX_DUPOK|MTX_DEF);
+ cma_dev->refcount = 1;
+ LIST_INIT(&cma_dev->id_list);
+ ib_set_client_data(device, &cma_client, cma_dev);
+
+ mtx_lock(&lock);
+ TAILQ_INSERT_TAIL(&dev_list, cma_dev, list);
+ LIST_FOREACH(id_priv, &listen_any_list, list)
+ cma_listen_on_dev(id_priv, cma_dev);
+ mtx_unlock(&lock);
+}
+
+static int cma_remove_id_dev(struct rdma_id_private *id_priv)
+{
+ struct rdma_cm_event event;
+ enum cma_state state;
+
+ /* Record that we want to remove the device */
+ state = cma_exch(id_priv, CMA_DEVICE_REMOVAL);
+ if (state == CMA_DESTROYING)
+ return 0;
+
+ cma_cancel_operation(id_priv, state);
+ mtx_lock(&id_priv->lock);
+ PANIC_IF(id_priv->dev_remove < 0);
+ if (id_priv->dev_remove)
+ cv_wait(&id_priv->wait_remove, &id_priv->lock);
+ mtx_unlock(&id_priv->lock);
+
+ /* Check for destruction from another callback. */
+ if (!cma_comp(id_priv, CMA_DEVICE_REMOVAL))
+ return 0;
+
+ memset(&event, 0, sizeof event);
+ event.event = RDMA_CM_EVENT_DEVICE_REMOVAL;
+ return id_priv->id.event_handler(&id_priv->id, &event);
+}
+
+static void cma_process_remove(struct cma_device *cma_dev)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ mtx_lock(&lock);
+ while (!LIST_EMPTY(&cma_dev->id_list)) {
+ id_priv = LIST_FIRST(&cma_dev->id_list);
+
+ if (cma_internal_listen(id_priv)) {
+ cma_destroy_listen(id_priv);
+ continue;
+ }
+
+ LIST_REMOVE(id_priv, list);
+ mtx_lock(&id_priv->lock);
+ id_priv->refcount++;
+ mtx_unlock(&id_priv->lock);
+ mtx_unlock(&lock);
+
+ ret = cma_remove_id_dev(id_priv);
+ cma_deref_id(id_priv);
+ if (ret)
+ rdma_destroy_id(&id_priv->id);
+
+ mtx_lock(&lock);
+ }
+ mtx_unlock(&lock);
+
+ cma_deref_dev(cma_dev);
+ mtx_lock(&cma_dev->lock);
+ PANIC_IF(cma_dev->refcount < 0);
+ if (cma_dev->refcount)
+ cv_wait(&cma_dev->comp, &cma_dev->lock);
+ mtx_unlock(&cma_dev->lock);
+}
+
+static void cma_remove_one(struct ib_device *device)
+{
+ struct cma_device *cma_dev;
+
+ cma_dev = ib_get_client_data(device, &cma_client);
+ if (!cma_dev)
+ return;
+
+ mtx_lock(&lock);
+ TAILQ_REMOVE(&dev_list, cma_dev, list);
+ mtx_unlock(&lock);
+
+ cma_process_remove(cma_dev);
+ free(cma_dev, M_DEVBUF);
+}
+
+static int cma_init(void)
+{
+ int ret;
+
+ LIST_INIT(&listen_any_list);
+ TAILQ_INIT(&dev_list);
+ mtx_init(&lock, "cma_device list", NULL, MTX_DEF);
+
+ arc4rand(&next_port, sizeof next_port, 0);
+ next_port = ((unsigned int) next_port %
+ (ipport_lastauto - ipport_firstauto)) +
+ ipport_firstauto;
+ cma_wq = taskqueue_create("rdma_cm", M_NOWAIT, taskqueue_thread_enqueue,
+ &cma_wq);
+
+ if (!cma_wq)
+ return (ENOMEM);
+
+ taskqueue_start_threads(&cma_wq, 1, PI_NET, "cma_wq thread");
+#ifdef IB_SUPPORTED
+ ib_sa_register_client(&sa_client);
+#endif
+ rdma_addr_register_client(&addr_client);
+
+ ret = ib_register_client(&cma_client);
+ if (ret)
+ goto err;
+ return 0;
+
+err:
+ rdma_addr_unregister_client(&addr_client);
+#ifdef IB_SUPPORTED
+ ib_sa_unregister_client(&sa_client);
+#endif
+ taskqueue_free(cma_wq);
+ return ret;
+}
+
+static void cma_cleanup(void)
+{
+ ib_unregister_client(&cma_client);
+ rdma_addr_unregister_client(&addr_client);
+#ifdef IB_SUPPORTED
+ ib_sa_unregister_client(&sa_client);
+#endif
+ taskqueue_free(cma_wq);
+ kvl_free(&sdp_ps);
+ kvl_free(&tcp_ps);
+ kvl_free(&udp_ps);
+ kvl_free(&ipoib_ps);
+}
+
+static int
+cma_load(module_t mod, int cmd, void *arg)
+{
+ int err = 0;
+
+ switch (cmd) {
+ case MOD_LOAD:
+ printf("Loading rdma_cma.\n");
+ cma_init();
+ break;
+ case MOD_QUIESCE:
+ break;
+ case MOD_UNLOAD:
+ printf("Unloading rdma_cma.\n");
+ cma_cleanup();
+ break;
+ case MOD_SHUTDOWN:
+ break;
+ default:
+ err = EOPNOTSUPP;
+ break;
+ }
+
+ return (err);
+}
+
+static moduledata_t mod_data = {
+ "rdma_cma",
+ cma_load,
+ 0
+};
+
+MODULE_VERSION(rdma_cma, 1);
+MODULE_DEPEND(rdma_cma, rdma_core, 1, 1, 1);
+MODULE_DEPEND(rdma_cma, rdma_addr, 1, 1, 1);
+MODULE_DEPEND(rdma_cma, rdma_iwcm, 1, 1, 1);
+DECLARE_MODULE(rdma_cma, mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
diff --git a/sys/contrib/rdma/rdma_device.c b/sys/contrib/rdma/rdma_device.c
new file mode 100644
index 000000000000..53cf31fb28d9
--- /dev/null
+++ b/sys/contrib/rdma/rdma_device.c
@@ -0,0 +1,776 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: device.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/condvar.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/socket.h>
+#include <sys/module.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+#include <sys/priv.h>
+#include <sys/syslog.h>
+
+#include <contrib/rdma/core_priv.h>
+
+struct ib_client_data {
+ TAILQ_ENTRY(ib_client_data) list;
+ struct ib_client *client;
+ void * data;
+};
+
+static TAILQ_HEAD(, ib_device) device_list;
+static TAILQ_HEAD(client_list_s, ib_client) client_list;
+
+/*
+ * device_mutex protects access to both device_list and client_list.
+ * There's no real point to using multiple locks or something fancier
+ * like an rwsem: we always access both lists, and we're always
+ * modifying one list or the other list. In any case this is not a
+ * hot path so there's no point in trying to optimize.
+ */
+static struct mtx device_mutex;
+
+static int ib_device_check_mandatory(struct ib_device *device)
+{
+#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x }
+#define MANDATORY_TABLE_DEPTH 19
+ static const struct {
+ size_t offset;
+ char *name;
+ } mandatory_table[] = {
+ IB_MANDATORY_FUNC(query_device),
+ IB_MANDATORY_FUNC(query_port),
+ IB_MANDATORY_FUNC(query_pkey),
+ IB_MANDATORY_FUNC(query_gid),
+ IB_MANDATORY_FUNC(alloc_pd),
+ IB_MANDATORY_FUNC(dealloc_pd),
+ IB_MANDATORY_FUNC(create_ah),
+ IB_MANDATORY_FUNC(destroy_ah),
+ IB_MANDATORY_FUNC(create_qp),
+ IB_MANDATORY_FUNC(modify_qp),
+ IB_MANDATORY_FUNC(destroy_qp),
+ IB_MANDATORY_FUNC(post_send),
+ IB_MANDATORY_FUNC(post_recv),
+ IB_MANDATORY_FUNC(create_cq),
+ IB_MANDATORY_FUNC(destroy_cq),
+ IB_MANDATORY_FUNC(poll_cq),
+ IB_MANDATORY_FUNC(req_notify_cq),
+ IB_MANDATORY_FUNC(get_dma_mr),
+ IB_MANDATORY_FUNC(dereg_mr)
+ };
+ int i;
+
+ for (i = 0; i < MANDATORY_TABLE_DEPTH; ++i) {
+ if (!*(void **) ((void *) ((unsigned long)device + mandatory_table[i].offset))) {
+ log(LOG_WARNING, "Device %s is missing mandatory function %s\n",
+ device->name, mandatory_table[i].name);
+ return (EINVAL);
+ }
+ }
+
+ return 0;
+}
+
+static struct ib_device *__ib_device_get_by_name(const char *name)
+{
+ struct ib_device *device;
+
+ TAILQ_FOREACH(device, &device_list, core_list)
+ if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX))
+ return device;
+
+ return NULL;
+}
+
+
+static int alloc_name(char *name)
+{
+ long *inuse;
+ char buf[IB_DEVICE_NAME_MAX];
+ struct ib_device *device;
+ int i;
+
+ inuse = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
+ if (!inuse)
+ return (ENOMEM);
+
+ TAILQ_FOREACH(device, &device_list, core_list) {
+ if (!sscanf(device->name, name, &i))
+ continue;
+ if (i < 0 || i >= PAGE_SIZE * 8)
+ continue;
+ snprintf(buf, sizeof buf, name, i);
+ if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX))
+ setbit(inuse, i);
+ }
+
+ i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
+ free(inuse, M_DEVBUF);
+ snprintf(buf, sizeof buf, name, i);
+
+ if (__ib_device_get_by_name(buf))
+ return (ENFILE);
+
+ strlcpy(name, buf, IB_DEVICE_NAME_MAX);
+ return 0;
+}
+
+static int start_port(struct ib_device *device)
+{
+ return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+}
+
+
+static int end_port(struct ib_device *device)
+{
+ return (device->node_type == RDMA_NODE_IB_SWITCH) ?
+ 0 : device->phys_port_cnt;
+}
+
+/**
+ * ib_alloc_device - allocate an IB device struct
+ * @size:size of structure to allocate
+ *
+ * Low-level drivers should use ib_alloc_device() to allocate &struct
+ * ib_device. @size is the size of the structure to be allocated,
+ * including any private data used by the low-level driver.
+ * ib_dealloc_device() must be used to free structures allocated with
+ * ib_alloc_device().
+ */
+struct ib_device *ib_alloc_device(size_t size)
+{
+ void *dev;
+
+ if (size < sizeof (struct ib_device))
+ panic("size=%zd < sizeof (struct ib_device)=%zd)",
+ size, sizeof (struct ib_device));
+
+ dev = malloc(size, M_DEVBUF, M_NOWAIT);
+ if (dev)
+ bzero(dev, size);
+ return dev;
+}
+
+/**
+ * ib_dealloc_device - free an IB device struct
+ * @device:structure to free
+ *
+ * Free a structure allocated with ib_alloc_device().
+ */
+void ib_dealloc_device(struct ib_device *device)
+{
+ if (device->reg_state == IB_DEV_UNINITIALIZED) {
+ free(device, M_DEVBUF);
+ return;
+ }
+
+ if (device->reg_state != IB_DEV_UNREGISTERED)
+ panic("device->reg_state=%d != IB_DEV_UNREGISTERED)",
+ device->reg_state);
+#ifdef notyet
+ ib_device_unregister_sysfs(device);
+#endif
+}
+
+static int add_client_context(struct ib_device *device, struct ib_client *client)
+{
+ struct ib_client_data *context;
+
+ context = malloc(sizeof *context, M_DEVBUF, M_NOWAIT);
+ if (!context) {
+ log(LOG_WARNING, "Couldn't allocate client context for %s/%s\n",
+ device->name, client->name);
+ return (ENOMEM);
+ }
+
+ context->client = client;
+ context->data = NULL;
+
+ mtx_lock(&device->client_data_lock);
+ TAILQ_INSERT_TAIL(&device->client_data_list, context, list);
+ mtx_unlock(&device->client_data_lock);
+
+ return 0;
+}
+
+static int read_port_table_lengths(struct ib_device *device)
+{
+ struct ib_port_attr *tprops = NULL;
+ int num_ports, ret = ENOMEM;
+ u8 port_index;
+
+ tprops = malloc(sizeof *tprops, M_DEVBUF, M_NOWAIT);
+ if (!tprops)
+ goto out;
+
+ num_ports = end_port(device) - start_port(device) + 1;
+
+ device->pkey_tbl_len = malloc(sizeof *device->pkey_tbl_len * num_ports,
+ M_DEVBUF, M_NOWAIT);
+ device->gid_tbl_len = malloc(sizeof *device->gid_tbl_len * num_ports,
+ M_DEVBUF, M_NOWAIT);
+ if (!device->pkey_tbl_len || !device->gid_tbl_len)
+ goto err;
+
+ for (port_index = 0; port_index < num_ports; ++port_index) {
+ ret = ib_query_port(device, port_index + start_port(device),
+ tprops);
+ if (ret)
+ goto err;
+ device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len;
+ device->gid_tbl_len[port_index] = tprops->gid_tbl_len;
+ }
+
+ ret = 0;
+ goto out;
+
+err:
+ free(device->gid_tbl_len, M_DEVBUF);
+ free(device->pkey_tbl_len, M_DEVBUF);
+out:
+ free(tprops, M_DEVBUF);
+ return ret;
+}
+
+/**
+ * ib_register_device - Register an IB device with IB core
+ * @device:Device to register
+ *
+ * Low-level drivers use ib_register_device() to register their
+ * devices with the IB core. All registered clients will receive a
+ * callback for each device that is added. @device must be allocated
+ * with ib_alloc_device().
+ */
+int ib_register_device(struct ib_device *device)
+{
+ int ret;
+
+ mtx_lock(&device_mutex);
+
+ if (strchr(device->name, '%')) {
+ ret = alloc_name(device->name);
+ if (ret)
+ goto out;
+ }
+
+ if (ib_device_check_mandatory(device)) {
+ ret = EINVAL;
+ goto out;
+ }
+
+ TAILQ_INIT(&device->event_handler_list);
+ TAILQ_INIT(&device->client_data_list);
+ mtx_init(&device->event_handler_lock, "ib event handler", NULL,
+ MTX_DUPOK|MTX_DEF);
+ mtx_init(&device->client_data_lock, "ib client data", NULL,
+ MTX_DUPOK|MTX_DEF);
+
+ ret = read_port_table_lengths(device);
+ if (ret) {
+ log(LOG_WARNING, "Couldn't create table lengths cache for device %s\n",
+ device->name);
+ goto out;
+ }
+
+#ifdef notyet
+ ret = ib_device_register_sysfs(device);
+ if (ret) {
+ log(LOG_WARNING, "Couldn't register device %s with driver model\n",
+ device->name);
+ free(device->gid_tbl_len, M_DEVBUF);
+ free(device->pkey_tbl_len, M_DEVBUF);
+ goto out;
+ }
+#endif
+
+ TAILQ_INSERT_TAIL(&device_list, device, core_list);
+
+ device->reg_state = IB_DEV_REGISTERED;
+
+ {
+ struct ib_client *client;
+
+ TAILQ_FOREACH(client, &client_list, list)
+ if (client->add && !add_client_context(device, client))
+ client->add(device);
+ }
+
+ out:
+ mtx_unlock(&device_mutex);
+ return ret;
+}
+
+/**
+ * ib_unregister_device - Unregister an IB device
+ * @device:Device to unregister
+ *
+ * Unregister an IB device. All clients will receive a remove callback.
+ */
+void ib_unregister_device(struct ib_device *device)
+{
+ struct ib_client *client;
+ struct ib_client_data *context, *tmp;
+
+ mtx_lock(&device_mutex);
+
+ TAILQ_FOREACH_REVERSE(client, &client_list, client_list_s, list)
+ if (client->remove)
+ client->remove(device);
+
+ TAILQ_REMOVE(&device_list, device, core_list);
+
+ free(device->gid_tbl_len, M_DEVBUF);
+ free(device->pkey_tbl_len, M_DEVBUF);
+
+ mtx_unlock(&device_mutex);
+
+ mtx_lock(&device->client_data_lock);
+ TAILQ_FOREACH_SAFE(context, &device->client_data_list, list, tmp)
+ free(context, M_DEVBUF);
+ mtx_unlock(&device->client_data_lock);
+
+ device->reg_state = IB_DEV_UNREGISTERED;
+}
+
+/**
+ * ib_register_client - Register an IB client
+ * @client:Client to register
+ *
+ * Upper level users of the IB drivers can use ib_register_client() to
+ * register callbacks for IB device addition and removal. When an IB
+ * device is added, each registered client's add method will be called
+ * (in the order the clients were registered), and when a device is
+ * removed, each client's remove method will be called (in the reverse
+ * order that clients were registered). In addition, when
+ * ib_register_client() is called, the client will receive an add
+ * callback for all devices already registered.
+ */
+int ib_register_client(struct ib_client *client)
+{
+ struct ib_device *device;
+
+ mtx_lock(&device_mutex);
+
+ TAILQ_INSERT_TAIL(&client_list, client, list);
+ TAILQ_FOREACH(device, &device_list, core_list)
+ if (client->add && !add_client_context(device, client))
+ client->add(device);
+
+ mtx_unlock(&device_mutex);
+
+ return 0;
+}
+
+/**
+ * ib_unregister_client - Unregister an IB client
+ * @client:Client to unregister
+ *
+ * Upper level users use ib_unregister_client() to remove their client
+ * registration. When ib_unregister_client() is called, the client
+ * will receive a remove callback for each IB device still registered.
+ */
+void ib_unregister_client(struct ib_client *client)
+{
+ struct ib_client_data *context, *tmp;
+ struct ib_device *device;
+
+ mtx_lock(&device_mutex);
+
+ TAILQ_FOREACH(device, &device_list, core_list) {
+ if (client->remove)
+ client->remove(device);
+
+ mtx_lock(&device->client_data_lock);
+ TAILQ_FOREACH_SAFE(context, &device->client_data_list, list,tmp)
+ if (context->client == client) {
+ TAILQ_REMOVE(&device->client_data_list, context,
+ list);
+ free(context, M_DEVBUF);
+ }
+ mtx_unlock(&device->client_data_lock);
+ }
+ TAILQ_REMOVE(&client_list, client, list);
+
+ mtx_unlock(&device_mutex);
+}
+
+/**
+ * ib_get_client_data - Get IB client context
+ * @device:Device to get context for
+ * @client:Client to get context for
+ *
+ * ib_get_client_data() returns client context set with
+ * ib_set_client_data().
+ */
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
+{
+ struct ib_client_data *context;
+ void *ret = NULL;
+
+ mtx_lock(&device->client_data_lock);
+ TAILQ_FOREACH(context, &device->client_data_list, list)
+ if (context->client == client) {
+ ret = context->data;
+ break;
+ }
+ mtx_unlock(&device->client_data_lock);
+
+ return ret;
+}
+
+/**
+ * ib_set_client_data - Set IB client context
+ * @device:Device to set context for
+ * @client:Client to set context for
+ * @data:Context to set
+ *
+ * ib_set_client_data() sets client context that can be retrieved with
+ * ib_get_client_data().
+ */
+void ib_set_client_data(struct ib_device *device, struct ib_client *client,
+ void *data)
+{
+ struct ib_client_data *context;
+
+ mtx_lock(&device->client_data_lock);
+ TAILQ_FOREACH(context, &device->client_data_list, list)
+ if (context->client == client) {
+ context->data = data;
+ goto out;
+ }
+
+ log(LOG_WARNING, "No client context found for %s/%s\n",
+ device->name, client->name);
+
+out:
+ mtx_unlock(&device->client_data_lock);
+}
+
+/**
+ * ib_register_event_handler - Register an IB event handler
+ * @event_handler:Handler to register
+ *
+ * ib_register_event_handler() registers an event handler that will be
+ * called back when asynchronous IB events occur (as defined in
+ * chapter 11 of the InfiniBand Architecture Specification). This
+ * callback may occur in interrupt context.
+ */
+int ib_register_event_handler (struct ib_event_handler *event_handler)
+{
+ mtx_lock(&event_handler->device->event_handler_lock);
+ TAILQ_INSERT_TAIL(&event_handler->device->event_handler_list,
+ event_handler, list);
+ mtx_unlock(&event_handler->device->event_handler_lock);
+
+ return 0;
+}
+
+/**
+ * ib_unregister_event_handler - Unregister an event handler
+ * @event_handler:Handler to unregister
+ *
+ * Unregister an event handler registered with
+ * ib_register_event_handler().
+ */
+int ib_unregister_event_handler(struct ib_event_handler *event_handler)
+{
+ mtx_lock(&event_handler->device->event_handler_lock);
+ TAILQ_REMOVE(&event_handler->device->event_handler_list, event_handler,
+ list);
+ mtx_unlock(&event_handler->device->event_handler_lock);
+
+ return 0;
+}
+
+/**
+ * ib_dispatch_event - Dispatch an asynchronous event
+ * @event:Event to dispatch
+ *
+ * Low-level drivers must call ib_dispatch_event() to dispatch the
+ * event to all registered event handlers when an asynchronous event
+ * occurs.
+ */
+void ib_dispatch_event(struct ib_event *event)
+{
+ struct ib_event_handler *handler;
+
+ mtx_lock(&event->device->event_handler_lock);
+
+ TAILQ_FOREACH(handler, &event->device->event_handler_list, list)
+ handler->handler(handler, event);
+
+ mtx_unlock(&event->device->event_handler_lock);
+}
+
+/**
+ * ib_query_device - Query IB device attributes
+ * @device:Device to query
+ * @device_attr:Device attributes
+ *
+ * ib_query_device() returns the attributes of a device through the
+ * @device_attr pointer.
+ */
+int ib_query_device(struct ib_device *device,
+ struct ib_device_attr *device_attr)
+{
+ return device->query_device(device, device_attr);
+}
+
+/**
+ * ib_query_port - Query IB port attributes
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @port_attr:Port attributes
+ *
+ * ib_query_port() returns the attributes of a port through the
+ * @port_attr pointer.
+ */
+int ib_query_port(struct ib_device *device,
+ u8 port_num,
+ struct ib_port_attr *port_attr)
+{
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return (EINVAL);
+
+ return device->query_port(device, port_num, port_attr);
+}
+
+/**
+ * ib_query_gid - Get GID table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:GID table index to query
+ * @gid:Returned GID
+ *
+ * ib_query_gid() fetches the specified GID table entry.
+ */
+int ib_query_gid(struct ib_device *device,
+ u8 port_num, int index, union ib_gid *gid)
+{
+ return device->query_gid(device, port_num, index, gid);
+}
+
+/**
+ * ib_query_pkey - Get P_Key table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:P_Key table index to query
+ * @pkey:Returned P_Key
+ *
+ * ib_query_pkey() fetches the specified P_Key table entry.
+ */
+int ib_query_pkey(struct ib_device *device,
+ u8 port_num, u16 index, u16 *pkey)
+{
+ return device->query_pkey(device, port_num, index, pkey);
+}
+
+/**
+ * ib_modify_device - Change IB device attributes
+ * @device:Device to modify
+ * @device_modify_mask:Mask of attributes to change
+ * @device_modify:New attribute values
+ *
+ * ib_modify_device() changes a device's attributes as specified by
+ * the @device_modify_mask and @device_modify structure.
+ */
+int ib_modify_device(struct ib_device *device,
+ int device_modify_mask,
+ struct ib_device_modify *device_modify)
+{
+ return device->modify_device(device, device_modify_mask,
+ device_modify);
+}
+
+/**
+ * ib_modify_port - Modifies the attributes for the specified port.
+ * @device: The device to modify.
+ * @port_num: The number of the port to modify.
+ * @port_modify_mask: Mask used to specify which attributes of the port
+ * to change.
+ * @port_modify: New attribute values for the port.
+ *
+ * ib_modify_port() changes a port's attributes as specified by the
+ * @port_modify_mask and @port_modify structure.
+ */
+int ib_modify_port(struct ib_device *device,
+ u8 port_num, int port_modify_mask,
+ struct ib_port_modify *port_modify)
+{
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return (EINVAL);
+
+ return device->modify_port(device, port_num, port_modify_mask,
+ port_modify);
+}
+
+/**
+ * ib_find_gid - Returns the port number and GID table index where
+ * a specified GID value occurs.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the GID table where the GID was found. This
+ * parameter may be NULL.
+ */
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+ u8 *port_num, u16 *index)
+{
+ union ib_gid tmp_gid;
+ int ret, port, i;
+
+ for (port = start_port(device); port <= end_port(device); ++port) {
+ for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) {
+ ret = ib_query_gid(device, port, i, &tmp_gid);
+ if (ret)
+ return ret;
+ if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
+ *port_num = port;
+ if (index)
+ *index = i;
+ return 0;
+ }
+ }
+ }
+
+ return (ENOENT);
+}
+
+/**
+ * ib_find_pkey - Returns the PKey table index where a specified
+ * PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the PKey table where the PKey was found.
+ */
+int ib_find_pkey(struct ib_device *device,
+ u8 port_num, u16 pkey, u16 *index)
+{
+ int ret, i;
+ u16 tmp_pkey;
+
+ for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) {
+ ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
+ if (ret)
+ return ret;
+
+ if (pkey == tmp_pkey) {
+ *index = i;
+ return 0;
+ }
+ }
+
+ return (ENOENT);
+}
+
+static int rdma_core_init(void)
+{
+ int ret;
+#ifdef notyet
+ ret = ib_sysfs_setup();
+ if (ret)
+ log(LOG_WARNING, "Couldn't create InfiniBand device class\n");
+#endif
+
+ mtx_init(&device_mutex, "rdma_device mutex", NULL, MTX_DEF);
+ TAILQ_INIT(&client_list);
+ TAILQ_INIT(&device_list);
+ ret = ib_cache_setup();
+ if (ret) {
+ log(LOG_WARNING, "Couldn't set up InfiniBand P_Key/GID cache\n");
+#ifdef notyet
+ ib_sysfs_cleanup();
+#endif
+ }
+
+ return ret;
+}
+
+static void rdma_core_cleanup(void)
+{
+ ib_cache_cleanup();
+#ifdef notyet
+ ib_sysfs_cleanup();
+ /* Make sure that any pending umem accounting work is done. */
+ flush_scheduled_work();
+#endif
+}
+
+static int
+rdma_core_load(module_t mod, int cmd, void *arg)
+{
+ int err = 0;
+
+ switch (cmd) {
+ case MOD_LOAD:
+ printf("Loading rdma_core.\n");
+ rdma_core_init();
+ break;
+ case MOD_QUIESCE:
+ break;
+ case MOD_UNLOAD:
+ printf("Unloading rdma_core.\n");
+ rdma_core_cleanup();
+ break;
+ case MOD_SHUTDOWN:
+ break;
+ default:
+ err = EOPNOTSUPP;
+ break;
+ }
+
+ return (err);
+}
+
+static moduledata_t mod_data = {
+ "rdma_core",
+ rdma_core_load,
+ 0
+};
+
+MODULE_VERSION(rdma_core, 1);
+DECLARE_MODULE(rdma_core, mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
diff --git a/sys/contrib/rdma/rdma_iwcm.c b/sys/contrib/rdma/rdma_iwcm.c
new file mode 100644
index 000000000000..916abcd2dfd9
--- /dev/null
+++ b/sys/contrib/rdma/rdma_iwcm.c
@@ -0,0 +1,1086 @@
+/*
+ * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/module.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+#include <sys/priv.h>
+#include <sys/syslog.h>
+#include <sys/malloc.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+
+#include <contrib/rdma/iw_cm.h>
+
+enum iw_cm_state {
+ IW_CM_STATE_IDLE, /* unbound, inactive */
+ IW_CM_STATE_LISTEN, /* listen waiting for connect */
+ IW_CM_STATE_CONN_RECV, /* inbound waiting for user accept */
+ IW_CM_STATE_CONN_SENT, /* outbound waiting for peer accept */
+ IW_CM_STATE_ESTABLISHED, /* established */
+ IW_CM_STATE_CLOSING, /* disconnect */
+ IW_CM_STATE_DESTROYING /* object being deleted */
+};
+
+struct iwcm_id_private {
+ struct iw_cm_id id;
+ enum iw_cm_state state;
+ unsigned long flags;
+ struct ib_qp *qp;
+ void * destroy_comp;
+ void * connect_wait;
+ TAILQ_HEAD(, iwcm_work) work_list;
+ struct mtx lock;
+ volatile int refcount;
+ TAILQ_HEAD(, iwcm_work) work_free_list;
+};
+
+#define IWCM_F_CALLBACK_DESTROY 1
+#define IWCM_F_CONNECT_WAIT 2
+
+static struct taskqueue *iwcm_wq;
+struct iwcm_work {
+ struct task task;
+ struct iwcm_id_private *cm_id;
+ TAILQ_ENTRY(iwcm_work) list;
+ struct iw_cm_event event;
+ TAILQ_ENTRY(iwcm_work) free_list;
+};
+
+/*
+ * The following services provide a mechanism for pre-allocating iwcm_work
+ * elements. The design pre-allocates them based on the cm_id type:
+ * LISTENING IDS: Get enough elements preallocated to handle the
+ * listen backlog.
+ * ACTIVE IDS: 4: CONNECT_REPLY, ESTABLISHED, DISCONNECT, CLOSE
+ * PASSIVE IDS: 3: ESTABLISHED, DISCONNECT, CLOSE
+ *
+ * Allocating them in connect and listen avoids having to deal
+ * with allocation failures on the event upcall from the provider (which
+ * is called in the interrupt context).
+ *
+ * One exception is when creating the cm_id for incoming connection requests.
+ * There are two cases:
+ * 1) in the event upcall, cm_event_handler(), for a listening cm_id. If
+ * the backlog is exceeded, then no more connection request events will
+ * be processed. cm_event_handler() returns ENOMEM in this case. Its up
+ * to the provider to reject the connection request.
+ * 2) in the connection request workqueue handler, cm_conn_req_handler().
+ * If work elements cannot be allocated for the new connect request cm_id,
+ * then IWCM will call the provider reject method. This is ok since
+ * cm_conn_req_handler() runs in the workqueue thread context.
+ */
+
+static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv)
+{
+ struct iwcm_work *work;
+
+ if (TAILQ_EMPTY(&cm_id_priv->work_free_list))
+ return NULL;
+ work = TAILQ_FIRST(&cm_id_priv->work_free_list);
+ TAILQ_REMOVE(&cm_id_priv->work_free_list, work, free_list);
+ return work;
+}
+
+static void put_work(struct iwcm_work *work)
+{
+ TAILQ_INSERT_HEAD(&work->cm_id->work_free_list, work, free_list);
+}
+
+static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv)
+{
+ struct iwcm_work *e, *tmp;
+
+ TAILQ_FOREACH_SAFE(e, &cm_id_priv->work_free_list, free_list, tmp)
+ free(e, M_DEVBUF);
+}
+
+static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count)
+{
+ struct iwcm_work *work;
+
+ PANIC_IF(!TAILQ_EMPTY(&cm_id_priv->work_free_list));
+ while (count--) {
+ work = malloc(sizeof(struct iwcm_work), M_DEVBUF, M_NOWAIT);
+ if (!work) {
+ dealloc_work_entries(cm_id_priv);
+ return (ENOMEM);
+ }
+ work->cm_id = cm_id_priv;
+ put_work(work);
+ }
+ return 0;
+}
+
+/*
+ * Save private data from incoming connection requests to
+ * iw_cm_event, so the low level driver doesn't have to. Adjust
+ * the event ptr to point to the local copy.
+ */
+static int copy_private_data(struct iw_cm_event *event)
+{
+ void *p;
+
+ p = malloc(event->private_data_len, M_DEVBUF, M_NOWAIT);
+ if (!p)
+ return (ENOMEM);
+ bcopy(event->private_data, p, event->private_data_len);
+ event->private_data = p;
+ return 0;
+}
+
+static void free_cm_id(struct iwcm_id_private *cm_id_priv)
+{
+ dealloc_work_entries(cm_id_priv);
+ free(cm_id_priv, M_DEVBUF);
+}
+
+/*
+ * Release a reference on cm_id. If the last reference is being
+ * released, enable the waiting thread (in iw_destroy_cm_id) to
+ * get woken up, and return 1 if a thread is already waiting.
+ */
+static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
+{
+ mtx_lock(&cm_id_priv->lock);
+ PANIC_IF(atomic_load_acq_int(&cm_id_priv->refcount)==0);
+ if (atomic_fetchadd_int(&cm_id_priv->refcount, -1) == 1) {
+ PANIC_IF(!TAILQ_EMPTY(&cm_id_priv->work_list));
+ wakeup(&cm_id_priv->destroy_comp);
+ mtx_unlock(&cm_id_priv->lock);
+ return 1;
+ }
+ mtx_unlock(&cm_id_priv->lock);
+
+ return 0;
+}
+
+static void add_ref(struct iw_cm_id *cm_id)
+{
+ struct iwcm_id_private *cm_id_priv;
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ mtx_lock(&cm_id_priv->lock);
+ atomic_add_int(&cm_id_priv->refcount, 1);
+ mtx_unlock(&cm_id_priv->lock);
+}
+
+static void rem_ref(struct iw_cm_id *cm_id)
+{
+ struct iwcm_id_private *cm_id_priv;
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ if (iwcm_deref_id(cm_id_priv) &&
+ isset(&cm_id_priv->flags, IWCM_F_CALLBACK_DESTROY)) {
+ PANIC_IF(!TAILQ_EMPTY(&cm_id_priv->work_list));
+ free_cm_id(cm_id_priv);
+ }
+}
+
+static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event);
+
+struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
+ struct socket *so,
+ iw_cm_handler cm_handler,
+ void *context)
+{
+ struct iwcm_id_private *cm_id_priv;
+
+ KASSERT(so, ("iw_create_cm_id called with NULL socket!"));
+ cm_id_priv = malloc(sizeof(*cm_id_priv), M_DEVBUF, M_NOWAIT);
+ if (!cm_id_priv)
+ return ERR_PTR(ENOMEM);
+ bzero(cm_id_priv, sizeof *cm_id_priv);
+
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ cm_id_priv->id.device = device;
+ cm_id_priv->id.cm_handler = cm_handler;
+ cm_id_priv->id.context = context;
+ cm_id_priv->id.event_handler = cm_event_handler;
+ cm_id_priv->id.add_ref = add_ref;
+ cm_id_priv->id.rem_ref = rem_ref;
+ cm_id_priv->id.so = so;
+ mtx_init(&cm_id_priv->lock, "cm_id_priv", NULL, MTX_DUPOK|MTX_DEF);
+ atomic_store_rel_int(&cm_id_priv->refcount, 1);
+ TAILQ_INIT(&cm_id_priv->work_list);
+ TAILQ_INIT(&cm_id_priv->work_free_list);
+
+ return &cm_id_priv->id;
+}
+
+
+static int iwcm_modify_qp_err(struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+
+ if (!qp)
+ return (EINVAL);
+
+ qp_attr.qp_state = IB_QPS_ERR;
+ return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+}
+
+/*
+ * This is really the RDMAC CLOSING state. It is most similar to the
+ * IB SQD QP state.
+ */
+static int iwcm_modify_qp_sqd(struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+
+ PANIC_IF(qp == NULL);
+ qp_attr.qp_state = IB_QPS_SQD;
+ return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+}
+
+/*
+ * CM_ID <-- CLOSING
+ *
+ * Block if a passive or active connection is currently being processed. Then
+ * process the event as follows:
+ * - If we are ESTABLISHED, move to CLOSING and modify the QP state
+ * based on the abrupt flag
+ * - If the connection is already in the CLOSING or IDLE state, the peer is
+ * disconnecting concurrently with us and we've already seen the
+ * DISCONNECT event -- ignore the request and return 0
+ * - Disconnect on a listening endpoint returns EINVAL
+ */
+int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt)
+{
+ struct iwcm_id_private *cm_id_priv;
+ int ret = 0;
+ struct ib_qp *qp = NULL;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ /* Wait if we're currently in a connect or accept downcall */
+ mtx_lock(&cm_id_priv->lock);
+ if (isset(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT))
+ msleep(&cm_id_priv->connect_wait, &cm_id_priv->lock, 0, "iwcm connect1", 0);
+
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_ESTABLISHED:
+ cm_id_priv->state = IW_CM_STATE_CLOSING;
+
+ /* QP could be <nul> for user-mode client */
+ if (cm_id_priv->qp)
+ qp = cm_id_priv->qp;
+ else
+ ret = EINVAL;
+ break;
+ case IW_CM_STATE_LISTEN:
+ ret = EINVAL;
+ break;
+ case IW_CM_STATE_CLOSING:
+ /* remote peer closed first */
+ case IW_CM_STATE_IDLE:
+ /* accept or connect returned !0 */
+ break;
+ case IW_CM_STATE_CONN_RECV:
+ /*
+ * App called disconnect before/without calling accept after
+ * connect_request event delivered.
+ */
+ break;
+ case IW_CM_STATE_CONN_SENT:
+ /* Can only get here if wait above fails */
+ default:
+ panic("just cuz");
+ }
+ mtx_unlock(&cm_id_priv->lock);
+
+ if (qp) {
+ if (abrupt)
+ ret = iwcm_modify_qp_err(qp);
+ else
+ ret = iwcm_modify_qp_sqd(qp);
+
+ /*
+ * If both sides are disconnecting the QP could
+ * already be in ERR or SQD states
+ */
+ ret = 0;
+ }
+
+ return ret;
+}
+
+/*
+ * CM_ID <-- DESTROYING
+ *
+ * Clean up all resources associated with the connection and release
+ * the initial reference taken by iw_create_cm_id.
+ */
+static void destroy_cm_id(struct iw_cm_id *cm_id)
+{
+ struct iwcm_id_private *cm_id_priv;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ /*
+ * Wait if we're currently in a connect or accept downcall. A
+ * listening endpoint should never block here.
+ */
+ mtx_lock(&cm_id_priv->lock);
+ if (isset(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT))
+ msleep(&cm_id_priv->connect_wait, &cm_id_priv->lock, 0, "iwcm connect2", 0);
+
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_LISTEN:
+ cm_id_priv->state = IW_CM_STATE_DESTROYING;
+ mtx_unlock(&cm_id_priv->lock);
+ /* destroy the listening endpoint */
+ ret = cm_id->device->iwcm->destroy_listen(cm_id);
+ mtx_lock(&cm_id_priv->lock);
+ break;
+ case IW_CM_STATE_ESTABLISHED:
+ cm_id_priv->state = IW_CM_STATE_DESTROYING;
+ mtx_unlock(&cm_id_priv->lock);
+ /* Abrupt close of the connection */
+ (void)iwcm_modify_qp_err(cm_id_priv->qp);
+ mtx_lock(&cm_id_priv->lock);
+ break;
+ case IW_CM_STATE_IDLE:
+ case IW_CM_STATE_CLOSING:
+ cm_id_priv->state = IW_CM_STATE_DESTROYING;
+ break;
+ case IW_CM_STATE_CONN_RECV:
+ /*
+ * App called destroy before/without calling accept after
+ * receiving connection request event notification or
+ * returned non zero from the event callback function.
+ * In either case, must tell the provider to reject.
+ */
+ cm_id_priv->state = IW_CM_STATE_DESTROYING;
+ break;
+ case IW_CM_STATE_CONN_SENT:
+ case IW_CM_STATE_DESTROYING:
+ default:
+ panic("just cuz");
+ break;
+ }
+ if (cm_id_priv->qp) {
+ cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+ cm_id_priv->qp = NULL;
+ }
+ mtx_unlock(&cm_id_priv->lock);
+
+ (void)iwcm_deref_id(cm_id_priv);
+}
+
+/*
+ * This function is only called by the application thread and cannot
+ * be called by the event thread. The function will wait for all
+ * references to be released on the cm_id and then free the cm_id
+ * object.
+ */
+void iw_destroy_cm_id(struct iw_cm_id *cm_id)
+{
+ struct iwcm_id_private *cm_id_priv;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ PANIC_IF(isset(&cm_id_priv->flags, IWCM_F_CALLBACK_DESTROY));
+
+ destroy_cm_id(cm_id);
+
+ mtx_lock(&cm_id_priv->lock);
+ if (atomic_load_acq_int(&cm_id_priv->refcount))
+ msleep(&cm_id_priv->destroy_comp, &cm_id_priv->lock, 0, "iwcm destroy", 0);
+ mtx_unlock(&cm_id_priv->lock);
+
+ free_cm_id(cm_id_priv);
+}
+
+/*
+ * CM_ID <-- LISTEN
+ *
+ * Start listening for connect requests. Generates one CONNECT_REQUEST
+ * event for each inbound connect request.
+ */
+int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)
+{
+ struct iwcm_id_private *cm_id_priv;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+ ret = alloc_work_entries(cm_id_priv, backlog);
+ if (ret)
+ return ret;
+
+ mtx_lock(&cm_id_priv->lock);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_IDLE:
+ cm_id_priv->state = IW_CM_STATE_LISTEN;
+ mtx_unlock(&cm_id_priv->lock);
+ ret = cm_id->device->iwcm->create_listen(cm_id, backlog);
+ if (ret)
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ mtx_lock(&cm_id_priv->lock);
+ break;
+ default:
+ ret = EINVAL;
+ }
+ mtx_unlock(&cm_id_priv->lock);
+
+ return ret;
+}
+
+/*
+ * CM_ID <-- IDLE
+ *
+ * Rejects an inbound connection request. No events are generated.
+ */
+int iw_cm_reject(struct iw_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct iwcm_id_private *cm_id_priv;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ setbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT);
+
+ mtx_lock(&cm_id_priv->lock);
+ if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
+ clrbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT);
+ wakeup(&cm_id_priv->connect_wait);
+ mtx_unlock(&cm_id_priv->lock);
+ return (EINVAL);
+ }
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ mtx_unlock(&cm_id_priv->lock);
+
+ ret = cm_id->device->iwcm->reject(cm_id, private_data,
+ private_data_len);
+
+ mtx_lock(&cm_id_priv->lock);
+ clrbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT);
+ wakeup(&cm_id_priv->connect_wait);
+ mtx_unlock(&cm_id_priv->lock);
+
+ return ret;
+}
+
+/*
+ * CM_ID <-- ESTABLISHED
+ *
+ * Accepts an inbound connection request and generates an ESTABLISHED
+ * event. Callers of iw_cm_disconnect and iw_destroy_cm_id will block
+ * until the ESTABLISHED event is received from the provider.
+ */
+int iw_cm_accept(struct iw_cm_id *cm_id,
+ struct iw_cm_conn_param *iw_param)
+{
+ struct iwcm_id_private *cm_id_priv;
+ struct ib_qp *qp;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ setbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT);
+
+ mtx_lock(&cm_id_priv->lock);
+ if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
+ clrbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT);
+ wakeup(&cm_id_priv->connect_wait);
+ mtx_unlock(&cm_id_priv->lock);
+
+ return (EINVAL);
+ }
+ /* Get the ib_qp given the QPN */
+ qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+ if (!qp) {
+ mtx_unlock(&cm_id_priv->lock);
+ return (EINVAL);
+ }
+ cm_id->device->iwcm->add_ref(qp);
+ cm_id_priv->qp = qp;
+ mtx_unlock(&cm_id_priv->lock);
+
+ ret = cm_id->device->iwcm->accept(cm_id, iw_param);
+ if (ret) {
+ /* An error on accept precludes provider events */
+ PANIC_IF(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ mtx_lock(&cm_id_priv->lock);
+ if (cm_id_priv->qp) {
+ cm_id->device->iwcm->rem_ref(qp);
+ cm_id_priv->qp = NULL;
+ }
+ clrbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT);
+ wakeup(&cm_id_priv->connect_wait);
+ mtx_unlock(&cm_id_priv->lock);
+ }
+
+ return ret;
+}
+
+/*
+ * Active Side: CM_ID <-- CONN_SENT
+ *
+ * If successful, results in the generation of a CONNECT_REPLY
+ * event. iw_cm_disconnect and iw_cm_destroy will block until the
+ * CONNECT_REPLY event is received from the provider.
+ */
+int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+ struct iwcm_id_private *cm_id_priv;
+ int ret;
+ struct ib_qp *qp;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+ ret = alloc_work_entries(cm_id_priv, 4);
+ if (ret)
+ return ret;
+
+ setbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT);
+ mtx_lock(&cm_id_priv->lock);
+
+ if (cm_id_priv->state != IW_CM_STATE_IDLE) {
+ clrbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT);
+ wakeup(&cm_id_priv->connect_wait);
+ mtx_unlock(&cm_id_priv->lock);
+
+ return (EINVAL);
+ }
+
+ /* Get the ib_qp given the QPN */
+ qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+ if (!qp) {
+ mtx_unlock(&cm_id_priv->lock);
+ return (EINVAL);
+ }
+ cm_id->device->iwcm->add_ref(qp);
+ cm_id_priv->qp = qp;
+ cm_id_priv->state = IW_CM_STATE_CONN_SENT;
+ mtx_unlock(&cm_id_priv->lock);
+
+ ret = cm_id->device->iwcm->connect(cm_id, iw_param);
+ if (ret) {
+ mtx_lock(&cm_id_priv->lock);
+ if (cm_id_priv->qp) {
+ cm_id->device->iwcm->rem_ref(qp);
+ cm_id_priv->qp = NULL;
+ }
+ PANIC_IF(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ clrbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT);
+ wakeup(&cm_id_priv->connect_wait);
+ mtx_unlock(&cm_id_priv->lock);
+
+ }
+
+ return ret;
+}
+
+/*
+ * Passive Side: new CM_ID <-- CONN_RECV
+ *
+ * Handles an inbound connect request. The function creates a new
+ * iw_cm_id to represent the new connection and inherits the client
+ * callback function and other attributes from the listening parent.
+ *
+ * The work item contains a pointer to the listen_cm_id and the event. The
+ * listen_cm_id contains the client cm_handler, context and
+ * device. These are copied when the device is cloned. The event
+ * contains the new four tuple.
+ *
+ * An error on the child should not affect the parent, so this
+ * function does not return a value.
+ */
+static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ struct iw_cm_id *cm_id;
+ struct iwcm_id_private *cm_id_priv;
+ int ret;
+
+ /*
+ * The provider should never generate a connection request
+ * event with a bad status.
+ */
+ PANIC_IF(iw_event->status);
+
+ /*
+ * We could be destroying the listening id. If so, ignore this
+ * upcall.
+ */
+ mtx_lock(&listen_id_priv->lock);
+ if (listen_id_priv->state != IW_CM_STATE_LISTEN) {
+ mtx_unlock(&listen_id_priv->lock);
+ goto out;
+ }
+ mtx_unlock(&listen_id_priv->lock);
+
+ cm_id = iw_create_cm_id(listen_id_priv->id.device,
+ iw_event->so,
+ listen_id_priv->id.cm_handler,
+ listen_id_priv->id.context);
+ /* If the cm_id could not be created, ignore the request */
+ if (IS_ERR(cm_id))
+ goto out;
+
+ cm_id->provider_data = iw_event->provider_data;
+ cm_id->local_addr = iw_event->local_addr;
+ cm_id->remote_addr = iw_event->remote_addr;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ cm_id_priv->state = IW_CM_STATE_CONN_RECV;
+
+ ret = alloc_work_entries(cm_id_priv, 3);
+ if (ret) {
+ iw_cm_reject(cm_id, NULL, 0);
+ iw_destroy_cm_id(cm_id);
+ goto out;
+ }
+
+ /* Call the client CM handler */
+ ret = cm_id->cm_handler(cm_id, iw_event);
+ if (ret) {
+ iw_cm_reject(cm_id, NULL, 0);
+ setbit(&cm_id_priv->flags, IWCM_F_CALLBACK_DESTROY);
+
+ destroy_cm_id(cm_id);
+ if (atomic_load_acq_int(&cm_id_priv->refcount)==0)
+ free_cm_id(cm_id_priv);
+ }
+
+out:
+ if (iw_event->private_data_len)
+ free(iw_event->private_data, M_DEVBUF);
+}
+
+/*
+ * Passive Side: CM_ID <-- ESTABLISHED
+ *
+ * The provider generated an ESTABLISHED event which means that
+ * the MPA negotion has completed successfully and we are now in MPA
+ * FPDU mode.
+ *
+ * This event can only be received in the CONN_RECV state. If the
+ * remote peer closed, the ESTABLISHED event would be received followed
+ * by the CLOSE event. If the app closes, it will block until we wake
+ * it up after processing this event.
+ */
+static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ int ret;
+
+ mtx_lock(&cm_id_priv->lock);
+
+ /*
+ * We clear the CONNECT_WAIT bit here to allow the callback
+ * function to call iw_cm_disconnect. Calling iw_destroy_cm_id
+ * from a callback handler is not allowed.
+ */
+ clrbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT);
+ PANIC_IF(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
+ cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+ wakeup(&cm_id_priv->connect_wait);
+ mtx_unlock(&cm_id_priv->lock);
+
+ return ret;
+}
+
+/*
+ * Active Side: CM_ID <-- ESTABLISHED
+ *
+ * The app has called connect and is waiting for the established event to
+ * post it's requests to the server. This event will wake up anyone
+ * blocked in iw_cm_disconnect or iw_destroy_id.
+ */
+static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ int ret;
+
+ mtx_lock(&cm_id_priv->lock);
+ /*
+ * Clear the connect wait bit so a callback function calling
+ * iw_cm_disconnect will not wait and deadlock this thread
+ */
+ clrbit(&cm_id_priv->flags, IWCM_F_CONNECT_WAIT);
+ PANIC_IF(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
+ if (iw_event->status == IW_CM_EVENT_STATUS_ACCEPTED) {
+ cm_id_priv->id.local_addr = iw_event->local_addr;
+ cm_id_priv->id.remote_addr = iw_event->remote_addr;
+ cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
+ } else {
+ /* REJECTED or RESET */
+ cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+ cm_id_priv->qp = NULL;
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ }
+ mtx_unlock(&cm_id_priv->lock);
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+
+ mtx_lock(&cm_id_priv->lock);
+ if (iw_event->private_data_len)
+ free(iw_event->private_data, M_DEVBUF);
+
+ /* Wake up waiters on connect complete */
+ wakeup(&cm_id_priv->connect_wait);
+ mtx_unlock(&cm_id_priv->lock);
+
+ return ret;
+}
+
+/*
+ * CM_ID <-- CLOSING
+ *
+ * If in the ESTABLISHED state, move to CLOSING.
+ */
+static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+
+ mtx_lock(&cm_id_priv->lock);
+ if (cm_id_priv->state == IW_CM_STATE_ESTABLISHED)
+ cm_id_priv->state = IW_CM_STATE_CLOSING;
+ mtx_unlock(&cm_id_priv->lock);
+}
+
+/*
+ * CM_ID <-- IDLE
+ *
+ * If in the ESTBLISHED or CLOSING states, the QP will have have been
+ * moved by the provider to the ERR state. Disassociate the CM_ID from
+ * the QP, move to IDLE, and remove the 'connected' reference.
+ *
+ * If in some other state, the cm_id was destroyed asynchronously.
+ * This is the last reference that will result in waking up
+ * the app thread blocked in iw_destroy_cm_id.
+ */
+static int cm_close_handler(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ int ret = 0;
+ mtx_lock(&cm_id_priv->lock);
+
+ if (cm_id_priv->qp) {
+ cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+ cm_id_priv->qp = NULL;
+ }
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_ESTABLISHED:
+ case IW_CM_STATE_CLOSING:
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ mtx_unlock(&cm_id_priv->lock);
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+ mtx_lock(&cm_id_priv->lock);
+ break;
+ case IW_CM_STATE_DESTROYING:
+ break;
+ default:
+ panic("just cuz");
+ }
+ mtx_unlock(&cm_id_priv->lock);
+
+ return ret;
+}
+
+static int process_event(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ int ret = 0;
+
+ switch (iw_event->event) {
+ case IW_CM_EVENT_CONNECT_REQUEST:
+ cm_conn_req_handler(cm_id_priv, iw_event);
+ break;
+ case IW_CM_EVENT_CONNECT_REPLY:
+ ret = cm_conn_rep_handler(cm_id_priv, iw_event);
+ break;
+ case IW_CM_EVENT_ESTABLISHED:
+ ret = cm_conn_est_handler(cm_id_priv, iw_event);
+ break;
+ case IW_CM_EVENT_DISCONNECT:
+ cm_disconnect_handler(cm_id_priv, iw_event);
+ break;
+ case IW_CM_EVENT_CLOSE:
+ ret = cm_close_handler(cm_id_priv, iw_event);
+ break;
+ default:
+ panic("just cuz");
+ }
+
+ return ret;
+}
+
+/*
+ * Process events on the work_list for the cm_id. If the callback
+ * function requests that the cm_id be deleted, a flag is set in the
+ * cm_id flags to indicate that when the last reference is
+ * removed, the cm_id is to be destroyed. This is necessary to
+ * distinguish between an object that will be destroyed by the app
+ * thread asleep on the destroy_comp list vs. an object destroyed
+ * here synchronously when the last reference is removed.
+ */
+static void cm_work_handler(void *context, int pending)
+{
+ struct iwcm_work *work = context;
+ struct iw_cm_event levent;
+ struct iwcm_id_private *cm_id_priv = work->cm_id;
+ int empty;
+ int ret = 0;
+
+ mtx_lock(&cm_id_priv->lock);
+ empty = TAILQ_EMPTY(&cm_id_priv->work_list);
+ while (!empty) {
+ work = TAILQ_FIRST(&cm_id_priv->work_list);
+ TAILQ_REMOVE(&cm_id_priv->work_list, work, list);
+ empty = TAILQ_EMPTY(&cm_id_priv->work_list);
+ levent = work->event;
+ put_work(work);
+ mtx_unlock(&cm_id_priv->lock);
+
+ ret = process_event(cm_id_priv, &levent);
+ if (ret) {
+ setbit(&cm_id_priv->flags, IWCM_F_CALLBACK_DESTROY);
+ destroy_cm_id(&cm_id_priv->id);
+ }
+ PANIC_IF(atomic_load_acq_int(&cm_id_priv->refcount)==0);
+ if (iwcm_deref_id(cm_id_priv)) {
+ if (isset(&cm_id_priv->flags,
+ IWCM_F_CALLBACK_DESTROY)) {
+ PANIC_IF(!TAILQ_EMPTY(&cm_id_priv->work_list));
+ free_cm_id(cm_id_priv);
+ }
+ return;
+ }
+ mtx_lock(&cm_id_priv->lock);
+ }
+ mtx_unlock(&cm_id_priv->lock);
+}
+
+/*
+ * This function is called on interrupt context. Schedule events on
+ * the iwcm_wq thread to allow callback functions to downcall into
+ * the CM and/or block. Events are queued to a per-CM_ID
+ * work_list. If this is the first event on the work_list, the work
+ * element is also queued on the iwcm_wq thread.
+ *
+ * Each event holds a reference on the cm_id. Until the last posted
+ * event has been delivered and processed, the cm_id cannot be
+ * deleted.
+ *
+ * Returns:
+ * 0 - the event was handled.
+ * ENOMEM - the event was not handled due to lack of resources.
+ */
+static int cm_event_handler(struct iw_cm_id *cm_id,
+ struct iw_cm_event *iw_event)
+{
+ struct iwcm_work *work;
+ struct iwcm_id_private *cm_id_priv;
+ int ret = 0;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+ mtx_lock(&cm_id_priv->lock);
+ work = get_work(cm_id_priv);
+ if (!work) {
+ ret = ENOMEM;
+ goto out;
+ }
+
+ TASK_INIT(&work->task, 0, cm_work_handler, work);
+ work->cm_id = cm_id_priv;
+ work->event = *iw_event;
+
+ if ((work->event.event == IW_CM_EVENT_CONNECT_REQUEST ||
+ work->event.event == IW_CM_EVENT_CONNECT_REPLY) &&
+ work->event.private_data_len) {
+ ret = copy_private_data(&work->event);
+ if (ret) {
+ put_work(work);
+ goto out;
+ }
+ }
+
+ atomic_add_acq_int(&cm_id_priv->refcount, 1);
+ if (TAILQ_EMPTY(&cm_id_priv->work_list)) {
+ TAILQ_INSERT_TAIL(&cm_id_priv->work_list, work, list);
+ taskqueue_enqueue(iwcm_wq, &work->task);
+ } else
+ TAILQ_INSERT_TAIL(&cm_id_priv->work_list, work, list);
+out:
+ mtx_unlock(&cm_id_priv->lock);
+ return ret;
+}
+
+static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ int ret;
+
+ mtx_lock(&cm_id_priv->lock);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_IDLE:
+ case IW_CM_STATE_CONN_SENT:
+ case IW_CM_STATE_CONN_RECV:
+ case IW_CM_STATE_ESTABLISHED:
+ *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+ qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE|
+ IB_ACCESS_REMOTE_READ;
+ ret = 0;
+ break;
+ default:
+ ret = EINVAL;
+ break;
+ }
+ mtx_unlock(&cm_id_priv->lock);
+ return ret;
+}
+
+static int iwcm_init_qp_rts_attr(struct iwcm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ int ret;
+
+ mtx_lock(&cm_id_priv->lock);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_IDLE:
+ case IW_CM_STATE_CONN_SENT:
+ case IW_CM_STATE_CONN_RECV:
+ case IW_CM_STATE_ESTABLISHED:
+ *qp_attr_mask = 0;
+ ret = 0;
+ break;
+ default:
+ ret = EINVAL;
+ break;
+ }
+ mtx_unlock(&cm_id_priv->lock);
+ return ret;
+}
+
+int iw_cm_init_qp_attr(struct iw_cm_id *cm_id,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ struct iwcm_id_private *cm_id_priv;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ switch (qp_attr->qp_state) {
+ case IB_QPS_INIT:
+ case IB_QPS_RTR:
+ ret = iwcm_init_qp_init_attr(cm_id_priv,
+ qp_attr, qp_attr_mask);
+ break;
+ case IB_QPS_RTS:
+ ret = iwcm_init_qp_rts_attr(cm_id_priv,
+ qp_attr, qp_attr_mask);
+ break;
+ default:
+ ret = EINVAL;
+ break;
+ }
+ return ret;
+}
+
+static int iw_cm_init(void)
+{
+ iwcm_wq = taskqueue_create("iw_cm_wq", M_NOWAIT, taskqueue_thread_enqueue, &iwcm_wq);
+ if (!iwcm_wq)
+ return (ENOMEM);
+
+ taskqueue_start_threads(&iwcm_wq, 1, PI_NET, "iw_cm_wq thread");
+ return 0;
+}
+
+static void iw_cm_cleanup(void)
+{
+ taskqueue_free(iwcm_wq);
+}
+
+static int
+iw_cm_load(module_t mod, int cmd, void *arg)
+{
+ int err = 0;
+
+ switch (cmd) {
+ case MOD_LOAD:
+ printf("Loading rdma_iwcm.\n");
+
+ iw_cm_init();
+ break;
+ case MOD_QUIESCE:
+ break;
+ case MOD_UNLOAD:
+ printf("Unloading rdma_iwcm.\n");
+ iw_cm_cleanup();
+ break;
+ case MOD_SHUTDOWN:
+ break;
+ default:
+ err = EOPNOTSUPP;
+ break;
+ }
+
+ return (err);
+}
+
+static moduledata_t mod_data = {
+ "rdma_iwcm",
+ iw_cm_load,
+ 0
+};
+
+MODULE_VERSION(rdma_iwcm, 1);
+MODULE_DEPEND(rdma_iwcm, rdma_core, 1, 1, 1);
+DECLARE_MODULE(rdma_iwcm, mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
diff --git a/sys/contrib/rdma/rdma_user_cm.h b/sys/contrib/rdma/rdma_user_cm.h
new file mode 100644
index 000000000000..0ffa4e5b3c24
--- /dev/null
+++ b/sys/contrib/rdma/rdma_user_cm.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef RDMA_USER_CM_H
+#define RDMA_USER_CM_H
+
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_user_sa.h>
+
+#define RDMA_USER_CM_ABI_VERSION 4
+
+#define RDMA_MAX_PRIVATE_DATA 256
+
+enum {
+ RDMA_USER_CM_CMD_CREATE_ID,
+ RDMA_USER_CM_CMD_DESTROY_ID,
+ RDMA_USER_CM_CMD_BIND_ADDR,
+ RDMA_USER_CM_CMD_RESOLVE_ADDR,
+ RDMA_USER_CM_CMD_RESOLVE_ROUTE,
+ RDMA_USER_CM_CMD_QUERY_ROUTE,
+ RDMA_USER_CM_CMD_CONNECT,
+ RDMA_USER_CM_CMD_LISTEN,
+ RDMA_USER_CM_CMD_ACCEPT,
+ RDMA_USER_CM_CMD_REJECT,
+ RDMA_USER_CM_CMD_DISCONNECT,
+ RDMA_USER_CM_CMD_INIT_QP_ATTR,
+ RDMA_USER_CM_CMD_GET_EVENT,
+ RDMA_USER_CM_CMD_GET_OPTION,
+ RDMA_USER_CM_CMD_SET_OPTION,
+ RDMA_USER_CM_CMD_NOTIFY,
+ RDMA_USER_CM_CMD_JOIN_MCAST,
+ RDMA_USER_CM_CMD_LEAVE_MCAST
+};
+
+/*
+ * command ABI structures.
+ */
+struct rdma_ucm_cmd_hdr {
+ __u32 cmd;
+ __u16 in;
+ __u16 out;
+};
+
+struct rdma_ucm_create_id {
+ __u64 uid;
+ __u64 response;
+ __u16 ps;
+ __u8 reserved[6];
+};
+
+struct rdma_ucm_create_id_resp {
+ __u32 id;
+};
+
+struct rdma_ucm_destroy_id {
+ __u64 response;
+ __u32 id;
+ __u32 reserved;
+};
+
+struct rdma_ucm_destroy_id_resp {
+ __u32 events_reported;
+};
+
+struct rdma_ucm_bind_addr {
+ __u64 response;
+ struct sockaddr_in6 addr;
+ __u32 id;
+};
+
+struct rdma_ucm_resolve_addr {
+ struct sockaddr_in6 src_addr;
+ struct sockaddr_in6 dst_addr;
+ __u32 id;
+ __u32 timeout_ms;
+};
+
+struct rdma_ucm_resolve_route {
+ __u32 id;
+ __u32 timeout_ms;
+};
+
+struct rdma_ucm_query_route {
+ __u64 response;
+ __u32 id;
+ __u32 reserved;
+};
+
+struct rdma_ucm_query_route_resp {
+ __u64 node_guid;
+ struct ib_user_path_rec ib_route[2];
+ struct sockaddr_in6 src_addr;
+ struct sockaddr_in6 dst_addr;
+ __u32 num_paths;
+ __u8 port_num;
+ __u8 reserved[3];
+};
+
+struct rdma_ucm_conn_param {
+ __u32 qp_num;
+ __u32 reserved;
+ __u8 private_data[RDMA_MAX_PRIVATE_DATA];
+ __u8 private_data_len;
+ __u8 srq;
+ __u8 responder_resources;
+ __u8 initiator_depth;
+ __u8 flow_control;
+ __u8 retry_count;
+ __u8 rnr_retry_count;
+ __u8 valid;
+};
+
+struct rdma_ucm_ud_param {
+ __u32 qp_num;
+ __u32 qkey;
+ struct ib_uverbs_ah_attr ah_attr;
+ __u8 private_data[RDMA_MAX_PRIVATE_DATA];
+ __u8 private_data_len;
+ __u8 reserved[7];
+};
+
+struct rdma_ucm_connect {
+ struct rdma_ucm_conn_param conn_param;
+ __u32 id;
+ __u32 reserved;
+};
+
+struct rdma_ucm_listen {
+ __u32 id;
+ __u32 backlog;
+};
+
+struct rdma_ucm_accept {
+ __u64 uid;
+ struct rdma_ucm_conn_param conn_param;
+ __u32 id;
+ __u32 reserved;
+};
+
+struct rdma_ucm_reject {
+ __u32 id;
+ __u8 private_data_len;
+ __u8 reserved[3];
+ __u8 private_data[RDMA_MAX_PRIVATE_DATA];
+};
+
+struct rdma_ucm_disconnect {
+ __u32 id;
+};
+
+struct rdma_ucm_init_qp_attr {
+ __u64 response;
+ __u32 id;
+ __u32 qp_state;
+};
+
+struct rdma_ucm_notify {
+ __u32 id;
+ __u32 event;
+};
+
+struct rdma_ucm_join_mcast {
+ __u64 response; /* rdma_ucm_create_id_resp */
+ __u64 uid;
+ struct sockaddr_in6 addr;
+ __u32 id;
+};
+
+struct rdma_ucm_get_event {
+ __u64 response;
+};
+
+struct rdma_ucm_event_resp {
+ __u64 uid;
+ __u32 id;
+ __u32 event;
+ __u32 status;
+ union {
+ struct rdma_ucm_conn_param conn;
+ struct rdma_ucm_ud_param ud;
+ } param;
+};
+
+#endif /* RDMA_USER_CM_H */
diff --git a/sys/contrib/rdma/rdma_verbs.c b/sys/contrib/rdma/rdma_verbs.c
new file mode 100644
index 000000000000..93821074b3c1
--- /dev/null
+++ b/sys/contrib/rdma/rdma_verbs.c
@@ -0,0 +1,822 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: verbs.c 1349 2004-12-16 21:09:43Z roland $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/module.h>
+#include <sys/endian.h>
+
+#include <contrib/rdma/ib_verbs.h>
+#include <contrib/rdma/ib_cache.h>
+
+int ib_rate_to_mult(enum ib_rate rate)
+{
+ switch (rate) {
+ case IB_RATE_2_5_GBPS: return 1;
+ case IB_RATE_5_GBPS: return 2;
+ case IB_RATE_10_GBPS: return 4;
+ case IB_RATE_20_GBPS: return 8;
+ case IB_RATE_30_GBPS: return 12;
+ case IB_RATE_40_GBPS: return 16;
+ case IB_RATE_60_GBPS: return 24;
+ case IB_RATE_80_GBPS: return 32;
+ case IB_RATE_120_GBPS: return 48;
+ default: return -1;
+ }
+}
+
+enum ib_rate mult_to_ib_rate(int mult)
+{
+ switch (mult) {
+ case 1: return IB_RATE_2_5_GBPS;
+ case 2: return IB_RATE_5_GBPS;
+ case 4: return IB_RATE_10_GBPS;
+ case 8: return IB_RATE_20_GBPS;
+ case 12: return IB_RATE_30_GBPS;
+ case 16: return IB_RATE_40_GBPS;
+ case 24: return IB_RATE_60_GBPS;
+ case 32: return IB_RATE_80_GBPS;
+ case 48: return IB_RATE_120_GBPS;
+ default: return IB_RATE_PORT_CURRENT;
+ }
+}
+
+enum rdma_transport_type
+rdma_node_get_transport(enum rdma_node_type node_type)
+{
+ switch (node_type) {
+ case RDMA_NODE_IB_CA:
+ case RDMA_NODE_IB_SWITCH:
+ case RDMA_NODE_IB_ROUTER:
+ return RDMA_TRANSPORT_IB;
+ case RDMA_NODE_RNIC:
+ return RDMA_TRANSPORT_IWARP;
+ default:
+ panic("bad condition");
+ return 0;
+ }
+}
+
+/* Protection domains */
+
+struct ib_pd *ib_alloc_pd(struct ib_device *device)
+{
+ struct ib_pd *pd;
+
+ pd = device->alloc_pd(device, NULL, NULL);
+
+ if (!IS_ERR(pd)) {
+ pd->device = device;
+ pd->uobject = NULL;
+ atomic_store_rel_int(&pd->usecnt, 0);
+ }
+
+ return pd;
+}
+
+int ib_dealloc_pd(struct ib_pd *pd)
+{
+ if (atomic_load_acq_int(&pd->usecnt))
+ return (EBUSY);
+
+ return pd->device->dealloc_pd(pd);
+}
+
+/* Address handles */
+
+struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+ struct ib_ah *ah;
+
+ ah = pd->device->create_ah(pd, ah_attr);
+
+ if (!IS_ERR(ah)) {
+ ah->device = pd->device;
+ ah->pd = pd;
+ ah->uobject = NULL;
+ atomic_add_acq_int(&pd->usecnt, 1);
+ }
+
+ return ah;
+}
+
+int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
+ struct ib_grh *grh, struct ib_ah_attr *ah_attr)
+{
+ u32 flow_class;
+ u16 gid_index;
+ int ret;
+
+ memset(ah_attr, 0, sizeof *ah_attr);
+ ah_attr->dlid = wc->slid;
+ ah_attr->sl = wc->sl;
+ ah_attr->src_path_bits = wc->dlid_path_bits;
+ ah_attr->port_num = port_num;
+
+ if (wc->wc_flags & IB_WC_GRH) {
+ ah_attr->ah_flags = IB_AH_GRH;
+ ah_attr->grh.dgid = grh->sgid;
+
+ ret = ib_find_cached_gid(device, &grh->dgid, &port_num,
+ &gid_index);
+ if (ret)
+ return ret;
+
+ ah_attr->grh.sgid_index = (u8) gid_index;
+ flow_class = be32toh(grh->version_tclass_flow);
+ ah_attr->grh.flow_label = flow_class & 0xFFFFF;
+ ah_attr->grh.hop_limit = 0xFF;
+ ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
+ }
+ return 0;
+}
+
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc,
+ struct ib_grh *grh, u8 port_num)
+{
+ struct ib_ah_attr ah_attr;
+ int ret;
+
+ ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return ib_create_ah(pd, &ah_attr);
+}
+
+int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+ return ah->device->modify_ah ?
+ ah->device->modify_ah(ah, ah_attr) :
+ ENOSYS;
+}
+
+int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+ return ah->device->query_ah ?
+ ah->device->query_ah(ah, ah_attr) :
+ ENOSYS;
+}
+
+int ib_destroy_ah(struct ib_ah *ah)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ pd = ah->pd;
+ ret = ah->device->destroy_ah(ah);
+ if (!ret)
+ atomic_subtract_acq_int(&pd->usecnt, 1);
+
+ return ret;
+}
+
+/* Shared receive queues */
+
+struct ib_srq *ib_create_srq(struct ib_pd *pd,
+ struct ib_srq_init_attr *srq_init_attr)
+{
+ struct ib_srq *srq;
+
+ if (!pd->device->create_srq)
+ return ERR_PTR(ENOSYS);
+
+ srq = pd->device->create_srq(pd, srq_init_attr, NULL);
+
+ if (!IS_ERR(srq)) {
+ srq->device = pd->device;
+ srq->pd = pd;
+ srq->uobject = NULL;
+ srq->event_handler = srq_init_attr->event_handler;
+ srq->srq_context = srq_init_attr->srq_context;
+ atomic_add_acq_int(&pd->usecnt, 1);
+ atomic_store_rel_int(&srq->usecnt, 0);
+ }
+
+ return srq;
+}
+
+int ib_modify_srq(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr,
+ enum ib_srq_attr_mask srq_attr_mask)
+{
+ return srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL);
+}
+
+int ib_query_srq(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr)
+{
+ return srq->device->query_srq ?
+ srq->device->query_srq(srq, srq_attr) : ENOSYS;
+}
+
+int ib_destroy_srq(struct ib_srq *srq)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ if (atomic_load_acq_int(&srq->usecnt))
+ return (EBUSY);
+
+ pd = srq->pd;
+
+ ret = srq->device->destroy_srq(srq);
+ if (!ret)
+ atomic_subtract_acq_int(&pd->usecnt, 1);
+
+ return ret;
+}
+
+/* Queue pairs */
+
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct ib_qp *qp;
+
+ qp = pd->device->create_qp(pd, qp_init_attr, NULL);
+
+ if (!IS_ERR(qp)) {
+ qp->device = pd->device;
+ qp->pd = pd;
+ qp->send_cq = qp_init_attr->send_cq;
+ qp->recv_cq = qp_init_attr->recv_cq;
+ qp->srq = qp_init_attr->srq;
+ qp->uobject = NULL;
+ qp->event_handler = qp_init_attr->event_handler;
+ qp->qp_context = qp_init_attr->qp_context;
+ qp->qp_type = qp_init_attr->qp_type;
+ atomic_add_acq_int(&pd->usecnt, 1);
+ atomic_add_acq_int(&qp_init_attr->send_cq->usecnt, 1);
+ atomic_add_acq_int(&qp_init_attr->recv_cq->usecnt, 1);
+ if (qp_init_attr->srq)
+ atomic_add_acq_int(&qp_init_attr->srq->usecnt, 1);
+ }
+
+ return qp;
+}
+
+static const struct {
+ int valid;
+ enum ib_qp_attr_mask req_param[IB_QPT_RAW_ETY + 1];
+ enum ib_qp_attr_mask opt_param[IB_QPT_RAW_ETY + 1];
+} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+ [IB_QPS_RESET] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_INIT] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ },
+ },
+ [IB_QPS_INIT] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_INIT] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_RTR] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UC] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN),
+ [IB_QPT_RC] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER),
+ },
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_RC] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_RTR] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UD] = IB_QP_SQ_PSN,
+ [IB_QPT_UC] = IB_QP_SQ_PSN,
+ [IB_QPT_RC] = (IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_SQ_PSN |
+ IB_QP_MAX_QP_RD_ATOMIC),
+ [IB_QPT_SMI] = IB_QP_SQ_PSN,
+ [IB_QPT_GSI] = IB_QP_SQ_PSN,
+ },
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_RTS] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE |
+ IB_QP_MIN_RNR_TIMER),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_SQD] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY
+ }
+ },
+ },
+ [IB_QPS_SQD] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_SQD] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_AV |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_PORT |
+ IB_QP_AV |
+ IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_SQE] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_ERR] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 }
+ }
+};
+
+int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
+ enum ib_qp_type type, enum ib_qp_attr_mask mask)
+{
+ enum ib_qp_attr_mask req_param, opt_param;
+
+ if (cur_state < 0 || cur_state > IB_QPS_ERR ||
+ next_state < 0 || next_state > IB_QPS_ERR)
+ return 0;
+
+ if (mask & IB_QP_CUR_STATE &&
+ cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
+ cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
+ return 0;
+
+ if (!qp_state_table[cur_state][next_state].valid)
+ return 0;
+
+ req_param = qp_state_table[cur_state][next_state].req_param[type];
+ opt_param = qp_state_table[cur_state][next_state].opt_param[type];
+
+ if ((mask & req_param) != req_param)
+ return 0;
+
+ if (mask & ~(req_param | opt_param | IB_QP_STATE))
+ return 0;
+
+ return 1;
+}
+
+int ib_modify_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask)
+{
+ return qp->device->modify_qp(qp, qp_attr, qp_attr_mask, NULL);
+}
+
+int ib_query_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ return qp->device->query_qp ?
+ qp->device->query_qp(qp, qp_attr, qp_attr_mask, qp_init_attr) :
+ ENOSYS;
+}
+
+int ib_destroy_qp(struct ib_qp *qp)
+{
+ struct ib_pd *pd;
+ struct ib_cq *scq, *rcq;
+ struct ib_srq *srq;
+ int ret;
+
+ pd = qp->pd;
+ scq = qp->send_cq;
+ rcq = qp->recv_cq;
+ srq = qp->srq;
+
+ ret = qp->device->destroy_qp(qp);
+ if (!ret) {
+ atomic_subtract_acq_int(&pd->usecnt, 1);
+ atomic_subtract_acq_int(&scq->usecnt, 1);
+ atomic_subtract_acq_int(&rcq->usecnt, 1);
+ if (srq)
+ atomic_subtract_acq_int(&srq->usecnt, 1);
+ }
+
+ return ret;
+}
+
+/* Completion queues */
+
+struct ib_cq *ib_create_cq(struct ib_device *device,
+ ib_comp_handler comp_handler,
+ void (*event_handler)(struct ib_event *, void *),
+ void *cq_context, int cqe, int comp_vector)
+{
+ struct ib_cq *cq;
+
+ cq = device->create_cq(device, cqe, comp_vector, NULL, NULL);
+
+ if (!IS_ERR(cq)) {
+ cq->device = device;
+ cq->uobject = NULL;
+ cq->comp_handler = comp_handler;
+ cq->event_handler = event_handler;
+ cq->cq_context = cq_context;
+ atomic_store_rel_int(&cq->usecnt, 0);
+ }
+
+ return cq;
+}
+
+int ib_destroy_cq(struct ib_cq *cq)
+{
+ if (atomic_load_acq_int(&cq->usecnt))
+ return (EBUSY);
+
+ return cq->device->destroy_cq(cq);
+}
+
+int ib_resize_cq(struct ib_cq *cq, int cqe)
+{
+ return cq->device->resize_cq ?
+ cq->device->resize_cq(cq, cqe, NULL) : ENOSYS;
+}
+
+/* Memory regions */
+
+struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
+{
+ struct ib_mr *mr;
+
+ mr = pd->device->get_dma_mr(pd, mr_access_flags);
+
+ if (!IS_ERR(mr)) {
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->uobject = NULL;
+ atomic_add_acq_int(&pd->usecnt, 1);
+ atomic_store_rel_int(&mr->usecnt, 0);
+ }
+
+ return mr;
+}
+
+struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start)
+{
+ struct ib_mr *mr;
+
+ mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
+ mr_access_flags, iova_start);
+
+ if (!IS_ERR(mr)) {
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->uobject = NULL;
+ atomic_add_acq_int(&pd->usecnt, 1);
+ atomic_store_rel_int(&mr->usecnt, 0);
+ }
+
+ return mr;
+}
+
+int ib_rereg_phys_mr(struct ib_mr *mr,
+ int mr_rereg_mask,
+ struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start)
+{
+ struct ib_pd *old_pd;
+ int ret;
+
+ if (!mr->device->rereg_phys_mr)
+ return (ENOSYS);
+
+ if (atomic_load_acq_int(&mr->usecnt))
+ return (EBUSY);
+
+ old_pd = mr->pd;
+
+ ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd,
+ phys_buf_array, num_phys_buf,
+ mr_access_flags, iova_start);
+
+ if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) {
+ atomic_subtract_acq_int(&old_pd->usecnt, 1);
+ atomic_add_acq_int(&pd->usecnt, 1);
+ }
+
+ return ret;
+}
+
+int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
+{
+ return mr->device->query_mr ?
+ mr->device->query_mr(mr, mr_attr) : ENOSYS;
+}
+
+int ib_dereg_mr(struct ib_mr *mr)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ if (atomic_load_acq_int(&mr->usecnt))
+ return (EBUSY);
+
+ pd = mr->pd;
+ ret = mr->device->dereg_mr(mr);
+ if (!ret)
+ atomic_subtract_acq_int(&pd->usecnt, 1);
+
+ return ret;
+}
+
+/* Memory windows */
+
+struct ib_mw *ib_alloc_mw(struct ib_pd *pd)
+{
+ struct ib_mw *mw;
+
+ if (!pd->device->alloc_mw)
+ return ERR_PTR(ENOSYS);
+
+ mw = pd->device->alloc_mw(pd);
+ if (!IS_ERR(mw)) {
+ mw->device = pd->device;
+ mw->pd = pd;
+ mw->uobject = NULL;
+ atomic_add_acq_int(&pd->usecnt, 1);
+ }
+
+ return mw;
+}
+
+int ib_dealloc_mw(struct ib_mw *mw)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ pd = mw->pd;
+ ret = mw->device->dealloc_mw(mw);
+ if (!ret)
+ atomic_subtract_acq_int(&pd->usecnt, 1);
+
+ return ret;
+}
+
+/* "Fast" memory regions */
+
+struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
+ int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr)
+{
+ struct ib_fmr *fmr;
+
+ if (!pd->device->alloc_fmr)
+ return ERR_PTR(ENOSYS);
+
+ fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr);
+ if (!IS_ERR(fmr)) {
+ fmr->device = pd->device;
+ fmr->pd = pd;
+ atomic_add_acq_int(&pd->usecnt, 1);
+ }
+
+ return fmr;
+}
+
+int ib_unmap_fmr(struct ib_fmr_list_head *fmr_list)
+{
+ struct ib_fmr *fmr;
+
+ if (TAILQ_EMPTY(fmr_list))
+ return 0;
+
+ fmr = TAILQ_FIRST(fmr_list);
+ return fmr->device->unmap_fmr(fmr_list);
+}
+
+int ib_dealloc_fmr(struct ib_fmr *fmr)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ pd = fmr->pd;
+ ret = fmr->device->dealloc_fmr(fmr);
+ if (!ret)
+ atomic_subtract_acq_int(&pd->usecnt, 1);
+
+ return ret;
+}
+
+/* Multicast groups */
+
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+ if (!qp->device->attach_mcast)
+ return (ENOSYS);
+ if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD)
+ return (EINVAL);
+
+ return qp->device->attach_mcast(qp, gid, lid);
+}
+
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+ if (!qp->device->detach_mcast)
+ return (ENOSYS);
+ if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD)
+ return (EINVAL);
+
+ return qp->device->detach_mcast(qp, gid, lid);
+}
diff --git a/sys/contrib/rdma/types.h b/sys/contrib/rdma/types.h
new file mode 100644
index 000000000000..33a1a62bd905
--- /dev/null
+++ b/sys/contrib/rdma/types.h
@@ -0,0 +1,121 @@
+/*
+ * $FreeBSD$
+ */
+#ifndef __RDMA_TYPES_H_
+#define __RDMA_TYPES_H_
+#include <sys/types.h>
+#include <sys/malloc.h>
+
+
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+typedef uint8_t __u8;
+typedef uint16_t __u16;
+typedef uint32_t __u32;
+typedef uint64_t __u64;
+typedef uint8_t __be8;
+typedef uint16_t __be16;
+typedef uint32_t __be32;
+typedef uint64_t __be64;
+
+typedef int32_t __s32;
+
+
+#define LINUX_TYPES_DEFINED
+#define ERR_PTR(err) ((void *)((long)(err)))
+#define IS_ERR(ptr) ((unsigned long)(ptr) > (unsigned long)(-1000))
+#define PTR_ERR(ptr) ((long)(ptr))
+
+#define PANIC_IF(exp) do { \
+ if (exp) \
+ panic("BUG func %s line %u: %s", __FUNCTION__, __LINE__, #exp); \
+} while (0)
+
+#define container_of(p, stype, field) ((stype *)(((uint8_t *)(p)) - offsetof(stype, field)))
+
+static __inline int
+find_first_zero_bit(volatile void *p, int max)
+{
+ int b;
+ volatile int *ptr = (volatile int *)p;
+
+ for (b = 0; b < max; b += 32) {
+ if (ptr[b >> 5] != ~0) {
+ for (;;) {
+ if ((ptr[b >> 5] & (1 << (b & 0x1f))) == 0)
+ return (b);
+ b++;
+ }
+ }
+ }
+
+ return (max);
+}
+
+struct kvl {
+ struct kvl *next;
+ unsigned int key;
+ void *value;
+};
+
+#define DEFINE_KVL(x) struct kvl x;
+
+static __inline void *
+kvl_lookup(struct kvl *x, uint32_t key)
+{
+ struct kvl *i;
+ for (i=x->next;i;i=i->next) if (i->key==key) return(i->value);
+ return(0);
+}
+
+static __inline int
+kvl_alloc_above(struct kvl *idp, void *ptr, int starting_id, int *id)
+{
+ int newid = starting_id;
+ struct kvl *i;
+
+ for (i=idp->next;i;i=i->next)
+ if (i->key == newid)
+ return -EEXIST;
+
+ i=malloc(sizeof(struct kvl),M_TEMP,M_NOWAIT);
+ i->key=newid;
+ i->value=ptr;
+ i->next=idp->next;
+ idp->next=i;
+ *id = newid;
+ return(0);
+}
+
+static __inline void
+kvl_delete(struct kvl *idp, int id)
+{
+ /* leak */
+ struct kvl *i, *prev=NULL;
+ for (i=idp->next;i;prev=i,i=i->next)
+ if ((i)->key==id) {
+ if (!prev)
+ idp->next = i->next;
+ else
+ prev->next = i->next;
+ free(i, M_TEMP);
+ return;
+ }
+}
+
+static __inline void
+kvl_free(struct kvl *idp)
+{
+ struct kvl *i, *tmp;
+ for (i=idp->next;i;i=tmp) {
+ tmp=i->next;
+ free(i, M_TEMP);
+ }
+ idp->next = NULL;
+}
+
+
+#endif