summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHans Petter Selasky <hselasky@FreeBSD.org>2017-06-15 12:47:48 +0000
committerHans Petter Selasky <hselasky@FreeBSD.org>2017-06-15 12:47:48 +0000
commit478d3005721019319c11a37980f8464ac22f529a (patch)
tree69c08fb159933baaa2776b9721046188e6202c51
parentd4a7f90c0ac990e7ae2d862c388a2ea053097f0b (diff)
Notes
-rw-r--r--sys/contrib/rdma/krping/krping.c2157
-rw-r--r--sys/contrib/rdma/krping/krping.h3
-rw-r--r--sys/contrib/rdma/krping/krping_dev.c3
-rw-r--r--sys/modules/ibcore/Makefile45
-rw-r--r--sys/modules/ipoib/Makefile1
-rw-r--r--sys/modules/rdma/krping/Makefile1
-rw-r--r--sys/ofed/drivers/infiniband/core/addr.c686
-rw-r--r--sys/ofed/drivers/infiniband/core/agent.c45
-rw-r--r--sys/ofed/drivers/infiniband/core/agent.h6
-rw-r--r--sys/ofed/drivers/infiniband/core/cache.c467
-rw-r--r--sys/ofed/drivers/infiniband/core/cm.c653
-rw-r--r--sys/ofed/drivers/infiniband/core/cm_msgs.h4
-rw-r--r--sys/ofed/drivers/infiniband/core/cma.c2859
-rw-r--r--sys/ofed/drivers/infiniband/core/core_priv.h106
-rw-r--r--sys/ofed/drivers/infiniband/core/device.c516
-rw-r--r--sys/ofed/drivers/infiniband/core/fmr_pool.c58
-rw-r--r--sys/ofed/drivers/infiniband/core/ib_addr.c760
-rw-r--r--sys/ofed/drivers/infiniband/core/ib_cache.c1253
-rw-r--r--sys/ofed/drivers/infiniband/core/ib_cq.c154
-rw-r--r--sys/ofed/drivers/infiniband/core/ib_iwpm_msg.c43
-rw-r--r--sys/ofed/drivers/infiniband/core/ib_iwpm_util.c97
-rw-r--r--sys/ofed/drivers/infiniband/core/ib_roce_gid_mgmt.c415
-rw-r--r--sys/ofed/drivers/infiniband/core/ib_smi.c338
-rw-r--r--sys/ofed/drivers/infiniband/core/ib_sysfs.c1327
-rw-r--r--sys/ofed/drivers/infiniband/core/ib_umem.c353
-rw-r--r--sys/ofed/drivers/infiniband/core/ib_umem_odp.c667
-rw-r--r--sys/ofed/drivers/infiniband/core/ib_umem_rbtree.c93
-rw-r--r--sys/ofed/drivers/infiniband/core/ib_verbs.c2066
-rw-r--r--sys/ofed/drivers/infiniband/core/iwcm.c381
-rw-r--r--sys/ofed/drivers/infiniband/core/iwcm.h2
-rw-r--r--sys/ofed/drivers/infiniband/core/iwpm_util.h75
-rw-r--r--sys/ofed/drivers/infiniband/core/mad.c1809
-rw-r--r--sys/ofed/drivers/infiniband/core/mad_priv.h55
-rw-r--r--sys/ofed/drivers/infiniband/core/mad_rmpp.c33
-rw-r--r--sys/ofed/drivers/infiniband/core/multicast.c118
-rw-r--r--sys/ofed/drivers/infiniband/core/opa_smi.h78
-rw-r--r--sys/ofed/drivers/infiniband/core/packer.c41
-rw-r--r--sys/ofed/drivers/infiniband/core/peer_mem.c461
-rw-r--r--sys/ofed/drivers/infiniband/core/sa_query.c490
-rw-r--r--sys/ofed/drivers/infiniband/core/smi.c253
-rw-r--r--sys/ofed/drivers/infiniband/core/smi.h4
-rw-r--r--sys/ofed/drivers/infiniband/core/sysfs.c1026
-rw-r--r--sys/ofed/drivers/infiniband/core/ucm.c83
-rw-r--r--sys/ofed/drivers/infiniband/core/ucma.c610
-rw-r--r--sys/ofed/drivers/infiniband/core/ud_header.c217
-rw-r--r--sys/ofed/drivers/infiniband/core/umem.c445
-rw-r--r--sys/ofed/drivers/infiniband/core/user_mad.c511
-rw-r--r--sys/ofed/drivers/infiniband/core/uverbs.h80
-rw-r--r--sys/ofed/drivers/infiniband/core/uverbs_cmd.c2873
-rw-r--r--sys/ofed/drivers/infiniband/core/uverbs_main.c1079
-rw-r--r--sys/ofed/drivers/infiniband/core/uverbs_marshall.c6
-rw-r--r--sys/ofed/drivers/infiniband/core/verbs.c1538
-rw-r--r--sys/ofed/drivers/infiniband/debug/Makefile3
-rw-r--r--sys/ofed/drivers/infiniband/debug/memtrack.c960
-rw-r--r--sys/ofed/drivers/infiniband/debug/memtrack.h106
-rw-r--r--sys/ofed/drivers/infiniband/debug/mtrack.h844
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig50
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h4
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c27
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c68
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c4
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c33
-rw-r--r--sys/ofed/include/rdma/ib.h120
-rw-r--r--sys/ofed/include/rdma/ib_addr.h116
-rw-r--r--sys/ofed/include/rdma/ib_cache.h40
-rw-r--r--sys/ofed/include/rdma/ib_cm.h26
-rw-r--r--sys/ofed/include/rdma/ib_hdrs.h178
-rw-r--r--sys/ofed/include/rdma/ib_mad.h209
-rw-r--r--sys/ofed/include/rdma/ib_pack.h66
-rw-r--r--sys/ofed/include/rdma/ib_peer_mem.h59
-rw-r--r--sys/ofed/include/rdma/ib_pma.h20
-rw-r--r--sys/ofed/include/rdma/ib_sa.h69
-rw-r--r--sys/ofed/include/rdma/ib_smi.h50
-rw-r--r--sys/ofed/include/rdma/ib_umem.h85
-rw-r--r--sys/ofed/include/rdma/ib_umem_odp.h161
-rw-r--r--sys/ofed/include/rdma/ib_user_mad.h47
-rw-r--r--sys/ofed/include/rdma/ib_user_verbs.h333
-rw-r--r--sys/ofed/include/rdma/ib_user_verbs_exp.h204
-rw-r--r--sys/ofed/include/rdma/ib_verbs.h2007
-rw-r--r--sys/ofed/include/rdma/ib_verbs_exp.h100
-rw-r--r--sys/ofed/include/rdma/iw_cm.h24
-rw-r--r--sys/ofed/include/rdma/iw_portmap.h70
-rw-r--r--sys/ofed/include/rdma/opa_port_info.h417
-rw-r--r--sys/ofed/include/rdma/opa_smi.h153
-rw-r--r--sys/ofed/include/rdma/peer_mem.h73
-rw-r--r--sys/ofed/include/rdma/rdma_cm.h60
-rw-r--r--sys/ofed/include/rdma/rdma_user_cm.h87
-rw-r--r--sys/ofed/include/rdma/rdma_vt.h500
-rw-r--r--sys/ofed/include/rdma/rdmavt_cq.h99
-rw-r--r--sys/ofed/include/rdma/rdmavt_mr.h140
-rw-r--r--sys/ofed/include/rdma/rdmavt_qp.h535
-rw-r--r--sys/ofed/include/rdma/sdp_socket.h23
-rw-r--r--sys/ofed/include/uapi/rdma/mlx4-abi.h107
-rw-r--r--sys/ofed/include/uapi/rdma/mlx5-abi.h249
94 files changed, 20180 insertions, 15790 deletions
diff --git a/sys/contrib/rdma/krping/krping.c b/sys/contrib/rdma/krping/krping.c
index 8a89f80e3ea3f..d270a1e7e8756 100644
--- a/sys/contrib/rdma/krping/krping.c
+++ b/sys/contrib/rdma/krping/krping.c
@@ -53,13 +53,14 @@ __FBSDID("$FreeBSD$");
#include "krping.h"
#include "getopt.h"
+#define PFX "krping: "
+
extern int krping_debug;
-#define DEBUG_LOG(cb, x...) if (krping_debug) log(LOG_INFO, x)
-#define PRINTF(cb, x...) log(LOG_INFO, x)
+#define DEBUG_LOG(...) do { if (krping_debug) log(LOG_INFO, __VA_ARGS__); } while (0)
#define BIND_INFO 1
MODULE_AUTHOR("Steve Wise");
-MODULE_DESCRIPTION("RDMA ping client/server");
+MODULE_DESCRIPTION("RDMA ping server");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_VERSION(krping, 1);
MODULE_DEPEND(krping, linuxkpi, 1, 1, 1);
@@ -76,21 +77,19 @@ typedef uint64_t cycles_t;
enum mem_type {
DMA = 1,
- FASTREG = 2,
- MW = 3,
- MR = 4
+ REG = 2,
};
static const struct krping_option krping_opts[] = {
{"count", OPT_INT, 'C'},
{"size", OPT_INT, 'S'},
{"addr", OPT_STRING, 'a'},
+ {"addr6", OPT_STRING, 'A'},
{"port", OPT_INT, 'p'},
{"verbose", OPT_NOPARAM, 'v'},
{"validate", OPT_NOPARAM, 'V'},
{"server", OPT_NOPARAM, 's'},
{"client", OPT_NOPARAM, 'c'},
- {"mem_mode", OPT_STRING, 'm'},
{"server_inv", OPT_NOPARAM, 'I'},
{"wlat", OPT_NOPARAM, 'l'},
{"rlat", OPT_NOPARAM, 'L'},
@@ -100,14 +99,14 @@ static const struct krping_option krping_opts[] = {
{"poll", OPT_NOPARAM, 'P'},
{"local_dma_lkey", OPT_NOPARAM, 'Z'},
{"read_inv", OPT_NOPARAM, 'R'},
- {"fr", OPT_INT, 'f'},
+ {"fr", OPT_NOPARAM, 'f'},
{NULL, 0, 0}
};
#define htonll(x) cpu_to_be64((x))
#define ntohll(x) cpu_to_be64((x))
-static struct mutex krping_mutex;
+static DEFINE_MUTEX(krping_mutex);
/*
* List of running krping threads.
@@ -115,6 +114,13 @@ static struct mutex krping_mutex;
static LIST_HEAD(krping_cbs);
/*
+ * Invoke like this, one on each side, using the server's address on
+ * the RDMA device (iw%d):
+ *
+ * /bin/echo server,port=9999,addr=192.168.69.142,validate > /proc/krping
+ * /bin/echo client,port=9999,addr=192.168.69.142,validate > /proc/krping
+ * /bin/echo client,port=9999,addr6=2001:db8:0:f101::1,validate > /proc/krping
+ *
* krping "ping/pong" loop:
* client sends source rkey/addr/len
* server receives source rkey/add/len
@@ -163,42 +169,35 @@ struct krping_rdma_info {
* Control block struct.
*/
struct krping_cb {
- void *cookie;
int server; /* 0 iff client */
struct ib_cq *cq;
struct ib_pd *pd;
struct ib_qp *qp;
- enum mem_type mem;
struct ib_mr *dma_mr;
struct ib_fast_reg_page_list *page_list;
int page_list_len;
- struct ib_send_wr fastreg_wr;
+ struct ib_reg_wr reg_mr_wr;
struct ib_send_wr invalidate_wr;
- struct ib_mr *fastreg_mr;
+ struct ib_mr *reg_mr;
int server_invalidate;
int read_inv;
u8 key;
- struct ib_mw *mw;
- struct ib_mw_bind bind_attr;
-
struct ib_recv_wr rq_wr; /* recv work request record */
struct ib_sge recv_sgl; /* recv single SGE */
- struct krping_rdma_info recv_buf;/* malloc'd buffer */
+ struct krping_rdma_info recv_buf __aligned(16); /* malloc'd buffer */
u64 recv_dma_addr;
DECLARE_PCI_UNMAP_ADDR(recv_mapping)
- struct ib_mr *recv_mr;
struct ib_send_wr sq_wr; /* send work requrest record */
struct ib_sge send_sgl;
- struct krping_rdma_info send_buf;/* single send buf */
+ struct krping_rdma_info send_buf __aligned(16); /* single send buf */
u64 send_dma_addr;
DECLARE_PCI_UNMAP_ADDR(send_mapping)
- struct ib_mr *send_mr;
- struct ib_send_wr rdma_sq_wr; /* rdma work request record */
+ struct ib_rdma_wr rdma_sq_wr; /* rdma work request record */
struct ib_sge rdma_sgl; /* rdma single SGE */
char *rdma_buf; /* used as rdma sink */
u64 rdma_dma_addr;
@@ -219,8 +218,9 @@ struct krping_cb {
struct krping_stats stats;
uint16_t port; /* dst port in NBO */
- struct in_addr addr; /* dst addr in NBO */
+ u8 addr[16]; /* dst addr in NBO */
char *addr_str; /* dst addr string */
+ uint8_t addr_type; /* ADDR_FAMILY - IPv4/V6 */
int verbose; /* verbose logging */
int count; /* ping count */
int size; /* ping data size */
@@ -232,8 +232,7 @@ struct krping_cb {
int poll; /* poll or block for rlat test */
int txdepth; /* SQ depth */
int local_dma_lkey; /* use 0 for lkey */
- int frtest; /* fastreg test */
- int testnum;
+ int frtest; /* reg test */
/* CM stuff */
struct rdma_cm_id *cm_id; /* connection on client side,*/
@@ -248,39 +247,34 @@ static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
int ret;
struct krping_cb *cb = cma_id->context;
- DEBUG_LOG(cb, "cma_event type %d cma_id %p (%s)\n", event->event,
- cma_id, (cma_id == cb->cm_id) ? "parent" : "child");
+ DEBUG_LOG("cma_event type %d cma_id %p (%s)\n", event->event, cma_id,
+ (cma_id == cb->cm_id) ? "parent" : "child");
switch (event->event) {
case RDMA_CM_EVENT_ADDR_RESOLVED:
cb->state = ADDR_RESOLVED;
ret = rdma_resolve_route(cma_id, 2000);
if (ret) {
- PRINTF(cb, "rdma_resolve_route error %d\n", ret);
+ printk(KERN_ERR PFX "rdma_resolve_route error %d\n",
+ ret);
wake_up_interruptible(&cb->sem);
}
break;
case RDMA_CM_EVENT_ROUTE_RESOLVED:
cb->state = ROUTE_RESOLVED;
- cb->child_cm_id = cma_id;
wake_up_interruptible(&cb->sem);
break;
case RDMA_CM_EVENT_CONNECT_REQUEST:
- if (cb->state == IDLE) {
- cb->state = CONNECT_REQUEST;
- cb->child_cm_id = cma_id;
- } else {
- PRINTF(cb, "Received connection request in wrong state"
- " (%d)\n", cb->state);
- }
- DEBUG_LOG(cb, "child cma %p\n", cb->child_cm_id);
+ cb->state = CONNECT_REQUEST;
+ cb->child_cm_id = cma_id;
+ DEBUG_LOG("child cma %p\n", cb->child_cm_id);
wake_up_interruptible(&cb->sem);
break;
case RDMA_CM_EVENT_ESTABLISHED:
- DEBUG_LOG(cb, "ESTABLISHED\n");
+ DEBUG_LOG("ESTABLISHED\n");
if (!cb->server) {
cb->state = CONNECTED;
}
@@ -292,24 +286,24 @@ static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
case RDMA_CM_EVENT_CONNECT_ERROR:
case RDMA_CM_EVENT_UNREACHABLE:
case RDMA_CM_EVENT_REJECTED:
- PRINTF(cb, "cma event %d, error %d\n", event->event,
+ printk(KERN_ERR PFX "cma event %d, error %d\n", event->event,
event->status);
cb->state = ERROR;
wake_up_interruptible(&cb->sem);
break;
case RDMA_CM_EVENT_DISCONNECTED:
- PRINTF(cb, "DISCONNECT EVENT...\n");
+ printk(KERN_ERR PFX "DISCONNECT EVENT...\n");
cb->state = ERROR;
wake_up_interruptible(&cb->sem);
break;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
- PRINTF(cb, "cma detected device removal!!!!\n");
+ printk(KERN_ERR PFX "cma detected device removal!!!!\n");
break;
default:
- PRINTF(cb, "oof bad type!\n");
+ printk(KERN_ERR PFX "oof bad type!\n");
wake_up_interruptible(&cb->sem);
break;
}
@@ -319,7 +313,7 @@ static int krping_cma_event_handler(struct rdma_cm_id *cma_id,
static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
{
if (wc->byte_len != sizeof(cb->recv_buf)) {
- PRINTF(cb, "Received bogus data, size %d\n",
+ printk(KERN_ERR PFX "Received bogus data, size %d\n",
wc->byte_len);
return -1;
}
@@ -327,7 +321,7 @@ static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
cb->remote_rkey = ntohl(cb->recv_buf.rkey);
cb->remote_addr = ntohll(cb->recv_buf.buf);
cb->remote_len = ntohl(cb->recv_buf.size);
- DEBUG_LOG(cb, "Received rkey %x addr %llx len %d from peer\n",
+ DEBUG_LOG("Received rkey %x addr %llx len %d from peer\n",
cb->remote_rkey, (unsigned long long)cb->remote_addr,
cb->remote_len);
@@ -342,7 +336,7 @@ static int server_recv(struct krping_cb *cb, struct ib_wc *wc)
static int client_recv(struct krping_cb *cb, struct ib_wc *wc)
{
if (wc->byte_len != sizeof(cb->recv_buf)) {
- PRINTF(cb, "Received bogus data, size %d\n",
+ printk(KERN_ERR PFX "Received bogus data, size %d\n",
wc->byte_len);
return -1;
}
@@ -364,18 +358,22 @@ static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
BUG_ON(cb->cq != cq);
if (cb->state == ERROR) {
- PRINTF(cb, "cq completion in ERROR state\n");
+ printk(KERN_ERR PFX "cq completion in ERROR state\n");
+ return;
+ }
+ if (cb->frtest) {
+ printk(KERN_ERR PFX "cq completion event in frtest!\n");
return;
}
- if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest)
+ if (!cb->wlat && !cb->rlat && !cb->bw)
ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
if (wc.status) {
if (wc.status == IB_WC_WR_FLUSH_ERR) {
- DEBUG_LOG(cb, "cq flushed\n");
+ DEBUG_LOG("cq flushed\n");
continue;
} else {
- PRINTF(cb, "cq completion failed with "
+ printk(KERN_ERR PFX "cq completion failed with "
"wr_id %jx status %d opcode %d vender_err %x\n",
(uintmax_t)wc.wr_id, wc.status, wc.opcode, wc.vendor_err);
goto error;
@@ -384,44 +382,44 @@ static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
switch (wc.opcode) {
case IB_WC_SEND:
- DEBUG_LOG(cb, "send completion\n");
+ DEBUG_LOG("send completion\n");
cb->stats.send_bytes += cb->send_sgl.length;
cb->stats.send_msgs++;
break;
case IB_WC_RDMA_WRITE:
- DEBUG_LOG(cb, "rdma write completion\n");
- cb->stats.write_bytes += cb->rdma_sq_wr.sg_list->length;
+ DEBUG_LOG("rdma write completion\n");
+ cb->stats.write_bytes += cb->rdma_sq_wr.wr.sg_list->length;
cb->stats.write_msgs++;
cb->state = RDMA_WRITE_COMPLETE;
wake_up_interruptible(&cb->sem);
break;
case IB_WC_RDMA_READ:
- DEBUG_LOG(cb, "rdma read completion\n");
- cb->stats.read_bytes += cb->rdma_sq_wr.sg_list->length;
+ DEBUG_LOG("rdma read completion\n");
+ cb->stats.read_bytes += cb->rdma_sq_wr.wr.sg_list->length;
cb->stats.read_msgs++;
cb->state = RDMA_READ_COMPLETE;
wake_up_interruptible(&cb->sem);
break;
case IB_WC_RECV:
- DEBUG_LOG(cb, "recv completion\n");
+ DEBUG_LOG("recv completion\n");
cb->stats.recv_bytes += sizeof(cb->recv_buf);
cb->stats.recv_msgs++;
- if (cb->wlat || cb->rlat || cb->bw || cb->frtest)
+ if (cb->wlat || cb->rlat || cb->bw)
ret = server_recv(cb, &wc);
else
ret = cb->server ? server_recv(cb, &wc) :
client_recv(cb, &wc);
if (ret) {
- PRINTF(cb, "recv wc error: %d\n", ret);
+ printk(KERN_ERR PFX "recv wc error: %d\n", ret);
goto error;
}
ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
if (ret) {
- PRINTF(cb, "post recv error: %d\n",
+ printk(KERN_ERR PFX "post recv error: %d\n",
ret);
goto error;
}
@@ -429,14 +427,14 @@ static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
break;
default:
- PRINTF(cb,
+ printk(KERN_ERR PFX
"%s:%d Unexpected opcode %d, Shutting down\n",
__func__, __LINE__, wc.opcode);
goto error;
}
}
if (ret) {
- PRINTF(cb, "poll error %d\n", ret);
+ printk(KERN_ERR PFX "poll error %d\n", ret);
goto error;
}
return;
@@ -450,7 +448,7 @@ static int krping_accept(struct krping_cb *cb)
struct rdma_conn_param conn_param;
int ret;
- DEBUG_LOG(cb, "accepting client connection request\n");
+ DEBUG_LOG("accepting client connection request\n");
memset(&conn_param, 0, sizeof conn_param);
conn_param.responder_resources = 1;
@@ -458,14 +456,14 @@ static int krping_accept(struct krping_cb *cb)
ret = rdma_accept(cb->child_cm_id, &conn_param);
if (ret) {
- PRINTF(cb, "rdma_accept error: %d\n", ret);
+ printk(KERN_ERR PFX "rdma_accept error: %d\n", ret);
return ret;
}
- if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
+ if (!cb->wlat && !cb->rlat && !cb->bw) {
wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
if (cb->state == ERROR) {
- PRINTF(cb, "wait for CONNECTED state %d\n",
+ printk(KERN_ERR PFX "wait for CONNECTED state %d\n",
cb->state);
return -1;
}
@@ -477,278 +475,120 @@ static void krping_setup_wr(struct krping_cb *cb)
{
cb->recv_sgl.addr = cb->recv_dma_addr;
cb->recv_sgl.length = sizeof cb->recv_buf;
- if (cb->local_dma_lkey)
- cb->recv_sgl.lkey = cb->qp->device->local_dma_lkey;
- else if (cb->mem == DMA)
- cb->recv_sgl.lkey = cb->dma_mr->lkey;
- else
- cb->recv_sgl.lkey = cb->recv_mr->lkey;
+ cb->recv_sgl.lkey = cb->pd->local_dma_lkey;
cb->rq_wr.sg_list = &cb->recv_sgl;
cb->rq_wr.num_sge = 1;
cb->send_sgl.addr = cb->send_dma_addr;
cb->send_sgl.length = sizeof cb->send_buf;
- if (cb->local_dma_lkey)
- cb->send_sgl.lkey = cb->qp->device->local_dma_lkey;
- else if (cb->mem == DMA)
- cb->send_sgl.lkey = cb->dma_mr->lkey;
- else
- cb->send_sgl.lkey = cb->send_mr->lkey;
+ cb->send_sgl.lkey = cb->pd->local_dma_lkey;
cb->sq_wr.opcode = IB_WR_SEND;
cb->sq_wr.send_flags = IB_SEND_SIGNALED;
cb->sq_wr.sg_list = &cb->send_sgl;
cb->sq_wr.num_sge = 1;
- if (cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) {
+ if (cb->server || cb->wlat || cb->rlat || cb->bw) {
cb->rdma_sgl.addr = cb->rdma_dma_addr;
- if (cb->mem == MR)
- cb->rdma_sgl.lkey = cb->rdma_mr->lkey;
- cb->rdma_sq_wr.send_flags = IB_SEND_SIGNALED;
- cb->rdma_sq_wr.sg_list = &cb->rdma_sgl;
- cb->rdma_sq_wr.num_sge = 1;
- }
-
- switch(cb->mem) {
- case FASTREG:
-
- /*
- * A chain of 2 WRs, INVALDATE_MR + FAST_REG_MR.
- * both unsignaled. The client uses them to reregister
- * the rdma buffers with a new key each iteration.
- */
- cb->fastreg_wr.opcode = IB_WR_FAST_REG_MR;
- cb->fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
- cb->fastreg_wr.wr.fast_reg.length = cb->size;
- cb->fastreg_wr.wr.fast_reg.page_list = cb->page_list;
- cb->fastreg_wr.wr.fast_reg.page_list_len = cb->page_list_len;
-
- cb->invalidate_wr.next = &cb->fastreg_wr;
- cb->invalidate_wr.opcode = IB_WR_LOCAL_INV;
- break;
- case MW:
- cb->bind_attr.wr_id = 0xabbaabba;
- cb->bind_attr.send_flags = 0; /* unsignaled */
-#ifdef BIND_INFO
- cb->bind_attr.bind_info.length = cb->size;
-#else
- cb->bind_attr.length = cb->size;
-#endif
- break;
- default:
- break;
+ cb->rdma_sq_wr.wr.send_flags = IB_SEND_SIGNALED;
+ cb->rdma_sq_wr.wr.sg_list = &cb->rdma_sgl;
+ cb->rdma_sq_wr.wr.num_sge = 1;
}
+
+ /*
+ * A chain of 2 WRs, INVALDATE_MR + REG_MR.
+ * both unsignaled. The client uses them to reregister
+ * the rdma buffers with a new key each iteration.
+ */
+ cb->reg_mr_wr.wr.opcode = IB_WR_REG_MR;
+ cb->reg_mr_wr.mr = cb->reg_mr;
+
+ cb->invalidate_wr.next = &cb->reg_mr_wr.wr;
+ cb->invalidate_wr.opcode = IB_WR_LOCAL_INV;
}
static int krping_setup_buffers(struct krping_cb *cb)
{
int ret;
- struct ib_phys_buf buf;
- u64 iovbase;
- DEBUG_LOG(cb, "krping_setup_buffers called on cb %p\n", cb);
+ DEBUG_LOG(PFX "krping_setup_buffers called on cb %p\n", cb);
- cb->recv_dma_addr = ib_dma_map_single(cb->pd->device,
+ cb->recv_dma_addr = ib_dma_map_single(cb->pd->device,
&cb->recv_buf,
sizeof(cb->recv_buf), DMA_BIDIRECTIONAL);
pci_unmap_addr_set(cb, recv_mapping, cb->recv_dma_addr);
- cb->send_dma_addr = ib_dma_map_single(cb->pd->device,
+ cb->send_dma_addr = ib_dma_map_single(cb->pd->device,
&cb->send_buf, sizeof(cb->send_buf),
DMA_BIDIRECTIONAL);
pci_unmap_addr_set(cb, send_mapping, cb->send_dma_addr);
- if (cb->mem == DMA) {
- cb->dma_mr = ib_get_dma_mr(cb->pd, IB_ACCESS_LOCAL_WRITE|
- IB_ACCESS_REMOTE_READ|
- IB_ACCESS_REMOTE_WRITE);
- if (IS_ERR(cb->dma_mr)) {
- DEBUG_LOG(cb, "reg_dmamr failed\n");
- ret = PTR_ERR(cb->dma_mr);
- goto bail;
- }
- } else {
- if (!cb->local_dma_lkey) {
- buf.addr = cb->recv_dma_addr;
- buf.size = sizeof cb->recv_buf;
- DEBUG_LOG(cb, "recv buf dma_addr %jx size %d\n",
- (uintmax_t)buf.addr, (int)buf.size);
- iovbase = cb->recv_dma_addr;
- cb->recv_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
- IB_ACCESS_LOCAL_WRITE,
- &iovbase);
-
- if (IS_ERR(cb->recv_mr)) {
- DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
- ret = PTR_ERR(cb->recv_mr);
- goto bail;
- }
-
- buf.addr = cb->send_dma_addr;
- buf.size = sizeof cb->send_buf;
- DEBUG_LOG(cb, "send buf dma_addr %jx size %d\n",
- (uintmax_t)buf.addr, (int)buf.size);
- iovbase = cb->send_dma_addr;
- cb->send_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
- 0, &iovbase);
-
- if (IS_ERR(cb->send_mr)) {
- DEBUG_LOG(cb, "send_buf reg_mr failed\n");
- ret = PTR_ERR(cb->send_mr);
- goto bail;
- }
- }
- }
-
- cb->rdma_buf = kmalloc(cb->size, GFP_KERNEL);
+ cb->rdma_buf = ib_dma_alloc_coherent(cb->pd->device, cb->size,
+ &cb->rdma_dma_addr,
+ GFP_KERNEL);
if (!cb->rdma_buf) {
- DEBUG_LOG(cb, "rdma_buf malloc failed\n");
+ DEBUG_LOG(PFX "rdma_buf allocation failed\n");
ret = -ENOMEM;
goto bail;
}
-
- cb->rdma_dma_addr = ib_dma_map_single(cb->pd->device,
- cb->rdma_buf, cb->size,
- DMA_BIDIRECTIONAL);
pci_unmap_addr_set(cb, rdma_mapping, cb->rdma_dma_addr);
- if (cb->mem != DMA) {
- switch (cb->mem) {
- case FASTREG:
- cb->page_list_len = (((cb->size - 1) & PAGE_MASK) +
- PAGE_SIZE) >> PAGE_SHIFT;
- cb->page_list = ib_alloc_fast_reg_page_list(
- cb->pd->device,
- cb->page_list_len);
- if (IS_ERR(cb->page_list)) {
- DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
- ret = PTR_ERR(cb->page_list);
- goto bail;
- }
- cb->fastreg_mr = ib_alloc_fast_reg_mr(cb->pd,
- cb->page_list->max_page_list_len);
- if (IS_ERR(cb->fastreg_mr)) {
- DEBUG_LOG(cb, "recv_buf reg_mr failed\n");
- ret = PTR_ERR(cb->fastreg_mr);
- goto bail;
- }
- DEBUG_LOG(cb, "fastreg rkey 0x%x page_list %p"
- " page_list_len %u\n", cb->fastreg_mr->rkey,
- cb->page_list, cb->page_list_len);
- break;
- case MW:
- cb->mw = ib_alloc_mw(cb->pd,IB_MW_TYPE_1);
- if (IS_ERR(cb->mw)) {
- DEBUG_LOG(cb, "recv_buf alloc_mw failed\n");
- ret = PTR_ERR(cb->mw);
- goto bail;
- }
- DEBUG_LOG(cb, "mw rkey 0x%x\n", cb->mw->rkey);
- /*FALLTHROUGH*/
- case MR:
- buf.addr = cb->rdma_dma_addr;
- buf.size = cb->size;
- iovbase = cb->rdma_dma_addr;
- cb->rdma_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
- IB_ACCESS_LOCAL_WRITE|
- IB_ACCESS_REMOTE_READ|
- IB_ACCESS_REMOTE_WRITE,
- &iovbase);
- if (IS_ERR(cb->rdma_mr)) {
- DEBUG_LOG(cb, "rdma_buf reg_mr failed\n");
- ret = PTR_ERR(cb->rdma_mr);
- goto bail;
- }
- DEBUG_LOG(cb, "rdma buf dma_addr %jx size %d mr rkey 0x%x\n",
- (uintmax_t)buf.addr, (int)buf.size, cb->rdma_mr->rkey);
- break;
- default:
- ret = -EINVAL;
- goto bail;
- break;
- }
+ cb->page_list_len = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE)
+ >> PAGE_SHIFT;
+ cb->reg_mr = ib_alloc_mr(cb->pd, IB_MR_TYPE_MEM_REG,
+ cb->page_list_len);
+ if (IS_ERR(cb->reg_mr)) {
+ ret = PTR_ERR(cb->reg_mr);
+ DEBUG_LOG(PFX "recv_buf reg_mr failed %d\n", ret);
+ goto bail;
}
+ DEBUG_LOG(PFX "reg rkey 0x%x page_list_len %u\n",
+ cb->reg_mr->rkey, cb->page_list_len);
- if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) {
+ if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
- cb->start_buf = kmalloc(cb->size, GFP_KERNEL);
+ cb->start_buf = ib_dma_alloc_coherent(cb->pd->device, cb->size,
+ &cb->start_dma_addr,
+ GFP_KERNEL);
if (!cb->start_buf) {
- DEBUG_LOG(cb, "start_buf malloc failed\n");
+ DEBUG_LOG(PFX "start_buf malloc failed\n");
ret = -ENOMEM;
goto bail;
}
-
- cb->start_dma_addr = ib_dma_map_single(cb->pd->device,
- cb->start_buf, cb->size,
- DMA_BIDIRECTIONAL);
pci_unmap_addr_set(cb, start_mapping, cb->start_dma_addr);
-
- if (cb->mem == MR || cb->mem == MW) {
- unsigned flags = IB_ACCESS_REMOTE_READ;
-
- if (cb->wlat || cb->rlat || cb->bw || cb->frtest) {
- flags |= IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_WRITE;
- }
-
- buf.addr = cb->start_dma_addr;
- buf.size = cb->size;
- DEBUG_LOG(cb, "start buf dma_addr %jx size %d\n",
- (uintmax_t)buf.addr, (int)buf.size);
- iovbase = cb->start_dma_addr;
- cb->start_mr = ib_reg_phys_mr(cb->pd, &buf, 1,
- flags,
- &iovbase);
-
- if (IS_ERR(cb->start_mr)) {
- DEBUG_LOG(cb, "start_buf reg_mr failed\n");
- ret = PTR_ERR(cb->start_mr);
- goto bail;
- }
- }
}
krping_setup_wr(cb);
- DEBUG_LOG(cb, "allocated & registered buffers...\n");
+ DEBUG_LOG(PFX "allocated & registered buffers...\n");
return 0;
bail:
- if (cb->fastreg_mr && !IS_ERR(cb->fastreg_mr))
- ib_dereg_mr(cb->fastreg_mr);
- if (cb->mw && !IS_ERR(cb->mw))
- ib_dealloc_mw(cb->mw);
+ if (cb->reg_mr && !IS_ERR(cb->reg_mr))
+ ib_dereg_mr(cb->reg_mr);
if (cb->rdma_mr && !IS_ERR(cb->rdma_mr))
ib_dereg_mr(cb->rdma_mr);
- if (cb->page_list && !IS_ERR(cb->page_list))
- ib_free_fast_reg_page_list(cb->page_list);
if (cb->dma_mr && !IS_ERR(cb->dma_mr))
ib_dereg_mr(cb->dma_mr);
- if (cb->recv_mr && !IS_ERR(cb->recv_mr))
- ib_dereg_mr(cb->recv_mr);
- if (cb->send_mr && !IS_ERR(cb->send_mr))
- ib_dereg_mr(cb->send_mr);
- if (cb->rdma_buf)
- kfree(cb->rdma_buf);
- if (cb->start_buf)
- kfree(cb->start_buf);
+ if (cb->rdma_buf) {
+ ib_dma_free_coherent(cb->pd->device, cb->size, cb->rdma_buf,
+ cb->rdma_dma_addr);
+ }
+ if (cb->start_buf) {
+ ib_dma_free_coherent(cb->pd->device, cb->size, cb->start_buf,
+ cb->start_dma_addr);
+ }
return ret;
}
static void krping_free_buffers(struct krping_cb *cb)
{
- DEBUG_LOG(cb, "krping_free_buffers called on cb %p\n", cb);
+ DEBUG_LOG("krping_free_buffers called on cb %p\n", cb);
if (cb->dma_mr)
ib_dereg_mr(cb->dma_mr);
- if (cb->send_mr)
- ib_dereg_mr(cb->send_mr);
- if (cb->recv_mr)
- ib_dereg_mr(cb->recv_mr);
if (cb->rdma_mr)
ib_dereg_mr(cb->rdma_mr);
if (cb->start_mr)
ib_dereg_mr(cb->start_mr);
- if (cb->fastreg_mr)
- ib_dereg_mr(cb->fastreg_mr);
- if (cb->mw)
- ib_dealloc_mw(cb->mw);
+ if (cb->reg_mr)
+ ib_dereg_mr(cb->reg_mr);
dma_unmap_single(cb->pd->device->dma_device,
pci_unmap_addr(cb, recv_mapping),
@@ -756,15 +596,13 @@ static void krping_free_buffers(struct krping_cb *cb)
dma_unmap_single(cb->pd->device->dma_device,
pci_unmap_addr(cb, send_mapping),
sizeof(cb->send_buf), DMA_BIDIRECTIONAL);
- dma_unmap_single(cb->pd->device->dma_device,
- pci_unmap_addr(cb, rdma_mapping),
- cb->size, DMA_BIDIRECTIONAL);
- kfree(cb->rdma_buf);
+
+ ib_dma_free_coherent(cb->pd->device, cb->size, cb->rdma_buf,
+ cb->rdma_dma_addr);
+
if (cb->start_buf) {
- dma_unmap_single(cb->pd->device->dma_device,
- pci_unmap_addr(cb, start_mapping),
- cb->size, DMA_BIDIRECTIONAL);
- kfree(cb->start_buf);
+ ib_dma_free_coherent(cb->pd->device, cb->size, cb->start_buf,
+ cb->start_dma_addr);
}
}
@@ -776,6 +614,11 @@ static int krping_create_qp(struct krping_cb *cb)
memset(&init_attr, 0, sizeof(init_attr));
init_attr.cap.max_send_wr = cb->txdepth;
init_attr.cap.max_recv_wr = 2;
+
+ /* For flush_qp() */
+ init_attr.cap.max_send_wr++;
+ init_attr.cap.max_recv_wr++;
+
init_attr.cap.max_recv_sge = 1;
init_attr.cap.max_send_sge = 1;
init_attr.qp_type = IB_QPT_RC;
@@ -806,38 +649,42 @@ static void krping_free_qp(struct krping_cb *cb)
static int krping_setup_qp(struct krping_cb *cb, struct rdma_cm_id *cm_id)
{
int ret;
- cb->pd = ib_alloc_pd(cm_id->device);
+ struct ib_cq_init_attr attr = {0};
+
+ cb->pd = ib_alloc_pd(cm_id->device, 0);
if (IS_ERR(cb->pd)) {
- PRINTF(cb, "ib_alloc_pd failed\n");
+ printk(KERN_ERR PFX "ib_alloc_pd failed\n");
return PTR_ERR(cb->pd);
}
- DEBUG_LOG(cb, "created pd %p\n", cb->pd);
+ DEBUG_LOG("created pd %p\n", cb->pd);
strlcpy(cb->stats.name, cb->pd->device->name, sizeof(cb->stats.name));
+ attr.cqe = cb->txdepth * 2;
+ attr.comp_vector = 0;
cb->cq = ib_create_cq(cm_id->device, krping_cq_event_handler, NULL,
- cb, cb->txdepth * 2, 0);
+ cb, &attr);
if (IS_ERR(cb->cq)) {
- PRINTF(cb, "ib_create_cq failed\n");
+ printk(KERN_ERR PFX "ib_create_cq failed\n");
ret = PTR_ERR(cb->cq);
goto err1;
}
- DEBUG_LOG(cb, "created cq %p\n", cb->cq);
+ DEBUG_LOG("created cq %p\n", cb->cq);
if (!cb->wlat && !cb->rlat && !cb->bw && !cb->frtest) {
ret = ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
if (ret) {
- PRINTF(cb, "ib_create_cq failed\n");
+ printk(KERN_ERR PFX "ib_create_cq failed\n");
goto err2;
}
}
ret = krping_create_qp(cb);
if (ret) {
- PRINTF(cb, "krping_create_qp failed: %d\n", ret);
+ printk(KERN_ERR PFX "krping_create_qp failed: %d\n", ret);
goto err2;
}
- DEBUG_LOG(cb, "created qp %p\n", cb->qp);
+ DEBUG_LOG("created qp %p\n", cb->qp);
return 0;
err2:
ib_destroy_cq(cb->cq);
@@ -848,115 +695,54 @@ err1:
/*
* return the (possibly rebound) rkey for the rdma buffer.
- * FASTREG mode: invalidate and rebind via fastreg wr.
- * MW mode: rebind the MW.
+ * REG mode: invalidate and rebind via reg wr.
* other modes: just return the mr rkey.
*/
static u32 krping_rdma_rkey(struct krping_cb *cb, u64 buf, int post_inv)
{
- u32 rkey = 0xffffffff;
- u64 p;
+ u32 rkey;
struct ib_send_wr *bad_wr;
- int i;
int ret;
+ struct scatterlist sg = {0};
- switch (cb->mem) {
- case FASTREG:
- cb->invalidate_wr.ex.invalidate_rkey = cb->fastreg_mr->rkey;
-
- /*
- * Update the fastreg key.
- */
- ib_update_fast_reg_key(cb->fastreg_mr, ++cb->key);
- cb->fastreg_wr.wr.fast_reg.rkey = cb->fastreg_mr->rkey;
-
- /*
- * Update the fastreg WR with new buf info.
- */
- if (buf == (u64)cb->start_dma_addr)
- cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_READ;
- else
- cb->fastreg_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
- cb->fastreg_wr.wr.fast_reg.iova_start = buf;
- p = (u64)(buf & PAGE_MASK);
- for (i=0; i < cb->fastreg_wr.wr.fast_reg.page_list_len;
- i++, p += PAGE_SIZE) {
- cb->page_list->page_list[i] = p;
- DEBUG_LOG(cb, "page_list[%d] 0x%jx\n", i, (uintmax_t)p);
- }
+ cb->invalidate_wr.ex.invalidate_rkey = cb->reg_mr->rkey;
- DEBUG_LOG(cb, "post_inv = %d, fastreg new rkey 0x%x shift %u len %u"
- " iova_start %jx page_list_len %u\n",
- post_inv,
- cb->fastreg_wr.wr.fast_reg.rkey,
- cb->fastreg_wr.wr.fast_reg.page_shift,
- (unsigned)cb->fastreg_wr.wr.fast_reg.length,
- (uintmax_t)cb->fastreg_wr.wr.fast_reg.iova_start,
- cb->fastreg_wr.wr.fast_reg.page_list_len);
-
- if (post_inv)
- ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr);
- else
- ret = ib_post_send(cb->qp, &cb->fastreg_wr, &bad_wr);
- if (ret) {
- PRINTF(cb, "post send error %d\n", ret);
- cb->state = ERROR;
- }
- rkey = cb->fastreg_mr->rkey;
- break;
- case MW:
- /*
- * Update the MW with new buf info.
- */
- if (buf == (u64)cb->start_dma_addr) {
-#ifdef BIND_INFO
- cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_READ;
- cb->bind_attr.bind_info.mr = cb->start_mr;
-#else
- cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_READ;
- cb->bind_attr.mr = cb->start_mr;
-#endif
- } else {
-#ifdef BIND_INFO
- cb->bind_attr.bind_info.mw_access_flags = IB_ACCESS_REMOTE_WRITE;
- cb->bind_attr.bind_info.mr = cb->rdma_mr;
-#else
- cb->bind_attr.mw_access_flags = IB_ACCESS_REMOTE_WRITE;
- cb->bind_attr.mr = cb->rdma_mr;
-#endif
- }
-#ifdef BIND_INFO
- cb->bind_attr.bind_info.addr = buf;
-#else
- cb->bind_attr.addr = buf;
-#endif
- DEBUG_LOG(cb, "binding mw rkey 0x%x to buf %jx mr rkey 0x%x\n",
-#ifdef BIND_INFO
- cb->mw->rkey, (uintmax_t)buf, cb->bind_attr.bind_info.mr->rkey);
-#else
- cb->mw->rkey, buf, cb->bind_attr.mr->rkey);
-#endif
- ret = ib_bind_mw(cb->qp, cb->mw, &cb->bind_attr);
- if (ret) {
- PRINTF(cb, "bind mw error %d\n", ret);
- cb->state = ERROR;
- } else
- rkey = cb->mw->rkey;
- break;
- case MR:
- if (buf == (u64)cb->start_dma_addr)
- rkey = cb->start_mr->rkey;
- else
- rkey = cb->rdma_mr->rkey;
- break;
- case DMA:
- rkey = cb->dma_mr->rkey;
- break;
- default:
- PRINTF(cb, "%s:%d case ERROR\n", __func__, __LINE__);
+ /*
+ * Update the reg key.
+ */
+ ib_update_fast_reg_key(cb->reg_mr, ++cb->key);
+ cb->reg_mr_wr.key = cb->reg_mr->rkey;
+
+ /*
+ * Update the reg WR with new buf info.
+ */
+ if (buf == (u64)cb->start_dma_addr)
+ cb->reg_mr_wr.access = IB_ACCESS_REMOTE_READ;
+ else
+ cb->reg_mr_wr.access = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
+ sg_dma_address(&sg) = buf;
+ sg_dma_len(&sg) = cb->size;
+
+ ret = ib_map_mr_sg(cb->reg_mr, &sg, 1, NULL, PAGE_SIZE);
+ BUG_ON(ret <= 0 || ret > cb->page_list_len);
+
+ DEBUG_LOG(PFX "post_inv = %d, reg_mr new rkey 0x%x pgsz %u len %u"
+ " iova_start %llx\n",
+ post_inv,
+ cb->reg_mr_wr.key,
+ cb->reg_mr->page_size,
+ cb->reg_mr->length,
+ (unsigned long long)cb->reg_mr->iova);
+
+ if (post_inv)
+ ret = ib_post_send(cb->qp, &cb->invalidate_wr, &bad_wr);
+ else
+ ret = ib_post_send(cb->qp, &cb->reg_mr_wr.wr, &bad_wr);
+ if (ret) {
+ printk(KERN_ERR PFX "post send error %d\n", ret);
cb->state = ERROR;
- break;
}
+ rkey = cb->reg_mr->rkey;
return rkey;
}
@@ -966,16 +752,16 @@ static void krping_format_send(struct krping_cb *cb, u64 buf)
u32 rkey;
/*
- * Client side will do fastreg or mw bind before
+ * Client side will do reg or mw bind before
* advertising the rdma buffer. Server side
* sends have no data.
*/
- if (!cb->server || cb->wlat || cb->rlat || cb->bw || cb->frtest) {
+ if (!cb->server || cb->wlat || cb->rlat || cb->bw) {
rkey = krping_rdma_rkey(cb, buf, !cb->server_invalidate);
info->buf = htonll(buf);
info->rkey = htonl(rkey);
info->size = htonl(cb->size);
- DEBUG_LOG(cb, "RDMA addr %llx rkey %x len %d\n",
+ DEBUG_LOG("RDMA addr %llx rkey %x len %d\n",
(unsigned long long)buf, rkey, cb->size);
}
}
@@ -989,111 +775,102 @@ static void krping_test_server(struct krping_cb *cb)
/* Wait for client's Start STAG/TO/Len */
wait_event_interruptible(cb->sem, cb->state >= RDMA_READ_ADV);
if (cb->state != RDMA_READ_ADV) {
- PRINTF(cb, "wait for RDMA_READ_ADV state %d\n",
+ printk(KERN_ERR PFX "wait for RDMA_READ_ADV state %d\n",
cb->state);
break;
}
- DEBUG_LOG(cb, "server received sink adv\n");
+ DEBUG_LOG("server received sink adv\n");
- cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
- cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
- cb->rdma_sq_wr.sg_list->length = cb->remote_len;
- cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 1);
+ cb->rdma_sq_wr.rkey = cb->remote_rkey;
+ cb->rdma_sq_wr.remote_addr = cb->remote_addr;
+ cb->rdma_sq_wr.wr.sg_list->length = cb->remote_len;
+ cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, !cb->read_inv);
+ cb->rdma_sq_wr.wr.next = NULL;
/* Issue RDMA Read. */
if (cb->read_inv)
- cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+ cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
else {
- cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
- if (cb->mem == FASTREG) {
- /*
- * Immediately follow the read with a
- * fenced LOCAL_INV.
- */
- cb->rdma_sq_wr.next = &inv;
- memset(&inv, 0, sizeof inv);
- inv.opcode = IB_WR_LOCAL_INV;
- inv.ex.invalidate_rkey = cb->fastreg_mr->rkey;
- inv.send_flags = IB_SEND_FENCE;
- }
+ cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_READ;
+ /*
+ * Immediately follow the read with a
+ * fenced LOCAL_INV.
+ */
+ cb->rdma_sq_wr.wr.next = &inv;
+ memset(&inv, 0, sizeof inv);
+ inv.opcode = IB_WR_LOCAL_INV;
+ inv.ex.invalidate_rkey = cb->reg_mr->rkey;
+ inv.send_flags = IB_SEND_FENCE;
}
- ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
+ ret = ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr);
if (ret) {
- PRINTF(cb, "post send error %d\n", ret);
+ printk(KERN_ERR PFX "post send error %d\n", ret);
break;
}
- cb->rdma_sq_wr.next = NULL;
+ cb->rdma_sq_wr.wr.next = NULL;
- DEBUG_LOG(cb, "server posted rdma read req \n");
+ DEBUG_LOG("server posted rdma read req \n");
/* Wait for read completion */
wait_event_interruptible(cb->sem,
cb->state >= RDMA_READ_COMPLETE);
if (cb->state != RDMA_READ_COMPLETE) {
- PRINTF(cb,
+ printk(KERN_ERR PFX
"wait for RDMA_READ_COMPLETE state %d\n",
cb->state);
break;
}
- DEBUG_LOG(cb, "server received read complete\n");
+ DEBUG_LOG("server received read complete\n");
/* Display data in recv buf */
- if (cb->verbose) {
- if (strlen(cb->rdma_buf) > 128) {
- char msgbuf[128];
-
- strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf));
- PRINTF(cb, "server ping data stripped: %s\n",
- msgbuf);
- } else
- PRINTF(cb, "server ping data: %s\n",
- cb->rdma_buf);
- }
+ if (cb->verbose)
+ printk(KERN_INFO PFX "server ping data: %s\n",
+ cb->rdma_buf);
/* Tell client to continue */
if (cb->server && cb->server_invalidate) {
cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
- DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
+ DEBUG_LOG("send-w-inv rkey 0x%x\n", cb->remote_rkey);
}
ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
if (ret) {
- PRINTF(cb, "post send error %d\n", ret);
+ printk(KERN_ERR PFX "post send error %d\n", ret);
break;
}
- DEBUG_LOG(cb, "server posted go ahead\n");
+ DEBUG_LOG("server posted go ahead\n");
/* Wait for client's RDMA STAG/TO/Len */
wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
if (cb->state != RDMA_WRITE_ADV) {
- PRINTF(cb,
+ printk(KERN_ERR PFX
"wait for RDMA_WRITE_ADV state %d\n",
cb->state);
break;
}
- DEBUG_LOG(cb, "server received sink adv\n");
+ DEBUG_LOG("server received sink adv\n");
/* RDMA Write echo data */
- cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
- cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
- cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
- cb->rdma_sq_wr.sg_list->length = strlen(cb->rdma_buf) + 1;
+ cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE;
+ cb->rdma_sq_wr.rkey = cb->remote_rkey;
+ cb->rdma_sq_wr.remote_addr = cb->remote_addr;
+ cb->rdma_sq_wr.wr.sg_list->length = strlen(cb->rdma_buf) + 1;
if (cb->local_dma_lkey)
- cb->rdma_sgl.lkey = cb->qp->device->local_dma_lkey;
+ cb->rdma_sgl.lkey = cb->pd->local_dma_lkey;
else
cb->rdma_sgl.lkey = krping_rdma_rkey(cb, cb->rdma_dma_addr, 0);
- DEBUG_LOG(cb, "rdma write from lkey %x laddr %llx len %d\n",
- cb->rdma_sq_wr.sg_list->lkey,
- (unsigned long long)cb->rdma_sq_wr.sg_list->addr,
- cb->rdma_sq_wr.sg_list->length);
+ DEBUG_LOG("rdma write from lkey %x laddr %llx len %d\n",
+ cb->rdma_sq_wr.wr.sg_list->lkey,
+ (unsigned long long)cb->rdma_sq_wr.wr.sg_list->addr,
+ cb->rdma_sq_wr.wr.sg_list->length);
- ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
+ ret = ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr);
if (ret) {
- PRINTF(cb, "post send error %d\n", ret);
+ printk(KERN_ERR PFX "post send error %d\n", ret);
break;
}
@@ -1101,12 +878,12 @@ static void krping_test_server(struct krping_cb *cb)
ret = wait_event_interruptible(cb->sem, cb->state >=
RDMA_WRITE_COMPLETE);
if (cb->state != RDMA_WRITE_COMPLETE) {
- PRINTF(cb,
+ printk(KERN_ERR PFX
"wait for RDMA_WRITE_COMPLETE state %d\n",
cb->state);
break;
}
- DEBUG_LOG(cb, "server rdma write complete \n");
+ DEBUG_LOG("server rdma write complete \n");
cb->state = CONNECTED;
@@ -1114,14 +891,14 @@ static void krping_test_server(struct krping_cb *cb)
if (cb->server && cb->server_invalidate) {
cb->sq_wr.ex.invalidate_rkey = cb->remote_rkey;
cb->sq_wr.opcode = IB_WR_SEND_WITH_INV;
- DEBUG_LOG(cb, "send-w-inv rkey 0x%x\n", cb->remote_rkey);
+ DEBUG_LOG("send-w-inv rkey 0x%x\n", cb->remote_rkey);
}
ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
if (ret) {
- PRINTF(cb, "post send error %d\n", ret);
+ printk(KERN_ERR PFX "post send error %d\n", ret);
break;
}
- DEBUG_LOG(cb, "server posted go ahead\n");
+ DEBUG_LOG("server posted go ahead\n");
}
}
@@ -1136,10 +913,10 @@ static void rlat_test(struct krping_cb *cb)
int ne;
scnt = 0;
- cb->rdma_sq_wr.opcode = IB_WR_RDMA_READ;
- cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
- cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
- cb->rdma_sq_wr.sg_list->length = cb->size;
+ cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_READ;
+ cb->rdma_sq_wr.rkey = cb->remote_rkey;
+ cb->rdma_sq_wr.remote_addr = cb->remote_addr;
+ cb->rdma_sq_wr.wr.sg_list->length = cb->size;
microtime(&start_tv);
if (!cb->poll) {
@@ -1149,9 +926,9 @@ static void rlat_test(struct krping_cb *cb)
while (scnt < iters) {
cb->state = RDMA_READ_ADV;
- ret = ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr);
+ ret = ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr);
if (ret) {
- PRINTF(cb,
+ printk(KERN_ERR PFX
"Couldn't post send: ret=%d scnt %d\n",
ret, scnt);
return;
@@ -1171,7 +948,7 @@ static void rlat_test(struct krping_cb *cb)
} else
ne = ib_poll_cq(cb->cq, 1, &wc);
if (cb->state == ERROR) {
- PRINTF(cb,
+ printk(KERN_ERR PFX
"state == ERROR...bailing scnt %d\n",
scnt);
return;
@@ -1179,13 +956,13 @@ static void rlat_test(struct krping_cb *cb)
} while (ne == 0);
if (ne < 0) {
- PRINTF(cb, "poll CQ failed %d\n", ne);
+ printk(KERN_ERR PFX "poll CQ failed %d\n", ne);
return;
}
if (cb->poll && wc.status != IB_WC_SUCCESS) {
- PRINTF(cb, "Completion wth error at %s:\n",
+ printk(KERN_ERR PFX "Completion wth error at %s:\n",
cb->server ? "server" : "client");
- PRINTF(cb, "Failed status %d: wr_id %d\n",
+ printk(KERN_ERR PFX "Failed status %d: wr_id %d\n",
wc.status, (int) wc.wr_id);
return;
}
@@ -1198,7 +975,7 @@ static void rlat_test(struct krping_cb *cb)
stop_tv.tv_sec -= 1;
}
- PRINTF(cb, "delta sec %lu delta usec %lu iter %d size %d\n",
+ printk(KERN_ERR PFX "delta sec %lu delta usec %lu iter %d size %d\n",
(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
(unsigned long)(stop_tv.tv_usec - start_tv.tv_usec),
scnt, cb->size);
@@ -1224,34 +1001,34 @@ static void wlat_test(struct krping_cb *cb)
post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
if (!post_cycles_start) {
- PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
+ printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
return;
}
post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
if (!post_cycles_stop) {
- PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
+ printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
return;
}
poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
if (!poll_cycles_start) {
- PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
+ printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
return;
}
poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
if (!poll_cycles_stop) {
- PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
+ printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
return;
}
last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t),
GFP_KERNEL);
if (!last_poll_cycles_start) {
- PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
+ printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
return;
}
- cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
- cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
- cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
- cb->rdma_sq_wr.sg_list->length = cb->size;
+ cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE;
+ cb->rdma_sq_wr.rkey = cb->remote_rkey;
+ cb->rdma_sq_wr.remote_addr = cb->remote_addr;
+ cb->rdma_sq_wr.wr.sg_list->length = cb->size;
if (cycle_iters > iters)
cycle_iters = iters;
@@ -1263,7 +1040,7 @@ static void wlat_test(struct krping_cb *cb)
++rcnt;
while (*poll_buf != (char)rcnt) {
if (cb->state == ERROR) {
- PRINTF(cb,
+ printk(KERN_ERR PFX
"state = ERROR, bailing\n");
return;
}
@@ -1276,8 +1053,8 @@ static void wlat_test(struct krping_cb *cb)
*buf = (char)scnt+1;
if (scnt < cycle_iters)
post_cycles_start[scnt] = get_cycles();
- if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
- PRINTF(cb,
+ if (ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr)) {
+ printk(KERN_ERR PFX
"Couldn't post send: scnt=%d\n",
scnt);
return;
@@ -1304,17 +1081,17 @@ static void wlat_test(struct krping_cb *cb)
++ccnt;
if (ne < 0) {
- PRINTF(cb, "poll CQ failed %d\n", ne);
+ printk(KERN_ERR PFX "poll CQ failed %d\n", ne);
return;
}
if (wc.status != IB_WC_SUCCESS) {
- PRINTF(cb,
+ printk(KERN_ERR PFX
"Completion wth error at %s:\n",
cb->server ? "server" : "client");
- PRINTF(cb,
+ printk(KERN_ERR PFX
"Failed status %d: wr_id %d\n",
wc.status, (int) wc.wr_id);
- PRINTF(cb,
+ printk(KERN_ERR PFX
"scnt=%d, rcnt=%d, ccnt=%d\n",
scnt, rcnt, ccnt);
return;
@@ -1333,7 +1110,7 @@ static void wlat_test(struct krping_cb *cb)
sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
}
- PRINTF(cb,
+ printk(KERN_ERR PFX
"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
@@ -1366,34 +1143,34 @@ static void bw_test(struct krping_cb *cb)
post_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
if (!post_cycles_start) {
- PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
+ printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
return;
}
post_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
if (!post_cycles_stop) {
- PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
+ printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
return;
}
poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
if (!poll_cycles_start) {
- PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
+ printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
return;
}
poll_cycles_stop = kmalloc(cycle_iters * sizeof(cycles_t), GFP_KERNEL);
if (!poll_cycles_stop) {
- PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
+ printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
return;
}
last_poll_cycles_start = kmalloc(cycle_iters * sizeof(cycles_t),
GFP_KERNEL);
if (!last_poll_cycles_start) {
- PRINTF(cb, "%s kmalloc failed\n", __FUNCTION__);
+ printk(KERN_ERR PFX "%s kmalloc failed\n", __FUNCTION__);
return;
}
- cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
- cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
- cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
- cb->rdma_sq_wr.sg_list->length = cb->size;
+ cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE;
+ cb->rdma_sq_wr.rkey = cb->remote_rkey;
+ cb->rdma_sq_wr.remote_addr = cb->remote_addr;
+ cb->rdma_sq_wr.wr.sg_list->length = cb->size;
if (cycle_iters > iters)
cycle_iters = iters;
@@ -1405,8 +1182,8 @@ static void bw_test(struct krping_cb *cb)
if (scnt < cycle_iters)
post_cycles_start[scnt] = get_cycles();
- if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
- PRINTF(cb,
+ if (ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr)) {
+ printk(KERN_ERR PFX
"Couldn't post send: scnt=%d\n",
scnt);
return;
@@ -1433,14 +1210,14 @@ static void bw_test(struct krping_cb *cb)
ccnt += 1;
if (ne < 0) {
- PRINTF(cb, "poll CQ failed %d\n", ne);
+ printk(KERN_ERR PFX "poll CQ failed %d\n", ne);
return;
}
if (wc.status != IB_WC_SUCCESS) {
- PRINTF(cb,
+ printk(KERN_ERR PFX
"Completion wth error at %s:\n",
cb->server ? "server" : "client");
- PRINTF(cb,
+ printk(KERN_ERR PFX
"Failed status %d: wr_id %d\n",
wc.status, (int) wc.wr_id);
return;
@@ -1459,7 +1236,7 @@ static void bw_test(struct krping_cb *cb)
sum_poll += poll_cycles_stop[i] - poll_cycles_start[i];
sum_last_poll += poll_cycles_stop[i]-last_poll_cycles_start[i];
}
- PRINTF(cb,
+ printk(KERN_ERR PFX
"delta sec %lu delta usec %lu iter %d size %d cycle_iters %d"
" sum_post %llu sum_poll %llu sum_last_poll %llu\n",
(unsigned long)(stop_tv.tv_sec - start_tv.tv_sec),
@@ -1489,20 +1266,21 @@ static void krping_rlat_test_server(struct krping_cb *cb)
krping_format_send(cb, cb->start_dma_addr);
ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
if (ret) {
- PRINTF(cb, "post send error %d\n", ret);
+ printk(KERN_ERR PFX "post send error %d\n", ret);
return;
}
/* Spin waiting for send completion */
while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
if (ret < 0) {
- PRINTF(cb, "poll error %d\n", ret);
+ printk(KERN_ERR PFX "poll error %d\n", ret);
return;
}
if (wc.status) {
- PRINTF(cb, "send completiong error %d\n", wc.status);
+ printk(KERN_ERR PFX "send completiong error %d\n", wc.status);
return;
}
+
wait_event_interruptible(cb->sem, cb->state == ERROR);
}
@@ -1521,18 +1299,18 @@ static void krping_wlat_test_server(struct krping_cb *cb)
krping_format_send(cb, cb->start_dma_addr);
ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
if (ret) {
- PRINTF(cb, "post send error %d\n", ret);
+ printk(KERN_ERR PFX "post send error %d\n", ret);
return;
}
/* Spin waiting for send completion */
while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
if (ret < 0) {
- PRINTF(cb, "poll error %d\n", ret);
+ printk(KERN_ERR PFX "poll error %d\n", ret);
return;
}
if (wc.status) {
- PRINTF(cb, "send completiong error %d\n", wc.status);
+ printk(KERN_ERR PFX "send completiong error %d\n", wc.status);
return;
}
@@ -1555,18 +1333,18 @@ static void krping_bw_test_server(struct krping_cb *cb)
krping_format_send(cb, cb->start_dma_addr);
ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
if (ret) {
- PRINTF(cb, "post send error %d\n", ret);
+ printk(KERN_ERR PFX "post send error %d\n", ret);
return;
}
/* Spin waiting for send completion */
while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
if (ret < 0) {
- PRINTF(cb, "poll error %d\n", ret);
+ printk(KERN_ERR PFX "poll error %d\n", ret);
return;
}
if (wc.status) {
- PRINTF(cb, "send completiong error %d\n", wc.status);
+ printk(KERN_ERR PFX "send completiong error %d\n", wc.status);
return;
}
@@ -1575,701 +1353,73 @@ static void krping_bw_test_server(struct krping_cb *cb)
wait_event_interruptible(cb->sem, cb->state == ERROR);
}
-static int fastreg_supported(struct krping_cb *cb, int server)
+static int reg_supported(struct ib_device *dev)
{
- struct ib_device *dev = server?cb->child_cm_id->device:
- cb->cm_id->device;
- struct ib_device_attr attr;
- int ret;
+ u64 needed_flags = IB_DEVICE_MEM_MGT_EXTENSIONS;
- ret = ib_query_device(dev, &attr);
- if (ret) {
- PRINTF(cb, "ib_query_device failed ret %d\n", ret);
+ if ((dev->attrs.device_cap_flags & needed_flags) != needed_flags) {
+ printk(KERN_ERR PFX
+ "Fastreg not supported - device_cap_flags 0x%llx\n",
+ (unsigned long long)dev->attrs.device_cap_flags);
return 0;
}
- if (!(attr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
- PRINTF(cb, "Fastreg not supported - device_cap_flags 0x%llx\n",
- (unsigned long long)attr.device_cap_flags);
- return 0;
- }
- DEBUG_LOG(cb, "Fastreg supported - device_cap_flags 0x%jx\n",
- (uintmax_t)attr.device_cap_flags);
+ DEBUG_LOG("Fastreg supported - device_cap_flags 0x%llx\n",
+ (unsigned long long)dev->attrs.device_cap_flags);
return 1;
}
+static void fill_sockaddr(struct sockaddr_storage *sin, struct krping_cb *cb)
+{
+ memset(sin, 0, sizeof(*sin));
+
+ if (cb->addr_type == AF_INET) {
+ struct sockaddr_in *sin4 = (struct sockaddr_in *)sin;
+ sin4->sin_family = AF_INET;
+ memcpy((void *)&sin4->sin_addr.s_addr, cb->addr, 4);
+ sin4->sin_port = cb->port;
+ } else if (cb->addr_type == AF_INET6) {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin;
+ sin6->sin6_family = AF_INET6;
+ memcpy((void *)&sin6->sin6_addr, cb->addr, 16);
+ sin6->sin6_port = cb->port;
+ }
+}
+
static int krping_bind_server(struct krping_cb *cb)
{
- struct sockaddr_in sin;
+ struct sockaddr_storage sin;
int ret;
- memset(&sin, 0, sizeof(sin));
- sin.sin_len = sizeof sin;
- sin.sin_family = AF_INET;
- sin.sin_addr.s_addr = cb->addr.s_addr;
- sin.sin_port = cb->port;
- ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *) &sin);
+ fill_sockaddr(&sin, cb);
+
+ ret = rdma_bind_addr(cb->cm_id, (struct sockaddr *)&sin);
if (ret) {
- PRINTF(cb, "rdma_bind_addr error %d\n", ret);
+ printk(KERN_ERR PFX "rdma_bind_addr error %d\n", ret);
return ret;
}
- DEBUG_LOG(cb, "rdma_bind_addr successful\n");
+ DEBUG_LOG("rdma_bind_addr successful\n");
- DEBUG_LOG(cb, "rdma_listen\n");
+ DEBUG_LOG("rdma_listen\n");
ret = rdma_listen(cb->cm_id, 3);
if (ret) {
- PRINTF(cb, "rdma_listen failed: %d\n", ret);
+ printk(KERN_ERR PFX "rdma_listen failed: %d\n", ret);
return ret;
}
wait_event_interruptible(cb->sem, cb->state >= CONNECT_REQUEST);
if (cb->state != CONNECT_REQUEST) {
- PRINTF(cb, "wait for CONNECT_REQUEST state %d\n",
+ printk(KERN_ERR PFX "wait for CONNECT_REQUEST state %d\n",
cb->state);
return -1;
}
- if (cb->mem == FASTREG && !fastreg_supported(cb, 1))
+ if (!reg_supported(cb->child_cm_id->device))
return -EINVAL;
return 0;
}
-/*
- * sq-depth worth of fastreg + 0B read-inv pairs, reposting them as the reads
- * complete.
- * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy.
- */
-static void krping_fr_test5(struct krping_cb *cb)
-{
- struct ib_fast_reg_page_list **pl;
- struct ib_send_wr *fr, *read, *bad;
- struct ib_wc wc;
- struct ib_sge *sgl;
- u8 key = 0;
- struct ib_mr **mr;
- u8 **buf;
- dma_addr_t *dma_addr;
- int i;
- int ret;
- int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
- time_t start;
- int count = 0;
- int scnt;
- int depth = cb->txdepth >> 1;
-
- if (!depth) {
- PRINTF(cb, "txdepth must be > 1 for this test!\n");
- return;
- }
-
- pl = kzalloc(sizeof *pl * depth, GFP_KERNEL);
- DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth);
- mr = kzalloc(sizeof *mr * depth, GFP_KERNEL);
- DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth);
- fr = kzalloc(sizeof *fr * depth, GFP_KERNEL);
- DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth);
- sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL);
- DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth);
- read = kzalloc(sizeof *read * depth, GFP_KERNEL);
- DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, read, sizeof *read * depth);
- buf = kzalloc(sizeof *buf * depth, GFP_KERNEL);
- DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth);
- dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL);
- DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth);
- if (!pl || !mr || !fr || !read || !sgl || !buf || !dma_addr) {
- PRINTF(cb, "kzalloc failed\n");
- goto err1;
- }
-
- for (scnt = 0; scnt < depth; scnt++) {
- pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
- if (IS_ERR(pl[scnt])) {
- PRINTF(cb, "alloc_fr_page_list failed %ld\n",
- PTR_ERR(pl[scnt]));
- goto err2;
- }
- DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]);
-
- mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen);
- if (IS_ERR(mr[scnt])) {
- PRINTF(cb, "alloc_fr failed %ld\n",
- PTR_ERR(mr[scnt]));
- goto err2;
- }
- DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]);
- ib_update_fast_reg_key(mr[scnt], ++key);
-
- buf[scnt] = kmalloc(cb->size, GFP_KERNEL);
- if (!buf[scnt]) {
- PRINTF(cb, "kmalloc failed\n");
- ret = -ENOMEM;
- goto err2;
- }
- DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]);
- dma_addr[scnt] = ib_dma_map_single(cb->pd->device,
- buf[scnt], cb->size,
- DMA_BIDIRECTIONAL);
- if (dma_mapping_error(cb->pd->device->dma_device,
- dma_addr[scnt])) {
- PRINTF(cb, "dma_map failed\n");
- ret = -ENOMEM;
- goto err2;
- }
- DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]);
- for (i=0; i<plen; i++) {
- pl[scnt]->page_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE);
- DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n",
- __func__, scnt, i, (uintmax_t)pl[scnt]->page_list[i]);
- }
-
- sgl[scnt].lkey = mr[scnt]->rkey;
- sgl[scnt].length = cb->size;
- sgl[scnt].addr = (u64)buf[scnt];
- DEBUG_LOG(cb, "%s sgl[%u].lkey 0x%x length %u addr 0x%jx\n",
- __func__, scnt, sgl[scnt].lkey, sgl[scnt].length,
- (uintmax_t)sgl[scnt].addr);
-
- fr[scnt].opcode = IB_WR_FAST_REG_MR;
- fr[scnt].wr_id = scnt;
- fr[scnt].send_flags = 0;
- fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT;
- fr[scnt].wr.fast_reg.length = cb->size;
- fr[scnt].wr.fast_reg.page_list = pl[scnt];
- fr[scnt].wr.fast_reg.page_list_len = plen;
- fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt];
- fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
- fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey;
- fr[scnt].next = &read[scnt];
- read[scnt].opcode = IB_WR_RDMA_READ_WITH_INV;
- read[scnt].wr_id = scnt;
- read[scnt].send_flags = IB_SEND_SIGNALED;
- read[scnt].wr.rdma.rkey = cb->remote_rkey;
- read[scnt].wr.rdma.remote_addr = cb->remote_addr;
- read[scnt].num_sge = 1;
- read[scnt].sg_list = &sgl[scnt];
- ret = ib_post_send(cb->qp, &fr[scnt], &bad);
- if (ret) {
- PRINTF(cb, "ib_post_send failed %d\n", ret);
- goto err2;
- }
- }
-
- start = time_uptime;
- DEBUG_LOG(cb, "%s starting IO.\n", __func__);
- while (!cb->count || cb->server || count < cb->count) {
- if ((time_uptime - start) >= 9) {
- DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__,
- count);
- wait_event_interruptible_timeout(cb->sem,
- cb->state == ERROR,
- 1);
- if (cb->state == ERROR)
- break;
- start = time_uptime;
- }
- do {
- ret = ib_poll_cq(cb->cq, 1, &wc);
- if (ret < 0) {
- PRINTF(cb, "ib_poll_cq failed %d\n",
- ret);
- goto err2;
- }
- if (ret == 1) {
- if (wc.status) {
- PRINTF(cb,
- "completion error %u wr_id %ju "
- "opcode %d\n", wc.status,
- (uintmax_t)wc.wr_id, wc.opcode);
- goto err2;
- }
- count++;
- if (count == cb->count)
- break;
- ib_update_fast_reg_key(mr[wc.wr_id], ++key);
- fr[wc.wr_id].wr.fast_reg.rkey =
- mr[wc.wr_id]->rkey;
- sgl[wc.wr_id].lkey = mr[wc.wr_id]->rkey;
- ret = ib_post_send(cb->qp, &fr[wc.wr_id], &bad);
- if (ret) {
- PRINTF(cb,
- "ib_post_send failed %d\n", ret);
- goto err2;
- }
- } else if (krping_sigpending()) {
- PRINTF(cb, "signal!\n");
- goto err2;
- }
- } while (ret == 1);
- }
- DEBUG_LOG(cb, "%s done!\n", __func__);
-err2:
- DEBUG_LOG(cb, "sleeping 1 second\n");
- wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
- DEBUG_LOG(cb, "draining the cq...\n");
- do {
- ret = ib_poll_cq(cb->cq, 1, &wc);
- if (ret < 0) {
- PRINTF(cb, "ib_poll_cq failed %d\n", ret);
- break;
- }
- if (ret == 1) {
- if (wc.status) {
- PRINTF(cb, "completion error %u "
- "opcode %u\n", wc.status, wc.opcode);
- }
- }
- } while (ret == 1);
-
- DEBUG_LOG(cb, "destroying fr mrs!\n");
- for (scnt = 0; scnt < depth; scnt++) {
- if (mr[scnt]) {
- ib_dereg_mr(mr[scnt]);
- DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]);
- }
- }
- DEBUG_LOG(cb, "unmapping/freeing bufs!\n");
- for (scnt = 0; scnt < depth; scnt++) {
- if (buf[scnt]) {
- dma_unmap_single(cb->pd->device->dma_device,
- dma_addr[scnt], cb->size,
- DMA_BIDIRECTIONAL);
- kfree(buf[scnt]);
- DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]);
- }
- }
- DEBUG_LOG(cb, "destroying fr page lists!\n");
- for (scnt = 0; scnt < depth; scnt++) {
- if (pl[scnt]) {
- DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]);
- ib_free_fast_reg_page_list(pl[scnt]);
- }
- }
-err1:
- if (pl)
- kfree(pl);
- if (mr)
- kfree(mr);
- if (fr)
- kfree(fr);
- if (read)
- kfree(read);
- if (sgl)
- kfree(sgl);
- if (buf)
- kfree(buf);
- if (dma_addr)
- kfree(dma_addr);
-}
-static void krping_fr_test_server(struct krping_cb *cb)
-{
- DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__);
- wait_event_interruptible(cb->sem, cb->state == ERROR);
-}
-
-static void krping_fr_test5_server(struct krping_cb *cb)
-{
- struct ib_send_wr *bad_wr;
- struct ib_wc wc;
- int ret;
-
- /* Spin waiting for client's Start STAG/TO/Len */
- while (cb->state < RDMA_READ_ADV) {
- krping_cq_event_handler(cb->cq, cb);
- }
- DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__,
- cb->remote_rkey, (uintmax_t)cb->remote_addr);
-
- /* Send STAG/TO/Len to client */
- krping_format_send(cb, cb->start_dma_addr);
- ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
- if (ret) {
- PRINTF(cb, "post send error %d\n", ret);
- return;
- }
-
- /* Spin waiting for send completion */
- while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
- if (ret < 0) {
- PRINTF(cb, "poll error %d\n", ret);
- return;
- }
- if (wc.status) {
- PRINTF(cb, "send completiong error %d\n", wc.status);
- return;
- }
-
- if (cb->duplex)
- krping_fr_test5(cb);
- DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__);
- wait_event_interruptible(cb->sem, cb->state == ERROR);
-}
-
-static void krping_fr_test5_client(struct krping_cb *cb)
-{
- struct ib_send_wr *bad;
- struct ib_wc wc;
- int ret;
-
- cb->state = RDMA_READ_ADV;
-
- /* Send STAG/TO/Len to server */
- krping_format_send(cb, cb->start_dma_addr);
- if (cb->state == ERROR) {
- PRINTF(cb, "krping_format_send failed\n");
- return;
- }
- ret = ib_post_send(cb->qp, &cb->sq_wr, &bad);
- if (ret) {
- PRINTF(cb, "post send error %d\n", ret);
- return;
- }
-
- /* Spin waiting for send completion */
- while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
- if (ret < 0) {
- PRINTF(cb, "poll error %d\n", ret);
- return;
- }
- if (wc.status) {
- PRINTF(cb, "send completion error %d\n", wc.status);
- return;
- }
-
- /* Spin waiting for server's Start STAG/TO/Len */
- while (cb->state < RDMA_WRITE_ADV) {
- krping_cq_event_handler(cb->cq, cb);
- }
- DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey,
- (uintmax_t)cb->remote_addr);
-
- return krping_fr_test5(cb);
-}
-
-/*
- * sq-depth worth of write + fastreg + inv, reposting them as the invs
- * complete.
- * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy.
- * If a count is given, then the last IO will have a bogus lkey in the
- * write work request. This reproduces a fw bug where the connection
- * will get stuck if a fastreg is processed while the ulptx is failing
- * the bad write.
- */
-static void krping_fr_test6(struct krping_cb *cb)
-{
- struct ib_fast_reg_page_list **pl;
- struct ib_send_wr *fr, *write, *inv, *bad;
- struct ib_wc wc;
- struct ib_sge *sgl;
- u8 key = 0;
- struct ib_mr **mr;
- u8 **buf;
- dma_addr_t *dma_addr;
- int i;
- int ret;
- int plen = (((cb->size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
- unsigned long start;
- int count = 0;
- int scnt;
- int depth = cb->txdepth / 3;
-
- if (!depth) {
- PRINTF(cb, "txdepth must be > 3 for this test!\n");
- return;
- }
-
- pl = kzalloc(sizeof *pl * depth, GFP_KERNEL);
- DEBUG_LOG(cb, "%s pl %p size %zu\n", __func__, pl, sizeof *pl * depth);
-
- mr = kzalloc(sizeof *mr * depth, GFP_KERNEL);
- DEBUG_LOG(cb, "%s mr %p size %zu\n", __func__, mr, sizeof *mr * depth);
-
- fr = kzalloc(sizeof *fr * depth, GFP_KERNEL);
- DEBUG_LOG(cb, "%s fr %p size %zu\n", __func__, fr, sizeof *fr * depth);
-
- sgl = kzalloc(sizeof *sgl * depth, GFP_KERNEL);
- DEBUG_LOG(cb, "%s sgl %p size %zu\n", __func__, sgl, sizeof *sgl * depth);
-
- write = kzalloc(sizeof *write * depth, GFP_KERNEL);
- DEBUG_LOG(cb, "%s read %p size %zu\n", __func__, write, sizeof *write * depth);
-
- inv = kzalloc(sizeof *inv * depth, GFP_KERNEL);
- DEBUG_LOG(cb, "%s inv %p size %zu\n", __func__, inv, sizeof *inv * depth);
-
- buf = kzalloc(sizeof *buf * depth, GFP_KERNEL);
- DEBUG_LOG(cb, "%s buf %p size %zu\n", __func__, buf, sizeof *buf * depth);
-
- dma_addr = kzalloc(sizeof *dma_addr * depth, GFP_KERNEL);
- DEBUG_LOG(cb, "%s dma_addr %p size %zu\n", __func__, dma_addr, sizeof *dma_addr * depth);
-
- if (!pl || !mr || !fr || !write || !sgl || !buf || !dma_addr) {
- PRINTF(cb, "kzalloc failed\n");
- goto err1;
- }
-
- for (scnt = 0; scnt < depth; scnt++) {
- pl[scnt] = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
- if (IS_ERR(pl[scnt])) {
- PRINTF(cb, "alloc_fr_page_list failed %ld\n",
- PTR_ERR(pl[scnt]));
- goto err2;
- }
- DEBUG_LOG(cb, "%s pl[%u] %p\n", __func__, scnt, pl[scnt]);
-
- mr[scnt] = ib_alloc_fast_reg_mr(cb->pd, plen);
- if (IS_ERR(mr[scnt])) {
- PRINTF(cb, "alloc_fr failed %ld\n",
- PTR_ERR(mr[scnt]));
- goto err2;
- }
- DEBUG_LOG(cb, "%s mr[%u] %p\n", __func__, scnt, mr[scnt]);
- ib_update_fast_reg_key(mr[scnt], ++key);
-
- buf[scnt] = kmalloc(cb->size, GFP_KERNEL);
- if (!buf[scnt]) {
- PRINTF(cb, "kmalloc failed\n");
- ret = -ENOMEM;
- goto err2;
- }
- DEBUG_LOG(cb, "%s buf[%u] %p\n", __func__, scnt, buf[scnt]);
- dma_addr[scnt] = ib_dma_map_single(cb->pd->device,
- buf[scnt], cb->size,
- DMA_BIDIRECTIONAL);
- if (dma_mapping_error(cb->pd->device->dma_device,
- dma_addr[scnt])) {
- PRINTF(cb, "dma_map failed\n");
- ret = -ENOMEM;
- goto err2;
- }
- DEBUG_LOG(cb, "%s dma_addr[%u] %p\n", __func__, scnt, (void *)dma_addr[scnt]);
- for (i=0; i<plen; i++) {
- pl[scnt]->page_list[i] = ((unsigned long)dma_addr[scnt] & PAGE_MASK) + (i * PAGE_SIZE);
- DEBUG_LOG(cb, "%s pl[%u]->page_list[%u] 0x%jx\n",
- __func__, scnt, i, (uintmax_t)pl[scnt]->page_list[i]);
- }
-
- write[scnt].opcode = IB_WR_RDMA_WRITE;
- write[scnt].wr_id = scnt;
- write[scnt].wr.rdma.rkey = cb->remote_rkey;
- write[scnt].wr.rdma.remote_addr = cb->remote_addr;
- write[scnt].num_sge = 1;
- write[scnt].sg_list = &cb->rdma_sgl;
- write[scnt].sg_list->length = cb->size;
- write[scnt].next = &fr[scnt];
-
- fr[scnt].opcode = IB_WR_FAST_REG_MR;
- fr[scnt].wr_id = scnt;
- fr[scnt].wr.fast_reg.page_shift = PAGE_SHIFT;
- fr[scnt].wr.fast_reg.length = cb->size;
- fr[scnt].wr.fast_reg.page_list = pl[scnt];
- fr[scnt].wr.fast_reg.page_list_len = plen;
- fr[scnt].wr.fast_reg.iova_start = (u64)buf[scnt];
- fr[scnt].wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
- fr[scnt].wr.fast_reg.rkey = mr[scnt]->rkey;
- fr[scnt].next = &inv[scnt];
-
- inv[scnt].opcode = IB_WR_LOCAL_INV;
- inv[scnt].send_flags = IB_SEND_SIGNALED;
- inv[scnt].ex.invalidate_rkey = mr[scnt]->rkey;
-
- ret = ib_post_send(cb->qp, &write[scnt], &bad);
- if (ret) {
- PRINTF(cb, "ib_post_send failed %d\n", ret);
- goto err2;
- }
- }
-
- start = time_uptime;
- DEBUG_LOG(cb, "%s starting IO.\n", __func__);
- while (!cb->count || cb->server || count < cb->count) {
- if ((time_uptime - start) >= 9) {
- DEBUG_LOG(cb, "%s pausing 1 tick! count %u\n", __func__,
- count);
- wait_event_interruptible_timeout(cb->sem,
- cb->state == ERROR,
- 1);
- if (cb->state == ERROR)
- break;
- start = time_uptime;
- }
- do {
- ret = ib_poll_cq(cb->cq, 1, &wc);
- if (ret < 0) {
- PRINTF(cb, "ib_poll_cq failed %d\n",
- ret);
- goto err2;
- }
- if (ret == 1) {
- if (wc.status) {
- PRINTF(cb,
- "completion error %u wr_id %ju "
- "opcode %d\n", wc.status,
- (uintmax_t)wc.wr_id, wc.opcode);
- goto err2;
- }
- count++;
- if (count == (cb->count -1))
- cb->rdma_sgl.lkey = 0x00dead;
- if (count == cb->count)
- break;
- ib_update_fast_reg_key(mr[wc.wr_id], ++key);
- fr[wc.wr_id].wr.fast_reg.rkey =
- mr[wc.wr_id]->rkey;
- inv[wc.wr_id].ex.invalidate_rkey =
- mr[wc.wr_id]->rkey;
- ret = ib_post_send(cb->qp, &write[wc.wr_id], &bad);
- if (ret) {
- PRINTF(cb,
- "ib_post_send failed %d\n", ret);
- goto err2;
- }
- } else if (krping_sigpending()){
- PRINTF(cb, "signal!\n");
- goto err2;
- }
- } while (ret == 1);
- }
- DEBUG_LOG(cb, "%s done!\n", __func__);
-err2:
- DEBUG_LOG(cb, "sleeping 1 second\n");
- wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
- DEBUG_LOG(cb, "draining the cq...\n");
- do {
- ret = ib_poll_cq(cb->cq, 1, &wc);
- if (ret < 0) {
- PRINTF(cb, "ib_poll_cq failed %d\n", ret);
- break;
- }
- if (ret == 1) {
- if (wc.status) {
- PRINTF(cb, "completion error %u "
- "opcode %u\n", wc.status, wc.opcode);
- }
- }
- } while (ret == 1);
-
- DEBUG_LOG(cb, "destroying fr mrs!\n");
- for (scnt = 0; scnt < depth; scnt++) {
- if (mr[scnt]) {
- ib_dereg_mr(mr[scnt]);
- DEBUG_LOG(cb, "%s dereg mr %p\n", __func__, mr[scnt]);
- }
- }
- DEBUG_LOG(cb, "unmapping/freeing bufs!\n");
- for (scnt = 0; scnt < depth; scnt++) {
- if (buf[scnt]) {
- dma_unmap_single(cb->pd->device->dma_device,
- dma_addr[scnt], cb->size,
- DMA_BIDIRECTIONAL);
- kfree(buf[scnt]);
- DEBUG_LOG(cb, "%s unmap/free buf %p dma_addr %p\n", __func__, buf[scnt], (void *)dma_addr[scnt]);
- }
- }
- DEBUG_LOG(cb, "destroying fr page lists!\n");
- for (scnt = 0; scnt < depth; scnt++) {
- if (pl[scnt]) {
- DEBUG_LOG(cb, "%s free pl %p\n", __func__, pl[scnt]);
- ib_free_fast_reg_page_list(pl[scnt]);
- }
- }
-err1:
- if (pl)
- kfree(pl);
- if (mr)
- kfree(mr);
- if (fr)
- kfree(fr);
- if (write)
- kfree(write);
- if (inv)
- kfree(inv);
- if (sgl)
- kfree(sgl);
- if (buf)
- kfree(buf);
- if (dma_addr)
- kfree(dma_addr);
-}
-
-static void krping_fr_test6_server(struct krping_cb *cb)
-{
- struct ib_send_wr *bad_wr;
- struct ib_wc wc;
- int ret;
-
- /* Spin waiting for client's Start STAG/TO/Len */
- while (cb->state < RDMA_READ_ADV) {
- krping_cq_event_handler(cb->cq, cb);
- }
- DEBUG_LOG(cb, "%s client STAG %x TO 0x%jx\n", __func__,
- cb->remote_rkey, (uintmax_t)cb->remote_addr);
-
- /* Send STAG/TO/Len to client */
- krping_format_send(cb, cb->start_dma_addr);
- ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
- if (ret) {
- PRINTF(cb, "post send error %d\n", ret);
- return;
- }
-
- /* Spin waiting for send completion */
- while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
- if (ret < 0) {
- PRINTF(cb, "poll error %d\n", ret);
- return;
- }
- if (wc.status) {
- PRINTF(cb, "send completiong error %d\n", wc.status);
- return;
- }
-
- if (cb->duplex)
- krping_fr_test6(cb);
- DEBUG_LOG(cb, "%s waiting for disconnect...\n", __func__);
- wait_event_interruptible(cb->sem, cb->state == ERROR);
-}
-
-static void krping_fr_test6_client(struct krping_cb *cb)
-{
- struct ib_send_wr *bad;
- struct ib_wc wc;
- int ret;
-
- cb->state = RDMA_READ_ADV;
-
- /* Send STAG/TO/Len to server */
- krping_format_send(cb, cb->start_dma_addr);
- if (cb->state == ERROR) {
- PRINTF(cb, "krping_format_send failed\n");
- return;
- }
- ret = ib_post_send(cb->qp, &cb->sq_wr, &bad);
- if (ret) {
- PRINTF(cb, "post send error %d\n", ret);
- return;
- }
-
- /* Spin waiting for send completion */
- while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
- if (ret < 0) {
- PRINTF(cb, "poll error %d\n", ret);
- return;
- }
- if (wc.status) {
- PRINTF(cb, "send completion error %d\n", wc.status);
- return;
- }
-
- /* Spin waiting for server's Start STAG/TO/Len */
- while (cb->state < RDMA_WRITE_ADV) {
- krping_cq_event_handler(cb->cq, cb);
- }
- DEBUG_LOG(cb, "%s server STAG %x TO 0x%jx\n", __func__, cb->remote_rkey,
- (uintmax_t)cb->remote_addr);
-
- return krping_fr_test6(cb);
-}
-
static void krping_run_server(struct krping_cb *cb)
{
struct ib_recv_wr *bad_wr;
@@ -2281,25 +1431,25 @@ static void krping_run_server(struct krping_cb *cb)
ret = krping_setup_qp(cb, cb->child_cm_id);
if (ret) {
- PRINTF(cb, "setup_qp failed: %d\n", ret);
+ printk(KERN_ERR PFX "setup_qp failed: %d\n", ret);
goto err0;
}
ret = krping_setup_buffers(cb);
if (ret) {
- PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
+ printk(KERN_ERR PFX "krping_setup_buffers failed: %d\n", ret);
goto err1;
}
ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
if (ret) {
- PRINTF(cb, "ib_post_recv failed: %d\n", ret);
+ printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret);
goto err2;
}
ret = krping_accept(cb);
if (ret) {
- PRINTF(cb, "connect error %d\n", ret);
+ printk(KERN_ERR PFX "connect error %d\n", ret);
goto err2;
}
@@ -2309,26 +1459,7 @@ static void krping_run_server(struct krping_cb *cb)
krping_rlat_test_server(cb);
else if (cb->bw)
krping_bw_test_server(cb);
- else if (cb->frtest) {
- switch (cb->testnum) {
- case 1:
- case 2:
- case 3:
- case 4:
- krping_fr_test_server(cb);
- break;
- case 5:
- krping_fr_test5_server(cb);
- break;
- case 6:
- krping_fr_test6_server(cb);
- break;
- default:
- PRINTF(cb, "unknown fr test %d\n", cb->testnum);
- goto err2;
- break;
- }
- } else
+ else
krping_test_server(cb);
rdma_disconnect(cb->child_cm_id);
err2:
@@ -2364,19 +1495,19 @@ static void krping_test_client(struct krping_cb *cb)
krping_format_send(cb, cb->start_dma_addr);
if (cb->state == ERROR) {
- PRINTF(cb, "krping_format_send failed\n");
+ printk(KERN_ERR PFX "krping_format_send failed\n");
break;
}
ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
if (ret) {
- PRINTF(cb, "post send error %d\n", ret);
+ printk(KERN_ERR PFX "post send error %d\n", ret);
break;
}
/* Wait for server to ACK */
wait_event_interruptible(cb->sem, cb->state >= RDMA_WRITE_ADV);
if (cb->state != RDMA_WRITE_ADV) {
- PRINTF(cb,
+ printk(KERN_ERR PFX
"wait for RDMA_WRITE_ADV state %d\n",
cb->state);
break;
@@ -2385,7 +1516,7 @@ static void krping_test_client(struct krping_cb *cb)
krping_format_send(cb, cb->rdma_dma_addr);
ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
if (ret) {
- PRINTF(cb, "post send error %d\n", ret);
+ printk(KERN_ERR PFX "post send error %d\n", ret);
break;
}
@@ -2393,7 +1524,7 @@ static void krping_test_client(struct krping_cb *cb)
wait_event_interruptible(cb->sem,
cb->state >= RDMA_WRITE_COMPLETE);
if (cb->state != RDMA_WRITE_COMPLETE) {
- PRINTF(cb,
+ printk(KERN_ERR PFX
"wait for RDMA_WRITE_COMPLETE state %d\n",
cb->state);
break;
@@ -2401,20 +1532,12 @@ static void krping_test_client(struct krping_cb *cb)
if (cb->validate)
if (memcmp(cb->start_buf, cb->rdma_buf, cb->size)) {
- PRINTF(cb, "data mismatch!\n");
+ printk(KERN_ERR PFX "data mismatch!\n");
break;
}
- if (cb->verbose) {
- if (strlen(cb->rdma_buf) > 128) {
- char msgbuf[128];
-
- strlcpy(msgbuf, cb->rdma_buf, sizeof(msgbuf));
- PRINTF(cb, "ping data stripped: %s\n",
- msgbuf);
- } else
- PRINTF(cb, "ping data: %s\n", cb->rdma_buf);
- }
+ if (cb->verbose)
+ printk(KERN_INFO PFX "ping data: %s\n", cb->rdma_buf);
#ifdef SLOW_KRPING
wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
#endif
@@ -2432,23 +1555,23 @@ static void krping_rlat_test_client(struct krping_cb *cb)
/* Send STAG/TO/Len to client */
krping_format_send(cb, cb->start_dma_addr);
if (cb->state == ERROR) {
- PRINTF(cb, "krping_format_send failed\n");
+ printk(KERN_ERR PFX "krping_format_send failed\n");
return;
}
ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
if (ret) {
- PRINTF(cb, "post send error %d\n", ret);
+ printk(KERN_ERR PFX "post send error %d\n", ret);
return;
}
/* Spin waiting for send completion */
while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
if (ret < 0) {
- PRINTF(cb, "poll error %d\n", ret);
+ printk(KERN_ERR PFX "poll error %d\n", ret);
return;
}
if (wc.status) {
- PRINTF(cb, "send completion error %d\n", wc.status);
+ printk(KERN_ERR PFX "send completion error %d\n", wc.status);
return;
}
@@ -2468,29 +1591,29 @@ static void krping_rlat_test_client(struct krping_cb *cb)
struct ib_send_wr *bad_wr;
int ne;
- cb->rdma_sq_wr.opcode = IB_WR_RDMA_WRITE;
- cb->rdma_sq_wr.wr.rdma.rkey = cb->remote_rkey;
- cb->rdma_sq_wr.wr.rdma.remote_addr = cb->remote_addr;
- cb->rdma_sq_wr.sg_list->length = 0;
- cb->rdma_sq_wr.num_sge = 0;
+ cb->rdma_sq_wr.wr.opcode = IB_WR_RDMA_WRITE;
+ cb->rdma_sq_wr.rkey = cb->remote_rkey;
+ cb->rdma_sq_wr.remote_addr = cb->remote_addr;
+ cb->rdma_sq_wr.wr.sg_list->length = 0;
+ cb->rdma_sq_wr.wr.num_sge = 0;
microtime(&start);
for (i=0; i < 100000; i++) {
- if (ib_post_send(cb->qp, &cb->rdma_sq_wr, &bad_wr)) {
- PRINTF(cb, "Couldn't post send\n");
+ if (ib_post_send(cb->qp, &cb->rdma_sq_wr.wr, &bad_wr)) {
+ printk(KERN_ERR PFX "Couldn't post send\n");
return;
}
do {
ne = ib_poll_cq(cb->cq, 1, &wc);
} while (ne == 0);
if (ne < 0) {
- PRINTF(cb, "poll CQ failed %d\n", ne);
+ printk(KERN_ERR PFX "poll CQ failed %d\n", ne);
return;
}
if (wc.status != IB_WC_SUCCESS) {
- PRINTF(cb, "Completion wth error at %s:\n",
+ printk(KERN_ERR PFX "Completion wth error at %s:\n",
cb->server ? "server" : "client");
- PRINTF(cb, "Failed status %d: wr_id %d\n",
+ printk(KERN_ERR PFX "Failed status %d: wr_id %d\n",
wc.status, (int) wc.wr_id);
return;
}
@@ -2504,7 +1627,7 @@ static void krping_rlat_test_client(struct krping_cb *cb)
sec = stop.tv_sec - start.tv_sec;
usec = stop.tv_usec - start.tv_usec;
elapsed = sec * 1000000 + usec;
- PRINTF(cb, "0B-write-lat iters 100000 usec %llu\n", elapsed);
+ printk(KERN_ERR PFX "0B-write-lat iters 100000 usec %llu\n", elapsed);
}
#endif
@@ -2522,23 +1645,23 @@ static void krping_wlat_test_client(struct krping_cb *cb)
/* Send STAG/TO/Len to client */
krping_format_send(cb, cb->start_dma_addr);
if (cb->state == ERROR) {
- PRINTF(cb, "krping_format_send failed\n");
+ printk(KERN_ERR PFX "krping_format_send failed\n");
return;
}
ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
if (ret) {
- PRINTF(cb, "post send error %d\n", ret);
+ printk(KERN_ERR PFX "post send error %d\n", ret);
return;
}
/* Spin waiting for send completion */
while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
if (ret < 0) {
- PRINTF(cb, "poll error %d\n", ret);
+ printk(KERN_ERR PFX "poll error %d\n", ret);
return;
}
if (wc.status) {
- PRINTF(cb, "send completion error %d\n", wc.status);
+ printk(KERN_ERR PFX "send completion error %d\n", wc.status);
return;
}
@@ -2561,23 +1684,23 @@ static void krping_bw_test_client(struct krping_cb *cb)
/* Send STAG/TO/Len to client */
krping_format_send(cb, cb->start_dma_addr);
if (cb->state == ERROR) {
- PRINTF(cb, "krping_format_send failed\n");
+ printk(KERN_ERR PFX "krping_format_send failed\n");
return;
}
ret = ib_post_send(cb->qp, &cb->sq_wr, &bad_wr);
if (ret) {
- PRINTF(cb, "post send error %d\n", ret);
+ printk(KERN_ERR PFX "post send error %d\n", ret);
return;
}
/* Spin waiting for send completion */
while ((ret = ib_poll_cq(cb->cq, 1, &wc) == 0));
if (ret < 0) {
- PRINTF(cb, "poll error %d\n", ret);
+ printk(KERN_ERR PFX "poll error %d\n", ret);
return;
}
if (wc.status) {
- PRINTF(cb, "send completion error %d\n", wc.status);
+ printk(KERN_ERR PFX "send completion error %d\n", wc.status);
return;
}
@@ -2589,261 +1712,101 @@ static void krping_bw_test_client(struct krping_cb *cb)
bw_test(cb);
}
-
/*
- * fastreg 2 valid different mrs and verify the completions.
+ * Manual qp flush test
*/
-static void krping_fr_test1(struct krping_cb *cb)
+static void flush_qp(struct krping_cb *cb)
{
- struct ib_fast_reg_page_list *pl;
- struct ib_send_wr fr, *bad;
+ struct ib_send_wr wr = { 0 }, *bad;
+ struct ib_recv_wr recv_wr = { 0 }, *recv_bad;
struct ib_wc wc;
- struct ib_mr *mr1, *mr2;
- int i;
int ret;
- int size = cb->size;
- int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
- int count = 0;
+ int flushed = 0;
+ int ccnt = 0;
- pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
- if (IS_ERR(pl)) {
- PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
- return;
- }
-
- mr1 = ib_alloc_fast_reg_mr(cb->pd, plen);
- if (IS_ERR(mr1)) {
- PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
- goto err1;
- }
- mr2 = ib_alloc_fast_reg_mr(cb->pd, plen);
- if (IS_ERR(mr2)) {
- PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
- goto err2;
- }
-
-
- for (i=0; i<plen; i++)
- pl->page_list[i] = i * PAGE_SIZE;
+ rdma_disconnect(cb->cm_id);
+ DEBUG_LOG("disconnected!\n");
- memset(&fr, 0, sizeof fr);
- fr.opcode = IB_WR_FAST_REG_MR;
- fr.wr_id = 1;
- fr.wr.fast_reg.page_shift = PAGE_SHIFT;
- fr.wr.fast_reg.length = size;
- fr.wr.fast_reg.page_list = pl;
- fr.wr.fast_reg.page_list_len = plen;
- fr.wr.fast_reg.iova_start = 0;
- fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
- fr.send_flags = IB_SEND_SIGNALED;
- fr.wr.fast_reg.rkey = mr1->rkey;
- DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
- ret = ib_post_send(cb->qp, &fr, &bad);
- if (ret) {
- PRINTF(cb, "ib_post_send failed %d\n", ret);
- goto err3;
- }
- fr.wr.fast_reg.rkey = mr2->rkey;
- DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
- ret = ib_post_send(cb->qp, &fr, &bad);
+ wr.opcode = IB_WR_SEND;
+ wr.wr_id = 0xdeadbeefcafebabe;
+ ret = ib_post_send(cb->qp, &wr, &bad);
if (ret) {
- PRINTF(cb, "ib_post_send failed %d\n", ret);
- goto err3;
- }
-
- DEBUG_LOG(cb, "sleeping 1 second\n");
- wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
- do {
- ret = ib_poll_cq(cb->cq, 1, &wc);
- if (ret < 0) {
- PRINTF(cb, "ib_poll_cq failed %d\n", ret);
- goto err3;
- }
- if (ret == 1) {
- DEBUG_LOG(cb, "completion status %u wr %s\n",
- wc.status, wc.wr_id == 1 ? "fr" : "inv");
- count++;
- } else if (krping_sigpending()) {
- PRINTF(cb, "signal!\n");
- goto err3;
- }
-
- wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
- } while (count != 2);
-err3:
- DEBUG_LOG(cb, "sleeping 1 second\n");
- wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
- DEBUG_LOG(cb, "draining the cq...\n");
- do {
- ret = ib_poll_cq(cb->cq, 1, &wc);
- if (ret < 0) {
- PRINTF(cb, "ib_poll_cq failed %d\n", ret);
- break;
- }
- if (ret == 1) {
- PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode);
- }
- } while (ret == 1);
- DEBUG_LOG(cb, "destroying fr mr2!\n");
-
- ib_dereg_mr(mr2);
-err2:
- DEBUG_LOG(cb, "destroying fr mr1!\n");
- ib_dereg_mr(mr1);
-err1:
- DEBUG_LOG(cb, "destroying fr page list!\n");
- ib_free_fast_reg_page_list(pl);
- DEBUG_LOG(cb, "%s done!\n", __func__);
-}
-
-/*
- * fastreg the same mr twice, 2nd one should produce error cqe.
- */
-static void krping_fr_test2(struct krping_cb *cb)
-{
- struct ib_fast_reg_page_list *pl;
- struct ib_send_wr fr, *bad;
- struct ib_wc wc;
- struct ib_mr *mr1;
- int i;
- int ret;
- int size = cb->size;
- int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
- int count = 0;
-
- pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
- if (IS_ERR(pl)) {
- PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
+ printk(KERN_ERR PFX "%s post_send failed ret %d\n", __func__, ret);
return;
}
- mr1 = ib_alloc_fast_reg_mr(cb->pd, plen);
- if (IS_ERR(mr1)) {
- PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
- goto err1;
- }
-
- for (i=0; i<plen; i++)
- pl->page_list[i] = i * PAGE_SIZE;
-
- memset(&fr, 0, sizeof fr);
- fr.opcode = IB_WR_FAST_REG_MR;
- fr.wr_id = 1;
- fr.wr.fast_reg.page_shift = PAGE_SHIFT;
- fr.wr.fast_reg.length = size;
- fr.wr.fast_reg.page_list = pl;
- fr.wr.fast_reg.page_list_len = plen;
- fr.wr.fast_reg.iova_start = 0;
- fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
- fr.send_flags = IB_SEND_SIGNALED;
- fr.wr.fast_reg.rkey = mr1->rkey;
- DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
- ret = ib_post_send(cb->qp, &fr, &bad);
+ recv_wr.wr_id = 0xcafebabedeadbeef;
+ ret = ib_post_recv(cb->qp, &recv_wr, &recv_bad);
if (ret) {
- PRINTF(cb, "ib_post_send failed %d\n", ret);
- goto err3;
- }
- DEBUG_LOG(cb, "%s fr2: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
- ret = ib_post_send(cb->qp, &fr, &bad);
- if (ret) {
- PRINTF(cb, "ib_post_send failed %d\n", ret);
- goto err3;
+ printk(KERN_ERR PFX "%s post_recv failed ret %d\n", __func__, ret);
+ return;
}
- DEBUG_LOG(cb, "sleeping 1 second\n");
- wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
+ /* poll until the flush WRs complete */
do {
ret = ib_poll_cq(cb->cq, 1, &wc);
if (ret < 0) {
- PRINTF(cb, "ib_poll_cq failed %d\n", ret);
- goto err3;
- }
- if (ret == 1) {
- DEBUG_LOG(cb, "completion status %u wr %s\n",
- wc.status, wc.wr_id == 1 ? "fr" : "inv");
- count++;
- } else if (krping_sigpending()) {
- PRINTF(cb, "signal!\n");
- goto err3;
- }
- wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
- } while (count != 2);
-err3:
- DEBUG_LOG(cb, "sleeping 1 second\n");
- wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
- DEBUG_LOG(cb, "draining the cq...\n");
- do {
- ret = ib_poll_cq(cb->cq, 1, &wc);
- if (ret < 0) {
- PRINTF(cb, "ib_poll_cq failed %d\n", ret);
- break;
- }
- if (ret == 1) {
- PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode);
+ printk(KERN_ERR PFX "ib_poll_cq failed %d\n", ret);
+ return;
}
- } while (ret == 1);
- DEBUG_LOG(cb, "destroying fr mr1!\n");
- ib_dereg_mr(mr1);
-err1:
- DEBUG_LOG(cb, "destroying fr page list!\n");
- ib_free_fast_reg_page_list(pl);
- DEBUG_LOG(cb, "%s done!\n", __func__);
+ if (ret == 0)
+ continue;
+ ccnt++;
+ if (wc.wr_id == 0xdeadbeefcafebabe ||
+ wc.wr_id == 0xcafebabedeadbeef)
+ flushed++;
+ } while (flushed != 2);
+ DEBUG_LOG("qp_flushed! ccnt %u\n", ccnt);
}
-/*
- * fastreg pipelined in a loop as fast as we can until the user interrupts.
- * NOTE: every 9 seconds we sleep for 1 second to keep the kernel happy.
- */
-static void krping_fr_test3(struct krping_cb *cb)
+static void krping_fr_test(struct krping_cb *cb)
{
- struct ib_fast_reg_page_list *pl;
- struct ib_send_wr fr, inv, *bad;
+ struct ib_send_wr inv, *bad;
+ struct ib_reg_wr fr;
struct ib_wc wc;
u8 key = 0;
struct ib_mr *mr;
- int i;
int ret;
int size = cb->size;
int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
unsigned long start;
int count = 0;
int scnt = 0;
+ struct scatterlist sg = {0};
-
- pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
- if (IS_ERR(pl)) {
- PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
+ mr = ib_alloc_mr(cb->pd, IB_MR_TYPE_MEM_REG, plen);
+ if (IS_ERR(mr)) {
+ printk(KERN_ERR PFX "ib_alloc_mr failed %ld\n", PTR_ERR(mr));
return;
}
-
- mr = ib_alloc_fast_reg_mr(cb->pd, plen);
- if (IS_ERR(mr)) {
- PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
- goto err1;
+
+ sg_dma_address(&sg) = 0xcafebabe0000UL;
+ sg_dma_len(&sg) = size;
+ ret = ib_map_mr_sg(mr, &sg, 1, NULL, PAGE_SIZE);
+ if (ret <= 0) {
+ printk(KERN_ERR PFX "ib_map_mr_sge err %d\n", ret);
+ goto err2;
}
- for (i=0; i<plen; i++)
- pl->page_list[i] = i * PAGE_SIZE;
-
memset(&fr, 0, sizeof fr);
- fr.opcode = IB_WR_FAST_REG_MR;
- fr.wr.fast_reg.page_shift = PAGE_SHIFT;
- fr.wr.fast_reg.length = size;
- fr.wr.fast_reg.page_list = pl;
- fr.wr.fast_reg.page_list_len = plen;
- fr.wr.fast_reg.iova_start = 0;
- fr.send_flags = IB_SEND_SIGNALED;
- fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
- fr.next = &inv;
+ fr.wr.opcode = IB_WR_REG_MR;
+ fr.access = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
+ fr.mr = mr;
+ fr.wr.next = &inv;
+
memset(&inv, 0, sizeof inv);
inv.opcode = IB_WR_LOCAL_INV;
inv.send_flags = IB_SEND_SIGNALED;
- DEBUG_LOG(cb, "fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth);
+ DEBUG_LOG("fr_test: stag index 0x%x plen %u size %u depth %u\n", mr->rkey >> 8, plen, cb->size, cb->txdepth);
start = time_uptime;
- while (1) {
+ while (!cb->count || count <= cb->count) {
+ if (SIGPENDING(curthread)) {
+ printk(KERN_ERR PFX "signal!\n");
+ break;
+ }
if ((time_uptime - start) >= 9) {
- DEBUG_LOG(cb, "fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen);
+ DEBUG_LOG("fr_test: pausing 1 second! count %u latest size %u plen %u\n", count, size, plen);
wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
if (cb->state == ERROR)
break;
@@ -2851,183 +1814,44 @@ static void krping_fr_test3(struct krping_cb *cb)
}
while (scnt < (cb->txdepth>>1)) {
ib_update_fast_reg_key(mr, ++key);
- fr.wr.fast_reg.rkey = mr->rkey;
+ fr.key = mr->rkey;
inv.ex.invalidate_rkey = mr->rkey;
+
size = arc4random() % cb->size;
if (size == 0)
size = cb->size;
- plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
- fr.wr.fast_reg.length = size;
- fr.wr.fast_reg.page_list_len = plen;
- ret = ib_post_send(cb->qp, &fr, &bad);
+ sg_dma_len(&sg) = size;
+ ret = ib_map_mr_sg(mr, &sg, 1, NULL, PAGE_SIZE);
+ if (ret <= 0) {
+ printk(KERN_ERR PFX "ib_map_mr_sge err %d\n", ret);
+ goto err2;
+ }
+ ret = ib_post_send(cb->qp, &fr.wr, &bad);
if (ret) {
- PRINTF(cb, "ib_post_send failed %d\n", ret);
+ printk(KERN_ERR PFX "ib_post_send failed %d\n", ret);
goto err2;
}
- scnt+=2;
+ scnt++;
}
- do {
- ret = ib_poll_cq(cb->cq, 1, &wc);
- if (ret < 0) {
- PRINTF(cb, "ib_poll_cq failed %d\n", ret);
- goto err2;
- }
- if (ret == 1) {
- if (wc.status) {
- PRINTF(cb, "completion error %u\n", wc.status);
- goto err2;
- }
- count++;
- scnt--;
- }
- else if (krping_sigpending()) {
- PRINTF(cb, "signal!\n");
- goto err2;
- }
- } while (ret == 1);
- }
-err2:
- DEBUG_LOG(cb, "sleeping 1 second\n");
- wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
- DEBUG_LOG(cb, "draining the cq...\n");
- do {
ret = ib_poll_cq(cb->cq, 1, &wc);
if (ret < 0) {
- PRINTF(cb, "ib_poll_cq failed %d\n", ret);
- break;
+ printk(KERN_ERR PFX "ib_poll_cq failed %d\n", ret);
+ goto err2;
}
if (ret == 1) {
if (wc.status) {
- PRINTF(cb, "completion error %u opcode %u\n", wc.status, wc.opcode);
+ printk(KERN_ERR PFX "completion error %u\n", wc.status);
+ goto err2;
}
- }
- } while (ret == 1);
- DEBUG_LOG(cb, "fr_test: done!\n");
- ib_dereg_mr(mr);
-err1:
- DEBUG_LOG(cb, "destroying fr page list!\n");
- ib_free_fast_reg_page_list(pl);
- DEBUG_LOG(cb, "%s done!\n", __func__);
-}
-
-/*
- * fastreg 1 and invalidate 1 mr and verify completion.
- */
-static void krping_fr_test4(struct krping_cb *cb)
-{
- struct ib_fast_reg_page_list *pl;
- struct ib_send_wr fr, inv, *bad;
- struct ib_wc wc;
- struct ib_mr *mr1;
- int i;
- int ret;
- int size = cb->size;
- int plen = (((size - 1) & PAGE_MASK) + PAGE_SIZE) >> PAGE_SHIFT;
- int count = 0;
-
- pl = ib_alloc_fast_reg_page_list(cb->qp->device, plen);
- if (IS_ERR(pl)) {
- PRINTF(cb, "ib_alloc_fast_reg_page_list failed %ld\n", PTR_ERR(pl));
- return;
- }
-
- mr1 = ib_alloc_fast_reg_mr(cb->pd, plen);
- if (IS_ERR(mr1)) {
- PRINTF(cb, "ib_alloc_fast_reg_mr failed %ld\n", PTR_ERR(pl));
- goto err1;
- }
-
- for (i=0; i<plen; i++)
- pl->page_list[i] = i * PAGE_SIZE;
-
- memset(&fr, 0, sizeof fr);
- fr.opcode = IB_WR_FAST_REG_MR;
- fr.wr_id = 1;
- fr.wr.fast_reg.page_shift = PAGE_SHIFT;
- fr.wr.fast_reg.length = size;
- fr.wr.fast_reg.page_list = pl;
- fr.wr.fast_reg.page_list_len = plen;
- fr.wr.fast_reg.iova_start = 0;
- fr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE;
- fr.send_flags = IB_SEND_SIGNALED;
- fr.wr.fast_reg.rkey = mr1->rkey;
- fr.next = &inv;
- memset(&inv, 0, sizeof inv);
- inv.opcode = IB_WR_LOCAL_INV;
- inv.ex.invalidate_rkey = mr1->rkey;
-
- DEBUG_LOG(cb, "%s fr1: stag 0x%x plen %u size %u depth %u\n", __func__, fr.wr.fast_reg.rkey, plen, cb->size, cb->txdepth);
- ret = ib_post_send(cb->qp, &fr, &bad);
- if (ret) {
- PRINTF(cb, "ib_post_send failed %d\n", ret);
- goto err3;
- }
- DEBUG_LOG(cb, "sleeping 1 second\n");
- wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
- do {
- ret = ib_poll_cq(cb->cq, 1, &wc);
- if (ret < 0) {
- PRINTF(cb, "ib_poll_cq failed %d\n", ret);
- goto err3;
- }
- if (ret == 1) {
- DEBUG_LOG(cb, "completion status %u wr %s\n",
- wc.status, wc.wr_id == 1 ? "fr" : "inv");
count++;
- } else if (krping_sigpending()) {
- PRINTF(cb, "signal!\n");
- goto err3;
+ scnt--;
}
- wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
- } while (count != 1);
-err3:
- DEBUG_LOG(cb, "sleeping 1 second\n");
- wait_event_interruptible_timeout(cb->sem, cb->state == ERROR, HZ);
- DEBUG_LOG(cb, "draining the cq...\n");
- do {
- ret = ib_poll_cq(cb->cq, 1, &wc);
- if (ret < 0) {
- PRINTF(cb, "ib_poll_cq failed %d\n", ret);
- break;
- }
- if (ret == 1) {
- PRINTF(cb, "completion %u opcode %u\n", wc.status, wc.opcode);
- }
- } while (ret == 1);
- DEBUG_LOG(cb, "destroying fr mr1!\n");
- ib_dereg_mr(mr1);
-err1:
- DEBUG_LOG(cb, "destroying fr page list!\n");
- ib_free_fast_reg_page_list(pl);
- DEBUG_LOG(cb, "%s done!\n", __func__);
-}
-
-static void krping_fr_test(struct krping_cb *cb)
-{
- switch (cb->testnum) {
- case 1:
- krping_fr_test1(cb);
- break;
- case 2:
- krping_fr_test2(cb);
- break;
- case 3:
- krping_fr_test3(cb);
- break;
- case 4:
- krping_fr_test4(cb);
- break;
- case 5:
- krping_fr_test5_client(cb);
- break;
- case 6:
- krping_fr_test6_client(cb);
- break;
- default:
- PRINTF(cb, "Unkown frtest num %u\n", cb->testnum);
- break;
}
+err2:
+ flush_qp(cb);
+ DEBUG_LOG("fr_test: done!\n");
+ ib_dereg_mr(mr);
}
static int krping_connect_client(struct krping_cb *cb)
@@ -3042,50 +1866,45 @@ static int krping_connect_client(struct krping_cb *cb)
ret = rdma_connect(cb->cm_id, &conn_param);
if (ret) {
- PRINTF(cb, "rdma_connect error %d\n", ret);
+ printk(KERN_ERR PFX "rdma_connect error %d\n", ret);
return ret;
}
wait_event_interruptible(cb->sem, cb->state >= CONNECTED);
if (cb->state == ERROR) {
- PRINTF(cb, "wait for CONNECTED state %d\n", cb->state);
+ printk(KERN_ERR PFX "wait for CONNECTED state %d\n", cb->state);
return -1;
}
- DEBUG_LOG(cb, "rdma_connect successful\n");
+ DEBUG_LOG("rdma_connect successful\n");
return 0;
}
static int krping_bind_client(struct krping_cb *cb)
{
- struct sockaddr_in sin;
+ struct sockaddr_storage sin;
int ret;
- memset(&sin, 0, sizeof(sin));
- sin.sin_len = sizeof sin;
- sin.sin_family = AF_INET;
- sin.sin_addr.s_addr = cb->addr.s_addr;
- sin.sin_port = cb->port;
+ fill_sockaddr(&sin, cb);
- ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *) &sin,
- 2000);
+ ret = rdma_resolve_addr(cb->cm_id, NULL, (struct sockaddr *)&sin, 2000);
if (ret) {
- PRINTF(cb, "rdma_resolve_addr error %d\n", ret);
+ printk(KERN_ERR PFX "rdma_resolve_addr error %d\n", ret);
return ret;
}
wait_event_interruptible(cb->sem, cb->state >= ROUTE_RESOLVED);
if (cb->state != ROUTE_RESOLVED) {
- PRINTF(cb,
+ printk(KERN_ERR PFX
"addr/route resolution did not resolve: state %d\n",
cb->state);
return -EINTR;
}
- if (cb->mem == FASTREG && !fastreg_supported(cb, 0))
+ if (!reg_supported(cb->cm_id->device))
return -EINVAL;
- DEBUG_LOG(cb, "rdma_resolve_addr - rdma_resolve_route successful\n");
+ DEBUG_LOG("rdma_resolve_addr - rdma_resolve_route successful\n");
return 0;
}
@@ -3100,25 +1919,25 @@ static void krping_run_client(struct krping_cb *cb)
ret = krping_setup_qp(cb, cb->cm_id);
if (ret) {
- PRINTF(cb, "setup_qp failed: %d\n", ret);
+ printk(KERN_ERR PFX "setup_qp failed: %d\n", ret);
return;
}
ret = krping_setup_buffers(cb);
if (ret) {
- PRINTF(cb, "krping_setup_buffers failed: %d\n", ret);
+ printk(KERN_ERR PFX "krping_setup_buffers failed: %d\n", ret);
goto err1;
}
ret = ib_post_recv(cb->qp, &cb->rq_wr, &bad_wr);
if (ret) {
- PRINTF(cb, "ib_post_recv failed: %d\n", ret);
+ printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret);
goto err2;
}
ret = krping_connect_client(cb);
if (ret) {
- PRINTF(cb, "connect error %d\n", ret);
+ printk(KERN_ERR PFX "connect error %d\n", ret);
goto err2;
}
@@ -3139,7 +1958,7 @@ err1:
krping_free_qp(cb);
}
-int krping_doit(char *cmd, void *cookie)
+int krping_doit(char *cmd)
{
struct krping_cb *cb;
int op;
@@ -3155,69 +1974,76 @@ int krping_doit(char *cmd, void *cookie)
list_add_tail(&cb->list, &krping_cbs);
mutex_unlock(&krping_mutex);
- cb->cookie = cookie;
cb->server = -1;
cb->state = IDLE;
cb->size = 64;
cb->txdepth = RPING_SQ_DEPTH;
- cb->mem = DMA;
init_waitqueue_head(&cb->sem);
while ((op = krping_getopt("krping", &cmd, krping_opts, NULL, &optarg,
&optint)) != 0) {
switch (op) {
+ struct in_addr in_addr;
case 'a':
cb->addr_str = optarg;
- DEBUG_LOG(cb, "ipaddr (%s)\n", optarg);
- if (!inet_aton(optarg, &cb->addr)) {
- PRINTF(cb, "bad addr string %s\n",
+ cb->addr_type = AF_INET;
+ DEBUG_LOG("ipaddr (%s)\n", optarg);
+ if (!inet_aton(optarg, &in_addr)) {
+ printk(KERN_ERR PFX "bad addr string %s\n",
optarg);
ret = EINVAL;
}
+ memcpy(cb->addr, &in_addr.s_addr, sizeof(in_addr.s_addr));
+ break;
+ case 'A':
+ cb->addr_str = optarg;
+ cb->addr_type = AF_INET6;
+ DEBUG_LOG("ipv6addr (%s)\n", optarg);
+ ret = EAFNOSUPPORT; /* XXX not supported */
break;
case 'p':
cb->port = htons(optint);
- DEBUG_LOG(cb, "port %d\n", (int)optint);
+ DEBUG_LOG("port %d\n", (int)optint);
break;
case 'P':
cb->poll = 1;
- DEBUG_LOG(cb, "server\n");
+ DEBUG_LOG("server\n");
break;
case 's':
cb->server = 1;
- DEBUG_LOG(cb, "server\n");
+ DEBUG_LOG("server\n");
break;
case 'c':
cb->server = 0;
- DEBUG_LOG(cb, "client\n");
+ DEBUG_LOG("client\n");
break;
case 'S':
cb->size = optint;
if ((cb->size < 1) ||
(cb->size > RPING_BUFSIZE)) {
- PRINTF(cb, "Invalid size %d "
+ printk(KERN_ERR PFX "Invalid size %d "
"(valid range is 1 to %d)\n",
cb->size, RPING_BUFSIZE);
ret = EINVAL;
} else
- DEBUG_LOG(cb, "size %d\n", (int)optint);
+ DEBUG_LOG("size %d\n", (int)optint);
break;
case 'C':
cb->count = optint;
if (cb->count < 0) {
- PRINTF(cb, "Invalid count %d\n",
+ printk(KERN_ERR PFX "Invalid count %d\n",
cb->count);
ret = EINVAL;
} else
- DEBUG_LOG(cb, "count %d\n", (int) cb->count);
+ DEBUG_LOG("count %d\n", (int) cb->count);
break;
case 'v':
cb->verbose++;
- DEBUG_LOG(cb, "verbose\n");
+ DEBUG_LOG("verbose\n");
break;
case 'V':
cb->validate++;
- DEBUG_LOG(cb, "validate data\n");
+ DEBUG_LOG("validate data\n");
break;
case 'l':
cb->wlat++;
@@ -3231,45 +2057,27 @@ int krping_doit(char *cmd, void *cookie)
case 'd':
cb->duplex++;
break;
- case 'm':
- if (!strncmp(optarg, "dma", 3))
- cb->mem = DMA;
- else if (!strncmp(optarg, "fastreg", 7))
- cb->mem = FASTREG;
- else if (!strncmp(optarg, "mw", 2))
- cb->mem = MW;
- else if (!strncmp(optarg, "mr", 2))
- cb->mem = MR;
- else {
- PRINTF(cb, "unknown mem mode %s. "
- "Must be dma, fastreg, mw, or mr\n",
- optarg);
- ret = -EINVAL;
- break;
- }
- break;
case 'I':
cb->server_invalidate = 1;
break;
case 'T':
cb->txdepth = optint;
- DEBUG_LOG(cb, "txdepth %d\n", (int) cb->txdepth);
+ DEBUG_LOG("txdepth %d\n", (int) cb->txdepth);
break;
case 'Z':
cb->local_dma_lkey = 1;
- DEBUG_LOG(cb, "using local dma lkey\n");
+ DEBUG_LOG("using local dma lkey\n");
break;
case 'R':
cb->read_inv = 1;
- DEBUG_LOG(cb, "using read-with-inv\n");
+ DEBUG_LOG("using read-with-inv\n");
break;
case 'f':
cb->frtest = 1;
- cb->testnum = optint;
- DEBUG_LOG(cb, "fast-reg test!\n");
+ DEBUG_LOG("fast-reg test!\n");
break;
default:
- PRINTF(cb, "unknown opt %s\n", optarg);
+ printk(KERN_ERR PFX "unknown opt %s\n", optarg);
ret = -EINVAL;
break;
}
@@ -3278,48 +2086,43 @@ int krping_doit(char *cmd, void *cookie)
goto out;
if (cb->server == -1) {
- PRINTF(cb, "must be either client or server\n");
+ printk(KERN_ERR PFX "must be either client or server\n");
ret = -EINVAL;
goto out;
}
- if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) {
- PRINTF(cb, "Pick only one test: fr, bw, rlat, wlat\n");
- ret = -EINVAL;
- goto out;
- }
- if (cb->server_invalidate && cb->mem != FASTREG) {
- PRINTF(cb, "server_invalidate only valid with fastreg mem_mode\n");
+ if (cb->server && cb->frtest) {
+ printk(KERN_ERR PFX "must be client to run frtest\n");
ret = -EINVAL;
goto out;
}
- if (cb->read_inv && cb->mem != FASTREG) {
- PRINTF(cb, "read_inv only valid with fastreg mem_mode\n");
+ if ((cb->frtest + cb->bw + cb->rlat + cb->wlat) > 1) {
+ printk(KERN_ERR PFX "Pick only one test: fr, bw, rlat, wlat\n");
ret = -EINVAL;
goto out;
}
- if (cb->mem != MR && (cb->wlat || cb->rlat || cb->bw || cb->frtest)) {
- PRINTF(cb, "wlat, rlat, and bw tests only support mem_mode MR\n");
+ if (cb->wlat || cb->rlat || cb->bw) {
+ printk(KERN_ERR PFX "wlat, rlat, and bw tests only support mem_mode MR - which is no longer supported\n");
ret = -EINVAL;
goto out;
}
- cb->cm_id = rdma_create_id(krping_cma_event_handler, cb, RDMA_PS_TCP, IB_QPT_RC);
+ cb->cm_id = rdma_create_id(&init_net, krping_cma_event_handler, cb, RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(cb->cm_id)) {
ret = PTR_ERR(cb->cm_id);
- PRINTF(cb, "rdma_create_id error %d\n", ret);
+ printk(KERN_ERR PFX "rdma_create_id error %d\n", ret);
goto out;
}
- DEBUG_LOG(cb, "created cm_id %p\n", cb->cm_id);
+ DEBUG_LOG("created cm_id %p\n", cb->cm_id);
if (cb->server)
krping_run_server(cb);
else
krping_run_client(cb);
- DEBUG_LOG(cb, "destroy cm_id %p\n", cb->cm_id);
+ DEBUG_LOG("destroy cm_id %p\n", cb->cm_id);
rdma_destroy_id(cb->cm_id);
out:
mutex_lock(&krping_mutex);
@@ -3339,9 +2142,3 @@ krping_walk_cb_list(void (*f)(struct krping_stats *, void *), void *arg)
(*f)(cb->pd ? &cb->stats : NULL, arg);
mutex_unlock(&krping_mutex);
}
-
-void krping_init(void)
-{
-
- mutex_init(&krping_mutex);
-}
diff --git a/sys/contrib/rdma/krping/krping.h b/sys/contrib/rdma/krping/krping.h
index f201d10475f29..5987b1c8625f8 100644
--- a/sys/contrib/rdma/krping/krping.h
+++ b/sys/contrib/rdma/krping/krping.h
@@ -14,7 +14,6 @@ struct krping_stats {
char name[16];
};
-int krping_doit(char *, void *);
+int krping_doit(char *);
void krping_walk_cb_list(void (*)(struct krping_stats *, void *), void *);
-void krping_init(void);
int krping_sigpending(void);
diff --git a/sys/contrib/rdma/krping/krping_dev.c b/sys/contrib/rdma/krping/krping_dev.c
index 7902ebfa8f4d4..b1e009bf51b13 100644
--- a/sys/contrib/rdma/krping/krping_dev.c
+++ b/sys/contrib/rdma/krping/krping_dev.c
@@ -72,7 +72,6 @@ krping_loader(struct module *m, int what, void *arg)
switch (what) {
case MOD_LOAD: /* kldload */
- krping_init();
krping_dev = make_dev(&krping_cdevsw, 0, UID_ROOT, GID_WHEEL,
0600, "krping");
printf("Krping device loaded.\n");
@@ -204,7 +203,7 @@ krping_write(struct cdev *dev, struct uio *uio, int ioflag)
*cp = 0;
krpingmsg->len = (unsigned long)(cp - krpingmsg->msg);
uprintf("krping: write string = |%s|\n", krpingmsg->msg);
- err = krping_doit(krpingmsg->msg, curproc);
+ err = krping_doit(krpingmsg->msg);
free(krpingmsg, M_DEVBUF);
return(err);
}
diff --git a/sys/modules/ibcore/Makefile b/sys/modules/ibcore/Makefile
index 1845378d04afd..77a5f24ff549c 100644
--- a/sys/modules/ibcore/Makefile
+++ b/sys/modules/ibcore/Makefile
@@ -2,21 +2,40 @@
.PATH: ${SRCTOP}/sys/ofed/drivers/infiniband/core
KMOD= ibcore
-SRCS= addr.c iwcm.c sa_query.c ucma.c uverbs_cmd.c \
- agent.c multicast.c smi.c ud_header.c uverbs_main.c \
- mad.c peer_mem.c umem.c uverbs_marshall.c \
- cache.c device.c packer.c sysfs.c user_mad.c verbs.c \
- cm.c fmr_pool.c mad_rmpp.c ucm.c cma.c \
- vnode_if.h device_if.h bus_if.h pci_if.h \
- opt_inet.h opt_inet6.h
+SRCS= vnode_if.h device_if.h bus_if.h pci_if.h \
+ opt_inet.h opt_inet6.h \
+ ib_addr.c \
+ ib_agent.c \
+ ib_cache.c \
+ ib_cm.c \
+ ib_cma.c \
+ ib_cq.c \
+ ib_device.c \
+ ib_fmr_pool.c \
+ ib_iwcm.c \
+ ib_iwpm_msg.c \
+ ib_iwpm_util.c \
+ ib_mad.c \
+ ib_mad_rmpp.c \
+ ib_multicast.c \
+ ib_packer.c \
+ ib_roce_gid_mgmt.c \
+ ib_sa_query.c \
+ ib_smi.c \
+ ib_sysfs.c \
+ ib_ucm.c \
+ ib_ucma.c \
+ ib_ud_header.c \
+ ib_umem.c \
+ ib_user_mad.c \
+ ib_uverbs_cmd.c \
+ ib_uverbs_main.c \
+ ib_uverbs_marshall.c \
+ ib_verbs.c
-CFLAGS+= -I${SRCTOP}/sys/ofed/drivers/infiniband/core
CFLAGS+= -I${SRCTOP}/sys/ofed/include
+CFLAGS+= -I${SRCTOP}/sys/ofed/include/uapi
CFLAGS+= -I${SRCTOP}/sys/compat/linuxkpi/common/include
-CFLAGS+= -DINET6 -DINET
+CFLAGS+= -DINET6 -DINET -DCONFIG_INFINIBAND_USER_MEM
.include <bsd.kmod.mk>
-
-CFLAGS+= -Wno-cast-qual -Wno-pointer-arith
-
-CWARNFLAGS.cm.c= -Wno-unused-function
diff --git a/sys/modules/ipoib/Makefile b/sys/modules/ipoib/Makefile
index 11751ec70faf8..5b1a771442abd 100644
--- a/sys/modules/ipoib/Makefile
+++ b/sys/modules/ipoib/Makefile
@@ -9,6 +9,7 @@ SRCS= device_if.h bus_if.h vnode_if.h pci_if.h \
CFLAGS+= -I${SRCTOP}/sys/ofed/drivers/infiniband/ulp/ipoib
CFLAGS+= -I${SRCTOP}/sys/ofed/include
+CFLAGS+= -I${SRCTOP}/sys/ofed/include/uapi
CFLAGS+= -I${SRCTOP}/sys/compat/linuxkpi/common/include
.include <bsd.kmod.mk>
diff --git a/sys/modules/rdma/krping/Makefile b/sys/modules/rdma/krping/Makefile
index 9f530e0f818f5..2adff09689d1c 100644
--- a/sys/modules/rdma/krping/Makefile
+++ b/sys/modules/rdma/krping/Makefile
@@ -6,6 +6,7 @@ SRCS= krping.c krping_dev.c getopt.c
SRCS+= bus_if.h device_if.h pci_if.h pcib_if.h vnode_if.h
SRCS+= opt_sched.h opt_inet.h opt_inet6.h
CFLAGS+= -I${SRCTOP}/sys/ofed/include
+CFLAGS+= -I${SRCTOP}/sys/ofed/include/uapi
CFLAGS+= -I${SRCTOP}/sys/compat/linuxkpi/common/include
.include <bsd.kmod.mk>
diff --git a/sys/ofed/drivers/infiniband/core/addr.c b/sys/ofed/drivers/infiniband/core/addr.c
deleted file mode 100644
index fc41bd863df94..0000000000000
--- a/sys/ofed/drivers/infiniband/core/addr.c
+++ /dev/null
@@ -1,686 +0,0 @@
-/*
- * Copyright (c) 2005 Voltaire Inc. All rights reserved.
- * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
- * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
- * Copyright (c) 2005 Intel Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/mutex.h>
-#include <linux/inetdevice.h>
-#include <linux/slab.h>
-#include <linux/workqueue.h>
-#include <linux/module.h>
-#include <linux/notifier.h>
-#include <net/route.h>
-#include <net/netevent.h>
-#include <rdma/ib_addr.h>
-#include <netinet/if_ether.h>
-#include <netinet6/scope6_var.h>
-
-
-MODULE_AUTHOR("Sean Hefty");
-MODULE_DESCRIPTION("IB Address Translation");
-MODULE_LICENSE("Dual BSD/GPL");
-
-struct addr_req {
- struct list_head list;
- struct sockaddr_storage src_addr;
- struct sockaddr_storage dst_addr;
- struct rdma_dev_addr *addr;
- struct rdma_addr_client *client;
- void *context;
- void (*callback)(int status, struct sockaddr *src_addr,
- struct rdma_dev_addr *addr, void *context);
- unsigned long timeout;
- int status;
-};
-
-static void process_req(struct work_struct *work);
-
-static DEFINE_MUTEX(lock);
-static LIST_HEAD(req_list);
-static struct delayed_work work;
-static struct workqueue_struct *addr_wq;
-
-static struct rdma_addr_client self;
-void rdma_addr_register_client(struct rdma_addr_client *client)
-{
- atomic_set(&client->refcount, 1);
- init_completion(&client->comp);
-}
-EXPORT_SYMBOL(rdma_addr_register_client);
-
-static inline void put_client(struct rdma_addr_client *client)
-{
- if (atomic_dec_and_test(&client->refcount))
- complete(&client->comp);
-}
-
-void rdma_addr_unregister_client(struct rdma_addr_client *client)
-{
- put_client(client);
- wait_for_completion(&client->comp);
-}
-EXPORT_SYMBOL(rdma_addr_unregister_client);
-
-int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev,
- const unsigned char *dst_dev_addr)
-{
- if (dev->if_type == IFT_INFINIBAND)
- dev_addr->dev_type = ARPHRD_INFINIBAND;
- else if (dev->if_type == IFT_ETHER)
- dev_addr->dev_type = ARPHRD_ETHER;
- else
- dev_addr->dev_type = 0;
- memcpy(dev_addr->src_dev_addr, IF_LLADDR(dev), dev->if_addrlen);
- memcpy(dev_addr->broadcast, __DECONST(char *, dev->if_broadcastaddr),
- dev->if_addrlen);
- if (dst_dev_addr)
- memcpy(dev_addr->dst_dev_addr, dst_dev_addr, dev->if_addrlen);
- dev_addr->bound_dev_if = dev->if_index;
- return 0;
-}
-EXPORT_SYMBOL(rdma_copy_addr);
-
-#define SCOPE_ID_CACHE(_scope_id, _addr6) do { \
- (_addr6)->sin6_addr.s6_addr[3] = (_scope_id); \
- (_addr6)->sin6_scope_id = 0; } while (0)
-
-#define SCOPE_ID_RESTORE(_scope_id, _addr6) do { \
- (_addr6)->sin6_scope_id = (_scope_id); \
- (_addr6)->sin6_addr.s6_addr[3] = 0; } while (0)
-
-int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
- u16 *vlan_id)
-{
- struct net_device *dev;
- int ret = -EADDRNOTAVAIL;
-
- if (dev_addr->bound_dev_if) {
- dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
- if (!dev)
- return -ENODEV;
- ret = rdma_copy_addr(dev_addr, dev, NULL);
- dev_put(dev);
- return ret;
- }
-
- switch (addr->sa_family) {
- case AF_INET:
- dev = ip_dev_find(&init_net,
- ((struct sockaddr_in *) addr)->sin_addr.s_addr);
-
- if (!dev)
- return ret;
-
- ret = rdma_copy_addr(dev_addr, dev, NULL);
- if (vlan_id)
- *vlan_id = rdma_vlan_dev_vlan_id(dev);
- dev_put(dev);
- break;
-
-#if defined(INET6)
- case AF_INET6:
- {
- struct sockaddr_in6 *sin6;
- struct ifaddr *ifa;
- in_port_t port;
- uint32_t scope_id;
-
- sin6 = (struct sockaddr_in6 *)addr;
- port = sin6->sin6_port;
- sin6->sin6_port = 0;
- scope_id = sin6->sin6_scope_id;
- if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr))
- SCOPE_ID_CACHE(scope_id, sin6);
- CURVNET_SET_QUIET(&init_net);
- ifa = ifa_ifwithaddr(addr);
- CURVNET_RESTORE();
- sin6->sin6_port = port;
- if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr))
- SCOPE_ID_RESTORE(scope_id, sin6);
- if (ifa == NULL) {
- ret = -ENODEV;
- break;
- }
- ret = rdma_copy_addr(dev_addr, ifa->ifa_ifp, NULL);
- if (vlan_id)
- *vlan_id = rdma_vlan_dev_vlan_id(ifa->ifa_ifp);
- ifa_free(ifa);
- break;
- }
-#endif
- default:
- break;
- }
- return ret;
-}
-EXPORT_SYMBOL(rdma_translate_ip);
-
-static void set_timeout(unsigned long time)
-{
- unsigned long delay;
-
- delay = time - jiffies;
- if ((long)delay <= 0)
- delay = 1;
-
- mod_delayed_work(addr_wq, &work, delay);
-}
-
-static void queue_req(struct addr_req *req)
-{
- struct addr_req *temp_req;
-
- mutex_lock(&lock);
- list_for_each_entry_reverse(temp_req, &req_list, list) {
- if (time_after_eq(req->timeout, temp_req->timeout))
- break;
- }
-
- list_add(&req->list, &temp_req->list);
-
- if (req_list.next == &req->list)
- set_timeout(req->timeout);
- mutex_unlock(&lock);
-}
-
-static int addr_resolve(struct sockaddr *src_in,
- struct sockaddr *dst_in,
- struct rdma_dev_addr *addr)
-{
- struct sockaddr_in *sin;
- struct sockaddr_in6 *sin6;
- struct ifaddr *ifa;
- struct ifnet *ifp;
- struct rtentry *rte;
-#if defined(INET) || defined(INET6)
- in_port_t port;
-#endif
-#ifdef INET6
- uint32_t scope_id;
-#endif
- u_char edst[MAX_ADDR_LEN];
- int multi;
- int bcast;
- int is_gw = 0;
- int error = 0;
-
- CURVNET_SET_QUIET(&init_net);
-
- /*
- * Determine whether the address is unicast, multicast, or broadcast
- * and whether the source interface is valid.
- */
- multi = 0;
- bcast = 0;
- sin = NULL;
- sin6 = NULL;
- ifp = NULL;
- rte = NULL;
- ifa = NULL;
- ifp = NULL;
- memset(edst, 0, sizeof(edst));
-#ifdef INET6
- scope_id = -1U;
-#endif
-
- switch (dst_in->sa_family) {
-#ifdef INET
- case AF_INET:
- sin = (struct sockaddr_in *)dst_in;
- if (sin->sin_addr.s_addr == INADDR_BROADCAST)
- bcast = 1;
- if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
- multi = 1;
- sin = (struct sockaddr_in *)src_in;
- if (sin->sin_addr.s_addr != INADDR_ANY) {
- /*
- * Address comparison fails if the port is set
- * cache it here to be restored later.
- */
- port = sin->sin_port;
- sin->sin_port = 0;
- memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
-
- /*
- * If we have a source address to use look it
- * up first and verify that it is a local
- * interface:
- */
- CURVNET_SET_QUIET(&init_net);
- ifa = ifa_ifwithaddr(src_in);
- CURVNET_RESTORE();
- sin->sin_port = port;
- if (ifa == NULL) {
- error = ENETUNREACH;
- goto done;
- }
- ifp = ifa->ifa_ifp;
- ifa_free(ifa);
- if (bcast || multi)
- goto mcast;
- }
- break;
-#endif
-#ifdef INET6
- case AF_INET6:
- sin6 = (struct sockaddr_in6 *)dst_in;
- if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
- multi = 1;
- if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) {
- /*
- * The IB address comparison fails if the
- * scope ID is set and not part of the addr:
- */
- scope_id = sin6->sin6_scope_id;
- if (scope_id < 256)
- SCOPE_ID_CACHE(scope_id, sin6);
- }
- sin6 = (struct sockaddr_in6 *)src_in;
- if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
- port = sin6->sin6_port;
- sin6->sin6_port = 0;
- if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) {
- if (scope_id < 256)
- SCOPE_ID_CACHE(scope_id, sin6);
- }
-
- /*
- * If we have a source address to use look it
- * up first and verify that it is a local
- * interface:
- */
- CURVNET_SET_QUIET(&init_net);
- ifa = ifa_ifwithaddr(src_in);
- CURVNET_RESTORE();
- sin6->sin6_port = port;
- if (ifa == NULL) {
- error = ENETUNREACH;
- goto done;
- }
- ifp = ifa->ifa_ifp;
- ifa_free(ifa);
- if (bcast || multi)
- goto mcast;
- }
- break;
-#endif
- default:
- error = EINVAL;
- goto done;
- }
- /*
- * Make sure the route exists and has a valid link.
- */
- rte = rtalloc1(dst_in, 1, 0);
- if (rte == NULL || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp)) {
- if (rte)
- RTFREE_LOCKED(rte);
- error = EHOSTUNREACH;
- goto done;
- }
- if (rte->rt_flags & RTF_GATEWAY)
- is_gw = 1;
- /*
- * If it's not multicast or broadcast and the route doesn't match the
- * requested interface return unreachable. Otherwise fetch the
- * correct interface pointer and unlock the route.
- */
- if (multi || bcast) {
- if (ifp == NULL) {
- ifp = rte->rt_ifp;
- /* rt_ifa holds the route answer source address */
- ifa = rte->rt_ifa;
- }
- RTFREE_LOCKED(rte);
- } else if (ifp && ifp != rte->rt_ifp) {
- RTFREE_LOCKED(rte);
- error = ENETUNREACH;
- goto done;
- } else {
- if (ifp == NULL) {
- ifp = rte->rt_ifp;
- ifa = rte->rt_ifa;
- }
- RT_UNLOCK(rte);
- }
-#if defined(INET) || defined(INET6)
-mcast:
-#endif
- if (bcast) {
- memcpy(edst, ifp->if_broadcastaddr, ifp->if_addrlen);
- goto done;
- } else if (multi) {
- struct sockaddr *llsa;
- struct sockaddr_dl sdl;
-
- sdl.sdl_len = sizeof(sdl);
- llsa = (struct sockaddr *)&sdl;
-
- if (ifp->if_resolvemulti == NULL) {
- error = EOPNOTSUPP;
- goto done;
- }
- error = ifp->if_resolvemulti(ifp, &llsa, dst_in);
- if (error == 0) {
- memcpy(edst, LLADDR((struct sockaddr_dl *)llsa),
- ifp->if_addrlen);
- }
- goto done;
- }
- /*
- * Resolve the link local address.
- */
- switch (dst_in->sa_family) {
-#ifdef INET
- case AF_INET:
- error = arpresolve(ifp, is_gw, NULL,
- is_gw ? rte->rt_gateway : dst_in, edst, NULL, NULL);
- break;
-#endif
-#ifdef INET6
- case AF_INET6:
- error = nd6_resolve(ifp, is_gw, NULL,
- is_gw ? rte->rt_gateway : dst_in, edst, NULL, NULL);
- break;
-#endif
- default:
- KASSERT(0, ("rdma_addr_resolve: Unreachable"));
- error = EINVAL;
- break;
- }
- RTFREE(rte);
-done:
- if (error == 0)
- error = -rdma_copy_addr(addr, ifp, edst);
- if (error == 0)
- memcpy(src_in, ifa->ifa_addr, ip_addr_size(ifa->ifa_addr));
-#ifdef INET6
- if (scope_id < 256) {
- sin6 = (struct sockaddr_in6 *)src_in;
- if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr))
- SCOPE_ID_RESTORE(scope_id, sin6);
- sin6 = (struct sockaddr_in6 *)dst_in;
- SCOPE_ID_RESTORE(scope_id, sin6);
- }
-#endif
- if (error == EWOULDBLOCK)
- error = ENODATA;
-
- CURVNET_RESTORE();
- return -error;
-}
-
-static void process_req(struct work_struct *work)
-{
- struct addr_req *req, *temp_req;
- struct sockaddr *src_in, *dst_in;
- struct list_head done_list;
-
- INIT_LIST_HEAD(&done_list);
-
- mutex_lock(&lock);
- list_for_each_entry_safe(req, temp_req, &req_list, list) {
- if (req->status == -ENODATA) {
- src_in = (struct sockaddr *) &req->src_addr;
- dst_in = (struct sockaddr *) &req->dst_addr;
- req->status = addr_resolve(src_in, dst_in, req->addr);
- if (req->status && time_after_eq(jiffies, req->timeout))
- req->status = -ETIMEDOUT;
- else if (req->status == -ENODATA)
- continue;
- }
- list_move_tail(&req->list, &done_list);
- }
-
- if (!list_empty(&req_list)) {
- req = list_entry(req_list.next, struct addr_req, list);
- set_timeout(req->timeout);
- }
- mutex_unlock(&lock);
-
- list_for_each_entry_safe(req, temp_req, &done_list, list) {
- list_del(&req->list);
- req->callback(req->status, (struct sockaddr *) &req->src_addr,
- req->addr, req->context);
- put_client(req->client);
- kfree(req);
- }
-}
-
-int rdma_resolve_ip(struct rdma_addr_client *client,
- struct sockaddr *src_addr, struct sockaddr *dst_addr,
- struct rdma_dev_addr *addr, int timeout_ms,
- void (*callback)(int status, struct sockaddr *src_addr,
- struct rdma_dev_addr *addr, void *context),
- void *context)
-{
- struct sockaddr *src_in, *dst_in;
- struct addr_req *req;
- int ret = 0;
-
- req = kzalloc(sizeof *req, GFP_KERNEL);
- if (!req)
- return -ENOMEM;
-
- src_in = (struct sockaddr *) &req->src_addr;
- dst_in = (struct sockaddr *) &req->dst_addr;
-
- if (src_addr) {
- if (src_addr->sa_family != dst_addr->sa_family) {
- ret = -EINVAL;
- goto err;
- }
-
- memcpy(src_in, src_addr, ip_addr_size(src_addr));
- } else {
- src_in->sa_family = dst_addr->sa_family;
- }
-
- memcpy(dst_in, dst_addr, ip_addr_size(dst_addr));
- req->addr = addr;
- req->callback = callback;
- req->context = context;
- req->client = client;
- atomic_inc(&client->refcount);
-
- req->status = addr_resolve(src_in, dst_in, addr);
- switch (req->status) {
- case 0:
- req->timeout = jiffies;
- queue_req(req);
- break;
- case -ENODATA:
- req->timeout = msecs_to_jiffies(timeout_ms) + jiffies;
- queue_req(req);
- break;
- default:
- ret = req->status;
- atomic_dec(&client->refcount);
- goto err;
- }
- return ret;
-err:
- kfree(req);
- return ret;
-}
-EXPORT_SYMBOL(rdma_resolve_ip);
-
-void rdma_addr_cancel(struct rdma_dev_addr *addr)
-{
- struct addr_req *req, *temp_req;
-
- mutex_lock(&lock);
- list_for_each_entry_safe(req, temp_req, &req_list, list) {
- if (req->addr == addr) {
- req->status = -ECANCELED;
- req->timeout = jiffies;
- list_move(&req->list, &req_list);
- set_timeout(req->timeout);
- break;
- }
- }
- mutex_unlock(&lock);
-}
-EXPORT_SYMBOL(rdma_addr_cancel);
-
-struct resolve_cb_context {
- struct rdma_dev_addr *addr;
- struct completion comp;
-};
-
-static void resolve_cb(int status, struct sockaddr *src_addr,
- struct rdma_dev_addr *addr, void *context)
-{
- memcpy(((struct resolve_cb_context *)context)->addr, addr, sizeof(struct
- rdma_dev_addr));
- complete(&((struct resolve_cb_context *)context)->comp);
-}
-
-int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac,
- u16 *vlan_id, u32 scope_id)
-{
- int ret = 0;
- struct rdma_dev_addr dev_addr;
- struct resolve_cb_context ctx;
- struct net_device *dev;
-
- union {
- struct sockaddr _sockaddr;
- struct sockaddr_in _sockaddr_in;
- struct sockaddr_in6 _sockaddr_in6;
- } sgid_addr, dgid_addr;
-
-
- ret = rdma_gid2ip(&sgid_addr._sockaddr, sgid, scope_id);
- if (ret)
- return ret;
-
- ret = rdma_gid2ip(&dgid_addr._sockaddr, dgid, scope_id);
- if (ret)
- return ret;
-
- memset(&dev_addr, 0, sizeof(dev_addr));
-
- ctx.addr = &dev_addr;
- init_completion(&ctx.comp);
- ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr,
- &dev_addr, 1000, resolve_cb, &ctx);
- if (ret)
- return ret;
-
- wait_for_completion(&ctx.comp);
-
- memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN);
- dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if);
- if (!dev)
- return -ENODEV;
- if (vlan_id)
- *vlan_id = rdma_vlan_dev_vlan_id(dev);
- dev_put(dev);
- return ret;
-}
-EXPORT_SYMBOL(rdma_addr_find_dmac_by_grh);
-
-u32 rdma_get_ipv6_scope_id(struct ib_device *ib, u8 port_num)
-{
-#ifdef INET6
- struct ifnet *ifp;
- if (ib->get_netdev == NULL)
- return (-1U);
- ifp = ib->get_netdev(ib, port_num);
- if (ifp == NULL)
- return (-1U);
- return (in6_getscopezone(ifp, IPV6_ADDR_SCOPE_LINKLOCAL));
-#else
- return (-1U);
-#endif
-}
-
-int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id,
- u32 scope_id)
-{
- int ret = 0;
- struct rdma_dev_addr dev_addr;
- union {
- struct sockaddr _sockaddr;
- struct sockaddr_in _sockaddr_in;
- struct sockaddr_in6 _sockaddr_in6;
- } gid_addr;
-
- ret = rdma_gid2ip(&gid_addr._sockaddr, sgid, scope_id);
- if (ret)
- return ret;
- memset(&dev_addr, 0, sizeof(dev_addr));
- ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id);
- if (ret)
- return ret;
-
- memcpy(smac, dev_addr.src_dev_addr, ETH_ALEN);
- return ret;
-}
-EXPORT_SYMBOL(rdma_addr_find_smac_by_sgid);
-
-static int netevent_callback(struct notifier_block *self, unsigned long event,
- void *ctx)
-{
- if (event == NETEVENT_NEIGH_UPDATE) {
- set_timeout(jiffies);
- }
- return 0;
-}
-
-static struct notifier_block nb = {
- .notifier_call = netevent_callback
-};
-
-static int __init addr_init(void)
-{
- INIT_DELAYED_WORK(&work, process_req);
- addr_wq = create_singlethread_workqueue("ib_addr");
- if (!addr_wq)
- return -ENOMEM;
-
- register_netevent_notifier(&nb);
- rdma_addr_register_client(&self);
- return 0;
-}
-
-static void __exit addr_cleanup(void)
-{
- rdma_addr_unregister_client(&self);
- unregister_netevent_notifier(&nb);
- destroy_workqueue(addr_wq);
-}
-
-module_init(addr_init);
-module_exit(addr_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/agent.c b/sys/ofed/drivers/infiniband/core/agent.c
index 2bc7f5af64f42..4fa524dfb6cf2 100644
--- a/sys/ofed/drivers/infiniband/core/agent.c
+++ b/sys/ofed/drivers/infiniband/core/agent.c
@@ -54,7 +54,7 @@ static DEFINE_SPINLOCK(ib_agent_port_list_lock);
static LIST_HEAD(ib_agent_port_list);
static struct ib_agent_port_private *
-__ib_get_agent_port(struct ib_device *device, int port_num)
+__ib_get_agent_port(const struct ib_device *device, int port_num)
{
struct ib_agent_port_private *entry;
@@ -67,7 +67,7 @@ __ib_get_agent_port(struct ib_device *device, int port_num)
}
static struct ib_agent_port_private *
-ib_get_agent_port(struct ib_device *device, int port_num)
+ib_get_agent_port(const struct ib_device *device, int port_num)
{
struct ib_agent_port_private *entry;
unsigned long flags;
@@ -78,9 +78,9 @@ ib_get_agent_port(struct ib_device *device, int port_num)
return entry;
}
-void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
- struct ib_wc *wc, struct ib_device *device,
- int port_num, int qpn)
+void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh,
+ const struct ib_wc *wc, const struct ib_device *device,
+ int port_num, int qpn, size_t resp_mad_len, bool opa)
{
struct ib_agent_port_private *port_priv;
struct ib_mad_agent *agent;
@@ -88,44 +88,49 @@ void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
struct ib_ah *ah;
struct ib_mad_send_wr_private *mad_send_wr;
- if (device->node_type == RDMA_NODE_IB_SWITCH)
+ if (rdma_cap_ib_switch(device))
port_priv = ib_get_agent_port(device, 0);
else
port_priv = ib_get_agent_port(device, port_num);
if (!port_priv) {
- printk(KERN_ERR SPFX "Unable to find port agent\n");
+ dev_err(&device->dev, "Unable to find port agent\n");
return;
}
agent = port_priv->agent[qpn];
ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num);
if (IS_ERR(ah)) {
- printk(KERN_ERR SPFX "ib_create_ah_from_wc error %ld\n",
+ dev_err(&device->dev, "ib_create_ah_from_wc error %ld\n",
PTR_ERR(ah));
return;
}
+ if (opa && mad_hdr->base_version != OPA_MGMT_BASE_VERSION)
+ resp_mad_len = IB_MGMT_MAD_SIZE;
+
send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0,
- IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
- GFP_KERNEL);
+ IB_MGMT_MAD_HDR,
+ resp_mad_len - IB_MGMT_MAD_HDR,
+ GFP_KERNEL,
+ mad_hdr->base_version);
if (IS_ERR(send_buf)) {
- printk(KERN_ERR SPFX "ib_create_send_mad error\n");
+ dev_err(&device->dev, "ib_create_send_mad error\n");
goto err1;
}
- memcpy(send_buf->mad, mad, sizeof *mad);
+ memcpy(send_buf->mad, mad_hdr, resp_mad_len);
send_buf->ah = ah;
- if (device->node_type == RDMA_NODE_IB_SWITCH) {
+ if (rdma_cap_ib_switch(device)) {
mad_send_wr = container_of(send_buf,
struct ib_mad_send_wr_private,
send_buf);
- mad_send_wr->send_wr.wr.ud.port_num = port_num;
+ mad_send_wr->send_wr.port_num = port_num;
}
if (ib_post_send_mad(send_buf, NULL)) {
- printk(KERN_ERR SPFX "ib_post_send_mad error\n");
+ dev_err(&device->dev, "ib_post_send_mad error\n");
goto err2;
}
return;
@@ -151,17 +156,17 @@ int ib_agent_port_open(struct ib_device *device, int port_num)
/* Create new device info */
port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
if (!port_priv) {
- printk(KERN_ERR SPFX "No memory for ib_agent_port_private\n");
+ dev_err(&device->dev, "No memory for ib_agent_port_private\n");
ret = -ENOMEM;
goto error1;
}
- if (rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND) {
+ if (rdma_cap_ib_smi(device, port_num)) {
/* Obtain send only MAD agent for SMI QP */
port_priv->agent[0] = ib_register_mad_agent(device, port_num,
IB_QPT_SMI, NULL, 0,
&agent_send_handler,
- NULL, NULL);
+ NULL, NULL, 0);
if (IS_ERR(port_priv->agent[0])) {
ret = PTR_ERR(port_priv->agent[0]);
goto error2;
@@ -172,7 +177,7 @@ int ib_agent_port_open(struct ib_device *device, int port_num)
port_priv->agent[1] = ib_register_mad_agent(device, port_num,
IB_QPT_GSI, NULL, 0,
&agent_send_handler,
- NULL, NULL);
+ NULL, NULL, 0);
if (IS_ERR(port_priv->agent[1])) {
ret = PTR_ERR(port_priv->agent[1]);
goto error3;
@@ -202,7 +207,7 @@ int ib_agent_port_close(struct ib_device *device, int port_num)
port_priv = __ib_get_agent_port(device, port_num);
if (port_priv == NULL) {
spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
- printk(KERN_ERR SPFX "Port %d not found\n", port_num);
+ dev_err(&device->dev, "Port %d not found\n", port_num);
return -ENODEV;
}
list_del(&port_priv->port_list);
diff --git a/sys/ofed/drivers/infiniband/core/agent.h b/sys/ofed/drivers/infiniband/core/agent.h
index 6669287009c2f..65f92bedae448 100644
--- a/sys/ofed/drivers/infiniband/core/agent.h
+++ b/sys/ofed/drivers/infiniband/core/agent.h
@@ -44,8 +44,8 @@ extern int ib_agent_port_open(struct ib_device *device, int port_num);
extern int ib_agent_port_close(struct ib_device *device, int port_num);
-extern void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
- struct ib_wc *wc, struct ib_device *device,
- int port_num, int qpn);
+extern void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh,
+ const struct ib_wc *wc, const struct ib_device *device,
+ int port_num, int qpn, size_t resp_mad_len, bool opa);
#endif /* __AGENT_H_ */
diff --git a/sys/ofed/drivers/infiniband/core/cache.c b/sys/ofed/drivers/infiniband/core/cache.c
deleted file mode 100644
index d11e7c2a88f7f..0000000000000
--- a/sys/ofed/drivers/infiniband/core/cache.c
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- * Copyright (c) 2004 Topspin Communications. All rights reserved.
- * Copyright (c) 2005 Intel Corporation. All rights reserved.
- * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
- * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/workqueue.h>
-
-#include <rdma/ib_cache.h>
-
-#include "core_priv.h"
-
-struct ib_pkey_cache {
- int table_len;
- u16 table[0];
-};
-
-struct ib_gid_cache {
- int table_len;
- union ib_gid table[0];
-};
-
-struct ib_update_work {
- struct work_struct work;
- struct ib_device *device;
- u8 port_num;
-};
-
-static inline int start_port(struct ib_device *device)
-{
- return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
-}
-
-static inline int end_port(struct ib_device *device)
-{
- return (device->node_type == RDMA_NODE_IB_SWITCH) ?
- 0 : device->phys_port_cnt;
-}
-
-int ib_get_cached_gid(struct ib_device *device,
- u8 port_num,
- int index,
- union ib_gid *gid)
-{
- struct ib_gid_cache *cache;
- unsigned long flags;
- int ret = -EINVAL;
-
- if (port_num < start_port(device) || port_num > end_port(device))
- return -EINVAL;
-
- read_lock_irqsave(&device->cache.lock, flags);
-
- if (device->cache.gid_cache) {
- cache = device->cache.gid_cache[port_num - start_port(device)];
-
- if (cache && index >= 0 && index < cache->table_len) {
- *gid = cache->table[index];
- ret = 0;
- }
- }
-
- read_unlock_irqrestore(&device->cache.lock, flags);
-
- return ret;
-}
-EXPORT_SYMBOL(ib_get_cached_gid);
-
-int ib_find_cached_gid(struct ib_device *device,
- union ib_gid *gid,
- u8 *port_num,
- u16 *index)
-{
- struct ib_gid_cache *cache;
- unsigned long flags;
- int p, i;
- int ret = -ENOENT;
-
- *port_num = -1;
- if (index)
- *index = -1;
-
- read_lock_irqsave(&device->cache.lock, flags);
- if (!device->cache.gid_cache)
- goto out;
- for (p = 0; p <= end_port(device) - start_port(device); ++p) {
- cache = device->cache.gid_cache[p];
- if (!cache)
- continue;
- for (i = 0; i < cache->table_len; ++i) {
- if (!memcmp(gid, &cache->table[i], sizeof *gid)) {
- *port_num = p + start_port(device);
- if (index)
- *index = i;
- ret = 0;
- goto out;
- }
- }
- }
-out:
- read_unlock_irqrestore(&device->cache.lock, flags);
- return ret;
-}
-EXPORT_SYMBOL(ib_find_cached_gid);
-
-int ib_get_cached_pkey(struct ib_device *device,
- u8 port_num,
- int index,
- u16 *pkey)
-{
- struct ib_pkey_cache *cache;
- unsigned long flags;
- int ret = -EINVAL;
-
- if (port_num < start_port(device) || port_num > end_port(device))
- return -EINVAL;
-
- read_lock_irqsave(&device->cache.lock, flags);
-
- if (device->cache.pkey_cache) {
- cache = device->cache.pkey_cache[port_num - start_port(device)];
-
- if (cache && index >= 0 && index < cache->table_len) {
- *pkey = cache->table[index];
- ret = 0;
- }
- }
-
- read_unlock_irqrestore(&device->cache.lock, flags);
-
- return ret;
-}
-EXPORT_SYMBOL(ib_get_cached_pkey);
-
-int ib_find_cached_pkey(struct ib_device *device,
- u8 port_num,
- u16 pkey,
- u16 *index)
-{
- struct ib_pkey_cache *cache;
- unsigned long flags;
- int i;
- int ret = -ENOENT;
- int partial_ix = -1;
-
- if (port_num < start_port(device) || port_num > end_port(device))
- return -EINVAL;
-
- *index = -1;
-
- read_lock_irqsave(&device->cache.lock, flags);
-
- if (!device->cache.pkey_cache)
- goto out;
-
- cache = device->cache.pkey_cache[port_num - start_port(device)];
- if (!cache)
- goto out;
-
- for (i = 0; i < cache->table_len; ++i)
- if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
- if (cache->table[i] & 0x8000) {
- *index = i;
- ret = 0;
- break;
- } else
- partial_ix = i;
- }
-
- if (ret && partial_ix >= 0) {
- *index = partial_ix;
- ret = 0;
- }
-out:
- read_unlock_irqrestore(&device->cache.lock, flags);
- return ret;
-}
-EXPORT_SYMBOL(ib_find_cached_pkey);
-
-int ib_find_exact_cached_pkey(struct ib_device *device,
- u8 port_num,
- u16 pkey,
- u16 *index)
-{
- struct ib_pkey_cache *cache;
- unsigned long flags;
- int i;
- int ret = -ENOENT;
-
- if (port_num < start_port(device) || port_num > end_port(device))
- return -EINVAL;
-
- *index = -1;
-
- read_lock_irqsave(&device->cache.lock, flags);
-
- if (!device->cache.pkey_cache)
- goto out;
-
- cache = device->cache.pkey_cache[port_num - start_port(device)];
- if (!cache)
- goto out;
-
- for (i = 0; i < cache->table_len; ++i)
- if (cache->table[i] == pkey) {
- *index = i;
- ret = 0;
- break;
- }
-out:
- read_unlock_irqrestore(&device->cache.lock, flags);
- return ret;
-}
-EXPORT_SYMBOL(ib_find_exact_cached_pkey);
-
-int ib_get_cached_lmc(struct ib_device *device,
- u8 port_num,
- u8 *lmc)
-{
- unsigned long flags;
- int ret = -EINVAL;
-
- if (port_num < start_port(device) || port_num > end_port(device))
- return -EINVAL;
-
- read_lock_irqsave(&device->cache.lock, flags);
- if (device->cache.lmc_cache) {
- *lmc = device->cache.lmc_cache[port_num - start_port(device)];
- ret = 0;
- }
- read_unlock_irqrestore(&device->cache.lock, flags);
-
- return ret;
-}
-EXPORT_SYMBOL(ib_get_cached_lmc);
-
-static void ib_cache_update(struct ib_device *device,
- u8 port)
-{
- struct ib_port_attr *tprops = NULL;
- struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache;
- struct ib_gid_cache *gid_cache = NULL, *old_gid_cache;
- int i;
- int ret;
-
- if (!(device->cache.pkey_cache && device->cache.gid_cache &&
- device->cache.lmc_cache))
- return;
-
- tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
- if (!tprops)
- return;
-
- ret = ib_query_port(device, port, tprops);
- if (ret) {
- printk(KERN_WARNING "ib_query_port failed (%d) for %s\n",
- ret, device->name);
- goto err;
- }
-
- pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len *
- sizeof *pkey_cache->table, GFP_KERNEL);
- if (!pkey_cache)
- goto err;
-
- pkey_cache->table_len = tprops->pkey_tbl_len;
-
- gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len *
- sizeof *gid_cache->table, GFP_KERNEL);
- if (!gid_cache)
- goto err;
-
- gid_cache->table_len = tprops->gid_tbl_len;
-
- for (i = 0; i < pkey_cache->table_len; ++i) {
- ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
- if (ret) {
- printk(KERN_WARNING "ib_query_pkey failed (%d) for %s (index %d)\n",
- ret, device->name, i);
- goto err;
- }
- }
-
- for (i = 0; i < gid_cache->table_len; ++i) {
- ret = ib_query_gid(device, port, i, gid_cache->table + i);
- if (ret) {
- printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
- ret, device->name, i);
- goto err;
- }
- }
-
- write_lock_irq(&device->cache.lock);
-
- old_pkey_cache = device->cache.pkey_cache[port - start_port(device)];
- old_gid_cache = device->cache.gid_cache [port - start_port(device)];
-
- device->cache.pkey_cache[port - start_port(device)] = pkey_cache;
- device->cache.gid_cache [port - start_port(device)] = gid_cache;
-
- device->cache.lmc_cache[port - start_port(device)] = tprops->lmc;
-
- write_unlock_irq(&device->cache.lock);
-
- kfree(old_pkey_cache);
- kfree(old_gid_cache);
- kfree(tprops);
- return;
-
-err:
- kfree(pkey_cache);
- kfree(gid_cache);
- kfree(tprops);
-}
-
-static void ib_cache_task(struct work_struct *_work)
-{
- struct ib_update_work *work =
- container_of(_work, struct ib_update_work, work);
-
- ib_cache_update(work->device, work->port_num);
- kfree(work);
-}
-
-static void ib_cache_event(struct ib_event_handler *handler,
- struct ib_event *event)
-{
- struct ib_update_work *work;
-
- if (event->event == IB_EVENT_PORT_ERR ||
- event->event == IB_EVENT_PORT_ACTIVE ||
- event->event == IB_EVENT_LID_CHANGE ||
- event->event == IB_EVENT_PKEY_CHANGE ||
- event->event == IB_EVENT_SM_CHANGE ||
- event->event == IB_EVENT_CLIENT_REREGISTER ||
- event->event == IB_EVENT_GID_CHANGE) {
- work = kmalloc(sizeof *work, GFP_ATOMIC);
- if (work) {
- INIT_WORK(&work->work, ib_cache_task);
- work->device = event->device;
- work->port_num = event->element.port_num;
- queue_work(ib_wq, &work->work);
- }
- }
-}
-
-static void ib_cache_setup_one(struct ib_device *device)
-{
- int p;
-
- rwlock_init(&device->cache.lock);
-
- device->cache.pkey_cache =
- kmalloc(sizeof *device->cache.pkey_cache *
- (end_port(device) - start_port(device) + 1), GFP_KERNEL);
- device->cache.gid_cache =
- kmalloc(sizeof *device->cache.gid_cache *
- (end_port(device) - start_port(device) + 1), GFP_KERNEL);
-
- device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache *
- (end_port(device) -
- start_port(device) + 1),
- GFP_KERNEL);
-
- if (!device->cache.pkey_cache || !device->cache.gid_cache ||
- !device->cache.lmc_cache) {
- printk(KERN_WARNING "Couldn't allocate cache "
- "for %s\n", device->name);
- goto err;
- }
-
- for (p = 0; p <= end_port(device) - start_port(device); ++p) {
- device->cache.pkey_cache[p] = NULL;
- device->cache.gid_cache [p] = NULL;
- ib_cache_update(device, p + start_port(device));
- }
-
- INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
- device, ib_cache_event);
- if (ib_register_event_handler(&device->cache.event_handler))
- goto err_cache;
-
- return;
-
-err_cache:
- for (p = 0; p <= end_port(device) - start_port(device); ++p) {
- kfree(device->cache.pkey_cache[p]);
- kfree(device->cache.gid_cache[p]);
- }
-
-err:
- kfree(device->cache.pkey_cache);
- kfree(device->cache.gid_cache);
- kfree(device->cache.lmc_cache);
- device->cache.pkey_cache = NULL;
- device->cache.gid_cache = NULL;
- device->cache.lmc_cache = NULL;
-}
-
-static void ib_cache_cleanup_one(struct ib_device *device)
-{
- int p;
-
- if (!(device->cache.pkey_cache && device->cache.gid_cache &&
- device->cache.lmc_cache))
- return;
-
- ib_unregister_event_handler(&device->cache.event_handler);
- flush_workqueue(ib_wq);
-
- for (p = 0; p <= end_port(device) - start_port(device); ++p) {
- kfree(device->cache.pkey_cache[p]);
- kfree(device->cache.gid_cache[p]);
- }
-
- kfree(device->cache.pkey_cache);
- kfree(device->cache.gid_cache);
- kfree(device->cache.lmc_cache);
-}
-
-static struct ib_client cache_client = {
- .name = "cache",
- .add = ib_cache_setup_one,
- .remove = ib_cache_cleanup_one
-};
-
-int __init ib_cache_setup(void)
-{
- return ib_register_client(&cache_client);
-}
-
-void __exit ib_cache_cleanup(void)
-{
- ib_unregister_client(&cache_client);
-}
diff --git a/sys/ofed/drivers/infiniband/core/cm.c b/sys/ofed/drivers/infiniband/core/cm.c
index 07f6e08f29014..253881f89b12e 100644
--- a/sys/ofed/drivers/infiniband/core/cm.c
+++ b/sys/ofed/drivers/infiniband/core/cm.c
@@ -47,7 +47,6 @@
#include <linux/sysfs.h>
#include <linux/workqueue.h>
#include <linux/kdev_t.h>
-#include <linux/string.h>
#include <linux/etherdevice.h>
#include <asm/atomic-long.h>
@@ -60,13 +59,8 @@ MODULE_AUTHOR("Sean Hefty");
MODULE_DESCRIPTION("InfiniBand CM");
MODULE_LICENSE("Dual BSD/GPL");
-#ifdef pr_fmt
-#undef pr_fmt
-#endif
-#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
-
static void cm_add_one(struct ib_device *device);
-static void cm_remove_one(struct ib_device *device);
+static void cm_remove_one(struct ib_device *device, void *client_data);
static struct ib_client cm_client = {
.name = "cm",
@@ -88,6 +82,8 @@ static struct ib_cm {
__be32 random_id_operand;
struct list_head timewait_list;
struct workqueue_struct *wq;
+ /* Sync on cm change port state */
+ spinlock_t state_lock;
} cm;
/* Counter indexes ordered by attribute ID */
@@ -169,6 +165,8 @@ struct cm_port {
struct ib_mad_agent *mad_agent;
struct kobject port_obj;
u8 port_num;
+ struct list_head cm_priv_prim_list;
+ struct list_head cm_priv_altr_list;
struct cm_counter_group counter_group[CM_COUNTER_GROUPS];
};
@@ -177,6 +175,7 @@ struct cm_device {
struct ib_device *ib_device;
struct device *device;
u8 ack_delay;
+ int going_down;
struct cm_port *port[0];
};
@@ -186,8 +185,6 @@ struct cm_av {
struct ib_ah_attr ah_attr;
u16 pkey_index;
u8 timeout;
- u8 valid;
- u8 smac[ETH_ALEN];
};
struct cm_work {
@@ -220,13 +217,15 @@ struct cm_id_private {
spinlock_t lock; /* Do not acquire inside cm.lock */
struct completion comp;
atomic_t refcount;
+ /* Number of clients sharing this ib_cm_id. Only valid for listeners.
+ * Protected by the cm.lock spinlock. */
+ int listen_sharecount;
struct ib_mad_send_buf *msg;
struct cm_timewait_info *timewait_info;
/* todo: use alternate port on send failure */
struct cm_av av;
struct cm_av alt_av;
- struct ib_cm_compare_data *compare_data;
void *private_data;
__be64 tid;
@@ -248,6 +247,12 @@ struct cm_id_private {
u8 service_timeout;
u8 target_ack_delay;
+ struct list_head prim_list;
+ struct list_head altr_list;
+ /* Indicates that the send port mad is registered and av is set */
+ int prim_send_port_not_ready;
+ int altr_send_port_not_ready;
+
struct list_head work_list;
atomic_t work_count;
};
@@ -266,19 +271,47 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
struct ib_mad_agent *mad_agent;
struct ib_mad_send_buf *m;
struct ib_ah *ah;
+ struct cm_av *av;
+ unsigned long flags, flags2;
+ int ret = 0;
+ /* don't let the port to be released till the agent is down */
+ spin_lock_irqsave(&cm.state_lock, flags2);
+ spin_lock_irqsave(&cm.lock, flags);
+ if (!cm_id_priv->prim_send_port_not_ready)
+ av = &cm_id_priv->av;
+ else if (!cm_id_priv->altr_send_port_not_ready &&
+ (cm_id_priv->alt_av.port))
+ av = &cm_id_priv->alt_av;
+ else {
+ pr_info("%s: not valid CM id\n", __func__);
+ ret = -ENODEV;
+ spin_unlock_irqrestore(&cm.lock, flags);
+ goto out;
+ }
+ spin_unlock_irqrestore(&cm.lock, flags);
+ /* Make sure the port haven't released the mad yet */
mad_agent = cm_id_priv->av.port->mad_agent;
- ah = ib_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr);
- if (IS_ERR(ah))
- return PTR_ERR(ah);
+ if (!mad_agent) {
+ pr_info("%s: not a valid MAD agent\n", __func__);
+ ret = -ENODEV;
+ goto out;
+ }
+ ah = ib_create_ah(mad_agent->qp->pd, &av->ah_attr);
+ if (IS_ERR(ah)) {
+ ret = PTR_ERR(ah);
+ goto out;
+ }
m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn,
- cm_id_priv->av.pkey_index,
+ av->pkey_index,
0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
- GFP_ATOMIC);
+ GFP_ATOMIC,
+ IB_MGMT_BASE_VERSION);
if (IS_ERR(m)) {
ib_destroy_ah(ah);
- return PTR_ERR(m);
+ ret = PTR_ERR(m);
+ goto out;
}
/* Timeout set by caller if response is expected. */
@@ -288,7 +321,10 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
atomic_inc(&cm_id_priv->refcount);
m->context[0] = cm_id_priv;
*msg = m;
- return 0;
+
+out:
+ spin_unlock_irqrestore(&cm.state_lock, flags2);
+ return ret;
}
static int cm_alloc_response_msg(struct cm_port *port,
@@ -305,7 +341,8 @@ static int cm_alloc_response_msg(struct cm_port *port,
m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index,
0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
- GFP_ATOMIC);
+ GFP_ATOMIC,
+ IB_MGMT_BASE_VERSION);
if (IS_ERR(m)) {
ib_destroy_ah(ah);
return PTR_ERR(m);
@@ -357,41 +394,29 @@ static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
grh, &av->ah_attr);
}
-int ib_update_cm_av(struct ib_cm_id *id, const u8 *smac, const u8 *alt_smac)
-{
- struct cm_id_private *cm_id_priv;
-
- cm_id_priv = container_of(id, struct cm_id_private, id);
-
- if (smac != NULL)
- memcpy(cm_id_priv->av.smac, smac, sizeof(cm_id_priv->av.smac));
-
- if (alt_smac != NULL)
- memcpy(cm_id_priv->alt_av.smac, alt_smac,
- sizeof(cm_id_priv->alt_av.smac));
-
- return 0;
-}
-EXPORT_SYMBOL(ib_update_cm_av);
-
-static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
+static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av,
+ struct cm_id_private *cm_id_priv)
{
struct cm_device *cm_dev;
struct cm_port *port = NULL;
unsigned long flags;
int ret;
u8 p;
+ struct net_device *ndev = ib_get_ndev_from_path(path);
read_lock_irqsave(&cm.device_lock, flags);
list_for_each_entry(cm_dev, &cm.device_list, list) {
if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid,
- &p, NULL)) {
+ path->gid_type, ndev, &p, NULL)) {
port = cm_dev->port[p-1];
break;
}
}
read_unlock_irqrestore(&cm.device_lock, flags);
+ if (ndev)
+ dev_put(ndev);
+
if (!port)
return -EINVAL;
@@ -401,32 +426,41 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
return ret;
av->port = port;
- ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path,
- &av->ah_attr);
+ ret = ib_init_ah_from_path(cm_dev->ib_device, port->port_num,
+ path, &av->ah_attr);
+ if (ret)
+ return ret;
+
av->timeout = path->packet_life_time + 1;
- memcpy(av->smac, path->smac, sizeof(av->smac));
- av->valid = 1;
- return 0;
+ spin_lock_irqsave(&cm.lock, flags);
+ if (&cm_id_priv->av == av)
+ list_add_tail(&cm_id_priv->prim_list, &port->cm_priv_prim_list);
+ else if (&cm_id_priv->alt_av == av)
+ list_add_tail(&cm_id_priv->altr_list, &port->cm_priv_altr_list);
+ else
+ ret = -EINVAL;
+
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ return ret;
}
static int cm_alloc_id(struct cm_id_private *cm_id_priv)
{
unsigned long flags;
- int ret, id;
- static int next_id;
+ int id;
- do {
- spin_lock_irqsave(&cm.lock, flags);
- ret = idr_get_new_above(&cm.local_id_table, cm_id_priv,
- next_id, &id);
- if (!ret)
- next_id = ((unsigned) id + 1) & MAX_IDR_MASK;
- spin_unlock_irqrestore(&cm.lock, flags);
- } while( (ret == -EAGAIN) && idr_pre_get(&cm.local_id_table, GFP_KERNEL) );
+ idr_preload(GFP_KERNEL);
+ spin_lock_irqsave(&cm.lock, flags);
+
+ id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT);
+
+ spin_unlock_irqrestore(&cm.lock, flags);
+ idr_preload_end();
cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand;
- return ret;
+ return id < 0 ? id : 0;
}
static void cm_free_id(__be32 local_id)
@@ -464,41 +498,6 @@ static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id)
return cm_id_priv;
}
-static void cm_mask_copy(u8 *dst, u8 *src, u8 *mask)
-{
- int i;
-
- for (i = 0; i < IB_CM_COMPARE_SIZE / sizeof(unsigned long); i++)
- ((unsigned long *) dst)[i] = ((unsigned long *) src)[i] &
- ((unsigned long *) mask)[i];
-}
-
-static int cm_compare_data(struct ib_cm_compare_data *src_data,
- struct ib_cm_compare_data *dst_data)
-{
- u8 src[IB_CM_COMPARE_SIZE];
- u8 dst[IB_CM_COMPARE_SIZE];
-
- if (!src_data || !dst_data)
- return 0;
-
- cm_mask_copy(src, src_data->data, dst_data->mask);
- cm_mask_copy(dst, dst_data->data, src_data->mask);
- return memcmp(src, dst, IB_CM_COMPARE_SIZE);
-}
-
-static int cm_compare_private_data(u8 *private_data,
- struct ib_cm_compare_data *dst_data)
-{
- u8 src[IB_CM_COMPARE_SIZE];
-
- if (!dst_data)
- return 0;
-
- cm_mask_copy(src, private_data, dst_data->mask);
- return memcmp(src, dst_data->data, IB_CM_COMPARE_SIZE);
-}
-
/*
* Trivial helpers to strip endian annotation and compare; the
* endianness doesn't actually matter since we just need a stable
@@ -531,18 +530,14 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
struct cm_id_private *cur_cm_id_priv;
__be64 service_id = cm_id_priv->id.service_id;
__be64 service_mask = cm_id_priv->id.service_mask;
- int data_cmp;
while (*link) {
parent = *link;
cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
service_node);
- data_cmp = cm_compare_data(cm_id_priv->compare_data,
- cur_cm_id_priv->compare_data);
if ((cur_cm_id_priv->id.service_mask & service_id) ==
(service_mask & cur_cm_id_priv->id.service_id) &&
- (cm_id_priv->id.device == cur_cm_id_priv->id.device) &&
- !data_cmp)
+ (cm_id_priv->id.device == cur_cm_id_priv->id.device))
return cur_cm_id_priv;
if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
@@ -553,8 +548,6 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
link = &(*link)->rb_left;
else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
link = &(*link)->rb_right;
- else if (data_cmp < 0)
- link = &(*link)->rb_left;
else
link = &(*link)->rb_right;
}
@@ -564,20 +557,16 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
}
static struct cm_id_private * cm_find_listen(struct ib_device *device,
- __be64 service_id,
- u8 *private_data)
+ __be64 service_id)
{
struct rb_node *node = cm.listen_service_table.rb_node;
struct cm_id_private *cm_id_priv;
- int data_cmp;
while (node) {
cm_id_priv = rb_entry(node, struct cm_id_private, service_node);
- data_cmp = cm_compare_private_data(private_data,
- cm_id_priv->compare_data);
if ((cm_id_priv->id.service_mask & service_id) ==
cm_id_priv->id.service_id &&
- (cm_id_priv->id.device == device) && !data_cmp)
+ (cm_id_priv->id.device == device))
return cm_id_priv;
if (device < cm_id_priv->id.device)
@@ -588,8 +577,6 @@ static struct cm_id_private * cm_find_listen(struct ib_device *device,
node = node->rb_left;
else if (be64_gt(service_id, cm_id_priv->id.service_id))
node = node->rb_right;
- else if (data_cmp < 0)
- node = node->rb_left;
else
node = node->rb_right;
}
@@ -746,6 +733,8 @@ struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
spin_lock_init(&cm_id_priv->lock);
init_completion(&cm_id_priv->comp);
INIT_LIST_HEAD(&cm_id_priv->work_list);
+ INIT_LIST_HEAD(&cm_id_priv->prim_list);
+ INIT_LIST_HEAD(&cm_id_priv->altr_list);
atomic_set(&cm_id_priv->work_count, -1);
atomic_set(&cm_id_priv->refcount, 1);
return &cm_id_priv->id;
@@ -813,11 +802,11 @@ static void cm_cleanup_timewait(struct cm_timewait_info *timewait_info)
}
}
-static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id, gfp_t flags)
+static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id)
{
struct cm_timewait_info *timewait_info;
- timewait_info = kzalloc(sizeof *timewait_info, flags);
+ timewait_info = kzalloc(sizeof *timewait_info, GFP_KERNEL);
if (!timewait_info)
return ERR_PTR(-ENOMEM);
@@ -831,6 +820,11 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv)
{
int wait_time;
unsigned long flags;
+ struct cm_device *cm_dev;
+
+ cm_dev = ib_get_client_data(cm_id_priv->id.device, &cm_client);
+ if (!cm_dev)
+ return;
spin_lock_irqsave(&cm.lock, flags);
cm_cleanup_timewait(cm_id_priv->timewait_info);
@@ -844,8 +838,14 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv)
*/
cm_id_priv->id.state = IB_CM_TIMEWAIT;
wait_time = cm_convert_to_ms(cm_id_priv->av.timeout);
- queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work,
- msecs_to_jiffies(wait_time));
+
+ /* Check if the device started its remove_one */
+ spin_lock_irqsave(&cm.lock, flags);
+ if (!cm_dev->going_down)
+ queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work,
+ msecs_to_jiffies(wait_time));
+ spin_unlock_irqrestore(&cm.lock, flags);
+
cm_id_priv->timewait_info = NULL;
}
@@ -873,9 +873,15 @@ retest:
spin_lock_irq(&cm_id_priv->lock);
switch (cm_id->state) {
case IB_CM_LISTEN:
- cm_id->state = IB_CM_IDLE;
spin_unlock_irq(&cm_id_priv->lock);
+
spin_lock_irq(&cm.lock);
+ if (--cm_id_priv->listen_sharecount > 0) {
+ /* The id is still shared. */
+ cm_deref_id(cm_id_priv);
+ spin_unlock_irq(&cm.lock);
+ return;
+ }
rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);
spin_unlock_irq(&cm.lock);
break;
@@ -887,8 +893,14 @@ retest:
case IB_CM_SIDR_REQ_RCVD:
spin_unlock_irq(&cm_id_priv->lock);
cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT);
+ spin_lock_irq(&cm.lock);
+ if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node))
+ rb_erase(&cm_id_priv->sidr_id_node,
+ &cm.remote_sidr_table);
+ spin_unlock_irq(&cm.lock);
break;
case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
spin_unlock_irq(&cm_id_priv->lock);
ib_send_cm_rej(cm_id, IB_CM_REJ_TIMEOUT,
@@ -907,7 +919,6 @@ retest:
NULL, 0, NULL, 0);
}
break;
- case IB_CM_MRA_REQ_RCVD:
case IB_CM_REP_SENT:
case IB_CM_MRA_REP_RCVD:
ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
@@ -939,12 +950,20 @@ retest:
break;
}
+ spin_lock_irq(&cm.lock);
+ if (!list_empty(&cm_id_priv->altr_list) &&
+ (!cm_id_priv->altr_send_port_not_ready))
+ list_del(&cm_id_priv->altr_list);
+ if (!list_empty(&cm_id_priv->prim_list) &&
+ (!cm_id_priv->prim_send_port_not_ready))
+ list_del(&cm_id_priv->prim_list);
+ spin_unlock_irq(&cm.lock);
+
cm_free_id(cm_id->local_id);
cm_deref_id(cm_id_priv);
wait_for_completion(&cm_id_priv->comp);
while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
cm_free_work(work);
- kfree(cm_id_priv->compare_data);
kfree(cm_id_priv->private_data);
kfree(cm_id_priv);
}
@@ -955,11 +974,23 @@ void ib_destroy_cm_id(struct ib_cm_id *cm_id)
}
EXPORT_SYMBOL(ib_destroy_cm_id);
-int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
- struct ib_cm_compare_data *compare_data)
+/**
+ * __ib_cm_listen - Initiates listening on the specified service ID for
+ * connection and service ID resolution requests.
+ * @cm_id: Connection identifier associated with the listen request.
+ * @service_id: Service identifier matched against incoming connection
+ * and service ID resolution requests. The service ID should be specified
+ * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ * assign a service ID to the caller.
+ * @service_mask: Mask applied to service ID used to listen across a
+ * range of service IDs. If set to 0, the service ID is matched
+ * exactly. This parameter is ignored if %service_id is set to
+ * IB_CM_ASSIGN_SERVICE_ID.
+ */
+static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id,
+ __be64 service_mask)
{
struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
- unsigned long flags;
int ret = 0;
service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
@@ -972,20 +1003,9 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
if (cm_id->state != IB_CM_IDLE)
return -EINVAL;
- if (compare_data) {
- cm_id_priv->compare_data = kzalloc(sizeof *compare_data,
- GFP_KERNEL);
- if (!cm_id_priv->compare_data)
- return -ENOMEM;
- cm_mask_copy(cm_id_priv->compare_data->data,
- compare_data->data, compare_data->mask);
- memcpy(cm_id_priv->compare_data->mask, compare_data->mask,
- IB_CM_COMPARE_SIZE);
- }
-
cm_id->state = IB_CM_LISTEN;
+ ++cm_id_priv->listen_sharecount;
- spin_lock_irqsave(&cm.lock, flags);
if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
cm_id->service_id = cpu_to_be64(cm.listen_service_id++);
cm_id->service_mask = ~cpu_to_be64(0);
@@ -994,18 +1014,95 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
cm_id->service_mask = service_mask;
}
cur_cm_id_priv = cm_insert_listen(cm_id_priv);
- spin_unlock_irqrestore(&cm.lock, flags);
if (cur_cm_id_priv) {
cm_id->state = IB_CM_IDLE;
- kfree(cm_id_priv->compare_data);
- cm_id_priv->compare_data = NULL;
+ --cm_id_priv->listen_sharecount;
ret = -EBUSY;
}
return ret;
}
+
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm.lock, flags);
+ ret = __ib_cm_listen(cm_id, service_id, service_mask);
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ return ret;
+}
EXPORT_SYMBOL(ib_cm_listen);
+/**
+ * Create a new listening ib_cm_id and listen on the given service ID.
+ *
+ * If there's an existing ID listening on that same device and service ID,
+ * return it.
+ *
+ * @device: Device associated with the cm_id. All related communication will
+ * be associated with the specified device.
+ * @cm_handler: Callback invoked to notify the user of CM events.
+ * @service_id: Service identifier matched against incoming connection
+ * and service ID resolution requests. The service ID should be specified
+ * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ * assign a service ID to the caller.
+ *
+ * Callers should call ib_destroy_cm_id when done with the listener ID.
+ */
+struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
+ ib_cm_handler cm_handler,
+ __be64 service_id)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_cm_id *cm_id;
+ unsigned long flags;
+ int err = 0;
+
+ /* Create an ID in advance, since the creation may sleep */
+ cm_id = ib_create_cm_id(device, cm_handler, NULL);
+ if (IS_ERR(cm_id))
+ return cm_id;
+
+ spin_lock_irqsave(&cm.lock, flags);
+
+ if (service_id == IB_CM_ASSIGN_SERVICE_ID)
+ goto new_id;
+
+ /* Find an existing ID */
+ cm_id_priv = cm_find_listen(device, service_id);
+ if (cm_id_priv) {
+ if (cm_id->cm_handler != cm_handler || cm_id->context) {
+ /* Sharing an ib_cm_id with different handlers is not
+ * supported */
+ spin_unlock_irqrestore(&cm.lock, flags);
+ return ERR_PTR(-EINVAL);
+ }
+ atomic_inc(&cm_id_priv->refcount);
+ ++cm_id_priv->listen_sharecount;
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ ib_destroy_cm_id(cm_id);
+ cm_id = &cm_id_priv->id;
+ return cm_id;
+ }
+
+new_id:
+ /* Use newly created ID */
+ err = __ib_cm_listen(cm_id, service_id, 0);
+
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ if (err) {
+ ib_destroy_cm_id(cm_id);
+ return ERR_PTR(err);
+ }
+ return cm_id;
+}
+EXPORT_SYMBOL(ib_cm_insert_listen);
+
static __be64 cm_form_tid(struct cm_id_private *cm_id_priv,
enum cm_msg_sequence msg_seq)
{
@@ -1058,7 +1155,7 @@ static void cm_format_req(struct cm_req_msg *req_msg,
cm_req_set_resp_res(req_msg, param->responder_resources);
cm_req_set_retry_count(req_msg, param->retry_count);
cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count);
- cm_req_set_srq(req_msg, param->srq);
+ cm_req_set_srq(req_msg, param->srq);
}
if (pri_path->hop_limit <= 1) {
@@ -1150,28 +1247,28 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id->state != IB_CM_IDLE) {
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
- id.local_id,
- GFP_ATOMIC);
+ id.local_id);
if (IS_ERR(cm_id_priv->timewait_info)) {
- spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- return (PTR_ERR(cm_id_priv->timewait_info));
+ ret = PTR_ERR(cm_id_priv->timewait_info);
+ goto out;
}
- ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av);
- if (!ret && param->alternate_path) {
+ ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av,
+ cm_id_priv);
+ if (ret)
+ goto error1;
+ if (param->alternate_path) {
ret = cm_init_av_by_path(param->alternate_path,
- &cm_id_priv->alt_av);
- }
- if (ret) {
- spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ &cm_id_priv->alt_av, cm_id_priv);
+ if (ret)
goto error1;
}
- spin_unlock_irqrestore(&cm_id_priv->lock, flags);
-
cm_id->service_id = param->service_id;
cm_id->service_mask = ~cpu_to_be64(0);
cm_id_priv->timeout_ms = cm_convert_to_ms(
@@ -1210,11 +1307,9 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return 0;
-error2:
- cm_free_msg(cm_id_priv->msg);
-error1:
- kfree(cm_id_priv->timewait_info);
- return ret;
+error2: cm_free_msg(cm_id_priv->msg);
+error1: kfree(cm_id_priv->timewait_info);
+out: return ret;
}
EXPORT_SYMBOL(ib_send_cm_req);
@@ -1254,14 +1349,6 @@ static int cm_issue_rej(struct cm_port *port,
return ret;
}
-static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid,
- __be32 local_qpn, __be32 remote_qpn)
-{
- return (be64_to_cpu(local_ca_guid) > be64_to_cpu(remote_ca_guid) ||
- ((local_ca_guid == remote_ca_guid) &&
- (be32_to_cpu(local_qpn) > be32_to_cpu(remote_qpn))));
-}
-
static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
struct ib_sa_path_rec *primary_path,
struct ib_sa_path_rec *alt_path)
@@ -1285,6 +1372,7 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
primary_path->packet_life_time =
cm_req_get_primary_local_ack_timeout(req_msg);
primary_path->packet_life_time -= (primary_path->packet_life_time > 0);
+ primary_path->service_id = req_msg->service_id;
if (req_msg->alt_local_lid) {
memset(alt_path, 0, sizeof *alt_path);
@@ -1306,7 +1394,26 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
alt_path->packet_life_time =
cm_req_get_alt_local_ack_timeout(req_msg);
alt_path->packet_life_time -= (alt_path->packet_life_time > 0);
+ alt_path->service_id = req_msg->service_id;
+ }
+}
+
+static u16 cm_get_bth_pkey(struct cm_work *work)
+{
+ struct ib_device *ib_dev = work->port->cm_dev->ib_device;
+ u8 port_num = work->port->port_num;
+ u16 pkey_index = work->mad_recv_wc->wc->pkey_index;
+ u16 pkey;
+ int ret;
+
+ ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey);
+ if (ret) {
+ dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %d, pkey index %d). %d\n",
+ port_num, pkey_index, ret);
+ return 0;
}
+
+ return pkey;
}
static void cm_format_req_event(struct cm_work *work,
@@ -1319,6 +1426,7 @@ static void cm_format_req_event(struct cm_work *work,
req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
param = &work->cm_event.param.req_rcvd;
param->listen_id = listen_id;
+ param->bth_pkey = cm_get_bth_pkey(work);
param->port = cm_id_priv->av.port->port_num;
param->primary_path = &work->path[0];
if (req_msg->alt_local_lid)
@@ -1501,8 +1609,7 @@ static struct cm_id_private * cm_match_req(struct cm_work *work,
/* Find matching listen request. */
listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device,
- req_msg->service_id,
- req_msg->private_data);
+ req_msg->service_id);
if (!listen_cm_id_priv) {
cm_cleanup_timewait(cm_id_priv->timewait_info);
spin_unlock_irq(&cm.lock);
@@ -1553,6 +1660,8 @@ static int cm_req_handler(struct cm_work *work)
struct ib_cm_id *cm_id;
struct cm_id_private *cm_id_priv, *listen_cm_id_priv;
struct cm_req_msg *req_msg;
+ union ib_gid gid;
+ struct ib_gid_attr gid_attr;
int ret;
req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
@@ -1567,8 +1676,7 @@ static int cm_req_handler(struct cm_work *work)
work->mad_recv_wc->recv_buf.grh,
&cm_id_priv->av);
cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
- id.local_id,
- GFP_KERNEL);
+ id.local_id);
if (IS_ERR(cm_id_priv->timewait_info)) {
ret = PTR_ERR(cm_id_priv->timewait_info);
goto destroy;
@@ -1592,20 +1700,41 @@ static int cm_req_handler(struct cm_work *work)
cm_process_routed_req(req_msg, work->mad_recv_wc->wc);
cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]);
- /* Workarround: path in req_msg doesn't contain MAC, take it from wc */
- memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, 6);
- work->path[0].vlan_id = cm_id_priv->av.ah_attr.vlan_id;
- ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
+ memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN);
+ work->path[0].hop_limit = cm_id_priv->av.ah_attr.grh.hop_limit;
+ ret = ib_get_cached_gid(work->port->cm_dev->ib_device,
+ work->port->port_num,
+ cm_id_priv->av.ah_attr.grh.sgid_index,
+ &gid, &gid_attr);
+ if (!ret) {
+ if (gid_attr.ndev) {
+ work->path[0].ifindex = gid_attr.ndev->if_index;
+ work->path[0].net = dev_net(gid_attr.ndev);
+ dev_put(gid_attr.ndev);
+ }
+ work->path[0].gid_type = gid_attr.gid_type;
+ ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av,
+ cm_id_priv);
+ }
if (ret) {
- ib_get_cached_gid(work->port->cm_dev->ib_device,
- work->port->port_num, 0, &work->path[0].sgid);
+ int err = ib_get_cached_gid(work->port->cm_dev->ib_device,
+ work->port->port_num, 0,
+ &work->path[0].sgid,
+ &gid_attr);
+ if (!err && gid_attr.ndev) {
+ work->path[0].ifindex = gid_attr.ndev->if_index;
+ work->path[0].net = dev_net(gid_attr.ndev);
+ dev_put(gid_attr.ndev);
+ }
+ work->path[0].gid_type = gid_attr.gid_type;
ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
&work->path[0].sgid, sizeof work->path[0].sgid,
NULL, 0);
goto rejected;
}
if (req_msg->alt_local_lid) {
- ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av);
+ ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av,
+ cm_id_priv);
if (ret) {
ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID,
&work->path[0].sgid,
@@ -1687,7 +1816,6 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id,
spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id->state != IB_CM_REQ_RCVD &&
cm_id->state != IB_CM_MRA_REQ_SENT) {
- pr_debug("cm_id->state: %d\n", cm_id->state);
ret = -EINVAL;
goto out;
}
@@ -1754,7 +1882,6 @@ int ib_send_cm_rtu(struct ib_cm_id *cm_id,
spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id->state != IB_CM_REP_RCVD &&
cm_id->state != IB_CM_MRA_REP_SENT) {
- pr_debug("cm_id->state: %d\n", cm_id->state);
ret = -EINVAL;
goto error;
}
@@ -1859,7 +1986,6 @@ static int cm_rep_handler(struct cm_work *work)
cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0);
if (!cm_id_priv) {
cm_dup_rep_handler(work);
- pr_debug("no cm_id_priv\n");
return -EINVAL;
}
@@ -1873,7 +1999,6 @@ static int cm_rep_handler(struct cm_work *work)
default:
spin_unlock_irq(&cm_id_priv->lock);
ret = -EINVAL;
- pr_debug("cm_id_priv->id.state: %d\n", cm_id_priv->id.state);
goto error;
}
@@ -1887,7 +2012,6 @@ static int cm_rep_handler(struct cm_work *work)
spin_unlock(&cm.lock);
spin_unlock_irq(&cm_id_priv->lock);
ret = -EINVAL;
- pr_debug("Failed to insert remote id\n");
goto error;
}
/* Check for a stale connection. */
@@ -1901,7 +2025,6 @@ static int cm_rep_handler(struct cm_work *work)
IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP,
NULL, 0);
ret = -EINVAL;
- pr_debug("Stale connection.\n");
goto error;
}
spin_unlock(&cm.lock);
@@ -2042,7 +2165,6 @@ int ib_send_cm_dreq(struct ib_cm_id *cm_id,
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id->state != IB_CM_ESTABLISHED) {
- pr_debug("cm_id->state: %d\n", cm_id->state);
ret = -EINVAL;
goto out;
}
@@ -2112,7 +2234,6 @@ int ib_send_cm_drep(struct ib_cm_id *cm_id,
if (cm_id->state != IB_CM_DREQ_RCVD) {
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
kfree(data);
- pr_debug("cm_id->state(%d) != IB_CM_DREQ_RCVD\n", cm_id->state);
return -EINVAL;
}
@@ -2178,7 +2299,6 @@ static int cm_dreq_handler(struct cm_work *work)
atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
counter[CM_DREQ_COUNTER]);
cm_issue_drep(work->port, work->mad_recv_wc);
- pr_debug("no cm_id_priv\n");
return -EINVAL;
}
@@ -2219,7 +2339,6 @@ static int cm_dreq_handler(struct cm_work *work)
counter[CM_DREQ_COUNTER]);
goto unlock;
default:
- pr_debug("cm_id_priv->id.state: %d\n", cm_id_priv->id.state);
goto unlock;
}
cm_id_priv->id.state = IB_CM_DREQ_RCVD;
@@ -2323,7 +2442,6 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id,
cm_enter_timewait(cm_id_priv);
break;
default:
- pr_debug("cm_id->state: 0x%x\n", cm_id->state);
ret = -EINVAL;
goto out;
}
@@ -2428,13 +2546,12 @@ static int cm_rej_handler(struct cm_work *work)
if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT)
ib_cancel_mad(cm_id_priv->av.port->mad_agent,
cm_id_priv->msg);
- cm_enter_timewait(cm_id_priv);
- break;
+ cm_enter_timewait(cm_id_priv);
+ break;
}
/* fall through */
default:
spin_unlock_irq(&cm_id_priv->lock);
- pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state);
ret = -EINVAL;
goto out;
}
@@ -2497,7 +2614,6 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id,
break;
}
default:
- pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state);
ret = -EINVAL;
goto error1;
}
@@ -2599,7 +2715,6 @@ static int cm_mra_handler(struct cm_work *work)
counter[CM_MRA_COUNTER]);
/* fall through */
default:
- pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state);
goto out;
}
@@ -2674,7 +2789,8 @@ int ib_send_cm_lap(struct ib_cm_id *cm_id,
goto out;
}
- ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av);
+ ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av,
+ cm_id_priv);
if (ret)
goto out;
cm_id_priv->alt_av.timeout =
@@ -2786,8 +2902,8 @@ static int cm_lap_handler(struct cm_work *work)
cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
work->mad_recv_wc->recv_buf.grh,
&cm_id_priv->av);
- if (cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av))
- goto unlock;
+ cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av,
+ cm_id_priv);
ret = atomic_inc_and_test(&cm_id_priv->work_count);
if (!ret)
list_add_tail(&work->list, &cm_id_priv->work_list);
@@ -2979,10 +3095,7 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
return -EINVAL;
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
-
- spin_lock_irqsave(&cm_id_priv->lock, flags);
-
- ret = cm_init_av_by_path(param->path, &cm_id_priv->av);
+ ret = cm_init_av_by_path(param->path, &cm_id_priv->av, cm_id_priv);
if (ret)
goto out;
@@ -2999,19 +3112,21 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
msg->timeout_ms = cm_id_priv->timeout_ms;
msg->context[1] = (void *) (unsigned long) IB_CM_SIDR_REQ_SENT;
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id->state == IB_CM_IDLE)
ret = ib_post_send_mad(msg, NULL);
else
ret = -EINVAL;
if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
cm_free_msg(msg);
goto out;
}
cm_id->state = IB_CM_SIDR_REQ_SENT;
cm_id_priv->msg = msg;
-out:
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+out:
return ret;
}
EXPORT_SYMBOL(ib_send_cm_sidr_req);
@@ -3027,6 +3142,8 @@ static void cm_format_sidr_req_event(struct cm_work *work,
param = &work->cm_event.param.sidr_req_rcvd;
param->pkey = __be16_to_cpu(sidr_req_msg->pkey);
param->listen_id = listen_id;
+ param->service_id = sidr_req_msg->service_id;
+ param->bth_pkey = cm_get_bth_pkey(work);
param->port = work->port->port_num;
work->cm_event.private_data = &sidr_req_msg->private_data;
}
@@ -3066,8 +3183,7 @@ static int cm_sidr_req_handler(struct cm_work *work)
}
cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
cur_cm_id_priv = cm_find_listen(cm_id->device,
- sidr_req_msg->service_id,
- sidr_req_msg->private_data);
+ sidr_req_msg->service_id);
if (!cur_cm_id_priv) {
spin_unlock_irq(&cm.lock);
cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED);
@@ -3147,7 +3263,10 @@ int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
spin_lock_irqsave(&cm.lock, flags);
- rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+ if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) {
+ rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+ RB_CLEAR_NODE(&cm_id_priv->sidr_id_node);
+ }
spin_unlock_irqrestore(&cm.lock, flags);
return 0;
@@ -3339,7 +3458,6 @@ static void cm_work_handler(struct work_struct *_work)
ret = cm_timewait_handler(work);
break;
default:
- pr_debug("work->cm_event.event: 0x%x\n", work->cm_event.event);
ret = -EINVAL;
break;
}
@@ -3353,6 +3471,11 @@ static int cm_establish(struct ib_cm_id *cm_id)
struct cm_work *work;
unsigned long flags;
int ret = 0;
+ struct cm_device *cm_dev;
+
+ cm_dev = ib_get_client_data(cm_id->device, &cm_client);
+ if (!cm_dev)
+ return -ENODEV;
work = kmalloc(sizeof *work, GFP_ATOMIC);
if (!work)
@@ -3370,7 +3493,6 @@ static int cm_establish(struct ib_cm_id *cm_id)
ret = -EISCONN;
break;
default:
- pr_debug("cm_id->state: 0x%x\n", cm_id->state);
ret = -EINVAL;
break;
}
@@ -3392,7 +3514,17 @@ static int cm_establish(struct ib_cm_id *cm_id)
work->remote_id = cm_id->remote_id;
work->mad_recv_wc = NULL;
work->cm_event.event = IB_CM_USER_ESTABLISHED;
- queue_delayed_work(cm.wq, &work->work, 0);
+
+ /* Check if the device started its remove_one */
+ spin_lock_irqsave(&cm.lock, flags);
+ if (!cm_dev->going_down) {
+ queue_delayed_work(cm.wq, &work->work, 0);
+ } else {
+ kfree(work);
+ ret = -ENODEV;
+ }
+ spin_unlock_irqrestore(&cm.lock, flags);
+
out:
return ret;
}
@@ -3400,7 +3532,9 @@ out:
static int cm_migrate(struct ib_cm_id *cm_id)
{
struct cm_id_private *cm_id_priv;
+ struct cm_av tmp_av;
unsigned long flags;
+ int tmp_send_port_not_ready;
int ret = 0;
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
@@ -3409,7 +3543,14 @@ static int cm_migrate(struct ib_cm_id *cm_id)
(cm_id->lap_state == IB_CM_LAP_UNINIT ||
cm_id->lap_state == IB_CM_LAP_IDLE)) {
cm_id->lap_state = IB_CM_LAP_IDLE;
+ /* Swap address vector */
+ tmp_av = cm_id_priv->av;
cm_id_priv->av = cm_id_priv->alt_av;
+ cm_id_priv->alt_av = tmp_av;
+ /* Swap port send ready state */
+ tmp_send_port_not_ready = cm_id_priv->prim_send_port_not_ready;
+ cm_id_priv->prim_send_port_not_ready = cm_id_priv->altr_send_port_not_ready;
+ cm_id_priv->altr_send_port_not_ready = tmp_send_port_not_ready;
} else
ret = -EINVAL;
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
@@ -3436,6 +3577,7 @@ int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event)
EXPORT_SYMBOL(ib_cm_notify);
static void cm_recv_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf,
struct ib_mad_recv_wc *mad_recv_wc)
{
struct cm_port *port = mad_agent->context;
@@ -3443,6 +3585,7 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent,
enum ib_cm_event_type event;
u16 attr_id;
int paths = 0;
+ int going_down = 0;
switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) {
case CM_REQ_ATTR_ID:
@@ -3501,7 +3644,19 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent,
work->cm_event.event = event;
work->mad_recv_wc = mad_recv_wc;
work->port = port;
- queue_delayed_work(cm.wq, &work->work, 0);
+
+ /* Check if the device started its remove_one */
+ spin_lock_irq(&cm.lock);
+ if (!port->cm_dev->going_down)
+ queue_delayed_work(cm.wq, &work->work, 0);
+ else
+ going_down = 1;
+ spin_unlock_irq(&cm.lock);
+
+ if (going_down) {
+ kfree(work);
+ ib_free_recv_mad(mad_recv_wc);
+ }
}
static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
@@ -3533,7 +3688,6 @@ static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
ret = 0;
break;
default:
- pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state);
ret = -EINVAL;
break;
}
@@ -3560,31 +3714,6 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
*qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU |
IB_QP_DEST_QPN | IB_QP_RQ_PSN;
qp_attr->ah_attr = cm_id_priv->av.ah_attr;
- if (!cm_id_priv->av.valid)
- return -EINVAL;
- if (cm_id_priv->av.ah_attr.vlan_id != 0xffff) {
- qp_attr->vlan_id = cm_id_priv->av.ah_attr.vlan_id;
- *qp_attr_mask |= IB_QP_VID;
- }
- if (!is_zero_ether_addr(cm_id_priv->av.smac)) {
- memcpy(qp_attr->smac, cm_id_priv->av.smac,
- sizeof(qp_attr->smac));
- *qp_attr_mask |= IB_QP_SMAC;
- }
- if (cm_id_priv->alt_av.valid) {
- if (cm_id_priv->alt_av.ah_attr.vlan_id != 0xffff) {
- qp_attr->alt_vlan_id =
- cm_id_priv->alt_av.ah_attr.vlan_id;
- *qp_attr_mask |= IB_QP_ALT_VID;
- }
- if (!is_zero_ether_addr(cm_id_priv->alt_av.smac)) {
- memcpy(qp_attr->alt_smac,
- cm_id_priv->alt_av.smac,
- sizeof(qp_attr->alt_smac));
- *qp_attr_mask |= IB_QP_ALT_SMAC;
- }
- }
-
qp_attr->path_mtu = cm_id_priv->path_mtu;
qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn);
qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn);
@@ -3606,7 +3735,6 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
ret = 0;
break;
default:
- pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state);
ret = -EINVAL;
break;
}
@@ -3666,7 +3794,6 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv,
ret = 0;
break;
default:
- pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state);
ret = -EINVAL;
break;
}
@@ -3693,7 +3820,6 @@ int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
ret = cm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask);
break;
default:
- pr_debug("qp_attr->qp_state: 0x%x\n", qp_attr->qp_state);
ret = -EINVAL;
break;
}
@@ -3701,16 +3827,6 @@ int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
}
EXPORT_SYMBOL(ib_cm_init_qp_attr);
-static void cm_get_ack_delay(struct cm_device *cm_dev)
-{
- struct ib_device_attr attr;
-
- if (ib_query_device(cm_dev->ib_device, &attr))
- cm_dev->ack_delay = 0; /* acks will rely on packet life time */
- else
- cm_dev->ack_delay = attr.local_ca_ack_delay;
-}
-
static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr,
char *buf)
{
@@ -3806,26 +3922,24 @@ static void cm_add_one(struct ib_device *ib_device)
struct cm_port *port;
struct ib_mad_reg_req reg_req = {
.mgmt_class = IB_MGMT_CLASS_CM,
- .mgmt_class_version = IB_CM_CLASS_VERSION
+ .mgmt_class_version = IB_CM_CLASS_VERSION,
};
struct ib_port_modify port_modify = {
.set_port_cap_mask = IB_PORT_CM_SUP
};
unsigned long flags;
int ret;
+ int count = 0;
u8 i;
- if (rdma_node_get_transport(ib_device->node_type) != RDMA_TRANSPORT_IB)
- return;
-
cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) *
ib_device->phys_port_cnt, GFP_KERNEL);
if (!cm_dev)
return;
cm_dev->ib_device = ib_device;
- cm_get_ack_delay(cm_dev);
-
+ cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
+ cm_dev->going_down = 0;
cm_dev->device = device_create(&cm_class, &ib_device->dev,
MKDEV(0, 0), NULL,
"%s", ib_device->name);
@@ -3836,6 +3950,9 @@ static void cm_add_one(struct ib_device *ib_device)
set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+ if (!rdma_cap_ib_cm(ib_device, i))
+ continue;
+
port = kzalloc(sizeof *port, GFP_KERNEL);
if (!port)
goto error1;
@@ -3844,6 +3961,9 @@ static void cm_add_one(struct ib_device *ib_device)
port->cm_dev = cm_dev;
port->port_num = i;
+ INIT_LIST_HEAD(&port->cm_priv_prim_list);
+ INIT_LIST_HEAD(&port->cm_priv_altr_list);
+
ret = cm_create_port_fs(port);
if (ret)
goto error1;
@@ -3854,14 +3974,21 @@ static void cm_add_one(struct ib_device *ib_device)
0,
cm_send_handler,
cm_recv_handler,
- port);
+ port,
+ 0);
if (IS_ERR(port->mad_agent))
goto error2;
ret = ib_modify_port(ib_device, i, 0, &port_modify);
if (ret)
goto error3;
+
+ count++;
}
+
+ if (!count)
+ goto free;
+
ib_set_client_data(ib_device, &cm_client, cm_dev);
write_lock_irqsave(&cm.device_lock, flags);
@@ -3877,26 +4004,31 @@ error1:
port_modify.set_port_cap_mask = 0;
port_modify.clr_port_cap_mask = IB_PORT_CM_SUP;
while (--i) {
+ if (!rdma_cap_ib_cm(ib_device, i))
+ continue;
+
port = cm_dev->port[i-1];
ib_modify_port(ib_device, port->port_num, 0, &port_modify);
ib_unregister_mad_agent(port->mad_agent);
cm_remove_port_fs(port);
}
+free:
device_unregister(cm_dev->device);
kfree(cm_dev);
}
-static void cm_remove_one(struct ib_device *ib_device)
+static void cm_remove_one(struct ib_device *ib_device, void *client_data)
{
- struct cm_device *cm_dev;
+ struct cm_device *cm_dev = client_data;
struct cm_port *port;
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_agent *cur_mad_agent;
struct ib_port_modify port_modify = {
.clr_port_cap_mask = IB_PORT_CM_SUP
};
unsigned long flags;
int i;
- cm_dev = ib_get_client_data(ib_device, &cm_client);
if (!cm_dev)
return;
@@ -3904,13 +4036,37 @@ static void cm_remove_one(struct ib_device *ib_device)
list_del(&cm_dev->list);
write_unlock_irqrestore(&cm.device_lock, flags);
+ spin_lock_irq(&cm.lock);
+ cm_dev->going_down = 1;
+ spin_unlock_irq(&cm.lock);
+
for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+ if (!rdma_cap_ib_cm(ib_device, i))
+ continue;
+
port = cm_dev->port[i-1];
ib_modify_port(ib_device, port->port_num, 0, &port_modify);
- ib_unregister_mad_agent(port->mad_agent);
+ /* Mark all the cm_id's as not valid */
+ spin_lock_irq(&cm.lock);
+ list_for_each_entry(cm_id_priv, &port->cm_priv_altr_list, altr_list)
+ cm_id_priv->altr_send_port_not_ready = 1;
+ list_for_each_entry(cm_id_priv, &port->cm_priv_prim_list, prim_list)
+ cm_id_priv->prim_send_port_not_ready = 1;
+ spin_unlock_irq(&cm.lock);
+ /*
+ * We flush the queue here after the going_down set, this
+ * verify that no new works will be queued in the recv handler,
+ * after that we can call the unregister_mad_agent
+ */
flush_workqueue(cm.wq);
+ spin_lock_irq(&cm.state_lock);
+ cur_mad_agent = port->mad_agent;
+ port->mad_agent = NULL;
+ spin_unlock_irq(&cm.state_lock);
+ ib_unregister_mad_agent(cur_mad_agent);
cm_remove_port_fs(port);
}
+
device_unregister(cm_dev->device);
kfree(cm_dev);
}
@@ -3923,6 +4079,7 @@ static int __init ib_cm_init(void)
INIT_LIST_HEAD(&cm.device_list);
rwlock_init(&cm.device_lock);
spin_lock_init(&cm.lock);
+ spin_lock_init(&cm.state_lock);
cm.listen_service_table = RB_ROOT;
cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID);
cm.remote_id_table = RB_ROOT;
@@ -3930,8 +4087,6 @@ static int __init ib_cm_init(void)
cm.remote_sidr_table = RB_ROOT;
idr_init(&cm.local_id_table);
get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);
- if (!idr_pre_get(&cm.local_id_table, GFP_KERNEL))
- return -ENOMEM;
INIT_LIST_HEAD(&cm.timewait_list);
ret = class_register(&cm_class);
diff --git a/sys/ofed/drivers/infiniband/core/cm_msgs.h b/sys/ofed/drivers/infiniband/core/cm_msgs.h
index be068f47e47e8..8b76f0ef965e8 100644
--- a/sys/ofed/drivers/infiniband/core/cm_msgs.h
+++ b/sys/ofed/drivers/infiniband/core/cm_msgs.h
@@ -103,7 +103,7 @@ struct cm_req_msg {
/* local ACK timeout:5, rsvd:3 */
u8 alt_offset139;
- u8 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE];
+ u32 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE / sizeof(u32)];
} __attribute__ ((packed));
@@ -801,7 +801,7 @@ struct cm_sidr_req_msg {
__be16 rsvd;
__be64 service_id;
- u8 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE];
+ u32 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE / sizeof(u32)];
} __attribute__ ((packed));
struct cm_sidr_rep_msg {
diff --git a/sys/ofed/drivers/infiniband/core/cma.c b/sys/ofed/drivers/infiniband/core/cma.c
index 2427be8811d59..8ac661fc4ef8a 100644
--- a/sys/ofed/drivers/infiniband/core/cma.c
+++ b/sys/ofed/drivers/infiniband/core/cma.c
@@ -3,7 +3,6 @@
* Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
* Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
* Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
- * Copyright (c) 2016 Chelsio Communications. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -45,71 +44,67 @@
#include <linux/inetdevice.h>
#include <linux/slab.h>
#include <linux/module.h>
-#include <linux/string.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/ipv6.h>
+#include <netinet6/scope6_var.h>
+#include <netinet6/ip6_var.h>
+
#include <rdma/rdma_cm.h>
#include <rdma/rdma_cm_ib.h>
+#include <rdma/ib.h>
+#include <rdma/ib_addr.h>
#include <rdma/ib_cache.h>
#include <rdma/ib_cm.h>
#include <rdma/ib_sa.h>
#include <rdma/iw_cm.h>
+#include <sys/priv.h>
+
+#include "core_priv.h"
+
MODULE_AUTHOR("Sean Hefty");
MODULE_DESCRIPTION("Generic RDMA CM Agent");
MODULE_LICENSE("Dual BSD/GPL");
#define CMA_CM_RESPONSE_TIMEOUT 20
+#define CMA_QUERY_CLASSPORT_INFO_TIMEOUT 3000
#define CMA_MAX_CM_RETRIES 15
#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
#define CMA_IBOE_PACKET_LIFETIME 18
-static int cma_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
-module_param_named(cma_response_timeout, cma_response_timeout, int, 0644);
-MODULE_PARM_DESC(cma_response_timeout, "CMA_CM_RESPONSE_TIMEOUT (default=20)");
-
-static int def_prec2sl = 3;
-module_param_named(def_prec2sl, def_prec2sl, int, 0644);
-MODULE_PARM_DESC(def_prec2sl, "Default value for SL priority with RoCE. Valid values 0 - 7");
-
-static int unify_tcp_port_space = 1;
-module_param(unify_tcp_port_space, int, 0644);
-MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port "
- "space allocation (default=1)");
-
-static int debug_level = 0;
-#define cma_pr(level, priv, format, arg...) \
- printk(level "CMA: %p: %s: " format, ((struct rdma_id_priv *) priv) , __func__, ## arg)
-
-#define cma_dbg(priv, format, arg...) \
- do { if (debug_level) cma_pr(KERN_DEBUG, priv, format, ## arg); } while (0)
-
-#define cma_warn(priv, format, arg...) \
- cma_pr(KERN_WARNING, priv, format, ## arg)
-
-#define CMA_GID_FMT "%2.2x%2.2x:%2.2x%2.2x"
-#define CMA_GID_RAW_ARG(gid) ((u8 *)(gid))[12],\
- ((u8 *)(gid))[13],\
- ((u8 *)(gid))[14],\
- ((u8 *)(gid))[15]
-
-#define CMA_GID_ARG(gid) CMA_GID_RAW_ARG((gid).raw)
-#define cma_debug_path(priv, pfx, p) \
- cma_dbg(priv, pfx "sgid=" CMA_GID_FMT ",dgid=" \
- CMA_GID_FMT "\n", CMA_GID_ARG(p.sgid), \
- CMA_GID_ARG(p.dgid))
+static const char * const cma_events[] = {
+ [RDMA_CM_EVENT_ADDR_RESOLVED] = "address resolved",
+ [RDMA_CM_EVENT_ADDR_ERROR] = "address error",
+ [RDMA_CM_EVENT_ROUTE_RESOLVED] = "route resolved ",
+ [RDMA_CM_EVENT_ROUTE_ERROR] = "route error",
+ [RDMA_CM_EVENT_CONNECT_REQUEST] = "connect request",
+ [RDMA_CM_EVENT_CONNECT_RESPONSE] = "connect response",
+ [RDMA_CM_EVENT_CONNECT_ERROR] = "connect error",
+ [RDMA_CM_EVENT_UNREACHABLE] = "unreachable",
+ [RDMA_CM_EVENT_REJECTED] = "rejected",
+ [RDMA_CM_EVENT_ESTABLISHED] = "established",
+ [RDMA_CM_EVENT_DISCONNECTED] = "disconnected",
+ [RDMA_CM_EVENT_DEVICE_REMOVAL] = "device removal",
+ [RDMA_CM_EVENT_MULTICAST_JOIN] = "multicast join",
+ [RDMA_CM_EVENT_MULTICAST_ERROR] = "multicast error",
+ [RDMA_CM_EVENT_ADDR_CHANGE] = "address change",
+ [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit",
+};
-#define cma_debug_gid(priv, g) \
- cma_dbg(priv, "gid=" CMA_GID_FMT "\n", CMA_GID_ARG(g)
+const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event)
+{
+ size_t index = event;
-module_param_named(debug_level, debug_level, int, 0644);
-MODULE_PARM_DESC(debug_level, "debug level default=0");
+ return (index < ARRAY_SIZE(cma_events) && cma_events[index]) ?
+ cma_events[index] : "unrecognized event";
+}
+EXPORT_SYMBOL(rdma_event_msg);
static void cma_add_one(struct ib_device *device);
-static void cma_remove_one(struct ib_device *device);
+static void cma_remove_one(struct ib_device *device, void *client_data);
static struct ib_client cma_client = {
.name = "cma",
@@ -123,12 +118,44 @@ static LIST_HEAD(dev_list);
static LIST_HEAD(listen_any_list);
static DEFINE_MUTEX(lock);
static struct workqueue_struct *cma_wq;
-static struct workqueue_struct *cma_free_wq;
-static DEFINE_IDR(sdp_ps);
-static DEFINE_IDR(tcp_ps);
-static DEFINE_IDR(udp_ps);
-static DEFINE_IDR(ipoib_ps);
-static DEFINE_IDR(ib_ps);
+
+struct cma_pernet {
+ struct idr tcp_ps;
+ struct idr udp_ps;
+ struct idr ipoib_ps;
+ struct idr ib_ps;
+};
+
+VNET_DEFINE(struct cma_pernet, cma_pernet);
+
+static struct cma_pernet *cma_pernet_ptr(struct vnet *vnet)
+{
+ struct cma_pernet *retval;
+
+ CURVNET_SET_QUIET(vnet);
+ retval = &VNET(cma_pernet);
+ CURVNET_RESTORE();
+
+ return (retval);
+}
+
+static struct idr *cma_pernet_idr(struct vnet *net, enum rdma_port_space ps)
+{
+ struct cma_pernet *pernet = cma_pernet_ptr(net);
+
+ switch (ps) {
+ case RDMA_PS_TCP:
+ return &pernet->tcp_ps;
+ case RDMA_PS_UDP:
+ return &pernet->udp_ps;
+ case RDMA_PS_IPOIB:
+ return &pernet->ipoib_ps;
+ case RDMA_PS_IB:
+ return &pernet->ib_ps;
+ default:
+ return NULL;
+ }
+}
struct cma_device {
struct list_head list;
@@ -136,18 +163,112 @@ struct cma_device {
struct completion comp;
atomic_t refcount;
struct list_head id_list;
+ struct sysctl_ctx_list sysctl_ctx;
+ enum ib_gid_type *default_gid_type;
};
struct rdma_bind_list {
- struct idr *ps;
+ enum rdma_port_space ps;
struct hlist_head owners;
unsigned short port;
};
+struct class_port_info_context {
+ struct ib_class_port_info *class_port_info;
+ struct ib_device *device;
+ struct completion done;
+ struct ib_sa_query *sa_query;
+ u8 port_num;
+};
+
+static int cma_ps_alloc(struct vnet *vnet, enum rdma_port_space ps,
+ struct rdma_bind_list *bind_list, int snum)
+{
+ struct idr *idr = cma_pernet_idr(vnet, ps);
+
+ return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL);
+}
+
+static struct rdma_bind_list *cma_ps_find(struct vnet *net,
+ enum rdma_port_space ps, int snum)
+{
+ struct idr *idr = cma_pernet_idr(net, ps);
+
+ return idr_find(idr, snum);
+}
+
+static void cma_ps_remove(struct vnet *net, enum rdma_port_space ps, int snum)
+{
+ struct idr *idr = cma_pernet_idr(net, ps);
+
+ idr_remove(idr, snum);
+}
+
enum {
CMA_OPTION_AFONLY,
};
+void cma_ref_dev(struct cma_device *cma_dev)
+{
+ atomic_inc(&cma_dev->refcount);
+}
+
+struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter,
+ void *cookie)
+{
+ struct cma_device *cma_dev;
+ struct cma_device *found_cma_dev = NULL;
+
+ mutex_lock(&lock);
+
+ list_for_each_entry(cma_dev, &dev_list, list)
+ if (filter(cma_dev->device, cookie)) {
+ found_cma_dev = cma_dev;
+ break;
+ }
+
+ if (found_cma_dev)
+ cma_ref_dev(found_cma_dev);
+ mutex_unlock(&lock);
+ return found_cma_dev;
+}
+
+int cma_get_default_gid_type(struct cma_device *cma_dev,
+ unsigned int port)
+{
+ if (port < rdma_start_port(cma_dev->device) ||
+ port > rdma_end_port(cma_dev->device))
+ return -EINVAL;
+
+ return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)];
+}
+
+int cma_set_default_gid_type(struct cma_device *cma_dev,
+ unsigned int port,
+ enum ib_gid_type default_gid_type)
+{
+ unsigned long supported_gids;
+
+ if (port < rdma_start_port(cma_dev->device) ||
+ port > rdma_end_port(cma_dev->device))
+ return -EINVAL;
+
+ supported_gids = roce_gid_type_mask_support(cma_dev->device, port);
+
+ if (!(supported_gids & 1 << default_gid_type))
+ return -EINVAL;
+
+ cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] =
+ default_gid_type;
+
+ return 0;
+}
+
+struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev)
+{
+ return cma_dev->device;
+}
+
/*
* Device removal can occur at anytime, so we need extra handling to
* serialize notifying the user of device removal with other callbacks.
@@ -158,7 +279,6 @@ struct rdma_id_private {
struct rdma_cm_id id;
struct rdma_bind_list *bind_list;
- struct socket *sock;
struct hlist_node node;
struct list_head list; /* listen_any_list or cma_device.list */
struct list_head listen_list; /* per device listens */
@@ -168,13 +288,11 @@ struct rdma_id_private {
int internal_id;
enum rdma_cm_state state;
spinlock_t lock;
- spinlock_t cm_lock;
struct mutex qp_mutex;
struct completion comp;
atomic_t refcount;
struct mutex handler_mutex;
- struct work_struct work; /* garbage coll */
int backlog;
int timeout_ms;
@@ -194,10 +312,7 @@ struct rdma_id_private {
u8 tos;
u8 reuseaddr;
u8 afonly;
- int qp_timeout;
- /* cache for mc record params */
- struct ib_sa_mcmember_rec rec;
- int is_valid_rec;
+ enum ib_gid_type gid_type;
};
struct cma_multicast {
@@ -209,6 +324,8 @@ struct cma_multicast {
void *context;
struct sockaddr_storage addr;
struct kref mcref;
+ bool igmp_joined;
+ u8 join_state;
};
struct cma_work {
@@ -247,25 +364,17 @@ struct cma_hdr {
union cma_ip_addr dst_addr;
};
-struct sdp_hh {
- u8 bsdh[16];
- u8 sdp_version; /* Major version: 7:4 */
- u8 ip_version; /* IP version: 7:4 */
- u8 sdp_specific1[10];
- __be16 port;
- __be16 sdp_specific2;
- union cma_ip_addr src_addr;
- union cma_ip_addr dst_addr;
-};
+#define CMA_VERSION 0x00
-struct sdp_hah {
- u8 bsdh[16];
- u8 sdp_version;
+struct cma_req_info {
+ struct ib_device *device;
+ int port;
+ union ib_gid local_gid;
+ __be64 service_id;
+ u16 pkey;
+ bool has_gid:1;
};
-#define CMA_VERSION 0x00
-#define SDP_MAJ_VERSION 0x2
-
static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp)
{
unsigned long flags;
@@ -303,7 +412,7 @@ static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv,
return old;
}
-static inline u8 cma_get_ip_ver(struct cma_hdr *hdr)
+static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr)
{
return hdr->ip_version >> 4;
}
@@ -313,33 +422,28 @@ static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
}
-static inline u8 sdp_get_majv(u8 sdp_version)
-{
- return sdp_version >> 4;
-}
-
-static inline u8 sdp_get_ip_ver(struct sdp_hh *hh)
-{
- return hh->ip_version >> 4;
-}
-
-static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver)
+static void _cma_attach_to_dev(struct rdma_id_private *id_priv,
+ struct cma_device *cma_dev)
{
- hh->ip_version = (ip_ver << 4) | (hh->ip_version & 0xF);
-}
-
-static void cma_attach_to_dev(struct rdma_id_private *id_priv,
- struct cma_device *cma_dev)
-{
- atomic_inc(&cma_dev->refcount);
+ cma_ref_dev(cma_dev);
id_priv->cma_dev = cma_dev;
+ id_priv->gid_type = 0;
id_priv->id.device = cma_dev->device;
id_priv->id.route.addr.dev_addr.transport =
rdma_node_get_transport(cma_dev->device->node_type);
list_add_tail(&id_priv->list, &cma_dev->id_list);
}
-static inline void cma_deref_dev(struct cma_device *cma_dev)
+static void cma_attach_to_dev(struct rdma_id_private *id_priv,
+ struct cma_device *cma_dev)
+{
+ _cma_attach_to_dev(id_priv, cma_dev);
+ id_priv->gid_type =
+ cma_dev->default_gid_type[id_priv->id.port_num -
+ rdma_start_port(cma_dev->device)];
+}
+
+void cma_deref_dev(struct cma_device *cma_dev)
{
if (atomic_dec_and_test(&cma_dev->refcount))
complete(&cma_dev->comp);
@@ -362,16 +466,40 @@ static void cma_release_dev(struct rdma_id_private *id_priv)
mutex_unlock(&lock);
}
-static int cma_set_qkey(struct rdma_id_private *id_priv)
+static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv)
+{
+ return (struct sockaddr *) &id_priv->id.route.addr.src_addr;
+}
+
+static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv)
+{
+ return (struct sockaddr *) &id_priv->id.route.addr.dst_addr;
+}
+
+static inline unsigned short cma_family(struct rdma_id_private *id_priv)
+{
+ return id_priv->id.route.addr.src_addr.ss_family;
+}
+
+static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey)
{
struct ib_sa_mcmember_rec rec;
int ret = 0;
- if (id_priv->qkey)
+ if (id_priv->qkey) {
+ if (qkey && id_priv->qkey != qkey)
+ return -EINVAL;
+ return 0;
+ }
+
+ if (qkey) {
+ id_priv->qkey = qkey;
return 0;
+ }
switch (id_priv->id.ps) {
case RDMA_PS_UDP:
+ case RDMA_PS_IB:
id_priv->qkey = RDMA_UDP_QKEY;
break;
case RDMA_PS_IPOIB:
@@ -388,109 +516,78 @@ static int cma_set_qkey(struct rdma_id_private *id_priv)
return ret;
}
-static int find_gid_port(struct ib_device *device, union ib_gid *gid, u8 port_num)
+static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr)
{
- int i;
- int err;
- struct ib_port_attr props;
- union ib_gid tmp;
+ dev_addr->dev_type = ARPHRD_INFINIBAND;
+ rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr);
+ ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey));
+}
- err = ib_query_port(device, port_num, &props);
- if (err)
- return 1;
+static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr)
+{
+ int ret;
- for (i = 0; i < props.gid_tbl_len; ++i) {
- err = ib_query_gid(device, port_num, i, &tmp);
- if (err)
- return 1;
- if (!memcmp(&tmp, gid, sizeof tmp))
- return 0;
+ if (addr->sa_family != AF_IB) {
+ ret = rdma_translate_ip(addr, dev_addr, NULL);
+ } else {
+ cma_translate_ib((struct sockaddr_ib *) addr, dev_addr);
+ ret = 0;
}
- return -EAGAIN;
+ return ret;
}
-int
-rdma_find_cmid_laddr(struct sockaddr_in *local_addr, unsigned short dev_type,
- void **cm_id)
+static inline int cma_validate_port(struct ib_device *device, u8 port,
+ enum ib_gid_type gid_type,
+ union ib_gid *gid, int dev_type,
+ struct vnet *net,
+ int bound_if_index)
{
- int ret;
- u8 port;
- int found_dev = 0, found_cmid = 0;
- struct rdma_id_private *id_priv;
- struct rdma_id_private *dev_id_priv;
- struct cma_device *cma_dev;
- struct rdma_dev_addr dev_addr;
- union ib_gid gid;
- enum rdma_link_layer dev_ll = dev_type == ARPHRD_INFINIBAND ?
- IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET;
+ int ret = -ENODEV;
+ struct net_device *ndev = NULL;
- memset(&dev_addr, 0, sizeof(dev_addr));
+ if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port))
+ return ret;
- ret = rdma_translate_ip((struct sockaddr *)local_addr,
- &dev_addr, NULL);
- if (ret)
- goto err;
+ if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port))
+ return ret;
- /* find rdma device based on MAC address/gid */
- mutex_lock(&lock);
+ if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) {
+ ndev = dev_get_by_index(net, bound_if_index);
+ if (ndev && ndev->if_flags & IFF_LOOPBACK) {
+ pr_info("detected loopback device\n");
+ dev_put(ndev);
- memcpy(&gid, dev_addr.src_dev_addr +
- rdma_addr_gid_offset(&dev_addr), sizeof(gid));
+ if (!device->get_netdev)
+ return -EOPNOTSUPP;
- list_for_each_entry(cma_dev, &dev_list, list)
- for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port)
- if ((rdma_port_get_link_layer(cma_dev->device, port) ==
- dev_ll) &&
- (rdma_node_get_transport(cma_dev->device->node_type) ==
- RDMA_TRANSPORT_IWARP)) {
- ret = find_gid_port(cma_dev->device,
- &gid, port);
- if (!ret) {
- found_dev = 1;
- goto out;
- } else if (ret == 1) {
- mutex_unlock(&lock);
- goto err;
- }
- }
-out:
- mutex_unlock(&lock);
+ ndev = device->get_netdev(device, port);
+ if (!ndev)
+ return -ENODEV;
+ }
+ } else {
+ gid_type = IB_GID_TYPE_IB;
+ }
- if (!found_dev)
- goto err;
+ ret = ib_find_cached_gid_by_port(device, gid, gid_type, port,
+ ndev, NULL);
- /* Traverse through the list of listening cm_id's to find the
- * desired cm_id based on rdma device & port number.
- */
- list_for_each_entry(id_priv, &listen_any_list, list)
- list_for_each_entry(dev_id_priv, &id_priv->listen_list,
- listen_list)
- if (dev_id_priv->cma_dev == cma_dev)
- if (dev_id_priv->cm_id.iw->local_addr.sin_port
- == local_addr->sin_port) {
- *cm_id = (void *)dev_id_priv->cm_id.iw;
- found_cmid = 1;
- }
- return found_cmid ? 0 : -ENODEV;
+ if (ndev)
+ dev_put(ndev);
-err:
- return -ENODEV;
+ return ret;
}
-EXPORT_SYMBOL(rdma_find_cmid_laddr);
static int cma_acquire_dev(struct rdma_id_private *id_priv,
struct rdma_id_private *listen_id_priv)
{
struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
struct cma_device *cma_dev;
- union ib_gid gid, iboe_gid;
+ union ib_gid gid, iboe_gid, *gidp;
int ret = -ENODEV;
- u8 port, found_port;
- enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ?
- IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET;
+ u8 port;
- if (dev_ll != IB_LINK_LAYER_INFINIBAND &&
+ if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
id_priv->id.ps == RDMA_PS_IPOIB)
return -EINVAL;
@@ -500,42 +597,46 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv,
memcpy(&gid, dev_addr->src_dev_addr +
rdma_addr_gid_offset(dev_addr), sizeof gid);
- if (listen_id_priv &&
- rdma_port_get_link_layer(listen_id_priv->id.device,
- listen_id_priv->id.port_num) == dev_ll) {
+
+ if (listen_id_priv) {
cma_dev = listen_id_priv->cma_dev;
port = listen_id_priv->id.port_num;
- if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB &&
- rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET)
- ret = ib_find_cached_gid(cma_dev->device, &iboe_gid,
- &found_port, NULL);
- else
- ret = ib_find_cached_gid(cma_dev->device, &gid,
- &found_port, NULL);
-
- if (!ret && (port == found_port)) {
- id_priv->id.port_num = found_port;
+ gidp = rdma_protocol_roce(cma_dev->device, port) ?
+ &iboe_gid : &gid;
+
+ ret = cma_validate_port(cma_dev->device, port,
+ rdma_protocol_ib(cma_dev->device, port) ?
+ IB_GID_TYPE_IB :
+ listen_id_priv->gid_type, gidp,
+ dev_addr->dev_type,
+ dev_addr->net,
+ dev_addr->bound_dev_if);
+ if (!ret) {
+ id_priv->id.port_num = port;
goto out;
}
}
+
list_for_each_entry(cma_dev, &dev_list, list) {
for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) {
if (listen_id_priv &&
listen_id_priv->cma_dev == cma_dev &&
listen_id_priv->id.port_num == port)
continue;
- if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) {
- if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB &&
- rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET)
- ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, &found_port, NULL);
- else
- ret = ib_find_cached_gid(cma_dev->device, &gid, &found_port, NULL);
-
- if (!ret && (port == found_port)) {
- id_priv->id.port_num = port;
- goto out;
- } else if (ret == 1)
- break;
+
+ gidp = rdma_protocol_roce(cma_dev->device, port) ?
+ &iboe_gid : &gid;
+
+ ret = cma_validate_port(cma_dev->device, port,
+ rdma_protocol_ib(cma_dev->device, port) ?
+ IB_GID_TYPE_IB :
+ cma_dev->default_gid_type[port - 1],
+ gidp, dev_addr->dev_type,
+ dev_addr->net,
+ dev_addr->bound_dev_if);
+ if (!ret) {
+ id_priv->id.port_num = port;
+ goto out;
}
}
}
@@ -548,24 +649,70 @@ out:
return ret;
}
-static void cma_deref_id(struct rdma_id_private *id_priv)
+/*
+ * Select the source IB device and address to reach the destination IB address.
+ */
+static int cma_resolve_ib_dev(struct rdma_id_private *id_priv)
{
- if (atomic_dec_and_test(&id_priv->refcount))
- complete(&id_priv->comp);
-}
+ struct cma_device *cma_dev, *cur_dev;
+ struct sockaddr_ib *addr;
+ union ib_gid gid, sgid, *dgid;
+ u16 pkey, index;
+ u8 p;
+ int i;
-static int cma_disable_callback(struct rdma_id_private *id_priv,
- enum rdma_cm_state state)
-{
- mutex_lock(&id_priv->handler_mutex);
- if (id_priv->state != state) {
- mutex_unlock(&id_priv->handler_mutex);
- return -EINVAL;
+ cma_dev = NULL;
+ addr = (struct sockaddr_ib *) cma_dst_addr(id_priv);
+ dgid = (union ib_gid *) &addr->sib_addr;
+ pkey = ntohs(addr->sib_pkey);
+
+ list_for_each_entry(cur_dev, &dev_list, list) {
+ for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) {
+ if (!rdma_cap_af_ib(cur_dev->device, p))
+ continue;
+
+ if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index))
+ continue;
+
+ for (i = 0; !ib_get_cached_gid(cur_dev->device, p, i,
+ &gid, NULL);
+ i++) {
+ if (!memcmp(&gid, dgid, sizeof(gid))) {
+ cma_dev = cur_dev;
+ sgid = gid;
+ id_priv->id.port_num = p;
+ goto found;
+ }
+
+ if (!cma_dev && (gid.global.subnet_prefix ==
+ dgid->global.subnet_prefix)) {
+ cma_dev = cur_dev;
+ sgid = gid;
+ id_priv->id.port_num = p;
+ }
+ }
+ }
}
+
+ if (!cma_dev)
+ return -ENODEV;
+
+found:
+ cma_attach_to_dev(id_priv, cma_dev);
+ addr = (struct sockaddr_ib *) cma_src_addr(id_priv);
+ memcpy(&addr->sib_addr, &sgid, sizeof sgid);
+ cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr);
return 0;
}
-struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
+static void cma_deref_id(struct rdma_id_private *id_priv)
+{
+ if (atomic_dec_and_test(&id_priv->refcount))
+ complete(&id_priv->comp);
+}
+
+struct rdma_cm_id *rdma_create_id(struct vnet *net,
+ rdma_cm_event_handler event_handler,
void *context, enum rdma_port_space ps,
enum ib_qp_type qp_type)
{
@@ -575,14 +722,13 @@ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
if (!id_priv)
return ERR_PTR(-ENOMEM);
- id_priv->owner = curthread->td_proc->p_pid;
+ id_priv->owner = task_pid_nr(current);
id_priv->state = RDMA_CM_IDLE;
id_priv->id.context = context;
id_priv->id.event_handler = event_handler;
id_priv->id.ps = ps;
id_priv->id.qp_type = qp_type;
spin_lock_init(&id_priv->lock);
- spin_lock_init(&id_priv->cm_lock);
mutex_init(&id_priv->qp_mutex);
init_completion(&id_priv->comp);
atomic_set(&id_priv->refcount, 1);
@@ -590,6 +736,7 @@ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
INIT_LIST_HEAD(&id_priv->listen_list);
INIT_LIST_HEAD(&id_priv->mc_list);
get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
+ id_priv->id.route.addr.dev_addr.net = TD_TO_VNET(curthread);
return &id_priv->id;
}
@@ -645,6 +792,7 @@ int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
if (id->device != pd->device)
return -EINVAL;
+ qp_init_attr->port_num = id->port_num;
qp = ib_create_qp(pd, qp_init_attr);
if (IS_ERR(qp))
return PTR_ERR(qp);
@@ -705,23 +853,13 @@ static int cma_modify_qp_rtr(struct rdma_id_private *id_priv,
ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
if (ret)
goto out;
+
ret = ib_query_gid(id_priv->id.device, id_priv->id.port_num,
- qp_attr.ah_attr.grh.sgid_index, &sgid);
+ qp_attr.ah_attr.grh.sgid_index, &sgid, NULL);
if (ret)
goto out;
- if (rdma_node_get_transport(id_priv->cma_dev->device->node_type)
- == RDMA_TRANSPORT_IB &&
- rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)
- == IB_LINK_LAYER_ETHERNET) {
- u32 scope_id = rdma_get_ipv6_scope_id(id_priv->id.device,
- id_priv->id.port_num);
-
- ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr.smac, NULL,
- scope_id);
- if (ret)
- goto out;
- }
+ BUG_ON(id_priv->cma_dev->device != id_priv->id.device);
if (conn_param)
qp_attr.max_dest_rd_atomic = conn_param->responder_resources;
@@ -750,12 +888,6 @@ static int cma_modify_qp_rts(struct rdma_id_private *id_priv,
if (conn_param)
qp_attr.max_rd_atomic = conn_param->initiator_depth;
-
- if (id_priv->qp_timeout && id_priv->id.qp->qp_type == IB_QPT_RC) {
- qp_attr.timeout = id_priv->qp_timeout;
- qp_attr_mask |= IB_QP_TIMEOUT;
- }
-
ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
out:
mutex_unlock(&id_priv->qp_mutex);
@@ -787,11 +919,10 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
int ret;
u16 pkey;
- if (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) ==
- IB_LINK_LAYER_INFINIBAND)
- pkey = ib_addr_get_pkey(dev_addr);
- else
+ if (rdma_cap_eth_ah(id_priv->id.device, id_priv->id.port_num))
pkey = 0xffff;
+ else
+ pkey = ib_addr_get_pkey(dev_addr);
ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num,
pkey, &qp_attr->pkey_index);
@@ -802,7 +933,7 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
*qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT;
if (id_priv->id.qp_type == IB_QPT_UD) {
- ret = cma_set_qkey(id_priv);
+ ret = cma_set_qkey(id_priv, 0);
if (ret)
return ret;
@@ -822,29 +953,24 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
int ret = 0;
id_priv = container_of(id, struct rdma_id_private, id);
- switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
- case RDMA_TRANSPORT_IB:
+ if (rdma_cap_ib_cm(id->device, id->port_num)) {
if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD))
ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask);
else
ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr,
qp_attr_mask);
+
if (qp_attr->qp_state == IB_QPS_RTR)
qp_attr->rq_psn = id_priv->seq_num;
- break;
- case RDMA_TRANSPORT_IWARP:
- case RDMA_TRANSPORT_SCIF:
+ } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
if (!id_priv->cm_id.iw) {
qp_attr->qp_access_flags = 0;
*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
} else
ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr,
qp_attr_mask);
- break;
- default:
+ } else
ret = -ENOSYS;
- break;
- }
return ret;
}
@@ -852,38 +978,36 @@ EXPORT_SYMBOL(rdma_init_qp_attr);
static inline int cma_zero_addr(struct sockaddr *addr)
{
- struct in6_addr *ip6;
-
- if (addr->sa_family == AF_INET)
- return ipv4_is_zeronet(
- ((struct sockaddr_in *)addr)->sin_addr.s_addr);
- else {
- ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr;
- return (ip6->s6_addr32[0] | ip6->s6_addr32[1] |
- ip6->s6_addr32[2] | ip6->s6_addr32[3]) == 0;
+ switch (addr->sa_family) {
+ case AF_INET:
+ return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr);
+ case AF_INET6:
+ return ipv6_addr_any(&((struct sockaddr_in6 *) addr)->sin6_addr);
+ case AF_IB:
+ return ib_addr_any(&((struct sockaddr_ib *) addr)->sib_addr);
+ default:
+ return 0;
}
}
static inline int cma_loopback_addr(struct sockaddr *addr)
{
- if (addr->sa_family == AF_INET)
- return ipv4_is_loopback(
- ((struct sockaddr_in *) addr)->sin_addr.s_addr);
- else
- return ipv6_addr_loopback(
- &((struct sockaddr_in6 *) addr)->sin6_addr);
+ switch (addr->sa_family) {
+ case AF_INET:
+ return ipv4_is_loopback(((struct sockaddr_in *) addr)->sin_addr.s_addr);
+ case AF_INET6:
+ return ipv6_addr_loopback(&((struct sockaddr_in6 *) addr)->sin6_addr);
+ case AF_IB:
+ return ib_addr_loopback(&((struct sockaddr_ib *) addr)->sib_addr);
+ default:
+ return 0;
+ }
}
static inline int cma_any_addr(struct sockaddr *addr)
{
return cma_zero_addr(addr) || cma_loopback_addr(addr);
}
-int
-rdma_cma_any_addr(struct sockaddr *addr)
-{
- return cma_any_addr(addr);
-}
-EXPORT_SYMBOL(rdma_cma_any_addr);
static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst)
{
@@ -894,18 +1018,31 @@ static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst)
case AF_INET:
return ((struct sockaddr_in *) src)->sin_addr.s_addr !=
((struct sockaddr_in *) dst)->sin_addr.s_addr;
- default:
+ case AF_INET6:
return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr,
&((struct sockaddr_in6 *) dst)->sin6_addr);
+ default:
+ return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr,
+ &((struct sockaddr_ib *) dst)->sib_addr);
}
}
-static inline __be16 cma_port(struct sockaddr *addr)
+static __be16 cma_port(struct sockaddr *addr)
{
- if (addr->sa_family == AF_INET)
+ struct sockaddr_ib *sib;
+
+ switch (addr->sa_family) {
+ case AF_INET:
return ((struct sockaddr_in *) addr)->sin_port;
- else
+ case AF_INET6:
return ((struct sockaddr_in6 *) addr)->sin6_port;
+ case AF_IB:
+ sib = (struct sockaddr_ib *) addr;
+ return htons((u16) (be64_to_cpu(sib->sib_sid) &
+ be64_to_cpu(sib->sib_sid_mask)));
+ default:
+ return 0;
+ }
}
static inline int cma_any_port(struct sockaddr *addr)
@@ -913,100 +1050,467 @@ static inline int cma_any_port(struct sockaddr *addr)
return !cma_port(addr);
}
-static int cma_get_net_info(void *hdr, enum rdma_port_space ps,
- u8 *ip_ver, __be16 *port,
- union cma_ip_addr **src, union cma_ip_addr **dst)
+static void cma_save_ib_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct rdma_cm_id *listen_id,
+ struct ib_sa_path_rec *path)
{
- switch (ps) {
- case RDMA_PS_SDP:
- if (sdp_get_majv(((struct sdp_hh *) hdr)->sdp_version) !=
- SDP_MAJ_VERSION)
- return -EINVAL;
+ struct sockaddr_ib *listen_ib, *ib;
+
+ listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr;
+ if (src_addr) {
+ ib = (struct sockaddr_ib *)src_addr;
+ ib->sib_family = AF_IB;
+ if (path) {
+ ib->sib_pkey = path->pkey;
+ ib->sib_flowinfo = path->flow_label;
+ memcpy(&ib->sib_addr, &path->sgid, 16);
+ ib->sib_sid = path->service_id;
+ ib->sib_scope_id = 0;
+ } else {
+ ib->sib_pkey = listen_ib->sib_pkey;
+ ib->sib_flowinfo = listen_ib->sib_flowinfo;
+ ib->sib_addr = listen_ib->sib_addr;
+ ib->sib_sid = listen_ib->sib_sid;
+ ib->sib_scope_id = listen_ib->sib_scope_id;
+ }
+ ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
+ }
+ if (dst_addr) {
+ ib = (struct sockaddr_ib *)dst_addr;
+ ib->sib_family = AF_IB;
+ if (path) {
+ ib->sib_pkey = path->pkey;
+ ib->sib_flowinfo = path->flow_label;
+ memcpy(&ib->sib_addr, &path->dgid, 16);
+ }
+ }
+}
- *ip_ver = sdp_get_ip_ver(hdr);
- *port = ((struct sdp_hh *) hdr)->port;
- *src = &((struct sdp_hh *) hdr)->src_addr;
- *dst = &((struct sdp_hh *) hdr)->dst_addr;
- break;
- default:
- if (((struct cma_hdr *) hdr)->cma_version != CMA_VERSION)
- return -EINVAL;
+static void cma_save_ip4_info(struct sockaddr_in *src_addr,
+ struct sockaddr_in *dst_addr,
+ struct cma_hdr *hdr,
+ __be16 local_port)
+{
+ if (src_addr) {
+ *src_addr = (struct sockaddr_in) {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = hdr->dst_addr.ip4.addr,
+ .sin_port = local_port,
+ };
+ }
- *ip_ver = cma_get_ip_ver(hdr);
- *port = ((struct cma_hdr *) hdr)->port;
- *src = &((struct cma_hdr *) hdr)->src_addr;
- *dst = &((struct cma_hdr *) hdr)->dst_addr;
- break;
+ if (dst_addr) {
+ *dst_addr = (struct sockaddr_in) {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = hdr->src_addr.ip4.addr,
+ .sin_port = hdr->port,
+ };
}
+}
- if (*ip_ver != 4 && *ip_ver != 6)
- return -EINVAL;
- return 0;
+static void cma_save_ip6_info(struct sockaddr_in6 *src_addr,
+ struct sockaddr_in6 *dst_addr,
+ struct cma_hdr *hdr,
+ __be16 local_port)
+{
+ if (src_addr) {
+ *src_addr = (struct sockaddr_in6) {
+ .sin6_family = AF_INET6,
+ .sin6_addr = hdr->dst_addr.ip6,
+ .sin6_port = local_port,
+ };
+ }
+
+ if (dst_addr) {
+ *dst_addr = (struct sockaddr_in6) {
+ .sin6_family = AF_INET6,
+ .sin6_addr = hdr->src_addr.ip6,
+ .sin6_port = hdr->port,
+ };
+ }
}
-static void cma_save_net_info(struct rdma_addr *addr,
- struct rdma_addr *listen_addr,
- u8 ip_ver, __be16 port,
- union cma_ip_addr *src, union cma_ip_addr *dst)
+static u16 cma_port_from_service_id(__be64 service_id)
{
- struct sockaddr_in *listen4, *ip4;
- struct sockaddr_in6 *listen6, *ip6;
+ return (u16)be64_to_cpu(service_id);
+}
+
+static int cma_save_ip_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct ib_cm_event *ib_event,
+ __be64 service_id)
+{
+ struct cma_hdr *hdr;
+ __be16 port;
+
+ hdr = ib_event->private_data;
+ if (hdr->cma_version != CMA_VERSION)
+ return -EINVAL;
+
+ port = htons(cma_port_from_service_id(service_id));
- switch (ip_ver) {
+ switch (cma_get_ip_ver(hdr)) {
case 4:
- listen4 = (struct sockaddr_in *) &listen_addr->src_addr;
- ip4 = (struct sockaddr_in *) &addr->src_addr;
- ip4->sin_family = listen4->sin_family;
- ip4->sin_addr.s_addr = dst->ip4.addr;
- ip4->sin_port = listen4->sin_port;
- ip4->sin_len = sizeof(struct sockaddr_in);
-
- ip4 = (struct sockaddr_in *) &addr->dst_addr;
- ip4->sin_family = listen4->sin_family;
- ip4->sin_addr.s_addr = src->ip4.addr;
- ip4->sin_port = port;
- ip4->sin_len = sizeof(struct sockaddr_in);
+ cma_save_ip4_info((struct sockaddr_in *)src_addr,
+ (struct sockaddr_in *)dst_addr, hdr, port);
break;
case 6:
- listen6 = (struct sockaddr_in6 *) &listen_addr->src_addr;
- ip6 = (struct sockaddr_in6 *) &addr->src_addr;
- ip6->sin6_family = listen6->sin6_family;
- ip6->sin6_addr = dst->ip6;
- ip6->sin6_port = listen6->sin6_port;
- ip6->sin6_len = sizeof(struct sockaddr_in6);
- ip6->sin6_scope_id = listen6->sin6_scope_id;
-
- ip6 = (struct sockaddr_in6 *) &addr->dst_addr;
- ip6->sin6_family = listen6->sin6_family;
- ip6->sin6_addr = src->ip6;
- ip6->sin6_port = port;
- ip6->sin6_len = sizeof(struct sockaddr_in6);
- ip6->sin6_scope_id = listen6->sin6_scope_id;
+ cma_save_ip6_info((struct sockaddr_in6 *)src_addr,
+ (struct sockaddr_in6 *)dst_addr, hdr, port);
break;
default:
- break;
+ return -EAFNOSUPPORT;
}
+
+ return 0;
}
-static inline int cma_user_data_offset(enum rdma_port_space ps)
+static int cma_save_net_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct rdma_cm_id *listen_id,
+ struct ib_cm_event *ib_event,
+ sa_family_t sa_family, __be64 service_id)
{
- switch (ps) {
- case RDMA_PS_SDP:
+ if (sa_family == AF_IB) {
+ if (ib_event->event == IB_CM_REQ_RECEIVED)
+ cma_save_ib_info(src_addr, dst_addr, listen_id,
+ ib_event->param.req_rcvd.primary_path);
+ else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
+ cma_save_ib_info(src_addr, dst_addr, listen_id, NULL);
return 0;
+ }
+
+ return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id);
+}
+
+static int cma_save_req_info(const struct ib_cm_event *ib_event,
+ struct cma_req_info *req)
+{
+ const struct ib_cm_req_event_param *req_param =
+ &ib_event->param.req_rcvd;
+ const struct ib_cm_sidr_req_event_param *sidr_param =
+ &ib_event->param.sidr_req_rcvd;
+
+ switch (ib_event->event) {
+ case IB_CM_REQ_RECEIVED:
+ req->device = req_param->listen_id->device;
+ req->port = req_param->port;
+ memcpy(&req->local_gid, &req_param->primary_path->sgid,
+ sizeof(req->local_gid));
+ req->has_gid = true;
+ req->service_id = req_param->primary_path->service_id;
+ req->pkey = be16_to_cpu(req_param->primary_path->pkey);
+ if (req->pkey != req_param->bth_pkey)
+ pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n"
+ "RDMA CMA: in the future this may cause the request to be dropped\n",
+ req_param->bth_pkey, req->pkey);
+ break;
+ case IB_CM_SIDR_REQ_RECEIVED:
+ req->device = sidr_param->listen_id->device;
+ req->port = sidr_param->port;
+ req->has_gid = false;
+ req->service_id = sidr_param->service_id;
+ req->pkey = sidr_param->pkey;
+ if (req->pkey != sidr_param->bth_pkey)
+ pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and SIDR request payload P_Key (0x%x)\n"
+ "RDMA CMA: in the future this may cause the request to be dropped\n",
+ sidr_param->bth_pkey, req->pkey);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static bool validate_ipv4_net_dev(struct net_device *net_dev,
+ const struct sockaddr_in *dst_addr,
+ const struct sockaddr_in *src_addr)
+{
+#ifdef INET
+ struct sockaddr_in dst_tmp = *dst_addr;
+ __be32 daddr = dst_addr->sin_addr.s_addr,
+ saddr = src_addr->sin_addr.s_addr;
+ struct net_device *src_dev;
+ struct rtentry *rte;
+ bool ret;
+
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
+ ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) ||
+ ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) ||
+ ipv4_is_loopback(saddr))
+ return false;
+
+ src_dev = ip_dev_find(net_dev->if_vnet, saddr);
+ if (src_dev != net_dev)
+ return false;
+
+ /*
+ * Make sure the socket address length field
+ * is set, else rtalloc1() will fail.
+ */
+ dst_tmp.sin_len = sizeof(dst_tmp);
+
+ CURVNET_SET(net_dev->if_vnet);
+ rte = rtalloc1((struct sockaddr *)&dst_tmp, 1, 0);
+ CURVNET_RESTORE();
+ if (rte != NULL) {
+ ret = (rte->rt_ifp == net_dev);
+ RTFREE_LOCKED(rte);
+ } else {
+ ret = false;
+ }
+ return ret;
+#else
+ return false;
+#endif
+}
+
+static bool validate_ipv6_net_dev(struct net_device *net_dev,
+ const struct sockaddr_in6 *dst_addr,
+ const struct sockaddr_in6 *src_addr)
+{
+#ifdef INET6
+ struct sockaddr_in6 dst_tmp = *dst_addr;
+ struct in6_addr in6_addr = src_addr->sin6_addr;
+ struct net_device *src_dev;
+ struct rtentry *rte;
+ bool ret;
+
+ /* embed scope ID */
+ in6_addr.s6_addr[3] = src_addr->sin6_scope_id;
+
+ src_dev = ip6_dev_find(net_dev->if_vnet, in6_addr);
+ if (src_dev != net_dev)
+ return false;
+
+ /*
+ * Make sure the socket address length field
+ * is set, else rtalloc1() will fail.
+ */
+ dst_tmp.sin6_len = sizeof(dst_tmp);
+
+ CURVNET_SET(net_dev->if_vnet);
+ rte = rtalloc1((struct sockaddr *)&dst_tmp, 1, 0);
+ CURVNET_RESTORE();
+ if (rte != NULL) {
+ ret = (rte->rt_ifp == net_dev);
+ RTFREE_LOCKED(rte);
+ } else {
+ ret = false;
+ }
+ return ret;
+#else
+ return false;
+#endif
+}
+
+static bool validate_net_dev(struct net_device *net_dev,
+ const struct sockaddr *daddr,
+ const struct sockaddr *saddr)
+{
+ const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr;
+ const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr;
+ const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr;
+ const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr;
+
+ switch (daddr->sa_family) {
+ case AF_INET:
+ return saddr->sa_family == AF_INET &&
+ validate_ipv4_net_dev(net_dev, daddr4, saddr4);
+
+ case AF_INET6:
+ return saddr->sa_family == AF_INET6 &&
+ validate_ipv6_net_dev(net_dev, daddr6, saddr6);
+
+ default:
+ return false;
+ }
+}
+
+static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event,
+ const struct cma_req_info *req)
+{
+ struct sockaddr_storage listen_addr_storage, src_addr_storage;
+ struct sockaddr *listen_addr = (struct sockaddr *)&listen_addr_storage,
+ *src_addr = (struct sockaddr *)&src_addr_storage;
+ struct net_device *net_dev;
+ const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL;
+ int err;
+
+ err = cma_save_ip_info(listen_addr, src_addr, ib_event,
+ req->service_id);
+ if (err)
+ return ERR_PTR(err);
+
+ net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey,
+ gid, listen_addr);
+ if (!net_dev)
+ return ERR_PTR(-ENODEV);
+
+ if (!validate_net_dev(net_dev, listen_addr, src_addr)) {
+ dev_put(net_dev);
+ return ERR_PTR(-EHOSTUNREACH);
+ }
+
+ return net_dev;
+}
+
+static enum rdma_port_space rdma_ps_from_service_id(__be64 service_id)
+{
+ return (be64_to_cpu(service_id) >> 16) & 0xffff;
+}
+
+static bool cma_match_private_data(struct rdma_id_private *id_priv,
+ const struct cma_hdr *hdr)
+{
+ struct sockaddr *addr = cma_src_addr(id_priv);
+ __be32 ip4_addr;
+ struct in6_addr ip6_addr;
+
+ if (cma_any_addr(addr) && !id_priv->afonly)
+ return true;
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr;
+ if (cma_get_ip_ver(hdr) != 4)
+ return false;
+ if (!cma_any_addr(addr) &&
+ hdr->dst_addr.ip4.addr != ip4_addr)
+ return false;
+ break;
+ case AF_INET6:
+ ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr;
+ if (cma_get_ip_ver(hdr) != 6)
+ return false;
+ if (!cma_any_addr(addr) &&
+ memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr)))
+ return false;
+ break;
+ case AF_IB:
+ return true;
default:
- return sizeof(struct cma_hdr);
+ return false;
+ }
+
+ return true;
+}
+
+static bool cma_protocol_roce_dev_port(struct ib_device *device, int port_num)
+{
+ enum rdma_link_layer ll = rdma_port_get_link_layer(device, port_num);
+ enum rdma_transport_type transport =
+ rdma_node_get_transport(device->node_type);
+
+ return ll == IB_LINK_LAYER_ETHERNET && transport == RDMA_TRANSPORT_IB;
+}
+
+static bool cma_protocol_roce(const struct rdma_cm_id *id)
+{
+ struct ib_device *device = id->device;
+ const int port_num = id->port_num ?: rdma_start_port(device);
+
+ return cma_protocol_roce_dev_port(device, port_num);
+}
+
+static bool cma_match_net_dev(const struct rdma_cm_id *id,
+ const struct net_device *net_dev,
+ u8 port_num)
+{
+ const struct rdma_addr *addr = &id->route.addr;
+
+ if (!net_dev)
+ /* This request is an AF_IB request or a RoCE request */
+ return (!id->port_num || id->port_num == port_num) &&
+ (addr->src_addr.ss_family == AF_IB ||
+ cma_protocol_roce_dev_port(id->device, port_num));
+
+ return !addr->dev_addr.bound_dev_if ||
+ (net_eq(dev_net(net_dev), addr->dev_addr.net) &&
+ addr->dev_addr.bound_dev_if == net_dev->if_index);
+}
+
+static struct rdma_id_private *cma_find_listener(
+ const struct rdma_bind_list *bind_list,
+ const struct ib_cm_id *cm_id,
+ const struct ib_cm_event *ib_event,
+ const struct cma_req_info *req,
+ const struct net_device *net_dev)
+{
+ struct rdma_id_private *id_priv, *id_priv_dev;
+
+ if (!bind_list)
+ return ERR_PTR(-EINVAL);
+
+ hlist_for_each_entry(id_priv, &bind_list->owners, node) {
+ if (cma_match_private_data(id_priv, ib_event->private_data)) {
+ if (id_priv->id.device == cm_id->device &&
+ cma_match_net_dev(&id_priv->id, net_dev, req->port))
+ return id_priv;
+ list_for_each_entry(id_priv_dev,
+ &id_priv->listen_list,
+ listen_list) {
+ if (id_priv_dev->id.device == cm_id->device &&
+ cma_match_net_dev(&id_priv_dev->id, net_dev, req->port))
+ return id_priv_dev;
+ }
+ }
+ }
+
+ return ERR_PTR(-EINVAL);
+}
+
+static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id,
+ struct ib_cm_event *ib_event,
+ struct net_device **net_dev)
+{
+ struct cma_req_info req;
+ struct rdma_bind_list *bind_list;
+ struct rdma_id_private *id_priv;
+ int err;
+
+ err = cma_save_req_info(ib_event, &req);
+ if (err)
+ return ERR_PTR(err);
+
+ *net_dev = cma_get_net_dev(ib_event, &req);
+ if (IS_ERR(*net_dev)) {
+ if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) {
+ /* Assuming the protocol is AF_IB */
+ *net_dev = NULL;
+ } else if (cma_protocol_roce_dev_port(req.device, req.port)) {
+ /* TODO find the net dev matching the request parameters
+ * through the RoCE GID table */
+ *net_dev = NULL;
+ } else {
+ return ERR_CAST(*net_dev);
+ }
+ }
+
+ bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net,
+ rdma_ps_from_service_id(req.service_id),
+ cma_port_from_service_id(req.service_id));
+ id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev);
+ if (IS_ERR(id_priv) && *net_dev) {
+ dev_put(*net_dev);
+ *net_dev = NULL;
}
+
+ return id_priv;
+}
+
+static inline int cma_user_data_offset(struct rdma_id_private *id_priv)
+{
+ return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr);
}
static void cma_cancel_route(struct rdma_id_private *id_priv)
{
- switch (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)) {
- case IB_LINK_LAYER_INFINIBAND:
+ if (rdma_cap_ib_sa(id_priv->id.device, id_priv->id.port_num)) {
if (id_priv->query)
ib_sa_cancel_query(id_priv->query_id, id_priv->query);
- break;
- default:
- break;
}
}
@@ -1046,8 +1550,7 @@ static void cma_cancel_operation(struct rdma_id_private *id_priv,
cma_cancel_route(id_priv);
break;
case RDMA_CM_LISTEN:
- if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr)
- && !id_priv->cma_dev)
+ if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev)
cma_cancel_listens(id_priv);
break;
default:
@@ -1057,18 +1560,16 @@ static void cma_cancel_operation(struct rdma_id_private *id_priv,
static void cma_release_port(struct rdma_id_private *id_priv)
{
- struct rdma_bind_list *bind_list;
+ struct rdma_bind_list *bind_list = id_priv->bind_list;
+ struct vnet *net = id_priv->id.route.addr.dev_addr.net;
- mutex_lock(&lock);
- bind_list = id_priv->bind_list;
- if (!bind_list) {
- mutex_unlock(&lock);
+ if (!bind_list)
return;
- }
+
+ mutex_lock(&lock);
hlist_del(&id_priv->node);
- id_priv->bind_list = NULL;
if (hlist_empty(&bind_list->owners)) {
- idr_remove(bind_list->ps, bind_list->port);
+ cma_ps_remove(net, bind_list->ps, bind_list->port);
kfree(bind_list);
}
mutex_unlock(&lock);
@@ -1082,39 +1583,32 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
mc = container_of(id_priv->mc_list.next,
struct cma_multicast, list);
list_del(&mc->list);
- switch (rdma_port_get_link_layer(id_priv->cma_dev->device, id_priv->id.port_num)) {
- case IB_LINK_LAYER_INFINIBAND:
+ if (rdma_cap_ib_mcast(id_priv->cma_dev->device,
+ id_priv->id.port_num)) {
ib_sa_free_multicast(mc->multicast.ib);
kfree(mc);
- break;
- case IB_LINK_LAYER_ETHERNET:
+ } else {
+ if (mc->igmp_joined) {
+ struct rdma_dev_addr *dev_addr =
+ &id_priv->id.route.addr.dev_addr;
+ struct net_device *ndev = NULL;
+
+ if (dev_addr->bound_dev_if)
+ ndev = dev_get_by_index(dev_addr->net,
+ dev_addr->bound_dev_if);
+ if (ndev) {
+ dev_put(ndev);
+ }
+ }
kref_put(&mc->mcref, release_mc);
- break;
- default:
- break;
}
}
}
-static void __rdma_free(struct work_struct *work)
-{
- struct rdma_id_private *id_priv;
- id_priv = container_of(work, struct rdma_id_private, work);
-
- wait_for_completion(&id_priv->comp);
-
- if (id_priv->internal_id)
- cma_deref_id(id_priv->id.context);
-
- kfree(id_priv->id.route.path_rec);
- kfree(id_priv);
-}
void rdma_destroy_id(struct rdma_cm_id *id)
{
struct rdma_id_private *id_priv;
enum rdma_cm_state state;
- unsigned long flags;
- struct ib_cm_id *ib;
id_priv = container_of(id, struct rdma_id_private, id);
state = cma_exch(id_priv, RDMA_CM_DESTROYING);
@@ -1128,24 +1622,12 @@ void rdma_destroy_id(struct rdma_cm_id *id)
mutex_unlock(&id_priv->handler_mutex);
if (id_priv->cma_dev) {
- switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
- case RDMA_TRANSPORT_IB:
- spin_lock_irqsave(&id_priv->cm_lock, flags);
- if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib)) {
- ib = id_priv->cm_id.ib;
- id_priv->cm_id.ib = NULL;
- spin_unlock_irqrestore(&id_priv->cm_lock, flags);
- ib_destroy_cm_id(ib);
- } else
- spin_unlock_irqrestore(&id_priv->cm_lock, flags);
- break;
- case RDMA_TRANSPORT_IWARP:
- case RDMA_TRANSPORT_SCIF:
+ if (rdma_cap_ib_cm(id_priv->id.device, 1)) {
+ if (id_priv->cm_id.ib)
+ ib_destroy_cm_id(id_priv->cm_id.ib);
+ } else if (rdma_cap_iw_cm(id_priv->id.device, 1)) {
if (id_priv->cm_id.iw)
iw_destroy_cm_id(id_priv->cm_id.iw);
- break;
- default:
- break;
}
cma_leave_mc_groups(id_priv);
cma_release_dev(id_priv);
@@ -1153,8 +1635,13 @@ void rdma_destroy_id(struct rdma_cm_id *id)
cma_release_port(id_priv);
cma_deref_id(id_priv);
- INIT_WORK(&id_priv->work, __rdma_free);
- queue_work(cma_free_wq, &id_priv->work);
+ wait_for_completion(&id_priv->comp);
+
+ if (id_priv->internal_id)
+ cma_deref_id(id_priv->id.context);
+
+ kfree(id_priv->id.route.path_rec);
+ kfree(id_priv);
}
EXPORT_SYMBOL(rdma_destroy_id);
@@ -1170,7 +1657,6 @@ static int cma_rep_recv(struct rdma_id_private *id_priv)
if (ret)
goto reject;
- cma_dbg(id_priv, "sending RTU\n");
ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0);
if (ret)
goto reject;
@@ -1178,22 +1664,11 @@ static int cma_rep_recv(struct rdma_id_private *id_priv)
return 0;
reject:
cma_modify_qp_err(id_priv);
- cma_dbg(id_priv, "sending REJ\n");
ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
NULL, 0, NULL, 0);
return ret;
}
-static int cma_verify_rep(struct rdma_id_private *id_priv, void *data)
-{
- if (id_priv->id.ps == RDMA_PS_SDP &&
- sdp_get_majv(((struct sdp_hah *) data)->sdp_version) !=
- SDP_MAJ_VERSION)
- return -EINVAL;
-
- return 0;
-}
-
static void cma_set_rep_event_data(struct rdma_cm_event *event,
struct ib_cm_rep_event_param *rep_data,
void *private_data)
@@ -1214,11 +1689,13 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
struct rdma_cm_event event;
int ret = 0;
+ mutex_lock(&id_priv->handler_mutex);
if ((ib_event->event != IB_CM_TIMEWAIT_EXIT &&
- cma_disable_callback(id_priv, RDMA_CM_CONNECT)) ||
+ id_priv->state != RDMA_CM_CONNECT) ||
(ib_event->event == IB_CM_TIMEWAIT_EXIT &&
- cma_disable_callback(id_priv, RDMA_CM_DISCONNECT)))
- return 0;
+ id_priv->state != RDMA_CM_DISCONNECT))
+ goto out;
+
memset(&event, 0, sizeof event);
switch (ib_event->event) {
case IB_CM_REQ_ERROR:
@@ -1227,15 +1704,13 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
event.status = -ETIMEDOUT;
break;
case IB_CM_REP_RECEIVED:
- event.status = cma_verify_rep(id_priv, ib_event->private_data);
- if (event.status)
- event.event = RDMA_CM_EVENT_CONNECT_ERROR;
- else if (id_priv->id.qp && id_priv->id.ps != RDMA_PS_SDP) {
+ if (id_priv->id.qp) {
event.status = cma_rep_recv(id_priv);
event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR :
RDMA_CM_EVENT_ESTABLISHED;
- } else
+ } else {
event.event = RDMA_CM_EVENT_CONNECT_RESPONSE;
+ }
cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd,
ib_event->private_data);
break;
@@ -1266,7 +1741,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
break;
default:
- printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n",
+ pr_err("RDMA CMA: unexpected IB CM event: %d\n",
ib_event->event);
goto out;
}
@@ -1286,27 +1761,28 @@ out:
}
static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
- struct ib_cm_event *ib_event)
+ struct ib_cm_event *ib_event,
+ struct net_device *net_dev)
{
struct rdma_id_private *id_priv;
struct rdma_cm_id *id;
struct rdma_route *rt;
- union cma_ip_addr *src, *dst;
- __be16 port;
- u8 ip_ver;
+ const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
+ const __be64 service_id =
+ ib_event->param.req_rcvd.primary_path->service_id;
int ret;
- if (cma_get_net_info(ib_event->private_data, listen_id->ps,
- &ip_ver, &port, &src, &dst))
- return NULL;
-
- id = rdma_create_id(listen_id->event_handler, listen_id->context,
+ id = rdma_create_id(listen_id->route.addr.dev_addr.net,
+ listen_id->event_handler, listen_id->context,
listen_id->ps, ib_event->param.req_rcvd.qp_type);
if (IS_ERR(id))
return NULL;
- cma_save_net_info(&id->route.addr, &listen_id->route.addr,
- ip_ver, port, src, dst);
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+ (struct sockaddr *)&id->route.addr.dst_addr,
+ listen_id, ib_event, ss_family, service_id))
+ goto err;
rt = &id->route;
rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1;
@@ -1319,19 +1795,24 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
if (rt->num_paths == 2)
rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
- if (cma_any_addr((struct sockaddr *) &rt->addr.src_addr)) {
- rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
- rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
- ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
- } else {
- ret = rdma_translate_ip((struct sockaddr *) &rt->addr.src_addr,
- &rt->addr.dev_addr, NULL);
+ if (net_dev) {
+ ret = rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL);
if (ret)
goto err;
+ } else {
+ if (!cma_protocol_roce(listen_id) &&
+ cma_any_addr(cma_src_addr(id_priv))) {
+ rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
+ rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
+ ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
+ } else if (!cma_any_addr(cma_src_addr(id_priv))) {
+ ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr);
+ if (ret)
+ goto err;
+ }
}
rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
- id_priv = container_of(id, struct rdma_id_private, id);
id_priv->state = RDMA_CM_CONNECT;
return id_priv;
@@ -1341,36 +1822,40 @@ err:
}
static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
- struct ib_cm_event *ib_event)
+ struct ib_cm_event *ib_event,
+ struct net_device *net_dev)
{
struct rdma_id_private *id_priv;
struct rdma_cm_id *id;
- union cma_ip_addr *src, *dst;
- __be16 port;
- u8 ip_ver;
+ const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
+ struct vnet *net = listen_id->route.addr.dev_addr.net;
int ret;
- id = rdma_create_id(listen_id->event_handler, listen_id->context,
+ id = rdma_create_id(net, listen_id->event_handler, listen_id->context,
listen_id->ps, IB_QPT_UD);
if (IS_ERR(id))
return NULL;
-
- if (cma_get_net_info(ib_event->private_data, listen_id->ps,
- &ip_ver, &port, &src, &dst))
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+ (struct sockaddr *)&id->route.addr.dst_addr,
+ listen_id, ib_event, ss_family,
+ ib_event->param.sidr_req_rcvd.service_id))
goto err;
- cma_save_net_info(&id->route.addr, &listen_id->route.addr,
- ip_ver, port, src, dst);
-
- if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) {
- ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr,
- &id->route.addr.dev_addr, NULL);
+ if (net_dev) {
+ ret = rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL);
if (ret)
goto err;
+ } else {
+ if (!cma_any_addr(cma_src_addr(id_priv))) {
+ ret = cma_translate_addr(cma_src_addr(id_priv),
+ &id->route.addr.dev_addr);
+ if (ret)
+ goto err;
+ }
}
- id_priv = container_of(id, struct rdma_id_private, id);
id_priv->state = RDMA_CM_CONNECT;
return id_priv;
err:
@@ -1382,7 +1867,7 @@ static void cma_set_req_event_data(struct rdma_cm_event *event,
struct ib_cm_req_event_param *req_data,
void *private_data, int offset)
{
- event->param.conn.private_data = private_data + offset;
+ event->param.conn.private_data = (char *)private_data + offset;
event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset;
event->param.conn.responder_resources = req_data->responder_resources;
event->param.conn.initiator_depth = req_data->initiator_depth;
@@ -1404,38 +1889,36 @@ static int cma_check_req_qp_type(struct rdma_cm_id *id, struct ib_cm_event *ib_e
static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
{
- struct rdma_id_private *listen_id, *conn_id;
+ struct rdma_id_private *listen_id, *conn_id = NULL;
struct rdma_cm_event event;
+ struct net_device *net_dev;
int offset, ret;
- u8 smac[ETH_ALEN];
- u8 alt_smac[ETH_ALEN];
- u8 *psmac = smac;
- u8 *palt_smac = alt_smac;
- int is_iboe = ((rdma_node_get_transport(cm_id->device->node_type) ==
- RDMA_TRANSPORT_IB) &&
- (rdma_port_get_link_layer(cm_id->device,
- ib_event->param.req_rcvd.port) ==
- IB_LINK_LAYER_ETHERNET));
- int is_sidr = 0;
- listen_id = cm_id->context;
- if (!cma_check_req_qp_type(&listen_id->id, ib_event))
- return -EINVAL;
+ listen_id = cma_id_from_event(cm_id, ib_event, &net_dev);
+ if (IS_ERR(listen_id))
+ return PTR_ERR(listen_id);
- if (cma_disable_callback(listen_id, RDMA_CM_LISTEN))
- return -ECONNABORTED;
+ if (!cma_check_req_qp_type(&listen_id->id, ib_event)) {
+ ret = -EINVAL;
+ goto net_dev_put;
+ }
+
+ mutex_lock(&listen_id->handler_mutex);
+ if (listen_id->state != RDMA_CM_LISTEN) {
+ ret = -ECONNABORTED;
+ goto err1;
+ }
memset(&event, 0, sizeof event);
- offset = cma_user_data_offset(listen_id->id.ps);
+ offset = cma_user_data_offset(listen_id);
event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) {
- is_sidr = 1;
- conn_id = cma_new_udp_id(&listen_id->id, ib_event);
- event.param.ud.private_data = ib_event->private_data + offset;
+ conn_id = cma_new_udp_id(&listen_id->id, ib_event, net_dev);
+ event.param.ud.private_data = (char *)ib_event->private_data + offset;
event.param.ud.private_data_len =
IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
} else {
- conn_id = cma_new_conn_id(&listen_id->id, ib_event);
+ conn_id = cma_new_conn_id(&listen_id->id, ib_event, net_dev);
cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
ib_event->private_data, offset);
}
@@ -1461,39 +1944,20 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
ret = conn_id->id.event_handler(&conn_id->id, &event);
if (ret)
goto err3;
-
- if (is_iboe && !is_sidr) {
- u32 scope_id = rdma_get_ipv6_scope_id(cm_id->device,
- ib_event->param.req_rcvd.port);
-
- if (ib_event->param.req_rcvd.primary_path != NULL)
- rdma_addr_find_smac_by_sgid(
- &ib_event->param.req_rcvd.primary_path->sgid,
- psmac, NULL, scope_id);
- else
- psmac = NULL;
- if (ib_event->param.req_rcvd.alternate_path != NULL)
- rdma_addr_find_smac_by_sgid(
- &ib_event->param.req_rcvd.alternate_path->sgid,
- palt_smac, NULL, scope_id);
- else
- palt_smac = NULL;
- }
- /*
- * Acquire mutex to prevent user executing rdma_destroy_id()
- * while we're accessing the cm_id.
- */
- mutex_lock(&lock);
- if (is_iboe && !is_sidr)
- ib_update_cm_av(cm_id, psmac, palt_smac);
- if (cma_comp(conn_id, RDMA_CM_CONNECT) && (conn_id->id.qp_type != IB_QPT_UD)) {
- cma_dbg(container_of(&conn_id->id, struct rdma_id_private, id), "sending MRA\n");
- ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
- }
- mutex_unlock(&lock);
- mutex_unlock(&conn_id->handler_mutex);
+ /*
+ * Acquire mutex to prevent user executing rdma_destroy_id()
+ * while we're accessing the cm_id.
+ */
+ mutex_lock(&lock);
+ if (cma_comp(conn_id, RDMA_CM_CONNECT) &&
+ (conn_id->id.qp_type != IB_QPT_UD))
+ ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+ mutex_unlock(&lock);
+ mutex_unlock(&conn_id->handler_mutex);
mutex_unlock(&listen_id->handler_mutex);
cma_deref_id(conn_id);
+ if (net_dev)
+ dev_put(net_dev);
return 0;
err3:
@@ -1507,81 +1971,34 @@ err1:
mutex_unlock(&listen_id->handler_mutex);
if (conn_id)
rdma_destroy_id(&conn_id->id);
- return ret;
-}
-static __be64 cma_get_service_id(enum rdma_port_space ps, struct sockaddr *addr)
-{
- return cpu_to_be64(((u64)ps << 16) + be16_to_cpu(cma_port(addr)));
+net_dev_put:
+ if (net_dev)
+ dev_put(net_dev);
+
+ return ret;
}
-static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
- struct ib_cm_compare_data *compare)
+__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr)
{
- struct cma_hdr *cma_data, *cma_mask;
- struct sdp_hh *sdp_data, *sdp_mask;
- __be32 ip4_addr;
- struct in6_addr ip6_addr;
+ if (addr->sa_family == AF_IB)
+ return ((struct sockaddr_ib *) addr)->sib_sid;
- memset(compare, 0, sizeof *compare);
- cma_data = (void *) compare->data;
- cma_mask = (void *) compare->mask;
- sdp_data = (void *) compare->data;
- sdp_mask = (void *) compare->mask;
-
- switch (addr->sa_family) {
- case AF_INET:
- ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr;
- if (ps == RDMA_PS_SDP) {
- sdp_set_ip_ver(sdp_data, 4);
- sdp_set_ip_ver(sdp_mask, 0xF);
- if (!cma_any_addr(addr)) {
- sdp_data->dst_addr.ip4.addr = ip4_addr;
- sdp_mask->dst_addr.ip4.addr = htonl(~0);
- }
- } else {
- cma_set_ip_ver(cma_data, 4);
- cma_set_ip_ver(cma_mask, 0xF);
- if (!cma_any_addr(addr)) {
- cma_data->dst_addr.ip4.addr = ip4_addr;
- cma_mask->dst_addr.ip4.addr = htonl(~0);
- }
- }
- break;
- case AF_INET6:
- ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr;
- if (ps == RDMA_PS_SDP) {
- sdp_set_ip_ver(sdp_data, 6);
- sdp_set_ip_ver(sdp_mask, 0xF);
- if (!cma_any_addr(addr)) {
- sdp_data->dst_addr.ip6 = ip6_addr;
- memset(&sdp_mask->dst_addr.ip6, 0xFF,
- sizeof(sdp_mask->dst_addr.ip6));
- }
- } else {
- cma_set_ip_ver(cma_data, 6);
- cma_set_ip_ver(cma_mask, 0xF);
- if (!cma_any_addr(addr)) {
- cma_data->dst_addr.ip6 = ip6_addr;
- memset(&cma_mask->dst_addr.ip6, 0xFF,
- sizeof(cma_mask->dst_addr.ip6));
- }
- }
- break;
- default:
- break;
- }
+ return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr)));
}
+EXPORT_SYMBOL(rdma_get_service_id);
static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
{
struct rdma_id_private *id_priv = iw_id->context;
struct rdma_cm_event event;
- struct sockaddr_in *sin;
int ret = 0;
+ struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
+ struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;
- if (cma_disable_callback(id_priv, RDMA_CM_CONNECT))
- return 0;
+ mutex_lock(&id_priv->handler_mutex);
+ if (id_priv->state != RDMA_CM_CONNECT)
+ goto out;
memset(&event, 0, sizeof event);
switch (iw_event->event) {
@@ -1589,11 +2006,11 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
event.event = RDMA_CM_EVENT_DISCONNECTED;
break;
case IW_CM_EVENT_CONNECT_REPLY:
- sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
- *sin = iw_event->local_addr;
- sin = (struct sockaddr_in *) &id_priv->id.route.addr.dst_addr;
- *sin = iw_event->remote_addr;
- switch ((int)iw_event->status) {
+ memcpy(cma_src_addr(id_priv), laddr,
+ rdma_addr_size(laddr));
+ memcpy(cma_dst_addr(id_priv), raddr,
+ rdma_addr_size(raddr));
+ switch (iw_event->status) {
case 0:
event.event = RDMA_CM_EVENT_ESTABLISHED;
event.param.conn.initiator_depth = iw_event->ird;
@@ -1633,6 +2050,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
return ret;
}
+out:
mutex_unlock(&id_priv->handler_mutex);
return ret;
}
@@ -1642,18 +2060,20 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
{
struct rdma_cm_id *new_cm_id;
struct rdma_id_private *listen_id, *conn_id;
- struct sockaddr_in *sin;
- struct net_device *dev = NULL;
struct rdma_cm_event event;
- int ret;
- struct ib_device_attr attr;
+ int ret = -ECONNABORTED;
+ struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
+ struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;
listen_id = cm_id->context;
- if (cma_disable_callback(listen_id, RDMA_CM_LISTEN))
- return -ECONNABORTED;
+
+ mutex_lock(&listen_id->handler_mutex);
+ if (listen_id->state != RDMA_CM_LISTEN)
+ goto out;
/* Create a new RDMA id for the new IW CM ID */
- new_cm_id = rdma_create_id(listen_id->id.event_handler,
+ new_cm_id = rdma_create_id(listen_id->id.route.addr.dev_addr.net,
+ listen_id->id.event_handler,
listen_id->id.context,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(new_cm_id)) {
@@ -1664,14 +2084,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
conn_id->state = RDMA_CM_CONNECT;
- dev = ip_dev_find(&init_net, iw_event->local_addr.sin_addr.s_addr);
- if (!dev) {
- ret = -EADDRNOTAVAIL;
- mutex_unlock(&conn_id->handler_mutex);
- rdma_destroy_id(new_cm_id);
- goto out;
- }
- ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL);
+ ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr, NULL);
if (ret) {
mutex_unlock(&conn_id->handler_mutex);
rdma_destroy_id(new_cm_id);
@@ -1689,17 +2102,8 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
cm_id->context = conn_id;
cm_id->cm_handler = cma_iw_handler;
- sin = (struct sockaddr_in *) &new_cm_id->route.addr.src_addr;
- *sin = iw_event->local_addr;
- sin = (struct sockaddr_in *) &new_cm_id->route.addr.dst_addr;
- *sin = iw_event->remote_addr;
-
- ret = ib_query_device(conn_id->id.device, &attr);
- if (ret) {
- mutex_unlock(&conn_id->handler_mutex);
- rdma_destroy_id(new_cm_id);
- goto out;
- }
+ memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr));
+ memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr));
memset(&event, 0, sizeof event);
event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
@@ -1728,60 +2132,42 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
cma_deref_id(conn_id);
out:
- if (dev)
- dev_put(dev);
mutex_unlock(&listen_id->handler_mutex);
return ret;
}
static int cma_ib_listen(struct rdma_id_private *id_priv)
{
- struct ib_cm_compare_data compare_data;
struct sockaddr *addr;
struct ib_cm_id *id;
__be64 svc_id;
- int ret;
- id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv);
+ addr = cma_src_addr(id_priv);
+ svc_id = rdma_get_service_id(&id_priv->id, addr);
+ id = ib_cm_insert_listen(id_priv->id.device, cma_req_handler, svc_id);
if (IS_ERR(id))
return PTR_ERR(id);
-
id_priv->cm_id.ib = id;
- addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr;
- svc_id = cma_get_service_id(id_priv->id.ps, addr);
- if (cma_any_addr(addr) && !id_priv->afonly)
- ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
- else {
- cma_set_compare_data(id_priv->id.ps, addr, &compare_data);
- ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data);
- }
-
- if (ret) {
- ib_destroy_cm_id(id_priv->cm_id.ib);
- id_priv->cm_id.ib = NULL;
- }
-
- return ret;
+ return 0;
}
static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
{
int ret;
- struct sockaddr_in *sin;
struct iw_cm_id *id;
id = iw_create_cm_id(id_priv->id.device,
- id_priv->sock,
- iw_conn_req_handler,
- id_priv);
+ iw_conn_req_handler,
+ id_priv);
if (IS_ERR(id))
return PTR_ERR(id);
+ id->tos = id_priv->tos;
id_priv->cm_id.iw = id;
- sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
- id_priv->cm_id.iw->local_addr = *sin;
+ memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv),
+ rdma_addr_size(cma_src_addr(id_priv)));
ret = iw_cm_listen(id_priv->cm_id.iw, backlog);
@@ -1808,9 +2194,13 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
{
struct rdma_id_private *dev_id_priv;
struct rdma_cm_id *id;
+ struct vnet *net = id_priv->id.route.addr.dev_addr.net;
int ret;
- id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps,
+ if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1))
+ return;
+
+ id = rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps,
id_priv->id.qp_type);
if (IS_ERR(id))
return;
@@ -1818,11 +2208,10 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
dev_id_priv = container_of(id, struct rdma_id_private, id);
dev_id_priv->state = RDMA_CM_ADDR_BOUND;
- dev_id_priv->sock = id_priv->sock;
- memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr,
- ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr));
+ memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv),
+ rdma_addr_size(cma_src_addr(id_priv)));
- cma_attach_to_dev(dev_id_priv, cma_dev);
+ _cma_attach_to_dev(dev_id_priv, cma_dev);
list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list);
atomic_inc(&id_priv->refcount);
dev_id_priv->internal_id = 1;
@@ -1830,7 +2219,8 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
ret = rdma_listen(id, id_priv->backlog);
if (ret)
- cma_warn(id_priv, "cma_listen_on_dev, error %d, listening on device %s\n", ret, cma_dev->device->name);
+ pr_warn("RDMA CMA: cma_listen_on_dev, error %d, listening on device %s\n",
+ ret, cma_dev->device->name);
}
static void cma_listen_on_all(struct rdma_id_private *id_priv)
@@ -1853,15 +2243,6 @@ void rdma_set_service_type(struct rdma_cm_id *id, int tos)
}
EXPORT_SYMBOL(rdma_set_service_type);
-void rdma_set_timeout(struct rdma_cm_id *id, int timeout)
-{
- struct rdma_id_private *id_priv;
-
- id_priv = container_of(id, struct rdma_id_private, id);
- id_priv->qp_timeout = (u8) timeout;
-}
-EXPORT_SYMBOL(rdma_set_timeout);
-
static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec,
void *context)
{
@@ -1886,31 +2267,39 @@ static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec,
static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms,
struct cma_work *work)
{
- struct rdma_addr *addr = &id_priv->id.route.addr;
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
struct ib_sa_path_rec path_rec;
ib_sa_comp_mask comp_mask;
struct sockaddr_in6 *sin6;
+ struct sockaddr_ib *sib;
memset(&path_rec, 0, sizeof path_rec);
- rdma_addr_get_sgid(&addr->dev_addr, &path_rec.sgid);
- rdma_addr_get_dgid(&addr->dev_addr, &path_rec.dgid);
- path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr));
+ rdma_addr_get_sgid(dev_addr, &path_rec.sgid);
+ rdma_addr_get_dgid(dev_addr, &path_rec.dgid);
+ path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
path_rec.numb_path = 1;
path_rec.reversible = 1;
- path_rec.service_id = cma_get_service_id(id_priv->id.ps,
- (struct sockaddr *) &addr->dst_addr);
+ path_rec.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH |
IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID;
- if (addr->src_addr.ss_family == AF_INET) {
+ switch (cma_family(id_priv)) {
+ case AF_INET:
path_rec.qos_class = cpu_to_be16((u16) id_priv->tos);
comp_mask |= IB_SA_PATH_REC_QOS_CLASS;
- } else {
- sin6 = (struct sockaddr_in6 *) &addr->src_addr;
+ break;
+ case AF_INET6:
+ sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv);
path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20);
comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
+ break;
+ case AF_IB:
+ sib = (struct sockaddr_ib *) cma_src_addr(id_priv);
+ path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20);
+ comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
+ break;
}
id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device,
@@ -1944,30 +2333,6 @@ out:
kfree(work);
}
-static void cma_ndev_work_handler(struct work_struct *_work)
-{
- struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work);
- struct rdma_id_private *id_priv = work->id;
- int destroy = 0;
-
- mutex_lock(&id_priv->handler_mutex);
- if (id_priv->state == RDMA_CM_DESTROYING ||
- id_priv->state == RDMA_CM_DEVICE_REMOVAL)
- goto out;
-
- if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
- cma_exch(id_priv, RDMA_CM_DESTROYING);
- destroy = 1;
- }
-
-out:
- mutex_unlock(&id_priv->handler_mutex);
- cma_deref_id(id_priv);
- if (destroy)
- rdma_destroy_id(&id_priv->id);
- kfree(work);
-}
-
static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
{
struct rdma_route *route = &id_priv->id.route;
@@ -2046,9 +2411,22 @@ static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)
return 0;
}
-static u8 tos_to_sl(u8 tos)
+static int iboe_tos_to_sl(struct net_device *ndev, int tos)
+{
+ /* TODO: Implement this function */
+ return 0;
+}
+
+static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type,
+ unsigned long supported_gids,
+ enum ib_gid_type default_gid)
{
- return def_prec2sl & 7;
+ if ((network_type == RDMA_NETWORK_IPV4 ||
+ network_type == RDMA_NETWORK_IPV6) &&
+ test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids))
+ return IB_GID_TYPE_ROCE_UDP_ENCAP;
+
+ return default_gid;
}
static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
@@ -2057,14 +2435,9 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
struct rdma_addr *addr = &route->addr;
struct cma_work *work;
int ret;
- struct sockaddr_in *src_addr = (struct sockaddr_in *)&route->addr.src_addr;
- struct sockaddr_in *dst_addr = (struct sockaddr_in *)&route->addr.dst_addr;
struct net_device *ndev = NULL;
- if (src_addr->sin_family != dst_addr->sin_family)
- return -EINVAL;
-
work = kzalloc(sizeof *work, GFP_KERNEL);
if (!work)
return -ENOMEM;
@@ -2080,29 +2453,64 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
route->num_paths = 1;
- if (addr->dev_addr.bound_dev_if)
- ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if);
+ if (addr->dev_addr.bound_dev_if) {
+ unsigned long supported_gids;
+
+ ndev = dev_get_by_index(addr->dev_addr.net,
+ addr->dev_addr.bound_dev_if);
+ if (!ndev) {
+ ret = -ENODEV;
+ goto err2;
+ }
+
+ if (ndev->if_flags & IFF_LOOPBACK) {
+ dev_put(ndev);
+ if (!id_priv->id.device->get_netdev) {
+ ret = -EOPNOTSUPP;
+ goto err2;
+ }
+
+ ndev = id_priv->id.device->get_netdev(id_priv->id.device,
+ id_priv->id.port_num);
+ if (!ndev) {
+ ret = -ENODEV;
+ goto err2;
+ }
+ }
+
+ route->path_rec->net = ndev->if_vnet;
+ route->path_rec->ifindex = ndev->if_index;
+ supported_gids = roce_gid_type_mask_support(id_priv->id.device,
+ id_priv->id.port_num);
+ route->path_rec->gid_type =
+ cma_route_gid_type(addr->dev_addr.network,
+ supported_gids,
+ id_priv->gid_type);
+ }
if (!ndev) {
ret = -ENODEV;
goto err2;
}
- route->path_rec->vlan_id = rdma_vlan_dev_vlan_id(ndev);
memcpy(route->path_rec->dmac, addr->dev_addr.dst_dev_addr, ETH_ALEN);
- memcpy(route->path_rec->smac, IF_LLADDR(ndev), ndev->if_addrlen);
-
rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
&route->path_rec->sgid);
rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr,
&route->path_rec->dgid);
- route->path_rec->hop_limit = 1;
+ /* Use the hint from IP Stack to select GID Type */
+ if (route->path_rec->gid_type < ib_network_to_gid_type(addr->dev_addr.network))
+ route->path_rec->gid_type = ib_network_to_gid_type(addr->dev_addr.network);
+ if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB)
+ /* TODO: get the hoplimit from the inet/inet6 device */
+ route->path_rec->hop_limit = addr->dev_addr.hoplimit;
+ else
+ route->path_rec->hop_limit = 1;
route->path_rec->reversible = 1;
route->path_rec->pkey = cpu_to_be16(0xffff);
route->path_rec->mtu_selector = IB_SA_EQ;
- route->path_rec->sl = tos_to_sl(id_priv->tos);
-
+ route->path_rec->sl = iboe_tos_to_sl(ndev, id_priv->tos);
route->path_rec->mtu = iboe_get_mtu(ndev->if_mtu);
route->path_rec->rate_selector = IB_SA_EQ;
route->path_rec->rate = iboe_get_rate(ndev);
@@ -2141,27 +2549,15 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
return -EINVAL;
atomic_inc(&id_priv->refcount);
- switch (rdma_node_get_transport(id->device->node_type)) {
- case RDMA_TRANSPORT_IB:
- switch (rdma_port_get_link_layer(id->device, id->port_num)) {
- case IB_LINK_LAYER_INFINIBAND:
- ret = cma_resolve_ib_route(id_priv, timeout_ms);
- break;
- case IB_LINK_LAYER_ETHERNET:
- ret = cma_resolve_iboe_route(id_priv);
- break;
- default:
- ret = -ENOSYS;
- }
- break;
- case RDMA_TRANSPORT_IWARP:
- case RDMA_TRANSPORT_SCIF:
+ if (rdma_cap_ib_sa(id->device, id->port_num))
+ ret = cma_resolve_ib_route(id_priv, timeout_ms);
+ else if (rdma_protocol_roce(id->device, id->port_num))
+ ret = cma_resolve_iboe_route(id_priv);
+ else if (rdma_protocol_iwarp(id->device, id->port_num))
ret = cma_resolve_iw_route(id_priv, timeout_ms);
- break;
- default:
+ else
ret = -ENOSYS;
- break;
- }
+
if (ret)
goto err;
@@ -2173,38 +2569,60 @@ err:
}
EXPORT_SYMBOL(rdma_resolve_route);
-int rdma_enable_apm(struct rdma_cm_id *id, enum alt_path_type alt_type)
+static void cma_set_loopback(struct sockaddr *addr)
{
- /* APM is not supported yet */
- return -EINVAL;
+ switch (addr->sa_family) {
+ case AF_INET:
+ ((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ break;
+ case AF_INET6:
+ ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr,
+ 0, 0, 0, htonl(1));
+ break;
+ default:
+ ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr,
+ 0, 0, 0, htonl(1));
+ break;
+ }
}
-EXPORT_SYMBOL(rdma_enable_apm);
static int cma_bind_loopback(struct rdma_id_private *id_priv)
{
- struct cma_device *cma_dev;
+ struct cma_device *cma_dev, *cur_dev;
struct ib_port_attr port_attr;
union ib_gid gid;
u16 pkey;
int ret;
u8 p;
+ cma_dev = NULL;
mutex_lock(&lock);
- if (list_empty(&dev_list)) {
+ list_for_each_entry(cur_dev, &dev_list, list) {
+ if (cma_family(id_priv) == AF_IB &&
+ !rdma_cap_ib_cm(cur_dev->device, 1))
+ continue;
+
+ if (!cma_dev)
+ cma_dev = cur_dev;
+
+ for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) {
+ if (!ib_query_port(cur_dev->device, p, &port_attr) &&
+ port_attr.state == IB_PORT_ACTIVE) {
+ cma_dev = cur_dev;
+ goto port_found;
+ }
+ }
+ }
+
+ if (!cma_dev) {
ret = -ENODEV;
goto out;
}
- list_for_each_entry(cma_dev, &dev_list, list)
- for (p = 1; p <= cma_dev->device->phys_port_cnt; ++p)
- if (!ib_query_port(cma_dev->device, p, &port_attr) &&
- port_attr.state == IB_PORT_ACTIVE)
- goto port_found;
p = 1;
- cma_dev = list_entry(dev_list.next, struct cma_device, list);
port_found:
- ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid);
+ ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid, NULL);
if (ret)
goto out;
@@ -2213,13 +2631,14 @@ port_found:
goto out;
id_priv->id.route.addr.dev_addr.dev_type =
- (rdma_port_get_link_layer(cma_dev->device, p) == IB_LINK_LAYER_INFINIBAND) ?
+ (rdma_protocol_ib(cma_dev->device, p)) ?
ARPHRD_INFINIBAND : ARPHRD_ETHER;
rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid);
ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey);
id_priv->id.port_num = p;
cma_attach_to_dev(id_priv, cma_dev);
+ cma_set_loopback(cma_src_addr(id_priv));
out:
mutex_unlock(&lock);
return ret;
@@ -2237,8 +2656,7 @@ static void addr_handler(int status, struct sockaddr *src_addr,
RDMA_CM_ADDR_RESOLVED))
goto out;
- memcpy(&id_priv->id.route.addr.src_addr, src_addr,
- ip_addr_size(src_addr));
+ memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr));
if (!status && !id_priv->cma_dev)
status = cma_acquire_dev(id_priv, NULL);
@@ -2266,7 +2684,6 @@ out:
static int cma_resolve_loopback(struct rdma_id_private *id_priv)
{
struct cma_work *work;
- struct sockaddr *src, *dst;
union ib_gid gid;
int ret;
@@ -2283,18 +2700,6 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv)
rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid);
rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid);
- src = (struct sockaddr *) &id_priv->id.route.addr.src_addr;
- if (cma_zero_addr(src)) {
- dst = (struct sockaddr *) &id_priv->id.route.addr.dst_addr;
- if ((src->sa_family = dst->sa_family) == AF_INET) {
- ((struct sockaddr_in *)src)->sin_addr =
- ((struct sockaddr_in *)dst)->sin_addr;
- } else {
- ((struct sockaddr_in6 *)src)->sin6_addr =
- ((struct sockaddr_in6 *)dst)->sin6_addr;
- }
- }
-
work->id = id_priv;
INIT_WORK(&work->work, cma_work_handler);
work->old_state = RDMA_CM_ADDR_QUERY;
@@ -2307,15 +2712,23 @@ err:
return ret;
}
-static int cma_resolve_scif(struct rdma_id_private *id_priv)
+static int cma_resolve_ib_addr(struct rdma_id_private *id_priv)
{
struct cma_work *work;
+ int ret;
work = kzalloc(sizeof *work, GFP_KERNEL);
if (!work)
return -ENOMEM;
- /* we probably can leave it empty here */
+ if (!id_priv->cma_dev) {
+ ret = cma_resolve_ib_dev(id_priv);
+ if (ret)
+ goto err;
+ }
+
+ rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *)
+ &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr));
work->id = id_priv;
INIT_WORK(&work->work, cma_work_handler);
@@ -2324,6 +2737,9 @@ static int cma_resolve_scif(struct rdma_id_private *id_priv)
work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
queue_work(cma_wq, &work->work);
return 0;
+err:
+ kfree(work);
+ return ret;
}
static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
@@ -2332,48 +2748,18 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
if (!src_addr || !src_addr->sa_family) {
src_addr = (struct sockaddr *) &id->route.addr.src_addr;
src_addr->sa_family = dst_addr->sa_family;
-#ifdef INET6
if (dst_addr->sa_family == AF_INET6) {
- ((struct sockaddr_in6 *) src_addr)->sin6_scope_id =
- ((struct sockaddr_in6 *) dst_addr)->sin6_scope_id;
- }
-#endif
- }
- if (!cma_any_addr(src_addr))
- return rdma_bind_addr(id, src_addr);
- else {
-#if defined(INET6) || defined(INET)
- union {
-#ifdef INET
- struct sockaddr_in in;
-#endif
-#ifdef INET6
- struct sockaddr_in6 in6;
-#endif
- } addr;
-#endif
-
- switch(dst_addr->sa_family) {
-#ifdef INET
- case AF_INET:
- memset(&addr.in, 0, sizeof(addr.in));
- addr.in.sin_family = dst_addr->sa_family;
- addr.in.sin_len = sizeof(addr.in);
- return rdma_bind_addr(id, (struct sockaddr *)&addr.in);
-#endif
-#ifdef INET6
- case AF_INET6:
- memset(&addr.in6, 0, sizeof(addr.in6));
- addr.in6.sin6_family = dst_addr->sa_family;
- addr.in6.sin6_len = sizeof(addr.in6);
- addr.in6.sin6_scope_id =
- ((struct sockaddr_in6 *)dst_addr)->sin6_scope_id;
- return rdma_bind_addr(id, (struct sockaddr *)&addr.in6);
-#endif
- default:
- return -EINVAL;
+ struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr;
+ struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr;
+ src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id;
+ if (IN6_IS_SCOPE_LINKLOCAL(&dst_addr6->sin6_addr))
+ id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id;
+ } else if (dst_addr->sa_family == AF_IB) {
+ ((struct sockaddr_ib *) src_addr)->sib_pkey =
+ ((struct sockaddr_ib *) dst_addr)->sib_pkey;
}
}
+ return rdma_bind_addr(id, src_addr);
}
int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
@@ -2389,20 +2775,25 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
return ret;
}
+ if (cma_family(id_priv) != dst_addr->sa_family)
+ return -EINVAL;
+
if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY))
return -EINVAL;
atomic_inc(&id_priv->refcount);
- memcpy(&id->route.addr.dst_addr, dst_addr, ip_addr_size(dst_addr));
- if (cma_any_addr(dst_addr))
+ memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr));
+ if (cma_any_addr(dst_addr)) {
ret = cma_resolve_loopback(id_priv);
- else if (id_priv->id.device &&
- rdma_node_get_transport(id_priv->id.device->node_type) == RDMA_TRANSPORT_SCIF)
- ret = cma_resolve_scif(id_priv);
- else
- ret = rdma_resolve_ip(&addr_client, (struct sockaddr *) &id->route.addr.src_addr,
- dst_addr, &id->route.addr.dev_addr,
- timeout_ms, addr_handler, id_priv);
+ } else {
+ if (dst_addr->sa_family == AF_IB) {
+ ret = cma_resolve_ib_addr(id_priv);
+ } else {
+ ret = rdma_resolve_ip(&addr_client, cma_src_addr(id_priv),
+ dst_addr, &id->route.addr.dev_addr,
+ timeout_ms, addr_handler, id_priv);
+ }
+ }
if (ret)
goto err;
@@ -2422,7 +2813,7 @@ int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse)
id_priv = container_of(id, struct rdma_id_private, id);
spin_lock_irqsave(&id_priv->lock, flags);
- if (id_priv->state == RDMA_CM_IDLE) {
+ if (reuse || id_priv->state == RDMA_CM_IDLE) {
id_priv->reuseaddr = reuse;
ret = 0;
} else {
@@ -2456,59 +2847,73 @@ EXPORT_SYMBOL(rdma_set_afonly);
static void cma_bind_port(struct rdma_bind_list *bind_list,
struct rdma_id_private *id_priv)
{
- struct sockaddr_in *sin;
+ struct sockaddr *addr;
+ struct sockaddr_ib *sib;
+ u64 sid, mask;
+ __be16 port;
+
+ addr = cma_src_addr(id_priv);
+ port = htons(bind_list->port);
- sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
- sin->sin_port = htons(bind_list->port);
+ switch (addr->sa_family) {
+ case AF_INET:
+ ((struct sockaddr_in *) addr)->sin_port = port;
+ break;
+ case AF_INET6:
+ ((struct sockaddr_in6 *) addr)->sin6_port = port;
+ break;
+ case AF_IB:
+ sib = (struct sockaddr_ib *) addr;
+ sid = be64_to_cpu(sib->sib_sid);
+ mask = be64_to_cpu(sib->sib_sid_mask);
+ sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port));
+ sib->sib_sid_mask = cpu_to_be64(~0ULL);
+ break;
+ }
id_priv->bind_list = bind_list;
hlist_add_head(&id_priv->node, &bind_list->owners);
}
-static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
- unsigned short snum)
+static int cma_alloc_port(enum rdma_port_space ps,
+ struct rdma_id_private *id_priv, unsigned short snum)
{
struct rdma_bind_list *bind_list;
- int port, ret;
+ int ret;
bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL);
if (!bind_list)
return -ENOMEM;
- do {
- ret = idr_get_new_above(ps, bind_list, snum, &port);
- } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));
-
- if (ret)
- goto err1;
-
- if (port != snum) {
- ret = -EADDRNOTAVAIL;
- goto err2;
- }
+ ret = cma_ps_alloc(id_priv->id.route.addr.dev_addr.net, ps, bind_list,
+ snum);
+ if (ret < 0)
+ goto err;
bind_list->ps = ps;
- bind_list->port = (unsigned short) port;
+ bind_list->port = (unsigned short)ret;
cma_bind_port(bind_list, id_priv);
return 0;
-err2:
- idr_remove(ps, port);
-err1:
+err:
kfree(bind_list);
- return ret;
+ return ret == -ENOSPC ? -EADDRNOTAVAIL : ret;
}
-static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
+static int cma_alloc_any_port(enum rdma_port_space ps,
+ struct rdma_id_private *id_priv)
{
static unsigned int last_used_port;
int low, high, remaining;
unsigned int rover;
+ struct vnet *net = id_priv->id.route.addr.dev_addr.net;
+ u32 rand;
- inet_get_local_port_range(&init_net, &low, &high);
+ inet_get_local_port_range(net, &low, &high);
remaining = (high - low) + 1;
- rover = random() % remaining + low;
+ get_random_bytes(&rand, sizeof(rand));
+ rover = rand % remaining + low;
retry:
if (last_used_port != rover &&
- !idr_find(ps, (unsigned short) rover)) {
+ !cma_ps_find(net, ps, (unsigned short)rover)) {
int ret = cma_alloc_port(ps, id_priv, rover);
/*
* Remember previously used port number in order to avoid
@@ -2518,7 +2923,7 @@ retry:
last_used_port = rover;
if (ret != -EADDRNOTAVAIL)
return ret;
- }
+ }
if (--remaining) {
rover++;
if ((rover < low) || (rover > high))
@@ -2540,7 +2945,7 @@ static int cma_check_port(struct rdma_bind_list *bind_list,
struct rdma_id_private *cur_id;
struct sockaddr *addr, *cur_addr;
- addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr;
+ addr = cma_src_addr(id_priv);
hlist_for_each_entry(cur_id, &bind_list->owners, node) {
if (id_priv == cur_id)
continue;
@@ -2549,7 +2954,7 @@ static int cma_check_port(struct rdma_bind_list *bind_list,
cur_id->reuseaddr)
continue;
- cur_addr = (struct sockaddr *) &cur_id->id.route.addr.src_addr;
+ cur_addr = cma_src_addr(cur_id);
if (id_priv->afonly && cur_id->afonly &&
(addr->sa_family != cur_addr->sa_family))
continue;
@@ -2563,15 +2968,19 @@ static int cma_check_port(struct rdma_bind_list *bind_list,
return 0;
}
-static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
+static int cma_use_port(enum rdma_port_space ps,
+ struct rdma_id_private *id_priv)
{
struct rdma_bind_list *bind_list;
unsigned short snum;
int ret;
- snum = ntohs(cma_port((struct sockaddr *) &id_priv->id.route.addr.src_addr));
+ snum = ntohs(cma_port(cma_src_addr(id_priv)));
+ if (snum < IPPORT_RESERVED &&
+ priv_check(curthread, PRIV_NETINET_BINDANY) != 0)
+ return -EACCES;
- bind_list = idr_find(ps, snum);
+ bind_list = cma_ps_find(id_priv->id.route.addr.dev_addr.net, ps, snum);
if (!bind_list) {
ret = cma_alloc_port(ps, id_priv, snum);
} else {
@@ -2594,97 +3003,92 @@ static int cma_bind_listen(struct rdma_id_private *id_priv)
return ret;
}
-static int cma_get_tcp_port(struct rdma_id_private *id_priv)
+static enum rdma_port_space cma_select_inet_ps(
+ struct rdma_id_private *id_priv)
{
- int ret;
- int size;
- struct socket *sock;
+ switch (id_priv->id.ps) {
+ case RDMA_PS_TCP:
+ case RDMA_PS_UDP:
+ case RDMA_PS_IPOIB:
+ case RDMA_PS_IB:
+ return id_priv->id.ps;
+ default:
- ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
- if (ret)
- return ret;
-#ifdef __linux__
- ret = sock->ops->bind(sock,
- (struct sockaddr *) &id_priv->id.route.addr.src_addr,
- ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr));
-#else
- ret = -sobind(sock,
- (struct sockaddr *)&id_priv->id.route.addr.src_addr,
- curthread);
-#endif
- if (ret) {
- sock_release(sock);
- return ret;
+ return 0;
}
+}
- size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr);
- ret = sock_getname(sock,
- (struct sockaddr *) &id_priv->id.route.addr.src_addr,
- &size, 0);
- if (ret) {
- sock_release(sock);
- return ret;
+static enum rdma_port_space cma_select_ib_ps(struct rdma_id_private *id_priv)
+{
+ enum rdma_port_space ps = 0;
+ struct sockaddr_ib *sib;
+ u64 sid_ps, mask, sid;
+
+ sib = (struct sockaddr_ib *) cma_src_addr(id_priv);
+ mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK;
+ sid = be64_to_cpu(sib->sib_sid) & mask;
+
+ if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) {
+ sid_ps = RDMA_IB_IP_PS_IB;
+ ps = RDMA_PS_IB;
+ } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) &&
+ (sid == (RDMA_IB_IP_PS_TCP & mask))) {
+ sid_ps = RDMA_IB_IP_PS_TCP;
+ ps = RDMA_PS_TCP;
+ } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) &&
+ (sid == (RDMA_IB_IP_PS_UDP & mask))) {
+ sid_ps = RDMA_IB_IP_PS_UDP;
+ ps = RDMA_PS_UDP;
}
- id_priv->sock = sock;
- return 0;
+ if (ps) {
+ sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib)));
+ sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK |
+ be64_to_cpu(sib->sib_sid_mask));
+ }
+ return ps;
}
static int cma_get_port(struct rdma_id_private *id_priv)
{
- struct idr *ps;
+ enum rdma_port_space ps;
int ret;
- switch (id_priv->id.ps) {
- case RDMA_PS_SDP:
- ps = &sdp_ps;
- break;
- case RDMA_PS_TCP:
- ps = &tcp_ps;
- if (unify_tcp_port_space) {
- ret = cma_get_tcp_port(id_priv);
- if (ret)
- goto out;
- }
- break;
- case RDMA_PS_UDP:
- ps = &udp_ps;
- break;
- case RDMA_PS_IPOIB:
- ps = &ipoib_ps;
- break;
- case RDMA_PS_IB:
- ps = &ib_ps;
- break;
- default:
+ if (cma_family(id_priv) != AF_IB)
+ ps = cma_select_inet_ps(id_priv);
+ else
+ ps = cma_select_ib_ps(id_priv);
+ if (!ps)
return -EPROTONOSUPPORT;
- }
mutex_lock(&lock);
- if (cma_any_port((struct sockaddr *) &id_priv->id.route.addr.src_addr))
+ if (cma_any_port(cma_src_addr(id_priv)))
ret = cma_alloc_any_port(ps, id_priv);
else
ret = cma_use_port(ps, id_priv);
mutex_unlock(&lock);
-out:
+
return ret;
}
static int cma_check_linklocal(struct rdma_dev_addr *dev_addr,
struct sockaddr *addr)
{
-#if defined(INET6)
- struct sockaddr_in6 *sin6;
+#ifdef INET6
+ struct sockaddr_in6 sin6;
if (addr->sa_family != AF_INET6)
return 0;
- sin6 = (struct sockaddr_in6 *) addr;
- if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) &&
- !sin6->sin6_scope_id)
- return -EINVAL;
+ sin6 = *(struct sockaddr_in6 *)addr;
+
+ if (!(IN6_IS_SCOPE_LINKLOCAL(&sin6.sin6_addr)))
+ return 0;
+
+ if (sa6_recoverscope(&sin6) || sin6.sin6_scope_id == 0)
+ return -EINVAL;
- dev_addr->bound_dev_if = sin6->sin6_scope_id;
+ dev_addr->bound_dev_if = sin6.sin6_scope_id;
#endif
return 0;
}
@@ -2696,8 +3100,8 @@ int rdma_listen(struct rdma_cm_id *id, int backlog)
id_priv = container_of(id, struct rdma_id_private, id);
if (id_priv->state == RDMA_CM_IDLE) {
- ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET;
- ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr);
+ id->route.addr.src_addr.ss_family = AF_INET;
+ ret = rdma_bind_addr(id, cma_src_addr(id_priv));
if (ret)
return ret;
}
@@ -2713,19 +3117,15 @@ int rdma_listen(struct rdma_cm_id *id, int backlog)
id_priv->backlog = backlog;
if (id->device) {
- switch (rdma_node_get_transport(id->device->node_type)) {
- case RDMA_TRANSPORT_IB:
+ if (rdma_cap_ib_cm(id->device, 1)) {
ret = cma_ib_listen(id_priv);
if (ret)
goto err;
- break;
- case RDMA_TRANSPORT_IWARP:
- case RDMA_TRANSPORT_SCIF:
+ } else if (rdma_cap_iw_cm(id->device, 1)) {
ret = cma_iw_listen(id_priv, backlog);
if (ret)
goto err;
- break;
- default:
+ } else {
ret = -ENOSYS;
goto err;
}
@@ -2744,12 +3144,9 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
{
struct rdma_id_private *id_priv;
int ret;
-#if defined(INET6)
- int ipv6only;
- size_t var_size = sizeof(int);
-#endif
- if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
+ if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 &&
+ addr->sa_family != AF_IB)
return -EAFNOSUPPORT;
id_priv = container_of(id, struct rdma_id_private, id);
@@ -2760,9 +3157,9 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
if (ret)
goto err1;
- memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr));
+ memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr));
if (!cma_any_addr(addr)) {
- ret = rdma_translate_ip(addr, &id->route.addr.dev_addr, NULL);
+ ret = cma_translate_addr(addr, &id->route.addr.dev_addr);
if (ret)
goto err1;
@@ -2774,10 +3171,12 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) {
if (addr->sa_family == AF_INET)
id_priv->afonly = 1;
-#if defined(INET6)
- else if (addr->sa_family == AF_INET6)
- id_priv->afonly = kernel_sysctlbyname(&thread0, "net.inet6.ip6.v6only",
- &ipv6only, &var_size, NULL, 0, NULL, 0);
+#ifdef INET6
+ else if (addr->sa_family == AF_INET6) {
+ CURVNET_SET_QUIET(id_priv->id.route.addr.dev_addr.net);
+ id_priv->afonly = V_ip6_v6only;
+ CURVNET_RESTORE();
+ }
#endif
}
ret = cma_get_port(id_priv);
@@ -2794,62 +3193,32 @@ err1:
}
EXPORT_SYMBOL(rdma_bind_addr);
-static int cma_format_hdr(void *hdr, enum rdma_port_space ps,
- struct rdma_route *route)
+static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv)
{
struct cma_hdr *cma_hdr;
- struct sdp_hh *sdp_hdr;
- if (route->addr.src_addr.ss_family == AF_INET) {
+ cma_hdr = hdr;
+ cma_hdr->cma_version = CMA_VERSION;
+ if (cma_family(id_priv) == AF_INET) {
struct sockaddr_in *src4, *dst4;
- src4 = (struct sockaddr_in *) &route->addr.src_addr;
- dst4 = (struct sockaddr_in *) &route->addr.dst_addr;
-
- switch (ps) {
- case RDMA_PS_SDP:
- sdp_hdr = hdr;
- if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION)
- return -EINVAL;
- sdp_set_ip_ver(sdp_hdr, 4);
- sdp_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr;
- sdp_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr;
- sdp_hdr->port = src4->sin_port;
- break;
- default:
- cma_hdr = hdr;
- cma_hdr->cma_version = CMA_VERSION;
- cma_set_ip_ver(cma_hdr, 4);
- cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr;
- cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr;
- cma_hdr->port = src4->sin_port;
- break;
- }
- } else {
+ src4 = (struct sockaddr_in *) cma_src_addr(id_priv);
+ dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv);
+
+ cma_set_ip_ver(cma_hdr, 4);
+ cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr;
+ cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr;
+ cma_hdr->port = src4->sin_port;
+ } else if (cma_family(id_priv) == AF_INET6) {
struct sockaddr_in6 *src6, *dst6;
- src6 = (struct sockaddr_in6 *) &route->addr.src_addr;
- dst6 = (struct sockaddr_in6 *) &route->addr.dst_addr;
-
- switch (ps) {
- case RDMA_PS_SDP:
- sdp_hdr = hdr;
- if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION)
- return -EINVAL;
- sdp_set_ip_ver(sdp_hdr, 6);
- sdp_hdr->src_addr.ip6 = src6->sin6_addr;
- sdp_hdr->dst_addr.ip6 = dst6->sin6_addr;
- sdp_hdr->port = src6->sin6_port;
- break;
- default:
- cma_hdr = hdr;
- cma_hdr->cma_version = CMA_VERSION;
- cma_set_ip_ver(cma_hdr, 6);
- cma_hdr->src_addr.ip6 = src6->sin6_addr;
- cma_hdr->dst_addr.ip6 = dst6->sin6_addr;
- cma_hdr->port = src6->sin6_port;
- break;
- }
+ src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv);
+ dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv);
+
+ cma_set_ip_ver(cma_hdr, 6);
+ cma_hdr->src_addr.ip6 = src6->sin6_addr;
+ cma_hdr->dst_addr.ip6 = dst6->sin6_addr;
+ cma_hdr->port = src6->sin6_port;
}
return 0;
}
@@ -2862,8 +3231,9 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd;
int ret = 0;
- if (cma_disable_callback(id_priv, RDMA_CM_CONNECT))
- return 0;
+ mutex_lock(&id_priv->handler_mutex);
+ if (id_priv->state != RDMA_CM_CONNECT)
+ goto out;
memset(&event, 0, sizeof event);
switch (ib_event->event) {
@@ -2879,27 +3249,28 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
event.status = ib_event->param.sidr_rep_rcvd.status;
break;
}
- ret = cma_set_qkey(id_priv);
+ ret = cma_set_qkey(id_priv, rep->qkey);
if (ret) {
event.event = RDMA_CM_EVENT_ADDR_ERROR;
- event.status = -EINVAL;
+ event.status = ret;
break;
}
- if (id_priv->qkey != rep->qkey) {
- event.event = RDMA_CM_EVENT_UNREACHABLE;
- event.status = -EINVAL;
+ ret = ib_init_ah_from_path(id_priv->id.device,
+ id_priv->id.port_num,
+ id_priv->id.route.path_rec,
+ &event.param.ud.ah_attr);
+ if (ret) {
+ event.event = RDMA_CM_EVENT_ADDR_ERROR;
+ event.status = ret;
break;
}
- ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num,
- id_priv->id.route.path_rec,
- &event.param.ud.ah_attr);
event.param.ud.qp_num = rep->qpn;
event.param.ud.qkey = rep->qkey;
event.event = RDMA_CM_EVENT_ESTABLISHED;
event.status = 0;
break;
default:
- printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n",
+ pr_err("RDMA CMA: unexpected IB CM event: %d\n",
ib_event->event);
goto out;
}
@@ -2922,27 +3293,34 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
struct rdma_conn_param *conn_param)
{
struct ib_cm_sidr_req_param req;
- struct rdma_route *route;
struct ib_cm_id *id;
- int ret;
+ void *private_data;
+ int offset, ret;
- req.private_data_len = sizeof(struct cma_hdr) +
- conn_param->private_data_len;
+ memset(&req, 0, sizeof req);
+ offset = cma_user_data_offset(id_priv);
+ req.private_data_len = offset + conn_param->private_data_len;
if (req.private_data_len < conn_param->private_data_len)
return -EINVAL;
- req.private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
- if (!req.private_data)
- return -ENOMEM;
+ if (req.private_data_len) {
+ private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
+ if (!private_data)
+ return -ENOMEM;
+ } else {
+ private_data = NULL;
+ }
if (conn_param->private_data && conn_param->private_data_len)
- memcpy((void *) req.private_data + sizeof(struct cma_hdr),
- conn_param->private_data, conn_param->private_data_len);
+ memcpy((char *)private_data + offset, conn_param->private_data,
+ conn_param->private_data_len);
- route = &id_priv->id.route;
- ret = cma_format_hdr((void *) req.private_data, id_priv->id.ps, route);
- if (ret)
- goto out;
+ if (private_data) {
+ ret = cma_format_hdr(private_data, id_priv);
+ if (ret)
+ goto out;
+ req.private_data = private_data;
+ }
id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler,
id_priv);
@@ -2952,20 +3330,18 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
}
id_priv->cm_id.ib = id;
- req.path = route->path_rec;
- req.service_id = cma_get_service_id(id_priv->id.ps,
- (struct sockaddr *) &route->addr.dst_addr);
- req.timeout_ms = 1 << (cma_response_timeout - 8);
+ req.path = id_priv->id.route.path_rec;
+ req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
+ req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8);
req.max_cm_retries = CMA_MAX_CM_RETRIES;
- cma_dbg(id_priv, "sending SIDR\n");
ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req);
if (ret) {
ib_destroy_cm_id(id_priv->cm_id.ib);
id_priv->cm_id.ib = NULL;
}
out:
- kfree(req.private_data);
+ kfree(private_data);
return ret;
}
@@ -2979,17 +3355,21 @@ static int cma_connect_ib(struct rdma_id_private *id_priv,
int offset, ret;
memset(&req, 0, sizeof req);
- offset = cma_user_data_offset(id_priv->id.ps);
+ offset = cma_user_data_offset(id_priv);
req.private_data_len = offset + conn_param->private_data_len;
if (req.private_data_len < conn_param->private_data_len)
return -EINVAL;
- private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
- if (!private_data)
- return -ENOMEM;
+ if (req.private_data_len) {
+ private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
+ if (!private_data)
+ return -ENOMEM;
+ } else {
+ private_data = NULL;
+ }
if (conn_param->private_data && conn_param->private_data_len)
- memcpy(private_data + offset, conn_param->private_data,
+ memcpy((char *)private_data + offset, conn_param->private_data,
conn_param->private_data_len);
id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv);
@@ -3000,17 +3380,18 @@ static int cma_connect_ib(struct rdma_id_private *id_priv,
id_priv->cm_id.ib = id;
route = &id_priv->id.route;
- ret = cma_format_hdr(private_data, id_priv->id.ps, route);
- if (ret)
- goto out;
- req.private_data = private_data;
+ if (private_data) {
+ ret = cma_format_hdr(private_data, id_priv);
+ if (ret)
+ goto out;
+ req.private_data = private_data;
+ }
req.primary_path = &route->path_rec[0];
if (route->num_paths == 2)
req.alternate_path = &route->path_rec[1];
- req.service_id = cma_get_service_id(id_priv->id.ps,
- (struct sockaddr *) &route->addr.dst_addr);
+ req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
req.qp_num = id_priv->qp_num;
req.qp_type = id_priv->id.qp_type;
req.starting_psn = id_priv->seq_num;
@@ -3019,12 +3400,11 @@ static int cma_connect_ib(struct rdma_id_private *id_priv,
req.flow_control = conn_param->flow_control;
req.retry_count = min_t(u8, 7, conn_param->retry_count);
req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count);
- req.remote_cm_response_timeout = cma_response_timeout;
- req.local_cm_response_timeout = cma_response_timeout;
+ req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
+ req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
req.max_cm_retries = CMA_MAX_CM_RETRIES;
req.srq = id_priv->srq ? 1 : 0;
- cma_dbg(id_priv, "sending REQ\n");
ret = ib_send_cm_req(id_priv->cm_id.ib, &req);
out:
if (ret && !IS_ERR(id)) {
@@ -3040,32 +3420,30 @@ static int cma_connect_iw(struct rdma_id_private *id_priv,
struct rdma_conn_param *conn_param)
{
struct iw_cm_id *cm_id;
- struct sockaddr_in* sin;
int ret;
struct iw_cm_conn_param iw_param;
- cm_id = iw_create_cm_id(id_priv->id.device, id_priv->sock,
- cma_iw_handler, id_priv);
+ cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv);
if (IS_ERR(cm_id))
return PTR_ERR(cm_id);
+ cm_id->tos = id_priv->tos;
id_priv->cm_id.iw = cm_id;
- sin = (struct sockaddr_in*) &id_priv->id.route.addr.src_addr;
- cm_id->local_addr = *sin;
-
- sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr;
- cm_id->remote_addr = *sin;
+ memcpy(&cm_id->local_addr, cma_src_addr(id_priv),
+ rdma_addr_size(cma_src_addr(id_priv)));
+ memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv),
+ rdma_addr_size(cma_dst_addr(id_priv)));
ret = cma_modify_qp_rtr(id_priv, conn_param);
if (ret)
goto out;
if (conn_param) {
- iw_param.ord = conn_param->initiator_depth;
- iw_param.ird = conn_param->responder_resources;
- iw_param.private_data = conn_param->private_data;
- iw_param.private_data_len = conn_param->private_data_len;
+ iw_param.ord = conn_param->initiator_depth;
+ iw_param.ird = conn_param->responder_resources;
+ iw_param.private_data = conn_param->private_data;
+ iw_param.private_data_len = conn_param->private_data_len;
iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num;
} else {
memset(&iw_param, 0, sizeof iw_param);
@@ -3094,21 +3472,15 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
id_priv->srq = conn_param->srq;
}
- switch (rdma_node_get_transport(id->device->node_type)) {
- case RDMA_TRANSPORT_IB:
+ if (rdma_cap_ib_cm(id->device, id->port_num)) {
if (id->qp_type == IB_QPT_UD)
ret = cma_resolve_ib_udp(id_priv, conn_param);
else
ret = cma_connect_ib(id_priv, conn_param);
- break;
- case RDMA_TRANSPORT_IWARP:
- case RDMA_TRANSPORT_SCIF:
+ } else if (rdma_cap_iw_cm(id->device, id->port_num))
ret = cma_connect_iw(id_priv, conn_param);
- break;
- default:
+ else
ret = -ENOSYS;
- break;
- }
if (ret)
goto err;
@@ -3144,7 +3516,7 @@ static int cma_accept_ib(struct rdma_id_private *id_priv,
rep.flow_control = conn_param->flow_control;
rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count);
rep.srq = id_priv->srq ? 1 : 0;
- cma_dbg(id_priv, "sending REP\n");
+
ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep);
out:
return ret;
@@ -3156,9 +3528,6 @@ static int cma_accept_iw(struct rdma_id_private *id_priv,
struct iw_cm_conn_param iw_param;
int ret;
- if (!conn_param)
- return -EINVAL;
-
ret = cma_modify_qp_rtr(id_priv, conn_param);
if (ret)
return ret;
@@ -3176,7 +3545,7 @@ static int cma_accept_iw(struct rdma_id_private *id_priv,
}
static int cma_send_sidr_rep(struct rdma_id_private *id_priv,
- enum ib_cm_sidr_status status,
+ enum ib_cm_sidr_status status, u32 qkey,
const void *private_data, int private_data_len)
{
struct ib_cm_sidr_rep_param rep;
@@ -3185,7 +3554,7 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv,
memset(&rep, 0, sizeof rep);
rep.status = status;
if (status == IB_SIDR_SUCCESS) {
- ret = cma_set_qkey(id_priv);
+ ret = cma_set_qkey(id_priv, qkey);
if (ret)
return ret;
rep.qp_num = id_priv->qp_num;
@@ -3194,7 +3563,6 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv,
rep.private_data = private_data;
rep.private_data_len = private_data_len;
- cma_dbg(id_priv, "sending SIDR\n");
return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep);
}
@@ -3205,7 +3573,8 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
id_priv = container_of(id, struct rdma_id_private, id);
- id_priv->owner = curthread->td_proc->p_pid;
+ id_priv->owner = task_pid_nr(current);
+
if (!cma_comp(id_priv, RDMA_CM_CONNECT))
return -EINVAL;
@@ -3214,31 +3583,26 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
id_priv->srq = conn_param->srq;
}
- switch (rdma_node_get_transport(id->device->node_type)) {
- case RDMA_TRANSPORT_IB:
+ if (rdma_cap_ib_cm(id->device, id->port_num)) {
if (id->qp_type == IB_QPT_UD) {
if (conn_param)
- ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
- conn_param->private_data,
- conn_param->private_data_len);
+ ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
+ conn_param->qkey,
+ conn_param->private_data,
+ conn_param->private_data_len);
else
ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
- NULL, 0);
+ 0, NULL, 0);
} else {
if (conn_param)
- ret = cma_accept_ib(id_priv, conn_param);
- else
- ret = cma_rep_recv(id_priv);
+ ret = cma_accept_ib(id_priv, conn_param);
+ else
+ ret = cma_rep_recv(id_priv);
}
- break;
- case RDMA_TRANSPORT_IWARP:
- case RDMA_TRANSPORT_SCIF:
+ } else if (rdma_cap_iw_cm(id->device, id->port_num))
ret = cma_accept_iw(id_priv, conn_param);
- break;
- default:
+ else
ret = -ENOSYS;
- break;
- }
if (ret)
goto reject;
@@ -3282,27 +3646,20 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data,
if (!id_priv->cm_id.ib)
return -EINVAL;
- switch (rdma_node_get_transport(id->device->node_type)) {
- case RDMA_TRANSPORT_IB:
+ if (rdma_cap_ib_cm(id->device, id->port_num)) {
if (id->qp_type == IB_QPT_UD)
- ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT,
+ ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0,
private_data, private_data_len);
- else {
- cma_dbg(id_priv, "sending REJ\n");
+ else
ret = ib_send_cm_rej(id_priv->cm_id.ib,
IB_CM_REJ_CONSUMER_DEFINED, NULL,
0, private_data, private_data_len);
- }
- break;
- case RDMA_TRANSPORT_IWARP:
- case RDMA_TRANSPORT_SCIF:
+ } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
ret = iw_cm_reject(id_priv->cm_id.iw,
private_data, private_data_len);
- break;
- default:
+ } else
ret = -ENOSYS;
- break;
- }
+
return ret;
}
EXPORT_SYMBOL(rdma_reject);
@@ -3316,26 +3673,18 @@ int rdma_disconnect(struct rdma_cm_id *id)
if (!id_priv->cm_id.ib)
return -EINVAL;
- switch (rdma_node_get_transport(id->device->node_type)) {
- case RDMA_TRANSPORT_IB:
+ if (rdma_cap_ib_cm(id->device, id->port_num)) {
ret = cma_modify_qp_err(id_priv);
if (ret)
goto out;
/* Initiate or respond to a disconnect. */
- cma_dbg(id_priv, "sending DREQ\n");
- if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) {
- cma_dbg(id_priv, "sending DREP\n");
+ if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0))
ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0);
- }
- break;
- case RDMA_TRANSPORT_IWARP:
- case RDMA_TRANSPORT_SCIF:
+ } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
ret = iw_cm_disconnect(id_priv->cm_id.iw, 0);
- break;
- default:
+ } else
ret = -EINVAL;
- break;
- }
+
out:
return ret;
}
@@ -3346,17 +3695,16 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
struct rdma_id_private *id_priv;
struct cma_multicast *mc = multicast->context;
struct rdma_cm_event event;
- struct rdma_dev_addr *dev_addr;
- int ret;
- struct net_device *ndev = NULL;
- u16 vlan;
+ int ret = 0;
id_priv = mc->id_priv;
- dev_addr = &id_priv->id.route.addr.dev_addr;
- if (cma_disable_callback(id_priv, RDMA_CM_ADDR_BOUND) &&
- cma_disable_callback(id_priv, RDMA_CM_ADDR_RESOLVED))
- return 0;
+ mutex_lock(&id_priv->handler_mutex);
+ if (id_priv->state != RDMA_CM_ADDR_BOUND &&
+ id_priv->state != RDMA_CM_ADDR_RESOLVED)
+ goto out;
+ if (!status)
+ status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey));
mutex_lock(&id_priv->qp_mutex);
if (!status && id_priv->id.qp)
status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid,
@@ -3366,32 +3714,27 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
memset(&event, 0, sizeof event);
event.status = status;
event.param.ud.private_data = mc->context;
- ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
- if (!ndev) {
- status = -ENODEV;
- } else {
- vlan = rdma_vlan_dev_vlan_id(ndev);
- dev_put(ndev);
- }
if (!status) {
+ struct rdma_dev_addr *dev_addr =
+ &id_priv->id.route.addr.dev_addr;
+ struct net_device *ndev =
+ dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
+ enum ib_gid_type gid_type =
+ id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
+ rdma_start_port(id_priv->cma_dev->device)];
+
event.event = RDMA_CM_EVENT_MULTICAST_JOIN;
ib_init_ah_from_mcmember(id_priv->id.device,
id_priv->id.port_num, &multicast->rec,
+ ndev, gid_type,
&event.param.ud.ah_attr);
- event.param.ud.ah_attr.vlan_id = vlan;
event.param.ud.qp_num = 0xFFFFFF;
event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey);
- } else {
+ if (ndev)
+ dev_put(ndev);
+ } else
event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
- /* mark that the cached record is no longer valid */
- if (status != -ENETRESET && status != -EAGAIN) {
- spin_lock(&id_priv->lock);
- id_priv->is_valid_rec = 0;
- spin_unlock(&id_priv->lock);
- }
- }
-
ret = id_priv->id.event_handler(&id_priv->id, &event);
if (ret) {
cma_exch(id_priv, RDMA_CM_DESTROYING);
@@ -3400,6 +3743,7 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
return 0;
}
+out:
mutex_unlock(&id_priv->handler_mutex);
return 0;
}
@@ -3410,24 +3754,22 @@ static void cma_set_mgid(struct rdma_id_private *id_priv,
unsigned char mc_map[MAX_ADDR_LEN];
struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
struct sockaddr_in *sin = (struct sockaddr_in *) addr;
-#if defined(INET6)
struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr;
-#endif
if (cma_any_addr(addr)) {
memset(mgid, 0, sizeof *mgid);
-#if defined(INET6)
} else if ((addr->sa_family == AF_INET6) &&
((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) ==
0xFF10A01B)) {
/* IPv6 address is an SA assigned MGID. */
memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
+ } else if (addr->sa_family == AF_IB) {
+ memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid);
} else if (addr->sa_family == AF_INET6) {
ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map);
if (id_priv->id.ps == RDMA_PS_UDP)
mc_map[7] = 0x01; /* Use RDMA CM signature */
*mgid = *(union ib_gid *) (mc_map + 4);
-#endif
} else {
ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map);
if (id_priv->id.ps == RDMA_PS_UDP)
@@ -3436,39 +3778,99 @@ static void cma_set_mgid(struct rdma_id_private *id_priv,
}
}
+static void cma_query_sa_classport_info_cb(int status,
+ struct ib_class_port_info *rec,
+ void *context)
+{
+ struct class_port_info_context *cb_ctx = context;
+
+ WARN_ON(!context);
+
+ if (status || !rec) {
+ pr_debug("RDMA CM: %s port %u failed query ClassPortInfo status: %d\n",
+ cb_ctx->device->name, cb_ctx->port_num, status);
+ goto out;
+ }
+
+ memcpy(cb_ctx->class_port_info, rec, sizeof(struct ib_class_port_info));
+
+out:
+ complete(&cb_ctx->done);
+}
+
+static int cma_query_sa_classport_info(struct ib_device *device, u8 port_num,
+ struct ib_class_port_info *class_port_info)
+{
+ struct class_port_info_context *cb_ctx;
+ int ret;
+
+ cb_ctx = kmalloc(sizeof(*cb_ctx), GFP_KERNEL);
+ if (!cb_ctx)
+ return -ENOMEM;
+
+ cb_ctx->device = device;
+ cb_ctx->class_port_info = class_port_info;
+ cb_ctx->port_num = port_num;
+ init_completion(&cb_ctx->done);
+
+ ret = ib_sa_classport_info_rec_query(&sa_client, device, port_num,
+ CMA_QUERY_CLASSPORT_INFO_TIMEOUT,
+ GFP_KERNEL, cma_query_sa_classport_info_cb,
+ cb_ctx, &cb_ctx->sa_query);
+ if (ret < 0) {
+ pr_err("RDMA CM: %s port %u failed to send ClassPortInfo query, ret: %d\n",
+ device->name, port_num, ret);
+ goto out;
+ }
+
+ wait_for_completion(&cb_ctx->done);
+
+out:
+ kfree(cb_ctx);
+ return ret;
+}
+
static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
struct cma_multicast *mc)
{
struct ib_sa_mcmember_rec rec;
+ struct ib_class_port_info class_port_info;
struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
ib_sa_comp_mask comp_mask;
- int ret = 0;
+ int ret;
- ib_addr_get_mgid(dev_addr, &id_priv->rec.mgid);
+ ib_addr_get_mgid(dev_addr, &rec.mgid);
+ ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num,
+ &rec.mgid, &rec);
+ if (ret)
+ return ret;
- /* cache ipoib bc record */
- spin_lock(&id_priv->lock);
- if (!id_priv->is_valid_rec)
- ret = ib_sa_get_mcmember_rec(id_priv->id.device,
- id_priv->id.port_num,
- &id_priv->rec.mgid,
- &id_priv->rec);
- if (ret) {
- id_priv->is_valid_rec = 0;
- spin_unlock(&id_priv->lock);
+ ret = cma_set_qkey(id_priv, 0);
+ if (ret)
return ret;
- } else {
- rec = id_priv->rec;
- id_priv->is_valid_rec = 1;
- }
- spin_unlock(&id_priv->lock);
cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid);
- if (id_priv->id.ps == RDMA_PS_UDP)
- rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
+ rec.qkey = cpu_to_be32(id_priv->qkey);
rdma_addr_get_sgid(dev_addr, &rec.port_gid);
rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
- rec.join_state = 1;
+ rec.join_state = mc->join_state;
+
+ if (rec.join_state == BIT(SENDONLY_FULLMEMBER_JOIN)) {
+ ret = cma_query_sa_classport_info(id_priv->id.device,
+ id_priv->id.port_num,
+ &class_port_info);
+
+ if (ret)
+ return ret;
+
+ if (!(ib_get_cpi_capmask2(&class_port_info) &
+ IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT)) {
+ pr_warn("RDMA CM: %s port %u Unable to multicast join\n"
+ "RDMA CM: SM doesn't support Send Only Full Member option\n",
+ id_priv->id.device->name, id_priv->id.port_num);
+ return -EOPNOTSUPP;
+ }
+ }
comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE |
@@ -3487,7 +3889,7 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
id_priv->id.port_num, &rec,
comp_mask, GFP_KERNEL,
cma_ib_mc_handler, mc);
- return PTR_RET(mc->multicast.ib);
+ return PTR_ERR_OR_ZERO(mc->multicast.ib);
}
static void iboe_mcast_work_handler(struct work_struct *work)
@@ -3533,9 +3935,13 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
{
struct iboe_mcast_work *work;
struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
- int err;
+ int err = 0;
struct sockaddr *addr = (struct sockaddr *)&mc->addr;
struct net_device *ndev = NULL;
+ enum ib_gid_type gid_type;
+ bool send_only;
+
+ send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN);
if (cma_zero_addr((struct sockaddr *)&mc->addr))
return -EINVAL;
@@ -3557,7 +3963,7 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
if (dev_addr->bound_dev_if)
- ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+ ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
if (!ndev) {
err = -ENODEV;
goto out2;
@@ -3565,9 +3971,24 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
mc->multicast.ib->rec.rate = iboe_get_rate(ndev);
mc->multicast.ib->rec.hop_limit = 1;
mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->if_mtu);
+
+ gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
+ rdma_start_port(id_priv->cma_dev->device)];
+ if (addr->sa_family == AF_INET) {
+ if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
+ mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT;
+ if (!send_only) {
+ mc->igmp_joined = true;
+ }
+ }
+ } else {
+ if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
+ err = -ENOTSUPP;
+ }
dev_put(ndev);
- if (!mc->multicast.ib->rec.mtu) {
- err = -EINVAL;
+ if (err || !mc->multicast.ib->rec.mtu) {
+ if (!err)
+ err = -EINVAL;
goto out2;
}
rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
@@ -3588,7 +4009,7 @@ out1:
}
int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
- void *context)
+ u8 join_state, void *context)
{
struct rdma_id_private *id_priv;
struct cma_multicast *mc;
@@ -3603,32 +4024,22 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
if (!mc)
return -ENOMEM;
- memcpy(&mc->addr, addr, ip_addr_size(addr));
+ memcpy(&mc->addr, addr, rdma_addr_size(addr));
mc->context = context;
mc->id_priv = id_priv;
-
+ mc->igmp_joined = false;
+ mc->join_state = join_state;
spin_lock(&id_priv->lock);
list_add(&mc->list, &id_priv->mc_list);
spin_unlock(&id_priv->lock);
- switch (rdma_node_get_transport(id->device->node_type)) {
- case RDMA_TRANSPORT_IB:
- switch (rdma_port_get_link_layer(id->device, id->port_num)) {
- case IB_LINK_LAYER_INFINIBAND:
- ret = cma_join_ib_multicast(id_priv, mc);
- break;
- case IB_LINK_LAYER_ETHERNET:
- kref_init(&mc->mcref);
- ret = cma_iboe_join_multicast(id_priv, mc);
- break;
- default:
- ret = -EINVAL;
- }
- break;
- default:
+ if (rdma_protocol_roce(id->device, id->port_num)) {
+ kref_init(&mc->mcref);
+ ret = cma_iboe_join_multicast(id_priv, mc);
+ } else if (rdma_cap_ib_mcast(id->device, id->port_num))
+ ret = cma_join_ib_multicast(id_priv, mc);
+ else
ret = -ENOSYS;
- break;
- }
if (ret) {
spin_lock_irq(&id_priv->lock);
@@ -3648,7 +4059,7 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
id_priv = container_of(id, struct rdma_id_private, id);
spin_lock_irq(&id_priv->lock);
list_for_each_entry(mc, &id_priv->mc_list, list) {
- if (!memcmp(&mc->addr, addr, ip_addr_size(addr))) {
+ if (!memcmp(&mc->addr, addr, rdma_addr_size(addr))) {
list_del(&mc->list);
spin_unlock_irq(&id_priv->lock);
@@ -3656,18 +4067,27 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
ib_detach_mcast(id->qp,
&mc->multicast.ib->rec.mgid,
be16_to_cpu(mc->multicast.ib->rec.mlid));
- if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB) {
- switch (rdma_port_get_link_layer(id->device, id->port_num)) {
- case IB_LINK_LAYER_INFINIBAND:
- ib_sa_free_multicast(mc->multicast.ib);
- kfree(mc);
- break;
- case IB_LINK_LAYER_ETHERNET:
- kref_put(&mc->mcref, release_mc);
- break;
- default:
- break;
+
+ BUG_ON(id_priv->cma_dev->device != id->device);
+
+ if (rdma_cap_ib_mcast(id->device, id->port_num)) {
+ ib_sa_free_multicast(mc->multicast.ib);
+ kfree(mc);
+ } else if (rdma_protocol_roce(id->device, id->port_num)) {
+ if (mc->igmp_joined) {
+ struct rdma_dev_addr *dev_addr =
+ &id->route.addr.dev_addr;
+ struct net_device *ndev = NULL;
+
+ if (dev_addr->bound_dev_if)
+ ndev = dev_get_by_index(dev_addr->net,
+ dev_addr->bound_dev_if);
+ if (ndev) {
+ dev_put(ndev);
+ }
+ mc->igmp_joined = false;
}
+ kref_put(&mc->mcref, release_mc);
}
return;
}
@@ -3676,80 +4096,60 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
}
EXPORT_SYMBOL(rdma_leave_multicast);
-static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv)
+static int
+sysctl_cma_default_roce_mode(SYSCTL_HANDLER_ARGS)
{
- struct rdma_dev_addr *dev_addr;
- struct cma_ndev_work *work;
-
- dev_addr = &id_priv->id.route.addr.dev_addr;
-
- if ((dev_addr->bound_dev_if == ndev->if_index) &&
- memcmp(dev_addr->src_dev_addr, IF_LLADDR(ndev), ndev->if_addrlen)) {
- printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n",
- ndev->if_xname, &id_priv->id);
- work = kzalloc(sizeof *work, GFP_KERNEL);
- if (!work)
- return -ENOMEM;
-
- INIT_WORK(&work->work, cma_ndev_work_handler);
- work->id = id_priv;
- work->event.event = RDMA_CM_EVENT_ADDR_CHANGE;
- atomic_inc(&id_priv->refcount);
- queue_work(cma_wq, &work->work);
+ struct cma_device *cma_dev = arg1;
+ const int port = arg2;
+ char buf[64];
+ int error;
+
+ strlcpy(buf, ib_cache_gid_type_str(
+ cma_get_default_gid_type(cma_dev, port)), sizeof(buf));
+
+ error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+ if (error != 0 || req->newptr == NULL)
+ goto done;
+
+ error = ib_cache_gid_parse_type_str(buf);
+ if (error < 0) {
+ error = EINVAL;
+ goto done;
}
- return 0;
-}
-
-static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
- void *ctx)
-{
- struct net_device *ndev = (struct net_device *)ctx;
- struct cma_device *cma_dev;
- struct rdma_id_private *id_priv;
- int ret = NOTIFY_DONE;
-
-/* BONDING related, commented out until the bonding is resolved */
-#if 0
- if (dev_net(ndev) != &init_net)
- return NOTIFY_DONE;
-
- if (event != NETDEV_BONDING_FAILOVER)
- return NOTIFY_DONE;
-
- if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING))
- return NOTIFY_DONE;
-#endif
- if (event != NETDEV_DOWN && event != NETDEV_UNREGISTER)
- return NOTIFY_DONE;
-
- mutex_lock(&lock);
- list_for_each_entry(cma_dev, &dev_list, list)
- list_for_each_entry(id_priv, &cma_dev->id_list, list) {
- ret = cma_netdev_change(ndev, id_priv);
- if (ret)
- goto out;
- }
-
-out:
- mutex_unlock(&lock);
- return ret;
+ cma_set_default_gid_type(cma_dev, port, error);
+ error = 0;
+done:
+ return (error);
}
-static struct notifier_block cma_nb = {
- .notifier_call = cma_netdev_callback
-};
-
static void cma_add_one(struct ib_device *device)
{
struct cma_device *cma_dev;
struct rdma_id_private *id_priv;
+ unsigned int i;
+ unsigned long supported_gids = 0;
cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL);
if (!cma_dev)
return;
+ sysctl_ctx_init(&cma_dev->sysctl_ctx);
+
cma_dev->device = device;
+ cma_dev->default_gid_type = kcalloc(device->phys_port_cnt,
+ sizeof(*cma_dev->default_gid_type),
+ GFP_KERNEL);
+ if (!cma_dev->default_gid_type) {
+ kfree(cma_dev);
+ return;
+ }
+ for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+ supported_gids = roce_gid_type_mask_support(device, i);
+ WARN_ON(!supported_gids);
+ cma_dev->default_gid_type[i - rdma_start_port(device)] =
+ find_first_bit(&supported_gids, BITS_PER_LONG);
+ }
init_completion(&cma_dev->comp);
atomic_set(&cma_dev->refcount, 1);
@@ -3761,6 +4161,18 @@ static void cma_add_one(struct ib_device *device)
list_for_each_entry(id_priv, &listen_any_list, list)
cma_listen_on_dev(id_priv, cma_dev);
mutex_unlock(&lock);
+
+ for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+ char buf[64];
+
+ snprintf(buf, sizeof(buf), "default_roce_mode_port%d", i);
+
+ (void) SYSCTL_ADD_PROC(&cma_dev->sysctl_ctx,
+ SYSCTL_CHILDREN(device->ports_parent->parent->oidp),
+ OID_AUTO, buf, CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
+ cma_dev, i, &sysctl_cma_default_roce_mode, "A",
+ "Default RoCE mode. Valid values: IB/RoCE v1 and RoCE v2");
+ }
}
static int cma_remove_id_dev(struct rdma_id_private *id_priv)
@@ -3817,11 +4229,10 @@ static void cma_process_remove(struct cma_device *cma_dev)
wait_for_completion(&cma_dev->comp);
}
-static void cma_remove_one(struct ib_device *device)
+static void cma_remove_one(struct ib_device *device, void *client_data)
{
- struct cma_device *cma_dev;
+ struct cma_device *cma_dev = client_data;
- cma_dev = ib_get_client_data(device, &cma_client);
if (!cma_dev)
return;
@@ -3830,56 +4241,66 @@ static void cma_remove_one(struct ib_device *device)
mutex_unlock(&lock);
cma_process_remove(cma_dev);
+ sysctl_ctx_free(&cma_dev->sysctl_ctx);
+ kfree(cma_dev->default_gid_type);
kfree(cma_dev);
}
+static void cma_init_vnet(void *arg)
+{
+ struct cma_pernet *pernet = &VNET(cma_pernet);
+
+ idr_init(&pernet->tcp_ps);
+ idr_init(&pernet->udp_ps);
+ idr_init(&pernet->ipoib_ps);
+ idr_init(&pernet->ib_ps);
+}
+VNET_SYSINIT(cma_init_vnet, SI_SUB_OFED_MODINIT - 1, SI_ORDER_FIRST, cma_init_vnet, NULL);
+
+static void cma_destroy_vnet(void *arg)
+{
+ struct cma_pernet *pernet = &VNET(cma_pernet);
+
+ idr_destroy(&pernet->tcp_ps);
+ idr_destroy(&pernet->udp_ps);
+ idr_destroy(&pernet->ipoib_ps);
+ idr_destroy(&pernet->ib_ps);
+}
+VNET_SYSUNINIT(cma_destroy_vnet, SI_SUB_OFED_MODINIT - 1, SI_ORDER_SECOND, cma_destroy_vnet, NULL);
+
static int __init cma_init(void)
{
- int ret = -ENOMEM;
+ int ret;
- cma_wq = create_singlethread_workqueue("rdma_cm");
+ cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM);
if (!cma_wq)
return -ENOMEM;
- cma_free_wq = create_singlethread_workqueue("rdma_cm_fr");
- if (!cma_free_wq)
- goto err1;
-
ib_sa_register_client(&sa_client);
rdma_addr_register_client(&addr_client);
- register_netdevice_notifier(&cma_nb);
ret = ib_register_client(&cma_client);
if (ret)
goto err;
+ cma_configfs_init();
+
return 0;
err:
- unregister_netdevice_notifier(&cma_nb);
rdma_addr_unregister_client(&addr_client);
ib_sa_unregister_client(&sa_client);
-
- destroy_workqueue(cma_free_wq);
-err1:
destroy_workqueue(cma_wq);
return ret;
}
static void __exit cma_cleanup(void)
{
+ cma_configfs_exit();
ib_unregister_client(&cma_client);
- unregister_netdevice_notifier(&cma_nb);
rdma_addr_unregister_client(&addr_client);
ib_sa_unregister_client(&sa_client);
- flush_workqueue(cma_free_wq);
- destroy_workqueue(cma_free_wq);
destroy_workqueue(cma_wq);
- idr_destroy(&sdp_ps);
- idr_destroy(&tcp_ps);
- idr_destroy(&udp_ps);
- idr_destroy(&ipoib_ps);
- idr_destroy(&ib_ps);
}
module_init(cma_init);
diff --git a/sys/ofed/drivers/infiniband/core/core_priv.h b/sys/ofed/drivers/infiniband/core/core_priv.h
index 001bbbe357a71..54c54f196b643 100644
--- a/sys/ofed/drivers/infiniband/core/core_priv.h
+++ b/sys/ofed/drivers/infiniband/core/core_priv.h
@@ -38,15 +38,111 @@
#include <rdma/ib_verbs.h>
+#include <net/if_vlan_var.h>
+
+#ifdef CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS
+int cma_configfs_init(void);
+void cma_configfs_exit(void);
+#else
+static inline int cma_configfs_init(void)
+{
+ return 0;
+}
+
+static inline void cma_configfs_exit(void)
+{
+}
+#endif
+struct cma_device;
+void cma_ref_dev(struct cma_device *cma_dev);
+void cma_deref_dev(struct cma_device *cma_dev);
+typedef bool (*cma_device_filter)(struct ib_device *, void *);
+struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter,
+ void *cookie);
+int cma_get_default_gid_type(struct cma_device *cma_dev,
+ unsigned int port);
+int cma_set_default_gid_type(struct cma_device *cma_dev,
+ unsigned int port,
+ enum ib_gid_type default_gid_type);
+struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev);
+
int ib_device_register_sysfs(struct ib_device *device,
int (*port_callback)(struct ib_device *,
- u8, struct kobject *));
+ u8, struct kobject *));
void ib_device_unregister_sysfs(struct ib_device *device);
-int ib_sysfs_setup(void);
-void ib_sysfs_cleanup(void);
-
-int ib_cache_setup(void);
+void ib_cache_setup(void);
void ib_cache_cleanup(void);
+int ib_resolve_eth_dmac(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr, int *qp_attr_mask);
+
+typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
+ struct net_device *idev, void *cookie);
+
+typedef int (*roce_netdev_filter)(struct ib_device *device, u8 port,
+ struct net_device *idev, void *cookie);
+
+void ib_enum_roce_netdev(struct ib_device *ib_dev,
+ roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie);
+void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie);
+
+enum ib_cache_gid_default_mode {
+ IB_CACHE_GID_DEFAULT_MODE_SET,
+ IB_CACHE_GID_DEFAULT_MODE_DELETE
+};
+
+int ib_cache_gid_parse_type_str(const char *buf);
+
+const char *ib_cache_gid_type_str(enum ib_gid_type gid_type);
+
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev,
+ unsigned long gid_type_mask,
+ enum ib_cache_gid_default_mode mode);
+
+int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr);
+
+int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr);
+
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev);
+
+int roce_gid_mgmt_init(void);
+void roce_gid_mgmt_cleanup(void);
+
+int roce_rescan_device(struct ib_device *ib_dev);
+unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port);
+
+int ib_cache_setup_one(struct ib_device *device);
+void ib_cache_cleanup_one(struct ib_device *device);
+void ib_cache_release_one(struct ib_device *device);
+
+static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
+ struct net_device *upper)
+{
+
+ /* TODO: add support for LAGG */
+ upper = VLAN_TRUNKDEV(upper);
+
+ return (dev == upper);
+}
+
+int addr_init(void);
+void addr_cleanup(void);
+
+int ib_mad_init(void);
+void ib_mad_cleanup(void);
+
+int ib_sa_init(void);
+void ib_sa_cleanup(void);
+
#endif /* _CORE_PRIV_H */
diff --git a/sys/ofed/drivers/infiniband/core/device.c b/sys/ofed/drivers/infiniband/core/device.c
index a3aee411e39d8..d0aea0fe425eb 100644
--- a/sys/ofed/drivers/infiniband/core/device.c
+++ b/sys/ofed/drivers/infiniband/core/device.c
@@ -37,6 +37,9 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
#include "core_priv.h"
@@ -48,22 +51,35 @@ struct ib_client_data {
struct list_head list;
struct ib_client *client;
void * data;
+ /* The device or client is going down. Do not call client or device
+ * callbacks other than remove(). */
+ bool going_down;
};
+struct workqueue_struct *ib_comp_wq;
struct workqueue_struct *ib_wq;
EXPORT_SYMBOL_GPL(ib_wq);
+/* The device_list and client_list contain devices and clients after their
+ * registration has completed, and the devices and clients are removed
+ * during unregistration. */
static LIST_HEAD(device_list);
static LIST_HEAD(client_list);
/*
- * device_mutex protects access to both device_list and client_list.
- * There's no real point to using multiple locks or something fancier
- * like an rwsem: we always access both lists, and we're always
- * modifying one list or the other list. In any case this is not a
- * hot path so there's no point in trying to optimize.
+ * device_mutex and lists_rwsem protect access to both device_list and
+ * client_list. device_mutex protects writer access by device and client
+ * registration / de-registration. lists_rwsem protects reader access to
+ * these lists. Iterators of these lists must lock it for read, while updates
+ * to the lists must be done with a write lock. A special case is when the
+ * device_mutex is locked. In this case locking the lists for read access is
+ * not necessary as the device_mutex implies it.
+ *
+ * lists_rwsem also protects access to the client data list.
*/
static DEFINE_MUTEX(device_mutex);
+static DECLARE_RWSEM(lists_rwsem);
+
static int ib_device_check_mandatory(struct ib_device *device)
{
@@ -90,14 +106,15 @@ static int ib_device_check_mandatory(struct ib_device *device)
IB_MANDATORY_FUNC(poll_cq),
IB_MANDATORY_FUNC(req_notify_cq),
IB_MANDATORY_FUNC(get_dma_mr),
- IB_MANDATORY_FUNC(dereg_mr)
+ IB_MANDATORY_FUNC(dereg_mr),
+ IB_MANDATORY_FUNC(get_port_immutable)
};
int i;
for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
- if (!*(void **) ((void *) device + mandatory_table[i].offset)) {
- printk(KERN_WARNING "Device %s is missing mandatory function %s\n",
- device->name, mandatory_table[i].name);
+ if (!*(void **) ((char *) device + mandatory_table[i].offset)) {
+ pr_warn("Device %s is missing mandatory function %s\n",
+ device->name, mandatory_table[i].name);
return -EINVAL;
}
}
@@ -149,18 +166,20 @@ static int alloc_name(char *name)
return 0;
}
-static int start_port(struct ib_device *device)
+static void ib_device_release(struct device *device)
{
- return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
-}
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
-
-static int end_port(struct ib_device *device)
-{
- return (device->node_type == RDMA_NODE_IB_SWITCH) ?
- 0 : device->phys_port_cnt;
+ ib_cache_release_one(dev);
+ kfree(dev->port_immutable);
+ kfree(dev);
}
+static struct class ib_class = {
+ .name = "infiniband",
+ .dev_release = ib_device_release,
+};
+
/**
* ib_alloc_device - allocate an IB device struct
* @size:size of structure to allocate
@@ -173,14 +192,28 @@ static int end_port(struct ib_device *device)
*/
struct ib_device *ib_alloc_device(size_t size)
{
- struct ib_device *dev;
+ struct ib_device *device;
+
+ if (WARN_ON(size < sizeof(struct ib_device)))
+ return NULL;
- BUG_ON(size < sizeof (struct ib_device));
+ device = kzalloc(size, GFP_KERNEL);
+ if (!device)
+ return NULL;
- dev = kzalloc(size, GFP_KERNEL);
- spin_lock_init(&dev->cmd_perf_lock);
+ device->dev.parent = &linux_root_device;
+ device->dev.class = &ib_class;
+ device_initialize(&device->dev);
- return dev;
+ dev_set_drvdata(&device->dev, device);
+
+ INIT_LIST_HEAD(&device->event_handler_list);
+ spin_lock_init(&device->event_handler_lock);
+ spin_lock_init(&device->client_data_lock);
+ INIT_LIST_HEAD(&device->client_data_list);
+ INIT_LIST_HEAD(&device->port_list);
+
+ return device;
}
EXPORT_SYMBOL(ib_alloc_device);
@@ -192,13 +225,8 @@ EXPORT_SYMBOL(ib_alloc_device);
*/
void ib_dealloc_device(struct ib_device *device)
{
- if (device->reg_state == IB_DEV_UNINITIALIZED) {
- kfree(device);
- return;
- }
-
- BUG_ON(device->reg_state != IB_DEV_UNREGISTERED);
-
+ WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
+ device->reg_state != IB_DEV_UNINITIALIZED);
kobject_put(&device->dev.kobj);
}
EXPORT_SYMBOL(ib_dealloc_device);
@@ -210,59 +238,70 @@ static int add_client_context(struct ib_device *device, struct ib_client *client
context = kmalloc(sizeof *context, GFP_KERNEL);
if (!context) {
- printk(KERN_WARNING "Couldn't allocate client context for %s/%s\n",
- device->name, client->name);
+ pr_warn("Couldn't allocate client context for %s/%s\n",
+ device->name, client->name);
return -ENOMEM;
}
context->client = client;
context->data = NULL;
+ context->going_down = false;
+ down_write(&lists_rwsem);
spin_lock_irqsave(&device->client_data_lock, flags);
list_add(&context->list, &device->client_data_list);
spin_unlock_irqrestore(&device->client_data_lock, flags);
+ up_write(&lists_rwsem);
return 0;
}
-static int read_port_table_lengths(struct ib_device *device)
+static int verify_immutable(const struct ib_device *dev, u8 port)
{
- struct ib_port_attr *tprops = NULL;
- int num_ports, ret = -ENOMEM;
- u8 port_index;
-
- tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
- if (!tprops)
- goto out;
-
- num_ports = end_port(device) - start_port(device) + 1;
+ return WARN_ON(!rdma_cap_ib_mad(dev, port) &&
+ rdma_max_mad_size(dev, port) != 0);
+}
- device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports,
- GFP_KERNEL);
- device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports,
- GFP_KERNEL);
- if (!device->pkey_tbl_len || !device->gid_tbl_len)
- goto err;
+static int read_port_immutable(struct ib_device *device)
+{
+ int ret;
+ u8 start_port = rdma_start_port(device);
+ u8 end_port = rdma_end_port(device);
+ u8 port;
+
+ /**
+ * device->port_immutable is indexed directly by the port number to make
+ * access to this data as efficient as possible.
+ *
+ * Therefore port_immutable is declared as a 1 based array with
+ * potential empty slots at the beginning.
+ */
+ device->port_immutable = kzalloc(sizeof(*device->port_immutable)
+ * (end_port + 1),
+ GFP_KERNEL);
+ if (!device->port_immutable)
+ return -ENOMEM;
- for (port_index = 0; port_index < num_ports; ++port_index) {
- ret = ib_query_port(device, port_index + start_port(device),
- tprops);
+ for (port = start_port; port <= end_port; ++port) {
+ ret = device->get_port_immutable(device, port,
+ &device->port_immutable[port]);
if (ret)
- goto err;
- device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len;
- device->gid_tbl_len[port_index] = tprops->gid_tbl_len;
- }
+ return ret;
- ret = 0;
- goto out;
+ if (verify_immutable(device, port))
+ return -EINVAL;
+ }
+ return 0;
+}
-err:
- kfree(device->gid_tbl_len);
- kfree(device->pkey_tbl_len);
-out:
- kfree(tprops);
- return ret;
+void ib_get_device_fw_str(struct ib_device *dev, char *str, size_t str_len)
+{
+ if (dev->get_dev_fw_str)
+ dev->get_dev_fw_str(dev, str, str_len);
+ else
+ str[0] = '\0';
}
+EXPORT_SYMBOL(ib_get_device_fw_str);
/**
* ib_register_device - Register an IB device with IB core
@@ -278,6 +317,8 @@ int ib_register_device(struct ib_device *device,
u8, struct kobject *))
{
int ret;
+ struct ib_client *client;
+ struct ib_udata uhw = {.outlen = 0, .inlen = 0};
mutex_lock(&device_mutex);
@@ -292,40 +333,45 @@ int ib_register_device(struct ib_device *device,
goto out;
}
- INIT_LIST_HEAD(&device->event_handler_list);
- INIT_LIST_HEAD(&device->client_data_list);
- spin_lock_init(&device->event_handler_lock);
- spin_lock_init(&device->client_data_lock);
+ ret = read_port_immutable(device);
+ if (ret) {
+ pr_warn("Couldn't create per port immutable data %s\n",
+ device->name);
+ goto out;
+ }
- ret = read_port_table_lengths(device);
+ ret = ib_cache_setup_one(device);
if (ret) {
- printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n",
- device->name);
+ pr_warn("Couldn't set up InfiniBand P_Key/GID cache\n");
goto out;
}
- ret = ib_device_register_sysfs(device, port_callback);
+ memset(&device->attrs, 0, sizeof(device->attrs));
+ ret = device->query_device(device, &device->attrs, &uhw);
if (ret) {
- printk(KERN_WARNING "Couldn't register device %s with driver model\n",
- device->name);
- kfree(device->gid_tbl_len);
- kfree(device->pkey_tbl_len);
+ pr_warn("Couldn't query the device attributes\n");
+ ib_cache_cleanup_one(device);
goto out;
}
- list_add_tail(&device->core_list, &device_list);
+ ret = ib_device_register_sysfs(device, port_callback);
+ if (ret) {
+ pr_warn("Couldn't register device %s with driver model\n",
+ device->name);
+ ib_cache_cleanup_one(device);
+ goto out;
+ }
device->reg_state = IB_DEV_REGISTERED;
- {
- struct ib_client *client;
-
- list_for_each_entry(client, &client_list, list)
- if (client->add && !add_client_context(device, client))
- client->add(device);
- }
+ list_for_each_entry(client, &client_list, list)
+ if (client->add && !add_client_context(device, client))
+ client->add(device);
- out:
+ down_write(&lists_rwsem);
+ list_add_tail(&device->core_list, &device_list);
+ up_write(&lists_rwsem);
+out:
mutex_unlock(&device_mutex);
return ret;
}
@@ -339,29 +385,37 @@ EXPORT_SYMBOL(ib_register_device);
*/
void ib_unregister_device(struct ib_device *device)
{
- struct ib_client *client;
struct ib_client_data *context, *tmp;
unsigned long flags;
mutex_lock(&device_mutex);
- list_for_each_entry_reverse(client, &client_list, list)
- if (client->remove)
- client->remove(device);
-
+ down_write(&lists_rwsem);
list_del(&device->core_list);
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+ context->going_down = true;
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+ downgrade_write(&lists_rwsem);
- kfree(device->gid_tbl_len);
- kfree(device->pkey_tbl_len);
+ list_for_each_entry_safe(context, tmp, &device->client_data_list,
+ list) {
+ if (context->client->remove)
+ context->client->remove(device, context->data);
+ }
+ up_read(&lists_rwsem);
mutex_unlock(&device_mutex);
ib_device_unregister_sysfs(device);
+ ib_cache_cleanup_one(device);
+ down_write(&lists_rwsem);
spin_lock_irqsave(&device->client_data_lock, flags);
list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
kfree(context);
spin_unlock_irqrestore(&device->client_data_lock, flags);
+ up_write(&lists_rwsem);
device->reg_state = IB_DEV_UNREGISTERED;
}
@@ -386,11 +440,14 @@ int ib_register_client(struct ib_client *client)
mutex_lock(&device_mutex);
- list_add_tail(&client->list, &client_list);
list_for_each_entry(device, &device_list, core_list)
if (client->add && !add_client_context(device, client))
client->add(device);
+ down_write(&lists_rwsem);
+ list_add_tail(&client->list, &client_list);
+ up_write(&lists_rwsem);
+
mutex_unlock(&device_mutex);
return 0;
@@ -413,19 +470,41 @@ void ib_unregister_client(struct ib_client *client)
mutex_lock(&device_mutex);
+ down_write(&lists_rwsem);
+ list_del(&client->list);
+ up_write(&lists_rwsem);
+
list_for_each_entry(device, &device_list, core_list) {
- if (client->remove)
- client->remove(device);
+ struct ib_client_data *found_context = NULL;
+ down_write(&lists_rwsem);
spin_lock_irqsave(&device->client_data_lock, flags);
list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
if (context->client == client) {
- list_del(&context->list);
- kfree(context);
+ context->going_down = true;
+ found_context = context;
+ break;
}
spin_unlock_irqrestore(&device->client_data_lock, flags);
+ up_write(&lists_rwsem);
+
+ if (client->remove)
+ client->remove(device, found_context ?
+ found_context->data : NULL);
+
+ if (!found_context) {
+ pr_warn("No client context found for %s/%s\n",
+ device->name, client->name);
+ continue;
+ }
+
+ down_write(&lists_rwsem);
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_del(&found_context->list);
+ kfree(found_context);
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+ up_write(&lists_rwsem);
}
- list_del(&client->list);
mutex_unlock(&device_mutex);
}
@@ -479,8 +558,8 @@ void ib_set_client_data(struct ib_device *device, struct ib_client *client,
goto out;
}
- printk(KERN_WARNING "No client context found for %s/%s\n",
- device->name, client->name);
+ pr_warn("No client context found for %s/%s\n",
+ device->name, client->name);
out:
spin_unlock_irqrestore(&device->client_data_lock, flags);
@@ -551,21 +630,6 @@ void ib_dispatch_event(struct ib_event *event)
EXPORT_SYMBOL(ib_dispatch_event);
/**
- * ib_query_device - Query IB device attributes
- * @device:Device to query
- * @device_attr:Device attributes
- *
- * ib_query_device() returns the attributes of a device through the
- * @device_attr pointer.
- */
-int ib_query_device(struct ib_device *device,
- struct ib_device_attr *device_attr)
-{
- return device->query_device(device, device_attr);
-}
-EXPORT_SYMBOL(ib_query_device);
-
-/**
* ib_query_port - Query IB port attributes
* @device:Device to query
* @port_num:Port number to query
@@ -578,10 +642,26 @@ int ib_query_port(struct ib_device *device,
u8 port_num,
struct ib_port_attr *port_attr)
{
- if (port_num < start_port(device) || port_num > end_port(device))
+ union ib_gid gid;
+ int err;
+
+ if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
return -EINVAL;
- return device->query_port(device, port_num, port_attr);
+ memset(port_attr, 0, sizeof(*port_attr));
+ err = device->query_port(device, port_num, port_attr);
+ if (err || port_attr->subnet_prefix)
+ return err;
+
+ if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND)
+ return 0;
+
+ err = ib_query_gid(device, port_num, 0, &gid, NULL);
+ if (err)
+ return err;
+
+ port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix);
+ return 0;
}
EXPORT_SYMBOL(ib_query_port);
@@ -591,17 +671,91 @@ EXPORT_SYMBOL(ib_query_port);
* @port_num:Port number to query
* @index:GID table index to query
* @gid:Returned GID
+ * @attr: Returned GID attributes related to this GID index (only in RoCE).
+ * NULL means ignore.
*
* ib_query_gid() fetches the specified GID table entry.
*/
int ib_query_gid(struct ib_device *device,
- u8 port_num, int index, union ib_gid *gid)
+ u8 port_num, int index, union ib_gid *gid,
+ struct ib_gid_attr *attr)
{
+ if (rdma_cap_roce_gid_table(device, port_num))
+ return ib_get_cached_gid(device, port_num, index, gid, attr);
+
+ if (attr)
+ return -EINVAL;
+
return device->query_gid(device, port_num, index, gid);
}
EXPORT_SYMBOL(ib_query_gid);
/**
+ * ib_enum_roce_netdev - enumerate all RoCE ports
+ * @ib_dev : IB device we want to query
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all of the physical RoCE ports of ib_dev
+ * which are related to netdevice and calls callback() on each
+ * device for which filter() function returns non zero.
+ */
+void ib_enum_roce_netdev(struct ib_device *ib_dev,
+ roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie)
+{
+ u8 port;
+
+ for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev);
+ port++)
+ if (rdma_protocol_roce(ib_dev, port)) {
+ struct net_device *idev = NULL;
+
+ if (ib_dev->get_netdev)
+ idev = ib_dev->get_netdev(ib_dev, port);
+
+ if (idev && (idev->if_flags & IFF_DYING)) {
+ dev_put(idev);
+ idev = NULL;
+ }
+
+ if (filter(ib_dev, port, idev, filter_cookie))
+ cb(ib_dev, port, idev, cookie);
+
+ if (idev)
+ dev_put(idev);
+ }
+}
+
+/**
+ * ib_enum_all_roce_netdevs - enumerate all RoCE devices
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all RoCE devices' physical ports which are related
+ * to netdevices and calls callback() on each device for which
+ * filter() function returns non zero.
+ */
+void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie)
+{
+ struct ib_device *dev;
+
+ down_read(&lists_rwsem);
+ list_for_each_entry(dev, &device_list, core_list)
+ ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
+ up_read(&lists_rwsem);
+}
+
+/**
* ib_query_pkey - Get P_Key table entry
* @device:Device to query
* @port_num:Port number to query
@@ -656,7 +810,7 @@ int ib_modify_port(struct ib_device *device,
if (!device->modify_port)
return -ENOSYS;
- if (port_num < start_port(device) || port_num > end_port(device))
+ if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
return -EINVAL;
return device->modify_port(device, port_num, port_modify_mask,
@@ -669,19 +823,33 @@ EXPORT_SYMBOL(ib_modify_port);
* a specified GID value occurs.
* @device: The device to query.
* @gid: The GID value to search for.
+ * @gid_type: Type of GID.
+ * @ndev: The ndev related to the GID to search for.
* @port_num: The port number of the device where the GID value was found.
* @index: The index into the GID table where the GID was found. This
* parameter may be NULL.
*/
int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+ enum ib_gid_type gid_type, struct net_device *ndev,
u8 *port_num, u16 *index)
{
union ib_gid tmp_gid;
int ret, port, i;
- for (port = start_port(device); port <= end_port(device); ++port) {
- for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) {
- ret = ib_query_gid(device, port, i, &tmp_gid);
+ for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
+ if (rdma_cap_roce_gid_table(device, port)) {
+ if (!ib_find_cached_gid_by_port(device, gid, gid_type, port,
+ ndev, index)) {
+ *port_num = port;
+ return 0;
+ }
+ }
+
+ if (gid_type != IB_GID_TYPE_IB)
+ continue;
+
+ for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
+ ret = ib_query_gid(device, port, i, &tmp_gid, NULL);
if (ret)
return ret;
if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
@@ -712,7 +880,7 @@ int ib_find_pkey(struct ib_device *device,
u16 tmp_pkey;
int partial_ix = -1;
- for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) {
+ for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) {
ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
if (ret)
return ret;
@@ -736,31 +904,103 @@ int ib_find_pkey(struct ib_device *device,
}
EXPORT_SYMBOL(ib_find_pkey);
+/**
+ * ib_get_net_dev_by_params() - Return the appropriate net_dev
+ * for a received CM request
+ * @dev: An RDMA device on which the request has been received.
+ * @port: Port number on the RDMA device.
+ * @pkey: The Pkey the request came on.
+ * @gid: A GID that the net_dev uses to communicate.
+ * @addr: Contains the IP address that the request specified as its
+ * destination.
+ */
+struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
+ u8 port,
+ u16 pkey,
+ const union ib_gid *gid,
+ const struct sockaddr *addr)
+{
+ struct net_device *net_dev = NULL;
+ struct ib_client_data *context;
+
+ if (!rdma_protocol_ib(dev, port))
+ return NULL;
+
+ down_read(&lists_rwsem);
+
+ list_for_each_entry(context, &dev->client_data_list, list) {
+ struct ib_client *client = context->client;
+
+ if (context->going_down)
+ continue;
+
+ if (client->get_net_dev_by_params) {
+ net_dev = client->get_net_dev_by_params(dev, port, pkey,
+ gid, addr,
+ context->data);
+ if (net_dev)
+ break;
+ }
+ }
+
+ up_read(&lists_rwsem);
+
+ return net_dev;
+}
+EXPORT_SYMBOL(ib_get_net_dev_by_params);
+
static int __init ib_core_init(void)
{
int ret;
- ib_wq = create_workqueue("infiniband");
+ ib_wq = alloc_workqueue("infiniband", 0, 0);
if (!ib_wq)
return -ENOMEM;
- ret = ib_sysfs_setup();
- if (ret) {
- printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
+ ib_comp_wq = alloc_workqueue("ib-comp-wq",
+ WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM,
+ mp_ncpus * 4 /* WQ_UNBOUND_MAX_ACTIVE */);
+ if (!ib_comp_wq) {
+ ret = -ENOMEM;
goto err;
}
- ret = ib_cache_setup();
+ ret = class_register(&ib_class);
+ if (ret) {
+ pr_warn("Couldn't create InfiniBand device class\n");
+ goto err_comp;
+ }
+
+ ret = addr_init();
if (ret) {
- printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
+ pr_warn("Could't init IB address resolution\n");
goto err_sysfs;
}
+ ret = ib_mad_init();
+ if (ret) {
+ pr_warn("Couldn't init IB MAD\n");
+ goto err_addr;
+ }
+
+ ret = ib_sa_init();
+ if (ret) {
+ pr_warn("Couldn't init SA\n");
+ goto err_mad;
+ }
+
+ ib_cache_setup();
+
return 0;
+err_mad:
+ ib_mad_cleanup();
+err_addr:
+ addr_cleanup();
err_sysfs:
- ib_sysfs_cleanup();
-
+ class_unregister(&ib_class);
+err_comp:
+ destroy_workqueue(ib_comp_wq);
err:
destroy_workqueue(ib_wq);
return ret;
@@ -769,7 +1009,11 @@ err:
static void __exit ib_core_cleanup(void)
{
ib_cache_cleanup();
- ib_sysfs_cleanup();
+ ib_sa_cleanup();
+ ib_mad_cleanup();
+ addr_cleanup();
+ class_unregister(&ib_class);
+ destroy_workqueue(ib_comp_wq);
/* Make sure that any pending umem accounting work is done. */
destroy_workqueue(ib_wq);
}
@@ -777,17 +1021,5 @@ static void __exit ib_core_cleanup(void)
module_init(ib_core_init);
module_exit(ib_core_cleanup);
-static int
-ibcore_evhand(module_t mod, int event, void *arg)
-{
- return (0);
-}
-
-static moduledata_t ibcore_mod = {
- .name = "ibcore",
- .evhand = ibcore_evhand,
-};
-
MODULE_VERSION(ibcore, 1);
MODULE_DEPEND(ibcore, linuxkpi, 1, 1, 1);
-DECLARE_MODULE(ibcore, ibcore_mod, SI_SUB_LAST, SI_ORDER_ANY);
diff --git a/sys/ofed/drivers/infiniband/core/fmr_pool.c b/sys/ofed/drivers/infiniband/core/fmr_pool.c
index c73196aa876a9..eda5112147f4b 100644
--- a/sys/ofed/drivers/infiniband/core/fmr_pool.c
+++ b/sys/ofed/drivers/infiniband/core/fmr_pool.c
@@ -33,7 +33,6 @@
#include <linux/errno.h>
#include <linux/spinlock.h>
-#include <linux/module.h>
#include <linux/slab.h>
#include <linux/jhash.h>
#include <linux/kthread.h>
@@ -150,8 +149,8 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool)
#ifdef DEBUG
if (fmr->ref_count !=0) {
- printk(KERN_WARNING PFX "Unmapping FMR %p with ref count %d\n",
- fmr, fmr->ref_count);
+ pr_warn(PFX "Unmapping FMR 0x%08x with ref count %d\n",
+ fmr, fmr->ref_count);
}
#endif
}
@@ -167,7 +166,7 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool)
ret = ib_unmap_fmr(&fmr_list);
if (ret)
- printk(KERN_WARNING PFX "ib_unmap_fmr returned %d\n", ret);
+ pr_warn(PFX "ib_unmap_fmr returned %d\n", ret);
spin_lock_irq(&pool->pool_lock);
list_splice(&unmap_list, &pool->free_list);
@@ -212,7 +211,6 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd,
{
struct ib_device *device;
struct ib_fmr_pool *pool;
- struct ib_device_attr *attr;
int i;
int ret;
int max_remaps;
@@ -223,39 +221,20 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd,
device = pd->device;
if (!device->alloc_fmr || !device->dealloc_fmr ||
!device->map_phys_fmr || !device->unmap_fmr) {
- printk(KERN_INFO PFX "Device %s does not support FMRs\n",
- device->name);
+ pr_info(PFX "Device %s does not support FMRs\n", device->name);
return ERR_PTR(-ENOSYS);
}
- attr = kmalloc(sizeof *attr, GFP_KERNEL);
- if (!attr) {
- printk(KERN_WARNING PFX "couldn't allocate device attr struct\n");
- return ERR_PTR(-ENOMEM);
- }
-
- ret = ib_query_device(device, attr);
- if (ret) {
- printk(KERN_WARNING PFX "couldn't query device: %d\n", ret);
- kfree(attr);
- return ERR_PTR(ret);
- }
-
- if (!attr->max_map_per_fmr)
+ if (!device->attrs.max_map_per_fmr)
max_remaps = IB_FMR_MAX_REMAPS;
else
- max_remaps = attr->max_map_per_fmr;
-
- kfree(attr);
+ max_remaps = device->attrs.max_map_per_fmr;
pool = kmalloc(sizeof *pool, GFP_KERNEL);
- if (!pool) {
- printk(KERN_WARNING PFX "couldn't allocate pool struct\n");
+ if (!pool)
return ERR_PTR(-ENOMEM);
- }
pool->cache_bucket = NULL;
-
pool->flush_function = params->flush_function;
pool->flush_arg = params->flush_arg;
@@ -267,7 +246,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd,
kmalloc(IB_FMR_HASH_SIZE * sizeof *pool->cache_bucket,
GFP_KERNEL);
if (!pool->cache_bucket) {
- printk(KERN_WARNING PFX "Failed to allocate cache in pool\n");
+ pr_warn(PFX "Failed to allocate cache in pool\n");
ret = -ENOMEM;
goto out_free_pool;
}
@@ -291,7 +270,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd,
"ib_fmr(%s)",
device->name);
if (IS_ERR(pool->thread)) {
- printk(KERN_WARNING PFX "couldn't start cleanup thread\n");
+ pr_warn(PFX "couldn't start cleanup thread\n");
ret = PTR_ERR(pool->thread);
goto out_free_pool;
}
@@ -310,11 +289,8 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd,
for (i = 0; i < params->pool_size; ++i) {
fmr = kmalloc(bytes_per_fmr, GFP_KERNEL);
- if (!fmr) {
- printk(KERN_WARNING PFX "failed to allocate fmr "
- "struct for FMR %d\n", i);
+ if (!fmr)
goto out_fail;
- }
fmr->pool = pool;
fmr->remap_count = 0;
@@ -323,8 +299,8 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd,
fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr);
if (IS_ERR(fmr->fmr)) {
- printk(KERN_WARNING PFX "fmr_create failed "
- "for FMR %d\n", i);
+ pr_warn(PFX "fmr_create failed for FMR %d\n",
+ i);
kfree(fmr);
goto out_fail;
}
@@ -379,8 +355,8 @@ void ib_destroy_fmr_pool(struct ib_fmr_pool *pool)
}
if (i < pool->pool_size)
- printk(KERN_WARNING PFX "pool still has %d regions registered\n",
- pool->pool_size - i);
+ pr_warn(PFX "pool still has %d regions registered\n",
+ pool->pool_size - i);
kfree(pool->cache_bucket);
kfree(pool);
@@ -479,7 +455,7 @@ struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
list_add(&fmr->list, &pool->free_list);
spin_unlock_irqrestore(&pool->pool_lock, flags);
- printk(KERN_WARNING PFX "fmr_map returns %d\n", result);
+ pr_warn(PFX "fmr_map returns %d\n", result);
return ERR_PTR(result);
}
@@ -533,8 +509,8 @@ int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr)
#ifdef DEBUG
if (fmr->ref_count < 0)
- printk(KERN_WARNING PFX "FMR %p has ref count %d < 0\n",
- fmr, fmr->ref_count);
+ pr_warn(PFX "FMR %p has ref count %d < 0\n",
+ fmr, fmr->ref_count);
#endif
spin_unlock_irqrestore(&pool->pool_lock, flags);
diff --git a/sys/ofed/drivers/infiniband/core/ib_addr.c b/sys/ofed/drivers/infiniband/core/ib_addr.c
new file mode 100644
index 0000000000000..18b0712f5d11c
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/ib_addr.c
@@ -0,0 +1,760 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc. All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mutex.h>
+#include <linux/inetdevice.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/module.h>
+#include <net/route.h>
+#include <net/netevent.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib.h>
+
+#include <netinet/if_ether.h>
+#include <netinet/ip_var.h>
+#include <netinet6/scope6_var.h>
+#include <netinet6/in6_pcb.h>
+
+#include "core_priv.h"
+
+struct addr_req {
+ struct list_head list;
+ struct sockaddr_storage src_addr;
+ struct sockaddr_storage dst_addr;
+ struct rdma_dev_addr *addr;
+ struct rdma_addr_client *client;
+ void *context;
+ void (*callback)(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *addr, void *context);
+ unsigned long timeout;
+ int status;
+};
+
+static void process_req(struct work_struct *work);
+
+static DEFINE_MUTEX(lock);
+static LIST_HEAD(req_list);
+static DECLARE_DELAYED_WORK(work, process_req);
+static struct workqueue_struct *addr_wq;
+
+int rdma_addr_size(struct sockaddr *addr)
+{
+ switch (addr->sa_family) {
+ case AF_INET:
+ return sizeof(struct sockaddr_in);
+ case AF_INET6:
+ return sizeof(struct sockaddr_in6);
+ case AF_IB:
+ return sizeof(struct sockaddr_ib);
+ default:
+ return 0;
+ }
+}
+EXPORT_SYMBOL(rdma_addr_size);
+
+static struct rdma_addr_client self;
+
+void rdma_addr_register_client(struct rdma_addr_client *client)
+{
+ atomic_set(&client->refcount, 1);
+ init_completion(&client->comp);
+}
+EXPORT_SYMBOL(rdma_addr_register_client);
+
+static inline void put_client(struct rdma_addr_client *client)
+{
+ if (atomic_dec_and_test(&client->refcount))
+ complete(&client->comp);
+}
+
+void rdma_addr_unregister_client(struct rdma_addr_client *client)
+{
+ put_client(client);
+ wait_for_completion(&client->comp);
+}
+EXPORT_SYMBOL(rdma_addr_unregister_client);
+
+static inline void
+rdma_copy_addr_sub(u8 *dst, const u8 *src, unsigned min, unsigned max)
+{
+ if (min > max)
+ min = max;
+ memcpy(dst, src, min);
+ memset(dst + min, 0, max - min);
+}
+
+int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
+ const unsigned char *dst_dev_addr)
+{
+ if (dev->if_type == IFT_INFINIBAND)
+ dev_addr->dev_type = ARPHRD_INFINIBAND;
+ else if (dev->if_type == IFT_ETHER)
+ dev_addr->dev_type = ARPHRD_ETHER;
+ else
+ dev_addr->dev_type = 0;
+ rdma_copy_addr_sub(dev_addr->src_dev_addr, IF_LLADDR(dev),
+ dev->if_addrlen, MAX_ADDR_LEN);
+ rdma_copy_addr_sub(dev_addr->broadcast, dev->if_broadcastaddr,
+ dev->if_addrlen, MAX_ADDR_LEN);
+ if (dst_dev_addr != NULL) {
+ rdma_copy_addr_sub(dev_addr->dst_dev_addr, dst_dev_addr,
+ dev->if_addrlen, MAX_ADDR_LEN);
+ }
+ dev_addr->bound_dev_if = dev->if_index;
+ return 0;
+}
+EXPORT_SYMBOL(rdma_copy_addr);
+
+int rdma_translate_ip(const struct sockaddr *addr,
+ struct rdma_dev_addr *dev_addr,
+ u16 *vlan_id)
+{
+ struct net_device *dev = NULL;
+ int ret = -EADDRNOTAVAIL;
+
+ if (dev_addr->bound_dev_if) {
+ dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
+ if (!dev)
+ return -ENODEV;
+ ret = rdma_copy_addr(dev_addr, dev, NULL);
+ dev_put(dev);
+ return ret;
+ }
+
+ switch (addr->sa_family) {
+#ifdef INET
+ case AF_INET:
+ dev = ip_dev_find(dev_addr->net,
+ ((const struct sockaddr_in *)addr)->sin_addr.s_addr);
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6: {
+ struct in6_addr in6_addr = ((const struct sockaddr_in6 *)addr)->sin6_addr;
+
+ /* embed scope ID */
+ in6_addr.s6_addr[3] = ((const struct sockaddr_in6 *)addr)->sin6_scope_id;
+
+ dev = ip6_dev_find(dev_addr->net, in6_addr);
+ break;
+ }
+#endif
+ default:
+ break;
+ }
+
+ if (dev != NULL) {
+ ret = rdma_copy_addr(dev_addr, dev, NULL);
+ if (vlan_id)
+ *vlan_id = rdma_vlan_dev_vlan_id(dev);
+ dev_put(dev);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rdma_translate_ip);
+
+static void set_timeout(unsigned long time)
+{
+ int delay; /* under FreeBSD ticks are 32-bit */
+
+ delay = time - jiffies;
+ if (delay <= 0)
+ delay = 1;
+
+ mod_delayed_work(addr_wq, &work, delay);
+}
+
+static void queue_req(struct addr_req *req)
+{
+ struct addr_req *temp_req;
+
+ mutex_lock(&lock);
+ list_for_each_entry_reverse(temp_req, &req_list, list) {
+ if (time_after_eq(req->timeout, temp_req->timeout))
+ break;
+ }
+
+ list_add(&req->list, &temp_req->list);
+
+ if (req_list.next == &req->list)
+ set_timeout(req->timeout);
+ mutex_unlock(&lock);
+}
+
+#if defined(INET) || defined(INET6)
+static int addr_resolve_multi(u8 *edst, struct ifnet *ifp, struct sockaddr *dst_in)
+{
+ struct sockaddr *llsa;
+ struct sockaddr_dl sdl;
+ int error;
+
+ sdl.sdl_len = sizeof(sdl);
+ llsa = (struct sockaddr *)&sdl;
+
+ if (ifp->if_resolvemulti == NULL) {
+ error = EOPNOTSUPP;
+ } else {
+ error = ifp->if_resolvemulti(ifp, &llsa, dst_in);
+ if (error == 0) {
+ rdma_copy_addr_sub(edst, LLADDR((struct sockaddr_dl *)llsa),
+ ifp->if_addrlen, MAX_ADDR_LEN);
+ }
+ }
+ return (error);
+}
+#endif
+
+#ifdef INET
+static int addr4_resolve(struct sockaddr_in *src_in,
+ const struct sockaddr_in *dst_in,
+ struct rdma_dev_addr *addr,
+ struct ifnet **ifpp)
+{
+ struct sockaddr_in dst_tmp = *dst_in;
+ u8 edst[MAX_ADDR_LEN];
+ struct rtentry *rte;
+ struct ifnet *ifp;
+ int error;
+
+ /*
+ * Make sure the socket address length field
+ * is set, else rtalloc1() will fail.
+ */
+ dst_tmp.sin_len = sizeof(dst_tmp);
+
+ CURVNET_SET(addr->net);
+ /* set default TTL limit */
+ addr->hoplimit = V_ip_defttl;
+
+ /* lookup route for destination */
+ rte = rtalloc1((struct sockaddr *)&dst_tmp, 1, 0);
+ CURVNET_RESTORE();
+
+ /*
+ * Make sure the route exists and has a valid link.
+ */
+ if (rte == NULL) {
+ error = EHOSTUNREACH;
+ goto done;
+ } else if (rte->rt_ifp == NULL || RT_LINK_IS_UP(rte->rt_ifp) == 0) {
+ RTFREE_LOCKED(rte);
+ error = EHOSTUNREACH;
+ goto done;
+ } else if (src_in->sin_addr.s_addr != INADDR_ANY) {
+ RT_UNLOCK(rte);
+
+ ifp = ip_dev_find(addr->net, src_in->sin_addr.s_addr);
+ if (ifp == NULL) {
+ RTFREE(rte);
+ error = ENETUNREACH;
+ goto done;
+ } else if (ifp != rte->rt_ifp) {
+ error = ENETUNREACH;
+ goto failure;
+ }
+ } else {
+ struct sockaddr *saddr;
+
+ ifp = rte->rt_ifp;
+ dev_hold(ifp);
+
+ saddr = rte->rt_ifa->ifa_addr;
+ memcpy(src_in, saddr, rdma_addr_size(saddr));
+ RT_UNLOCK(rte);
+ }
+
+ /*
+ * Resolve destination MAC address
+ */
+ if (dst_tmp.sin_addr.s_addr == INADDR_BROADCAST) {
+ rdma_copy_addr_sub(edst, ifp->if_broadcastaddr,
+ ifp->if_addrlen, MAX_ADDR_LEN);
+ } else if (IN_MULTICAST(ntohl(dst_tmp.sin_addr.s_addr))) {
+ error = addr_resolve_multi(edst, ifp, (struct sockaddr *)&dst_tmp);
+ if (error != 0)
+ goto failure;
+ } else {
+ bool is_gw = (rte->rt_flags & RTF_GATEWAY) != 0;
+ memset(edst, 0, sizeof(edst));
+ error = arpresolve(ifp, is_gw, NULL, is_gw ?
+ rte->rt_gateway : (const struct sockaddr *)&dst_tmp,
+ edst, NULL, NULL);
+ if (error != 0)
+ goto failure;
+ else if (is_gw != 0)
+ addr->network = RDMA_NETWORK_IPV4;
+ }
+
+ /*
+ * Copy destination and source MAC addresses
+ */
+ error = -rdma_copy_addr(addr, ifp, edst);
+ if (error != 0) {
+failure:
+ dev_put(ifp);
+
+ if (error == EWOULDBLOCK || error == EAGAIN)
+ error = ENODATA;
+ } else {
+ *ifpp = ifp;
+ }
+ RTFREE(rte);
+done:
+ return (-error);
+}
+#else
+static int addr4_resolve(struct sockaddr_in *src_in,
+ const struct sockaddr_in *dst_in,
+ struct rdma_dev_addr *addr,
+ struct ifnet **ifpp)
+{
+ return -EADDRNOTAVAIL;
+}
+#endif
+
+#ifdef INET6
+static int addr6_resolve(struct sockaddr_in6 *src_in,
+ const struct sockaddr_in6 *dst_in,
+ struct rdma_dev_addr *addr,
+ struct ifnet **ifpp)
+{
+ struct sockaddr_in6 dst_tmp = *dst_in;
+ u8 edst[MAX_ADDR_LEN];
+ struct rtentry *rte;
+ struct ifnet *ifp;
+ int error;
+
+ sa6_embedscope(&dst_tmp, 0);
+ sa6_embedscope(src_in, 0);
+
+ /*
+ * Make sure the socket address length field
+ * is set, else rtalloc1() will fail.
+ */
+ dst_tmp.sin6_len = sizeof(dst_tmp);
+
+ CURVNET_SET(addr->net);
+ /* set default TTL limit */
+ addr->hoplimit = V_ip_defttl;
+
+ /* lookup route for destination */
+ rte = rtalloc1((struct sockaddr *)&dst_tmp, 1, 0);
+ CURVNET_RESTORE();
+
+ /*
+ * Make sure the route exists and has a valid link.
+ */
+ if (rte == NULL) {
+ error = EHOSTUNREACH;
+ goto done;
+ } else if (rte->rt_ifp == NULL || RT_LINK_IS_UP(rte->rt_ifp) == 0) {
+ RTFREE_LOCKED(rte);
+ error = EHOSTUNREACH;
+ goto done;
+ } else if (!IN6_IS_ADDR_UNSPECIFIED(&src_in->sin6_addr)) {
+ RT_UNLOCK(rte);
+
+ ifp = ip6_dev_find(addr->net, src_in->sin6_addr);
+ if (ifp == NULL) {
+ RTFREE(rte);
+ error = ENETUNREACH;
+ goto done;
+ } else if (ifp != rte->rt_ifp) {
+ error = ENETUNREACH;
+ goto failure;
+ }
+ } else {
+ struct sockaddr *saddr;
+
+ ifp = rte->rt_ifp;
+ dev_hold(ifp);
+
+ saddr = rte->rt_ifa->ifa_addr;
+ memcpy(src_in, saddr, rdma_addr_size(saddr));
+ RT_UNLOCK(rte);
+ }
+
+ /*
+ * Resolve destination MAC address
+ */
+ if (IN6_IS_ADDR_MULTICAST(&dst_tmp.sin6_addr)) {
+ error = addr_resolve_multi(edst, ifp, (struct sockaddr *)&dst_tmp);
+ if (error != 0)
+ goto failure;
+ } else {
+ bool is_gw = (rte->rt_flags & RTF_GATEWAY) != 0;
+ memset(edst, 0, sizeof(edst));
+ error = nd6_resolve(ifp, is_gw, NULL, is_gw ?
+ rte->rt_gateway : (const struct sockaddr *)&dst_tmp,
+ edst, NULL, NULL);
+ if (error != 0)
+ goto failure;
+ else if (is_gw != 0)
+ addr->network = RDMA_NETWORK_IPV6;
+ }
+
+ /*
+ * Copy destination and source MAC addresses
+ */
+ error = -rdma_copy_addr(addr, ifp, edst);
+ if (error != 0) {
+failure:
+ dev_put(ifp);
+
+ if (error == EWOULDBLOCK || error == EAGAIN)
+ error = ENODATA;
+ } else {
+ *ifpp = ifp;
+ }
+ RTFREE(rte);
+done:
+ sa6_recoverscope(&dst_tmp);
+ sa6_recoverscope(src_in);
+
+ return (-error);
+}
+#else
+static int addr6_resolve(struct sockaddr_in6 *src_in,
+ const struct sockaddr_in6 *dst_in,
+ struct rdma_dev_addr *addr,
+ struct ifnet **ifpp)
+{
+ return -EADDRNOTAVAIL;
+}
+#endif
+
+static int addr_resolve_neigh(struct ifnet *dev,
+ const struct sockaddr *dst_in,
+ struct rdma_dev_addr *addr)
+{
+ if (dev->if_flags & IFF_LOOPBACK) {
+ int ret;
+
+ ret = rdma_translate_ip(dst_in, addr, NULL);
+ if (!ret)
+ memcpy(addr->dst_dev_addr, addr->src_dev_addr,
+ MAX_ADDR_LEN);
+
+ return ret;
+ }
+
+ /* If the device doesn't do ARP internally */
+ if (!(dev->if_flags & IFF_NOARP))
+ return 0;
+
+ return rdma_copy_addr(addr, dev, NULL);
+}
+
+static int addr_resolve(struct sockaddr *src_in,
+ const struct sockaddr *dst_in,
+ struct rdma_dev_addr *addr,
+ bool resolve_neigh)
+{
+ struct net_device *ndev = NULL;
+ int ret;
+
+ if (dst_in->sa_family != src_in->sa_family)
+ return -EINVAL;
+
+ if (src_in->sa_family == AF_INET) {
+ ret = addr4_resolve((struct sockaddr_in *)src_in,
+ (const struct sockaddr_in *)dst_in,
+ addr, &ndev);
+ if (ret)
+ return ret;
+
+ if (resolve_neigh)
+ ret = addr_resolve_neigh(ndev, dst_in, addr);
+ } else {
+ ret = addr6_resolve((struct sockaddr_in6 *)src_in,
+ (const struct sockaddr_in6 *)dst_in, addr,
+ &ndev);
+ if (ret)
+ return ret;
+
+ if (resolve_neigh)
+ ret = addr_resolve_neigh(ndev, dst_in, addr);
+ }
+
+ addr->bound_dev_if = ndev->if_index;
+ addr->net = dev_net(ndev);
+ dev_put(ndev);
+
+ return ret;
+}
+
+static void process_req(struct work_struct *work)
+{
+ struct addr_req *req, *temp_req;
+ struct sockaddr *src_in, *dst_in;
+ struct list_head done_list;
+
+ INIT_LIST_HEAD(&done_list);
+
+ mutex_lock(&lock);
+ list_for_each_entry_safe(req, temp_req, &req_list, list) {
+ if (req->status == -ENODATA) {
+ src_in = (struct sockaddr *) &req->src_addr;
+ dst_in = (struct sockaddr *) &req->dst_addr;
+ req->status = addr_resolve(src_in, dst_in, req->addr,
+ true);
+ if (req->status && time_after_eq(jiffies, req->timeout))
+ req->status = -ETIMEDOUT;
+ else if (req->status == -ENODATA)
+ continue;
+ }
+ list_move_tail(&req->list, &done_list);
+ }
+
+ if (!list_empty(&req_list)) {
+ req = list_entry(req_list.next, struct addr_req, list);
+ set_timeout(req->timeout);
+ }
+ mutex_unlock(&lock);
+
+ list_for_each_entry_safe(req, temp_req, &done_list, list) {
+ list_del(&req->list);
+ req->callback(req->status, (struct sockaddr *) &req->src_addr,
+ req->addr, req->context);
+ put_client(req->client);
+ kfree(req);
+ }
+}
+
+int rdma_resolve_ip(struct rdma_addr_client *client,
+ struct sockaddr *src_addr, struct sockaddr *dst_addr,
+ struct rdma_dev_addr *addr, int timeout_ms,
+ void (*callback)(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *addr, void *context),
+ void *context)
+{
+ struct sockaddr *src_in, *dst_in;
+ struct addr_req *req;
+ int ret = 0;
+
+ req = kzalloc(sizeof *req, GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ src_in = (struct sockaddr *) &req->src_addr;
+ dst_in = (struct sockaddr *) &req->dst_addr;
+
+ if (src_addr) {
+ if (src_addr->sa_family != dst_addr->sa_family) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ memcpy(src_in, src_addr, rdma_addr_size(src_addr));
+ } else {
+ src_in->sa_family = dst_addr->sa_family;
+ }
+
+ memcpy(dst_in, dst_addr, rdma_addr_size(dst_addr));
+ req->addr = addr;
+ req->callback = callback;
+ req->context = context;
+ req->client = client;
+ atomic_inc(&client->refcount);
+
+ req->status = addr_resolve(src_in, dst_in, addr, true);
+ switch (req->status) {
+ case 0:
+ req->timeout = jiffies;
+ queue_req(req);
+ break;
+ case -ENODATA:
+ req->timeout = msecs_to_jiffies(timeout_ms) + jiffies;
+ queue_req(req);
+ break;
+ default:
+ ret = req->status;
+ atomic_dec(&client->refcount);
+ goto err;
+ }
+ return ret;
+err:
+ kfree(req);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_ip);
+
+int rdma_resolve_ip_route(struct sockaddr *src_addr,
+ const struct sockaddr *dst_addr,
+ struct rdma_dev_addr *addr)
+{
+ struct sockaddr_storage ssrc_addr = {};
+ struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr;
+
+ if (src_addr) {
+ if (src_addr->sa_family != dst_addr->sa_family)
+ return -EINVAL;
+
+ memcpy(src_in, src_addr, rdma_addr_size(src_addr));
+ } else {
+ src_in->sa_family = dst_addr->sa_family;
+ }
+
+ return addr_resolve(src_in, dst_addr, addr, false);
+}
+EXPORT_SYMBOL(rdma_resolve_ip_route);
+
+void rdma_addr_cancel(struct rdma_dev_addr *addr)
+{
+ struct addr_req *req, *temp_req;
+
+ mutex_lock(&lock);
+ list_for_each_entry_safe(req, temp_req, &req_list, list) {
+ if (req->addr == addr) {
+ req->status = -ECANCELED;
+ req->timeout = jiffies;
+ list_move(&req->list, &req_list);
+ set_timeout(req->timeout);
+ break;
+ }
+ }
+ mutex_unlock(&lock);
+}
+EXPORT_SYMBOL(rdma_addr_cancel);
+
+struct resolve_cb_context {
+ struct rdma_dev_addr *addr;
+ struct completion comp;
+ int status;
+};
+
+static void resolve_cb(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *addr, void *context)
+{
+ if (!status)
+ memcpy(((struct resolve_cb_context *)context)->addr,
+ addr, sizeof(struct rdma_dev_addr));
+ ((struct resolve_cb_context *)context)->status = status;
+ complete(&((struct resolve_cb_context *)context)->comp);
+}
+
+int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
+ const union ib_gid *dgid,
+ u8 *dmac, u16 *vlan_id, int *if_index,
+ int *hoplimit)
+{
+ int ret = 0;
+ struct rdma_dev_addr dev_addr;
+ struct resolve_cb_context ctx;
+ struct net_device *dev;
+
+ union {
+ struct sockaddr _sockaddr;
+ struct sockaddr_in _sockaddr_in;
+ struct sockaddr_in6 _sockaddr_in6;
+ } sgid_addr, dgid_addr;
+
+
+ rdma_gid2ip(&sgid_addr._sockaddr, sgid);
+ rdma_gid2ip(&dgid_addr._sockaddr, dgid);
+
+ memset(&dev_addr, 0, sizeof(dev_addr));
+ if (if_index)
+ dev_addr.bound_dev_if = *if_index;
+ dev_addr.net = TD_TO_VNET(curthread);
+
+ ctx.addr = &dev_addr;
+ init_completion(&ctx.comp);
+ ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr,
+ &dev_addr, 1000, resolve_cb, &ctx);
+ if (ret)
+ return ret;
+
+ wait_for_completion(&ctx.comp);
+
+ ret = ctx.status;
+ if (ret)
+ return ret;
+
+ memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN);
+ dev = dev_get_by_index(dev_addr.net, dev_addr.bound_dev_if);
+ if (!dev)
+ return -ENODEV;
+ if (if_index)
+ *if_index = dev_addr.bound_dev_if;
+ if (vlan_id)
+ *vlan_id = rdma_vlan_dev_vlan_id(dev);
+ if (hoplimit)
+ *hoplimit = dev_addr.hoplimit;
+ dev_put(dev);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_addr_find_l2_eth_by_grh);
+
+int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id)
+{
+ int ret = 0;
+ struct rdma_dev_addr dev_addr;
+ union {
+ struct sockaddr _sockaddr;
+ struct sockaddr_in _sockaddr_in;
+ struct sockaddr_in6 _sockaddr_in6;
+ } gid_addr;
+
+ rdma_gid2ip(&gid_addr._sockaddr, sgid);
+
+ memset(&dev_addr, 0, sizeof(dev_addr));
+ dev_addr.net = TD_TO_VNET(curthread);
+ ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id);
+ if (ret)
+ return ret;
+
+ memcpy(smac, dev_addr.src_dev_addr, ETH_ALEN);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_addr_find_smac_by_sgid);
+
+int addr_init(void)
+{
+ addr_wq = alloc_workqueue("ib_addr", WQ_MEM_RECLAIM, 0);
+ if (!addr_wq)
+ return -ENOMEM;
+
+ rdma_addr_register_client(&self);
+
+ return 0;
+}
+
+void addr_cleanup(void)
+{
+ rdma_addr_unregister_client(&self);
+ destroy_workqueue(addr_wq);
+}
diff --git a/sys/ofed/drivers/infiniband/core/ib_cache.c b/sys/ofed/drivers/infiniband/core/ib_cache.c
new file mode 100644
index 0000000000000..1b685f16fc66e
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/ib_cache.c
@@ -0,0 +1,1253 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/netdevice.h>
+
+#include <rdma/ib_cache.h>
+
+#include "core_priv.h"
+
+struct ib_pkey_cache {
+ int table_len;
+ u16 table[0];
+};
+
+struct ib_update_work {
+ struct work_struct work;
+ struct ib_device *device;
+ u8 port_num;
+};
+
+union ib_gid zgid;
+EXPORT_SYMBOL(zgid);
+
+static const struct ib_gid_attr zattr;
+
+enum gid_attr_find_mask {
+ GID_ATTR_FIND_MASK_GID = 1UL << 0,
+ GID_ATTR_FIND_MASK_NETDEV = 1UL << 1,
+ GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2,
+ GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3,
+};
+
+enum gid_table_entry_props {
+ GID_TABLE_ENTRY_INVALID = 1UL << 0,
+ GID_TABLE_ENTRY_DEFAULT = 1UL << 1,
+};
+
+enum gid_table_write_action {
+ GID_TABLE_WRITE_ACTION_ADD,
+ GID_TABLE_WRITE_ACTION_DEL,
+ /* MODIFY only updates the GID table. Currently only used by
+ * ib_cache_update.
+ */
+ GID_TABLE_WRITE_ACTION_MODIFY
+};
+
+struct ib_gid_table_entry {
+ unsigned long props;
+ union ib_gid gid;
+ struct ib_gid_attr attr;
+ void *context;
+};
+
+struct ib_gid_table {
+ int sz;
+ /* In RoCE, adding a GID to the table requires:
+ * (a) Find if this GID is already exists.
+ * (b) Find a free space.
+ * (c) Write the new GID
+ *
+ * Delete requires different set of operations:
+ * (a) Find the GID
+ * (b) Delete it.
+ *
+ * Add/delete should be carried out atomically.
+ * This is done by locking this mutex from multiple
+ * writers. We don't need this lock for IB, as the MAD
+ * layer replaces all entries. All data_vec entries
+ * are locked by this lock.
+ **/
+ struct mutex lock;
+ /* This lock protects the table entries from being
+ * read and written simultaneously.
+ */
+ rwlock_t rwlock;
+ struct ib_gid_table_entry *data_vec;
+};
+
+static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port)
+{
+ if (rdma_cap_roce_gid_table(ib_dev, port)) {
+ struct ib_event event;
+
+ event.device = ib_dev;
+ event.element.port_num = port;
+ event.event = IB_EVENT_GID_CHANGE;
+
+ ib_dispatch_event(&event);
+ }
+}
+
+static const char * const gid_type_str[] = {
+ [IB_GID_TYPE_IB] = "IB/RoCE v1",
+ [IB_GID_TYPE_ROCE_UDP_ENCAP] = "RoCE v2",
+};
+
+const char *ib_cache_gid_type_str(enum ib_gid_type gid_type)
+{
+ if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type])
+ return gid_type_str[gid_type];
+
+ return "Invalid GID type";
+}
+EXPORT_SYMBOL(ib_cache_gid_type_str);
+
+int ib_cache_gid_parse_type_str(const char *buf)
+{
+ unsigned int i;
+ size_t len;
+ int err = -EINVAL;
+
+ len = strlen(buf);
+ if (len == 0)
+ return -EINVAL;
+
+ if (buf[len - 1] == '\n')
+ len--;
+
+ for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i)
+ if (gid_type_str[i] && !strncmp(buf, gid_type_str[i], len) &&
+ len == strlen(gid_type_str[i])) {
+ err = i;
+ break;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(ib_cache_gid_parse_type_str);
+
+/* This function expects that rwlock will be write locked in all
+ * scenarios and that lock will be locked in sleep-able (RoCE)
+ * scenarios.
+ */
+static int write_gid(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table, int ix,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *attr,
+ enum gid_table_write_action action,
+ bool default_gid)
+ __releases(&table->rwlock) __acquires(&table->rwlock)
+{
+ int ret = 0;
+ struct net_device *old_net_dev;
+ enum ib_gid_type old_gid_type;
+
+ /* in rdma_cap_roce_gid_table, this funciton should be protected by a
+ * sleep-able lock.
+ */
+
+ if (rdma_cap_roce_gid_table(ib_dev, port)) {
+ table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID;
+ write_unlock_irq(&table->rwlock);
+ /* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by
+ * RoCE providers and thus only updates the cache.
+ */
+ if (action == GID_TABLE_WRITE_ACTION_ADD)
+ ret = ib_dev->add_gid(ib_dev, port, ix, gid, attr,
+ &table->data_vec[ix].context);
+ else if (action == GID_TABLE_WRITE_ACTION_DEL)
+ ret = ib_dev->del_gid(ib_dev, port, ix,
+ &table->data_vec[ix].context);
+ write_lock_irq(&table->rwlock);
+ }
+
+ old_net_dev = table->data_vec[ix].attr.ndev;
+ old_gid_type = table->data_vec[ix].attr.gid_type;
+ if (old_net_dev && old_net_dev != attr->ndev)
+ dev_put(old_net_dev);
+ /* if modify_gid failed, just delete the old gid */
+ if (ret || action == GID_TABLE_WRITE_ACTION_DEL) {
+ gid = &zgid;
+ attr = &zattr;
+ table->data_vec[ix].context = NULL;
+ }
+
+ memcpy(&table->data_vec[ix].gid, gid, sizeof(*gid));
+ memcpy(&table->data_vec[ix].attr, attr, sizeof(*attr));
+ if (default_gid) {
+ table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT;
+ if (action == GID_TABLE_WRITE_ACTION_DEL)
+ table->data_vec[ix].attr.gid_type = old_gid_type;
+ }
+ if (table->data_vec[ix].attr.ndev &&
+ table->data_vec[ix].attr.ndev != old_net_dev)
+ dev_hold(table->data_vec[ix].attr.ndev);
+
+ table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID;
+
+ return ret;
+}
+
+static int add_gid(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table, int ix,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *attr,
+ bool default_gid) {
+ return write_gid(ib_dev, port, table, ix, gid, attr,
+ GID_TABLE_WRITE_ACTION_ADD, default_gid);
+}
+
+static int modify_gid(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table, int ix,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *attr,
+ bool default_gid) {
+ return write_gid(ib_dev, port, table, ix, gid, attr,
+ GID_TABLE_WRITE_ACTION_MODIFY, default_gid);
+}
+
+static int del_gid(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table, int ix,
+ bool default_gid) {
+ return write_gid(ib_dev, port, table, ix, &zgid, &zattr,
+ GID_TABLE_WRITE_ACTION_DEL, default_gid);
+}
+
+/* rwlock should be read locked */
+static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
+ const struct ib_gid_attr *val, bool default_gid,
+ unsigned long mask, int *pempty)
+{
+ int i = 0;
+ int found = -1;
+ int empty = pempty ? -1 : 0;
+
+ while (i < table->sz && (found < 0 || empty < 0)) {
+ struct ib_gid_table_entry *data = &table->data_vec[i];
+ struct ib_gid_attr *attr = &data->attr;
+ int curr_index = i;
+
+ i++;
+
+ if (data->props & GID_TABLE_ENTRY_INVALID)
+ continue;
+
+ if (empty < 0)
+ if (!memcmp(&data->gid, &zgid, sizeof(*gid)) &&
+ !memcmp(attr, &zattr, sizeof(*attr)) &&
+ !data->props)
+ empty = curr_index;
+
+ if (found >= 0)
+ continue;
+
+ if (mask & GID_ATTR_FIND_MASK_GID_TYPE &&
+ attr->gid_type != val->gid_type)
+ continue;
+
+ if (mask & GID_ATTR_FIND_MASK_GID &&
+ memcmp(gid, &data->gid, sizeof(*gid)))
+ continue;
+
+ if (mask & GID_ATTR_FIND_MASK_NETDEV &&
+ attr->ndev != val->ndev)
+ continue;
+
+ if (mask & GID_ATTR_FIND_MASK_DEFAULT &&
+ !!(data->props & GID_TABLE_ENTRY_DEFAULT) !=
+ default_gid)
+ continue;
+
+ found = curr_index;
+ }
+
+ if (pempty)
+ *pempty = empty;
+
+ return found;
+}
+
+static void addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
+{
+ if (dev->if_addrlen != ETH_ALEN)
+ return;
+ memcpy(eui, IF_LLADDR(dev), 3);
+ memcpy(eui + 5, IF_LLADDR(dev) + 3, 3);
+
+ /* NOTE: The scope ID is added by the GID to IP conversion */
+
+ eui[3] = 0xFF;
+ eui[4] = 0xFE;
+ eui[0] ^= 2;
+}
+
+static void make_default_gid(struct net_device *dev, union ib_gid *gid)
+{
+ gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+ addrconf_ifid_eui48(&gid->raw[8], dev);
+}
+
+int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ int ix;
+ int ret = 0;
+ struct net_device *idev;
+ int empty;
+
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ if (!memcmp(gid, &zgid, sizeof(*gid)))
+ return -EINVAL;
+
+ if (ib_dev->get_netdev) {
+ idev = ib_dev->get_netdev(ib_dev, port);
+ if (idev && attr->ndev != idev) {
+ union ib_gid default_gid;
+
+ /* Adding default GIDs in not permitted */
+ make_default_gid(idev, &default_gid);
+ if (!memcmp(gid, &default_gid, sizeof(*gid))) {
+ dev_put(idev);
+ return -EPERM;
+ }
+ }
+ if (idev)
+ dev_put(idev);
+ }
+
+ mutex_lock(&table->lock);
+ write_lock_irq(&table->rwlock);
+
+ ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_GID_TYPE |
+ GID_ATTR_FIND_MASK_NETDEV, &empty);
+ if (ix >= 0)
+ goto out_unlock;
+
+ if (empty < 0) {
+ ret = -ENOSPC;
+ goto out_unlock;
+ }
+
+ ret = add_gid(ib_dev, port, table, empty, gid, attr, false);
+ if (!ret)
+ dispatch_gid_change_event(ib_dev, port);
+
+out_unlock:
+ write_unlock_irq(&table->rwlock);
+ mutex_unlock(&table->lock);
+ return ret;
+}
+
+int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ int ix;
+
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ mutex_lock(&table->lock);
+ write_lock_irq(&table->rwlock);
+
+ ix = find_gid(table, gid, attr, false,
+ GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_GID_TYPE |
+ GID_ATTR_FIND_MASK_NETDEV |
+ GID_ATTR_FIND_MASK_DEFAULT,
+ NULL);
+ if (ix < 0)
+ goto out_unlock;
+
+ if (!del_gid(ib_dev, port, table, ix, false))
+ dispatch_gid_change_event(ib_dev, port);
+
+out_unlock:
+ write_unlock_irq(&table->rwlock);
+ mutex_unlock(&table->lock);
+ return 0;
+}
+
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ int ix;
+ bool deleted = false;
+
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ mutex_lock(&table->lock);
+ write_lock_irq(&table->rwlock);
+
+ for (ix = 0; ix < table->sz; ix++)
+ if (table->data_vec[ix].attr.ndev == ndev)
+ if (!del_gid(ib_dev, port, table, ix,
+ !!(table->data_vec[ix].props &
+ GID_TABLE_ENTRY_DEFAULT)))
+ deleted = true;
+
+ write_unlock_irq(&table->rwlock);
+ mutex_unlock(&table->lock);
+
+ if (deleted)
+ dispatch_gid_change_event(ib_dev, port);
+
+ return 0;
+}
+
+static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index,
+ union ib_gid *gid, struct ib_gid_attr *attr)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ if (index < 0 || index >= table->sz)
+ return -EINVAL;
+
+ if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID)
+ return -EAGAIN;
+
+ memcpy(gid, &table->data_vec[index].gid, sizeof(*gid));
+ if (attr) {
+ memcpy(attr, &table->data_vec[index].attr, sizeof(*attr));
+ if (attr->ndev)
+ dev_hold(attr->ndev);
+ }
+
+ return 0;
+}
+
+static int _ib_cache_gid_table_find(struct ib_device *ib_dev,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *val,
+ unsigned long mask,
+ u8 *port, u16 *index)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ u8 p;
+ int local_index;
+ unsigned long flags;
+
+ for (p = 0; p < ib_dev->phys_port_cnt; p++) {
+ table = ports_table[p];
+ read_lock_irqsave(&table->rwlock, flags);
+ local_index = find_gid(table, gid, val, false, mask, NULL);
+ if (local_index >= 0) {
+ if (index)
+ *index = local_index;
+ if (port)
+ *port = p + rdma_start_port(ib_dev);
+ read_unlock_irqrestore(&table->rwlock, flags);
+ return 0;
+ }
+ read_unlock_irqrestore(&table->rwlock, flags);
+ }
+
+ return -ENOENT;
+}
+
+static int ib_cache_gid_find(struct ib_device *ib_dev,
+ const union ib_gid *gid,
+ enum ib_gid_type gid_type,
+ struct net_device *ndev, u8 *port,
+ u16 *index)
+{
+ unsigned long mask = GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_GID_TYPE;
+ struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type};
+
+ if (ndev)
+ mask |= GID_ATTR_FIND_MASK_NETDEV;
+
+ return _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val,
+ mask, port, index);
+}
+
+int ib_find_cached_gid_by_port(struct ib_device *ib_dev,
+ const union ib_gid *gid,
+ enum ib_gid_type gid_type,
+ u8 port, struct net_device *ndev,
+ u16 *index)
+{
+ int local_index;
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ unsigned long mask = GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_GID_TYPE;
+ struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type};
+ unsigned long flags;
+
+ if (port < rdma_start_port(ib_dev) ||
+ port > rdma_end_port(ib_dev))
+ return -ENOENT;
+
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ if (ndev)
+ mask |= GID_ATTR_FIND_MASK_NETDEV;
+
+ read_lock_irqsave(&table->rwlock, flags);
+ local_index = find_gid(table, gid, &val, false, mask, NULL);
+ if (local_index >= 0) {
+ if (index)
+ *index = local_index;
+ read_unlock_irqrestore(&table->rwlock, flags);
+ return 0;
+ }
+
+ read_unlock_irqrestore(&table->rwlock, flags);
+ return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_cached_gid_by_port);
+
+/**
+ * ib_find_gid_by_filter - Returns the GID table index where a specified
+ * GID value occurs
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value could be
+ * searched.
+ * @filter: The filter function is executed on any matching GID in the table.
+ * If the filter function returns true, the corresponding index is returned,
+ * otherwise, we continue searching the GID table. It's guaranteed that
+ * while filter is executed, ndev field is valid and the structure won't
+ * change. filter is executed in an atomic context. filter must not be NULL.
+ * @index: The index into the cached GID table where the GID was found. This
+ * parameter may be NULL.
+ *
+ * ib_cache_gid_find_by_filter() searches for the specified GID value
+ * of which the filter function returns true in the port's GID table.
+ * This function is only supported on RoCE ports.
+ *
+ */
+static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev,
+ const union ib_gid *gid,
+ u8 port,
+ bool (*filter)(const union ib_gid *,
+ const struct ib_gid_attr *,
+ void *),
+ void *context,
+ u16 *index)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ unsigned int i;
+ unsigned long flags;
+ bool found = false;
+
+ if (!ports_table)
+ return -EOPNOTSUPP;
+
+ if (port < rdma_start_port(ib_dev) ||
+ port > rdma_end_port(ib_dev) ||
+ !rdma_protocol_roce(ib_dev, port))
+ return -EPROTONOSUPPORT;
+
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ read_lock_irqsave(&table->rwlock, flags);
+ for (i = 0; i < table->sz; i++) {
+ struct ib_gid_attr attr;
+
+ if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID)
+ goto next;
+
+ if (memcmp(gid, &table->data_vec[i].gid, sizeof(*gid)))
+ goto next;
+
+ memcpy(&attr, &table->data_vec[i].attr, sizeof(attr));
+
+ if (filter(gid, &attr, context))
+ found = true;
+
+next:
+ if (found)
+ break;
+ }
+ read_unlock_irqrestore(&table->rwlock, flags);
+
+ if (!found)
+ return -ENOENT;
+
+ if (index)
+ *index = i;
+ return 0;
+}
+
+static struct ib_gid_table *alloc_gid_table(int sz)
+{
+ struct ib_gid_table *table =
+ kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL);
+
+ if (!table)
+ return NULL;
+
+ table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL);
+ if (!table->data_vec)
+ goto err_free_table;
+
+ mutex_init(&table->lock);
+
+ table->sz = sz;
+ rwlock_init(&table->rwlock);
+
+ return table;
+
+err_free_table:
+ kfree(table);
+ return NULL;
+}
+
+static void release_gid_table(struct ib_gid_table *table)
+{
+ if (table) {
+ kfree(table->data_vec);
+ kfree(table);
+ }
+}
+
+static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table)
+{
+ int i;
+ bool deleted = false;
+
+ if (!table)
+ return;
+
+ write_lock_irq(&table->rwlock);
+ for (i = 0; i < table->sz; ++i) {
+ if (memcmp(&table->data_vec[i].gid, &zgid,
+ sizeof(table->data_vec[i].gid)))
+ if (!del_gid(ib_dev, port, table, i,
+ table->data_vec[i].props &
+ GID_ATTR_FIND_MASK_DEFAULT))
+ deleted = true;
+ }
+ write_unlock_irq(&table->rwlock);
+
+ if (deleted)
+ dispatch_gid_change_event(ib_dev, port);
+}
+
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev,
+ unsigned long gid_type_mask,
+ enum ib_cache_gid_default_mode mode)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ union ib_gid gid;
+ struct ib_gid_attr gid_attr;
+ struct ib_gid_attr zattr_type = zattr;
+ struct ib_gid_table *table;
+ unsigned int gid_type;
+
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ make_default_gid(ndev, &gid);
+ memset(&gid_attr, 0, sizeof(gid_attr));
+ gid_attr.ndev = ndev;
+
+ for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) {
+ int ix;
+ union ib_gid current_gid;
+ struct ib_gid_attr current_gid_attr = {};
+
+ if (1UL << gid_type & ~gid_type_mask)
+ continue;
+
+ gid_attr.gid_type = gid_type;
+
+ mutex_lock(&table->lock);
+ write_lock_irq(&table->rwlock);
+ ix = find_gid(table, NULL, &gid_attr, true,
+ GID_ATTR_FIND_MASK_GID_TYPE |
+ GID_ATTR_FIND_MASK_DEFAULT,
+ NULL);
+
+ /* Coudn't find default GID location */
+ if (WARN_ON(ix < 0))
+ goto release;
+
+ zattr_type.gid_type = gid_type;
+
+ if (!__ib_cache_gid_get(ib_dev, port, ix,
+ &current_gid, &current_gid_attr) &&
+ mode == IB_CACHE_GID_DEFAULT_MODE_SET &&
+ !memcmp(&gid, &current_gid, sizeof(gid)) &&
+ !memcmp(&gid_attr, &current_gid_attr, sizeof(gid_attr)))
+ goto release;
+
+ if (memcmp(&current_gid, &zgid, sizeof(current_gid)) ||
+ memcmp(&current_gid_attr, &zattr_type,
+ sizeof(current_gid_attr))) {
+ if (del_gid(ib_dev, port, table, ix, true)) {
+ pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n",
+ ix, gid.raw);
+ goto release;
+ } else {
+ dispatch_gid_change_event(ib_dev, port);
+ }
+ }
+
+ if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) {
+ if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true))
+ pr_warn("ib_cache_gid: unable to add default gid %pI6\n",
+ gid.raw);
+ else
+ dispatch_gid_change_event(ib_dev, port);
+ }
+
+release:
+ if (current_gid_attr.ndev)
+ dev_put(current_gid_attr.ndev);
+ write_unlock_irq(&table->rwlock);
+ mutex_unlock(&table->lock);
+ }
+}
+
+static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table)
+{
+ unsigned int i;
+ unsigned long roce_gid_type_mask;
+ unsigned int num_default_gids;
+ unsigned int current_gid = 0;
+
+ roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+ num_default_gids = hweight_long(roce_gid_type_mask);
+ for (i = 0; i < num_default_gids && i < table->sz; i++) {
+ struct ib_gid_table_entry *entry =
+ &table->data_vec[i];
+
+ entry->props |= GID_TABLE_ENTRY_DEFAULT;
+ current_gid = find_next_bit(&roce_gid_type_mask,
+ BITS_PER_LONG,
+ current_gid);
+ entry->attr.gid_type = current_gid++;
+ }
+
+ return 0;
+}
+
+static int _gid_table_setup_one(struct ib_device *ib_dev)
+{
+ u8 port;
+ struct ib_gid_table **table;
+ int err = 0;
+
+ table = kcalloc(ib_dev->phys_port_cnt, sizeof(*table), GFP_KERNEL);
+
+ if (!table) {
+ pr_warn("failed to allocate ib gid cache for %s\n",
+ ib_dev->name);
+ return -ENOMEM;
+ }
+
+ for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+ u8 rdma_port = port + rdma_start_port(ib_dev);
+
+ table[port] =
+ alloc_gid_table(
+ ib_dev->port_immutable[rdma_port].gid_tbl_len);
+ if (!table[port]) {
+ err = -ENOMEM;
+ goto rollback_table_setup;
+ }
+
+ err = gid_table_reserve_default(ib_dev,
+ port + rdma_start_port(ib_dev),
+ table[port]);
+ if (err)
+ goto rollback_table_setup;
+ }
+
+ ib_dev->cache.gid_cache = table;
+ return 0;
+
+rollback_table_setup:
+ for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+ cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
+ table[port]);
+ release_gid_table(table[port]);
+ }
+
+ kfree(table);
+ return err;
+}
+
+static void gid_table_release_one(struct ib_device *ib_dev)
+{
+ struct ib_gid_table **table = ib_dev->cache.gid_cache;
+ u8 port;
+
+ if (!table)
+ return;
+
+ for (port = 0; port < ib_dev->phys_port_cnt; port++)
+ release_gid_table(table[port]);
+
+ kfree(table);
+ ib_dev->cache.gid_cache = NULL;
+}
+
+static void gid_table_cleanup_one(struct ib_device *ib_dev)
+{
+ struct ib_gid_table **table = ib_dev->cache.gid_cache;
+ u8 port;
+
+ if (!table)
+ return;
+
+ for (port = 0; port < ib_dev->phys_port_cnt; port++)
+ cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
+ table[port]);
+}
+
+static int gid_table_setup_one(struct ib_device *ib_dev)
+{
+ int err;
+
+ err = _gid_table_setup_one(ib_dev);
+
+ if (err)
+ return err;
+
+ err = roce_rescan_device(ib_dev);
+
+ if (err) {
+ gid_table_cleanup_one(ib_dev);
+ gid_table_release_one(ib_dev);
+ }
+
+ return err;
+}
+
+int ib_get_cached_gid(struct ib_device *device,
+ u8 port_num,
+ int index,
+ union ib_gid *gid,
+ struct ib_gid_attr *gid_attr)
+{
+ int res;
+ unsigned long flags;
+ struct ib_gid_table **ports_table = device->cache.gid_cache;
+ struct ib_gid_table *table = ports_table[port_num - rdma_start_port(device)];
+
+ if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
+ return -EINVAL;
+
+ read_lock_irqsave(&table->rwlock, flags);
+ res = __ib_cache_gid_get(device, port_num, index, gid, gid_attr);
+ read_unlock_irqrestore(&table->rwlock, flags);
+
+ return res;
+}
+EXPORT_SYMBOL(ib_get_cached_gid);
+
+int ib_find_cached_gid(struct ib_device *device,
+ const union ib_gid *gid,
+ enum ib_gid_type gid_type,
+ struct net_device *ndev,
+ u8 *port_num,
+ u16 *index)
+{
+ return ib_cache_gid_find(device, gid, gid_type, ndev, port_num, index);
+}
+EXPORT_SYMBOL(ib_find_cached_gid);
+
+int ib_find_gid_by_filter(struct ib_device *device,
+ const union ib_gid *gid,
+ u8 port_num,
+ bool (*filter)(const union ib_gid *gid,
+ const struct ib_gid_attr *,
+ void *),
+ void *context, u16 *index)
+{
+ /* Only RoCE GID table supports filter function */
+ if (!rdma_cap_roce_gid_table(device, port_num) && filter)
+ return -EPROTONOSUPPORT;
+
+ return ib_cache_gid_find_by_filter(device, gid,
+ port_num, filter,
+ context, index);
+}
+EXPORT_SYMBOL(ib_find_gid_by_filter);
+
+int ib_get_cached_pkey(struct ib_device *device,
+ u8 port_num,
+ int index,
+ u16 *pkey)
+{
+ struct ib_pkey_cache *cache;
+ unsigned long flags;
+ int ret = 0;
+
+ if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+
+ cache = device->cache.pkey_cache[port_num - rdma_start_port(device)];
+
+ if (index < 0 || index >= cache->table_len)
+ ret = -EINVAL;
+ else
+ *pkey = cache->table[index];
+
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_pkey);
+
+int ib_find_cached_pkey(struct ib_device *device,
+ u8 port_num,
+ u16 pkey,
+ u16 *index)
+{
+ struct ib_pkey_cache *cache;
+ unsigned long flags;
+ int i;
+ int ret = -ENOENT;
+ int partial_ix = -1;
+
+ if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+
+ cache = device->cache.pkey_cache[port_num - rdma_start_port(device)];
+
+ *index = -1;
+
+ for (i = 0; i < cache->table_len; ++i)
+ if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
+ if (cache->table[i] & 0x8000) {
+ *index = i;
+ ret = 0;
+ break;
+ } else
+ partial_ix = i;
+ }
+
+ if (ret && partial_ix >= 0) {
+ *index = partial_ix;
+ ret = 0;
+ }
+
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_pkey);
+
+int ib_find_exact_cached_pkey(struct ib_device *device,
+ u8 port_num,
+ u16 pkey,
+ u16 *index)
+{
+ struct ib_pkey_cache *cache;
+ unsigned long flags;
+ int i;
+ int ret = -ENOENT;
+
+ if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+
+ cache = device->cache.pkey_cache[port_num - rdma_start_port(device)];
+
+ *index = -1;
+
+ for (i = 0; i < cache->table_len; ++i)
+ if (cache->table[i] == pkey) {
+ *index = i;
+ ret = 0;
+ break;
+ }
+
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_find_exact_cached_pkey);
+
+int ib_get_cached_lmc(struct ib_device *device,
+ u8 port_num,
+ u8 *lmc)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+ *lmc = device->cache.lmc_cache[port_num - rdma_start_port(device)];
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_lmc);
+
+static void ib_cache_update(struct ib_device *device,
+ u8 port)
+{
+ struct ib_port_attr *tprops = NULL;
+ struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache;
+ struct ib_gid_cache {
+ int table_len;
+ union ib_gid table[0];
+ } *gid_cache = NULL;
+ int i;
+ int ret;
+ struct ib_gid_table *table;
+ struct ib_gid_table **ports_table = device->cache.gid_cache;
+ bool use_roce_gid_table =
+ rdma_cap_roce_gid_table(device, port);
+
+ if (port < rdma_start_port(device) || port > rdma_end_port(device))
+ return;
+
+ table = ports_table[port - rdma_start_port(device)];
+
+ tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+ if (!tprops)
+ return;
+
+ ret = ib_query_port(device, port, tprops);
+ if (ret) {
+ pr_warn("ib_query_port failed (%d) for %s\n",
+ ret, device->name);
+ goto err;
+ }
+
+ pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len *
+ sizeof *pkey_cache->table, GFP_KERNEL);
+ if (!pkey_cache)
+ goto err;
+
+ pkey_cache->table_len = tprops->pkey_tbl_len;
+
+ if (!use_roce_gid_table) {
+ gid_cache = kmalloc(sizeof(*gid_cache) + tprops->gid_tbl_len *
+ sizeof(*gid_cache->table), GFP_KERNEL);
+ if (!gid_cache)
+ goto err;
+
+ gid_cache->table_len = tprops->gid_tbl_len;
+ }
+
+ for (i = 0; i < pkey_cache->table_len; ++i) {
+ ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
+ if (ret) {
+ pr_warn("ib_query_pkey failed (%d) for %s (index %d)\n",
+ ret, device->name, i);
+ goto err;
+ }
+ }
+
+ if (!use_roce_gid_table) {
+ for (i = 0; i < gid_cache->table_len; ++i) {
+ ret = ib_query_gid(device, port, i,
+ gid_cache->table + i, NULL);
+ if (ret) {
+ pr_warn("ib_query_gid failed (%d) for %s (index %d)\n",
+ ret, device->name, i);
+ goto err;
+ }
+ }
+ }
+
+ write_lock_irq(&device->cache.lock);
+
+ old_pkey_cache = device->cache.pkey_cache[port - rdma_start_port(device)];
+
+ device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache;
+ if (!use_roce_gid_table) {
+ write_lock(&table->rwlock);
+ for (i = 0; i < gid_cache->table_len; i++) {
+ modify_gid(device, port, table, i, gid_cache->table + i,
+ &zattr, false);
+ }
+ write_unlock(&table->rwlock);
+ }
+
+ device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc;
+
+ write_unlock_irq(&device->cache.lock);
+
+ kfree(gid_cache);
+ kfree(old_pkey_cache);
+ kfree(tprops);
+ return;
+
+err:
+ kfree(pkey_cache);
+ kfree(gid_cache);
+ kfree(tprops);
+}
+
+static void ib_cache_task(struct work_struct *_work)
+{
+ struct ib_update_work *work =
+ container_of(_work, struct ib_update_work, work);
+
+ ib_cache_update(work->device, work->port_num);
+ kfree(work);
+}
+
+static void ib_cache_event(struct ib_event_handler *handler,
+ struct ib_event *event)
+{
+ struct ib_update_work *work;
+
+ if (event->event == IB_EVENT_PORT_ERR ||
+ event->event == IB_EVENT_PORT_ACTIVE ||
+ event->event == IB_EVENT_LID_CHANGE ||
+ event->event == IB_EVENT_PKEY_CHANGE ||
+ event->event == IB_EVENT_SM_CHANGE ||
+ event->event == IB_EVENT_CLIENT_REREGISTER ||
+ event->event == IB_EVENT_GID_CHANGE) {
+ work = kmalloc(sizeof *work, GFP_ATOMIC);
+ if (work) {
+ INIT_WORK(&work->work, ib_cache_task);
+ work->device = event->device;
+ work->port_num = event->element.port_num;
+ queue_work(ib_wq, &work->work);
+ }
+ }
+}
+
+int ib_cache_setup_one(struct ib_device *device)
+{
+ int p;
+ int err;
+
+ rwlock_init(&device->cache.lock);
+
+ device->cache.pkey_cache =
+ kzalloc(sizeof *device->cache.pkey_cache *
+ (rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL);
+ device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache *
+ (rdma_end_port(device) -
+ rdma_start_port(device) + 1),
+ GFP_KERNEL);
+ if (!device->cache.pkey_cache ||
+ !device->cache.lmc_cache) {
+ pr_warn("Couldn't allocate cache for %s\n", device->name);
+ return -ENOMEM;
+ }
+
+ err = gid_table_setup_one(device);
+ if (err)
+ /* Allocated memory will be cleaned in the release function */
+ return err;
+
+ for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
+ ib_cache_update(device, p + rdma_start_port(device));
+
+ INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
+ device, ib_cache_event);
+ err = ib_register_event_handler(&device->cache.event_handler);
+ if (err)
+ goto err;
+
+ return 0;
+
+err:
+ gid_table_cleanup_one(device);
+ return err;
+}
+
+void ib_cache_release_one(struct ib_device *device)
+{
+ int p;
+
+ /*
+ * The release function frees all the cache elements.
+ * This function should be called as part of freeing
+ * all the device's resources when the cache could no
+ * longer be accessed.
+ */
+ if (device->cache.pkey_cache)
+ for (p = 0;
+ p <= rdma_end_port(device) - rdma_start_port(device); ++p)
+ kfree(device->cache.pkey_cache[p]);
+
+ gid_table_release_one(device);
+ kfree(device->cache.pkey_cache);
+ kfree(device->cache.lmc_cache);
+}
+
+void ib_cache_cleanup_one(struct ib_device *device)
+{
+ /* The cleanup function unregisters the event handler,
+ * waits for all in-progress workqueue elements and cleans
+ * up the GID cache. This function should be called after
+ * the device was removed from the devices list and all
+ * clients were removed, so the cache exists but is
+ * non-functional and shouldn't be updated anymore.
+ */
+ ib_unregister_event_handler(&device->cache.event_handler);
+ flush_workqueue(ib_wq);
+ gid_table_cleanup_one(device);
+}
+
+void __init ib_cache_setup(void)
+{
+ roce_gid_mgmt_init();
+}
+
+void __exit ib_cache_cleanup(void)
+{
+ roce_gid_mgmt_cleanup();
+}
diff --git a/sys/ofed/drivers/infiniband/core/ib_cq.c b/sys/ofed/drivers/infiniband/core/ib_cq.c
new file mode 100644
index 0000000000000..9f79f733390c6
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/ib_cq.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+#include <rdma/ib_verbs.h>
+
+#define IB_CQ_POLL_MAX 16
+/* maximum number of completions per poll loop */
+#define IB_CQ_POLL_BUDGET 65536
+#define IB_CQ_POLL_FLAGS (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
+
+static void
+ib_cq_poll_work(struct work_struct *work)
+{
+ struct ib_wc ib_wc[IB_CQ_POLL_MAX];
+ struct ib_cq *cq = container_of(work, struct ib_cq, work);
+ int total = 0;
+ int i;
+ int n;
+
+ while (1) {
+ n = ib_poll_cq(cq, IB_CQ_POLL_MAX, ib_wc);
+ for (i = 0; i < n; i++) {
+ struct ib_wc *wc = ib_wc + i;
+
+ if (wc->wr_cqe != NULL)
+ wc->wr_cqe->done(cq, wc);
+ }
+
+ if (n != IB_CQ_POLL_MAX) {
+ if (ib_req_notify_cq(cq, IB_CQ_POLL_FLAGS) > 0)
+ break;
+ else
+ return;
+ }
+ total += n;
+ if (total >= IB_CQ_POLL_BUDGET)
+ break;
+ }
+
+ /* give other work structs a chance */
+ queue_work(ib_comp_wq, &cq->work);
+}
+
+static void
+ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
+{
+ queue_work(ib_comp_wq, &cq->work);
+}
+
+struct ib_cq *
+ib_alloc_cq(struct ib_device *dev, void *private,
+ int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx)
+{
+ struct ib_cq_init_attr cq_attr = {
+ .cqe = nr_cqe,
+ .comp_vector = comp_vector,
+ };
+ struct ib_cq *cq;
+
+ /*
+ * Check for invalid parameters early on to avoid
+ * extra error handling code:
+ */
+ switch (poll_ctx) {
+ case IB_POLL_DIRECT:
+ case IB_POLL_SOFTIRQ:
+ case IB_POLL_WORKQUEUE:
+ break;
+ default:
+ return (ERR_PTR(-EINVAL));
+ }
+
+ cq = dev->create_cq(dev, &cq_attr, NULL, NULL);
+ if (IS_ERR(cq))
+ return (cq);
+
+ cq->device = dev;
+ cq->uobject = NULL;
+ cq->event_handler = NULL;
+ cq->cq_context = private;
+ cq->poll_ctx = poll_ctx;
+ atomic_set(&cq->usecnt, 0);
+
+ switch (poll_ctx) {
+ case IB_POLL_DIRECT:
+ cq->comp_handler = NULL; /* no hardware completions */
+ break;
+ case IB_POLL_SOFTIRQ:
+ case IB_POLL_WORKQUEUE:
+ cq->comp_handler = ib_cq_completion_workqueue;
+ INIT_WORK(&cq->work, ib_cq_poll_work);
+ ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+ break;
+ default:
+ break;
+ }
+ return (cq);
+}
+EXPORT_SYMBOL(ib_alloc_cq);
+
+void
+ib_free_cq(struct ib_cq *cq)
+{
+
+ if (WARN_ON_ONCE(atomic_read(&cq->usecnt) != 0))
+ return;
+
+ switch (cq->poll_ctx) {
+ case IB_POLL_DIRECT:
+ break;
+ case IB_POLL_SOFTIRQ:
+ case IB_POLL_WORKQUEUE:
+ flush_work(&cq->work);
+ break;
+ default:
+ break;
+ }
+
+ (void)cq->device->destroy_cq(cq);
+}
+EXPORT_SYMBOL(ib_free_cq);
diff --git a/sys/ofed/drivers/infiniband/core/ib_iwpm_msg.c b/sys/ofed/drivers/infiniband/core/ib_iwpm_msg.c
new file mode 100644
index 0000000000000..158b977ad8ae9
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/ib_iwpm_msg.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "iwpm_util.h"
+
+static int iwpm_user_pid = IWPM_PID_UNDEFINED;
+
+int iwpm_valid_pid(void)
+{
+ return iwpm_user_pid > 0;
+}
+EXPORT_SYMBOL(iwpm_valid_pid);
+
diff --git a/sys/ofed/drivers/infiniband/core/ib_iwpm_util.c b/sys/ofed/drivers/infiniband/core/ib_iwpm_util.c
new file mode 100644
index 0000000000000..13eef42896cd3
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/ib_iwpm_util.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "iwpm_util.h"
+
+#define IWPM_MAPINFO_HASH_SIZE 512
+#define IWPM_MAPINFO_HASH_MASK (IWPM_MAPINFO_HASH_SIZE - 1)
+#define IWPM_REMINFO_HASH_SIZE 64
+#define IWPM_REMINFO_HASH_MASK (IWPM_REMINFO_HASH_SIZE - 1)
+#define IWPM_MSG_SIZE 512
+
+int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr,
+ struct sockaddr_storage *b_sockaddr)
+{
+ if (a_sockaddr->ss_family != b_sockaddr->ss_family)
+ return 1;
+ if (a_sockaddr->ss_family == AF_INET) {
+ struct sockaddr_in *a4_sockaddr =
+ (struct sockaddr_in *)a_sockaddr;
+ struct sockaddr_in *b4_sockaddr =
+ (struct sockaddr_in *)b_sockaddr;
+ if (!memcmp(&a4_sockaddr->sin_addr,
+ &b4_sockaddr->sin_addr, sizeof(struct in_addr))
+ && a4_sockaddr->sin_port == b4_sockaddr->sin_port)
+ return 0;
+
+ } else if (a_sockaddr->ss_family == AF_INET6) {
+ struct sockaddr_in6 *a6_sockaddr =
+ (struct sockaddr_in6 *)a_sockaddr;
+ struct sockaddr_in6 *b6_sockaddr =
+ (struct sockaddr_in6 *)b_sockaddr;
+ if (!memcmp(&a6_sockaddr->sin6_addr,
+ &b6_sockaddr->sin6_addr, sizeof(struct in6_addr))
+ && a6_sockaddr->sin6_port == b6_sockaddr->sin6_port)
+ return 0;
+
+ } else {
+ pr_err("%s: Invalid sockaddr family\n", __func__);
+ }
+ return 1;
+}
+
+void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg)
+{
+ struct sockaddr_in6 *sockaddr_v6;
+ struct sockaddr_in *sockaddr_v4;
+
+ switch (sockaddr->ss_family) {
+ case AF_INET:
+ sockaddr_v4 = (struct sockaddr_in *)sockaddr;
+ pr_debug("%s IPV4 %pI4: %u(0x%04X)\n",
+ msg, &sockaddr_v4->sin_addr,
+ ntohs(sockaddr_v4->sin_port),
+ ntohs(sockaddr_v4->sin_port));
+ break;
+ case AF_INET6:
+ sockaddr_v6 = (struct sockaddr_in6 *)sockaddr;
+ pr_debug("%s IPV6 %pI6: %u(0x%04X)\n",
+ msg, &sockaddr_v6->sin6_addr,
+ ntohs(sockaddr_v6->sin6_port),
+ ntohs(sockaddr_v6->sin6_port));
+ break;
+ default:
+ break;
+ }
+}
+
diff --git a/sys/ofed/drivers/infiniband/core/ib_roce_gid_mgmt.c b/sys/ofed/drivers/infiniband/core/ib_roce_gid_mgmt.c
new file mode 100644
index 0000000000000..d5a0bd0f6d5af
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/ib_roce_gid_mgmt.c
@@ -0,0 +1,415 @@
+/*
+ * Copyright (c) 2015-2017, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/rcupdate.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+
+#include <netinet6/scope6_var.h>
+
+static struct workqueue_struct *roce_gid_mgmt_wq;
+
+enum gid_op_type {
+ GID_DEL = 0,
+ GID_ADD
+};
+
+struct roce_gid_scan_event_work {
+ struct work_struct work;
+ struct net_device *ndev;
+};
+
+struct roce_rescan_work {
+ struct work_struct work;
+ struct ib_device *ib_dev;
+};
+
+static const struct {
+ bool (*is_supported)(const struct ib_device *device, u8 port_num);
+ enum ib_gid_type gid_type;
+} PORT_CAP_TO_GID_TYPE[] = {
+ {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE},
+ {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP},
+};
+
+#define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE)
+
+unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port)
+{
+ int i;
+ unsigned int ret_flags = 0;
+
+ if (!rdma_protocol_roce(ib_dev, port))
+ return 1UL << IB_GID_TYPE_IB;
+
+ for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++)
+ if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port))
+ ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type;
+
+ return ret_flags;
+}
+EXPORT_SYMBOL(roce_gid_type_mask_support);
+
+static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
+ u8 port, union ib_gid *gid, struct net_device *ndev)
+{
+ int i;
+ unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+ struct ib_gid_attr gid_attr;
+
+ memset(&gid_attr, 0, sizeof(gid_attr));
+ gid_attr.ndev = ndev;
+
+ for (i = 0; i != IB_GID_TYPE_SIZE; i++) {
+ if ((1UL << i) & gid_type_mask) {
+ gid_attr.gid_type = i;
+ switch (gid_op) {
+ case GID_ADD:
+ ib_cache_gid_add(ib_dev, port,
+ gid, &gid_attr);
+ break;
+ case GID_DEL:
+ ib_cache_gid_del(ib_dev, port,
+ gid, &gid_attr);
+ break;
+ }
+ }
+ }
+}
+
+static int
+roce_gid_match_netdev(struct ib_device *ib_dev, u8 port,
+ struct net_device *idev, void *cookie)
+{
+ struct net_device *ndev = (struct net_device *)cookie;
+ if (idev == NULL)
+ return (0);
+ return (ndev == idev);
+}
+
+static int
+roce_gid_match_all(struct ib_device *ib_dev, u8 port,
+ struct net_device *idev, void *cookie)
+{
+ if (idev == NULL)
+ return (0);
+ return (1);
+}
+
+static int
+roce_gid_enum_netdev_default(struct ib_device *ib_dev,
+ u8 port, struct net_device *idev)
+{
+ unsigned long gid_type_mask;
+
+ gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
+ ib_cache_gid_set_default_gid(ib_dev, port, idev, gid_type_mask,
+ IB_CACHE_GID_DEFAULT_MODE_SET);
+
+ return (hweight_long(gid_type_mask));
+}
+
+#define ETH_IPOIB_DRV_NAME "ib"
+
+static inline int
+is_eth_ipoib_intf(struct net_device *dev)
+{
+ if (strcmp(dev->if_dname, ETH_IPOIB_DRV_NAME))
+ return 0;
+ return 1;
+}
+
+static void
+roce_gid_update_addr_callback(struct ib_device *device, u8 port,
+ struct net_device *ndev, void *cookie)
+{
+ struct ipx_entry {
+ STAILQ_ENTRY(ipx_entry) entry;
+ union ipx_addr {
+ struct sockaddr sa[0];
+ struct sockaddr_in v4;
+ struct sockaddr_in6 v6;
+ } ipx_addr;
+ };
+ struct ipx_entry *entry;
+ struct net_device *idev;
+ struct ifaddr *ifa;
+ union ib_gid gid;
+ int default_gids;
+ u16 index_num;
+ int i;
+
+ STAILQ_HEAD(, ipx_entry) ipx_head;
+
+ STAILQ_INIT(&ipx_head);
+
+ /* make sure default GIDs are in */
+ default_gids = roce_gid_enum_netdev_default(device, port, ndev);
+
+ CURVNET_SET(ndev->if_vnet);
+ IFNET_RLOCK();
+ TAILQ_FOREACH(idev, &V_ifnet, if_link) {
+ if (idev != ndev) {
+ if (idev->if_type != IFT_L2VLAN)
+ continue;
+ if (ndev != rdma_vlan_dev_real_dev(idev))
+ continue;
+ }
+
+ /* clone address information for IPv4 and IPv6 */
+ IF_ADDR_RLOCK(idev);
+#if defined(INET)
+ TAILQ_FOREACH(ifa, &idev->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr == NULL ||
+ ifa->ifa_addr->sa_family != AF_INET)
+ continue;
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (entry == NULL) {
+ pr_warn("roce_gid_update_addr_callback: "
+ "couldn't allocate entry for IPv4 update\n");
+ continue;
+ }
+ entry->ipx_addr.v4 = *((struct sockaddr_in *)ifa->ifa_addr);
+ STAILQ_INSERT_TAIL(&ipx_head, entry, entry);
+ }
+#endif
+#if defined(INET6)
+ TAILQ_FOREACH(ifa, &idev->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr == NULL ||
+ ifa->ifa_addr->sa_family != AF_INET6)
+ continue;
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (entry == NULL) {
+ pr_warn("roce_gid_update_addr_callback: "
+ "couldn't allocate entry for IPv6 update\n");
+ continue;
+ }
+ entry->ipx_addr.v6 = *((struct sockaddr_in6 *)ifa->ifa_addr);
+
+ /* trash IPv6 scope ID */
+ sa6_recoverscope(&entry->ipx_addr.v6);
+ entry->ipx_addr.v6.sin6_scope_id = 0;
+
+ STAILQ_INSERT_TAIL(&ipx_head, entry, entry);
+ }
+#endif
+ IF_ADDR_RUNLOCK(idev);
+ }
+ IFNET_RUNLOCK();
+ CURVNET_RESTORE();
+
+ /* add missing GIDs, if any */
+ STAILQ_FOREACH(entry, &ipx_head, entry) {
+ unsigned long gid_type_mask = roce_gid_type_mask_support(device, port);
+
+ if (rdma_ip2gid(&entry->ipx_addr.sa[0], &gid) != 0)
+ continue;
+
+ for (i = 0; i != IB_GID_TYPE_SIZE; i++) {
+ if (!((1UL << i) & gid_type_mask))
+ continue;
+ /* check if entry found */
+ if (ib_find_cached_gid_by_port(device, &gid, i,
+ port, ndev, &index_num) == 0)
+ break;
+ }
+ if (i != IB_GID_TYPE_SIZE)
+ continue;
+ /* add new GID */
+ update_gid(GID_ADD, device, port, &gid, ndev);
+ }
+
+ /* remove stale GIDs, if any */
+ for (i = default_gids; ib_get_cached_gid(device, port, i, &gid, NULL) == 0; i++) {
+ union ipx_addr ipx;
+
+ /* don't delete empty entries */
+ if (memcmp(&gid, &zgid, sizeof(zgid)) == 0)
+ continue;
+
+ /* zero default */
+ memset(&ipx, 0, sizeof(ipx));
+
+ rdma_gid2ip(&ipx.sa[0], &gid);
+
+ STAILQ_FOREACH(entry, &ipx_head, entry) {
+ if (memcmp(&entry->ipx_addr, &ipx, sizeof(ipx)) == 0)
+ break;
+ }
+ /* check if entry found */
+ if (entry != NULL)
+ continue;
+
+ /* remove GID */
+ update_gid(GID_DEL, device, port, &gid, ndev);
+ }
+
+ while ((entry = STAILQ_FIRST(&ipx_head))) {
+ STAILQ_REMOVE_HEAD(&ipx_head, entry);
+ kfree(entry);
+ }
+}
+
+static void
+roce_gid_queue_scan_event_handler(struct work_struct *_work)
+{
+ struct roce_gid_scan_event_work *work =
+ container_of(_work, struct roce_gid_scan_event_work, work);
+
+ ib_enum_all_roce_netdevs(roce_gid_match_netdev, work->ndev,
+ roce_gid_update_addr_callback, NULL);
+
+ dev_put(work->ndev);
+ kfree(work);
+}
+
+static void
+roce_gid_queue_scan_event(struct net_device *ndev)
+{
+ struct roce_gid_scan_event_work *work;
+
+retry:
+ if (is_eth_ipoib_intf(ndev))
+ return;
+
+ if (ndev->if_type != IFT_ETHER) {
+ if (ndev->if_type == IFT_L2VLAN) {
+ ndev = rdma_vlan_dev_real_dev(ndev);
+ if (ndev != NULL)
+ goto retry;
+ }
+ return;
+ }
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (!work) {
+ pr_warn("roce_gid_mgmt: Couldn't allocate work for addr_event\n");
+ return;
+ }
+
+ INIT_WORK(&work->work, roce_gid_queue_scan_event_handler);
+ dev_hold(ndev);
+
+ work->ndev = ndev;
+
+ queue_work(roce_gid_mgmt_wq, &work->work);
+}
+
+static int
+inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *ndev = ptr;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ case NETDEV_UNREGISTER:
+ case NETDEV_CHANGEADDR:
+ case NETDEV_CHANGEIFADDR:
+ roce_gid_queue_scan_event(ndev);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nb_inetaddr = {
+ .notifier_call = inetaddr_event
+};
+
+static void
+roce_rescan_device_handler(struct work_struct *_work)
+{
+ struct roce_rescan_work *work =
+ container_of(_work, struct roce_rescan_work, work);
+
+ ib_enum_roce_netdev(work->ib_dev, roce_gid_match_all, NULL,
+ roce_gid_update_addr_callback, NULL);
+ kfree(work);
+}
+
+/* Caller must flush system workqueue before removing the ib_device */
+int roce_rescan_device(struct ib_device *ib_dev)
+{
+ struct roce_rescan_work *work = kmalloc(sizeof(*work), GFP_KERNEL);
+
+ if (!work)
+ return -ENOMEM;
+
+ work->ib_dev = ib_dev;
+ INIT_WORK(&work->work, roce_rescan_device_handler);
+ queue_work(roce_gid_mgmt_wq, &work->work);
+
+ return 0;
+}
+
+int __init roce_gid_mgmt_init(void)
+{
+ roce_gid_mgmt_wq = alloc_ordered_workqueue("roce_gid_mgmt_wq", 0);
+ if (!roce_gid_mgmt_wq) {
+ pr_warn("roce_gid_mgmt: can't allocate work queue\n");
+ return -ENOMEM;
+ }
+
+ register_inetaddr_notifier(&nb_inetaddr);
+
+ /*
+ * We rely on the netdevice notifier to enumerate all existing
+ * devices in the system. Register to this notifier last to
+ * make sure we will not miss any IP add/del callbacks.
+ */
+ register_netdevice_notifier(&nb_inetaddr);
+
+ return 0;
+}
+
+void __exit roce_gid_mgmt_cleanup(void)
+{
+ unregister_inetaddr_notifier(&nb_inetaddr);
+ unregister_netdevice_notifier(&nb_inetaddr);
+
+ /*
+ * Ensure all gid deletion tasks complete before we go down,
+ * to avoid any reference to free'd memory. By the time
+ * ib-core is removed, all physical devices have been removed,
+ * so no issue with remaining hardware contexts.
+ */
+ synchronize_rcu();
+ drain_workqueue(roce_gid_mgmt_wq);
+ destroy_workqueue(roce_gid_mgmt_wq);
+}
diff --git a/sys/ofed/drivers/infiniband/core/ib_smi.c b/sys/ofed/drivers/infiniband/core/ib_smi.c
new file mode 100644
index 0000000000000..f19b23817c2b4
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/ib_smi.c
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <rdma/ib_smi.h>
+#include "smi.h"
+#include "opa_smi.h"
+
+static enum smi_action __smi_handle_dr_smp_send(bool is_switch, int port_num,
+ u8 *hop_ptr, u8 hop_cnt,
+ const u8 *initial_path,
+ const u8 *return_path,
+ u8 direction,
+ bool dr_dlid_is_permissive,
+ bool dr_slid_is_permissive)
+{
+ /* See section 14.2.2.2, Vol 1 IB spec */
+ /* C14-6 -- valid hop_cnt values are from 0 to 63 */
+ if (hop_cnt >= IB_SMP_MAX_PATH_HOPS)
+ return IB_SMI_DISCARD;
+
+ if (!direction) {
+ /* C14-9:1 */
+ if (hop_cnt && *hop_ptr == 0) {
+ (*hop_ptr)++;
+ return (initial_path[*hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:2 */
+ if (*hop_ptr && *hop_ptr < hop_cnt) {
+ if (!is_switch)
+ return IB_SMI_DISCARD;
+
+ /* return_path set when received */
+ (*hop_ptr)++;
+ return (initial_path[*hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ if (*hop_ptr == hop_cnt) {
+ /* return_path set when received */
+ (*hop_ptr)++;
+ return (is_switch ||
+ dr_dlid_is_permissive ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+ /* C14-9:5 -- Fail unreasonable hop pointer */
+ return (*hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+
+ } else {
+ /* C14-13:1 */
+ if (hop_cnt && *hop_ptr == hop_cnt + 1) {
+ (*hop_ptr)--;
+ return (return_path[*hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:2 */
+ if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) {
+ if (!is_switch)
+ return IB_SMI_DISCARD;
+
+ (*hop_ptr)--;
+ return (return_path[*hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:3 -- at the end of the DR segment of path */
+ if (*hop_ptr == 1) {
+ (*hop_ptr)--;
+ /* C14-13:3 -- SMPs destined for SM shouldn't be here */
+ return (is_switch ||
+ dr_slid_is_permissive ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */
+ if (*hop_ptr == 0)
+ return IB_SMI_HANDLE;
+
+ /* C14-13:5 -- Check for unreasonable hop pointer */
+ return IB_SMI_DISCARD;
+ }
+}
+
+/*
+ * Fixup a directed route SMP for sending
+ * Return IB_SMI_DISCARD if the SMP should be discarded
+ */
+enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
+ bool is_switch, int port_num)
+{
+ return __smi_handle_dr_smp_send(is_switch, port_num,
+ &smp->hop_ptr, smp->hop_cnt,
+ smp->initial_path,
+ smp->return_path,
+ ib_get_smp_direction(smp),
+ smp->dr_dlid == IB_LID_PERMISSIVE,
+ smp->dr_slid == IB_LID_PERMISSIVE);
+}
+
+enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp,
+ bool is_switch, int port_num)
+{
+ return __smi_handle_dr_smp_send(is_switch, port_num,
+ &smp->hop_ptr, smp->hop_cnt,
+ smp->route.dr.initial_path,
+ smp->route.dr.return_path,
+ opa_get_smp_direction(smp),
+ smp->route.dr.dr_dlid ==
+ OPA_LID_PERMISSIVE,
+ smp->route.dr.dr_slid ==
+ OPA_LID_PERMISSIVE);
+}
+
+static enum smi_action __smi_handle_dr_smp_recv(bool is_switch, int port_num,
+ int phys_port_cnt,
+ u8 *hop_ptr, u8 hop_cnt,
+ const u8 *initial_path,
+ u8 *return_path,
+ u8 direction,
+ bool dr_dlid_is_permissive,
+ bool dr_slid_is_permissive)
+{
+ /* See section 14.2.2.2, Vol 1 IB spec */
+ /* C14-6 -- valid hop_cnt values are from 0 to 63 */
+ if (hop_cnt >= IB_SMP_MAX_PATH_HOPS)
+ return IB_SMI_DISCARD;
+
+ if (!direction) {
+ /* C14-9:1 -- sender should have incremented hop_ptr */
+ if (hop_cnt && *hop_ptr == 0)
+ return IB_SMI_DISCARD;
+
+ /* C14-9:2 -- intermediate hop */
+ if (*hop_ptr && *hop_ptr < hop_cnt) {
+ if (!is_switch)
+ return IB_SMI_DISCARD;
+
+ return_path[*hop_ptr] = port_num;
+ /* hop_ptr updated when sending */
+ return (initial_path[*hop_ptr+1] <= phys_port_cnt ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ if (*hop_ptr == hop_cnt) {
+ if (hop_cnt)
+ return_path[*hop_ptr] = port_num;
+ /* hop_ptr updated when sending */
+
+ return (is_switch ||
+ dr_dlid_is_permissive ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+ /* C14-9:5 -- fail unreasonable hop pointer */
+ return (*hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+
+ } else {
+
+ /* C14-13:1 */
+ if (hop_cnt && *hop_ptr == hop_cnt + 1) {
+ (*hop_ptr)--;
+ return (return_path[*hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:2 */
+ if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) {
+ if (!is_switch)
+ return IB_SMI_DISCARD;
+
+ /* hop_ptr updated when sending */
+ return (return_path[*hop_ptr-1] <= phys_port_cnt ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:3 -- We're at the end of the DR segment of path */
+ if (*hop_ptr == 1) {
+ if (dr_slid_is_permissive) {
+ /* giving SMP to SM - update hop_ptr */
+ (*hop_ptr)--;
+ return IB_SMI_HANDLE;
+ }
+ /* hop_ptr updated when sending */
+ return (is_switch ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:4 -- hop_ptr = 0 -> give to SM */
+ /* C14-13:5 -- Check for unreasonable hop pointer */
+ return (*hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+}
+
+/*
+ * Adjust information for a received SMP
+ * Return IB_SMI_DISCARD if the SMP should be dropped
+ */
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch,
+ int port_num, int phys_port_cnt)
+{
+ return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt,
+ &smp->hop_ptr, smp->hop_cnt,
+ smp->initial_path,
+ smp->return_path,
+ ib_get_smp_direction(smp),
+ smp->dr_dlid == IB_LID_PERMISSIVE,
+ smp->dr_slid == IB_LID_PERMISSIVE);
+}
+
+/*
+ * Adjust information for a received SMP
+ * Return IB_SMI_DISCARD if the SMP should be dropped
+ */
+enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch,
+ int port_num, int phys_port_cnt)
+{
+ return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt,
+ &smp->hop_ptr, smp->hop_cnt,
+ smp->route.dr.initial_path,
+ smp->route.dr.return_path,
+ opa_get_smp_direction(smp),
+ smp->route.dr.dr_dlid ==
+ OPA_LID_PERMISSIVE,
+ smp->route.dr.dr_slid ==
+ OPA_LID_PERMISSIVE);
+}
+
+static enum smi_forward_action __smi_check_forward_dr_smp(u8 hop_ptr, u8 hop_cnt,
+ u8 direction,
+ bool dr_dlid_is_permissive,
+ bool dr_slid_is_permissive)
+{
+ if (!direction) {
+ /* C14-9:2 -- intermediate hop */
+ if (hop_ptr && hop_ptr < hop_cnt)
+ return IB_SMI_FORWARD;
+
+ /* C14-9:3 -- at the end of the DR segment of path */
+ if (hop_ptr == hop_cnt)
+ return (dr_dlid_is_permissive ?
+ IB_SMI_SEND : IB_SMI_LOCAL);
+
+ /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+ if (hop_ptr == hop_cnt + 1)
+ return IB_SMI_SEND;
+ } else {
+ /* C14-13:2 -- intermediate hop */
+ if (2 <= hop_ptr && hop_ptr <= hop_cnt)
+ return IB_SMI_FORWARD;
+
+ /* C14-13:3 -- at the end of the DR segment of path */
+ if (hop_ptr == 1)
+ return (!dr_slid_is_permissive ?
+ IB_SMI_SEND : IB_SMI_LOCAL);
+ }
+ return IB_SMI_LOCAL;
+
+}
+
+enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp)
+{
+ return __smi_check_forward_dr_smp(smp->hop_ptr, smp->hop_cnt,
+ ib_get_smp_direction(smp),
+ smp->dr_dlid == IB_LID_PERMISSIVE,
+ smp->dr_slid == IB_LID_PERMISSIVE);
+}
+
+enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp)
+{
+ return __smi_check_forward_dr_smp(smp->hop_ptr, smp->hop_cnt,
+ opa_get_smp_direction(smp),
+ smp->route.dr.dr_dlid ==
+ OPA_LID_PERMISSIVE,
+ smp->route.dr.dr_slid ==
+ OPA_LID_PERMISSIVE);
+}
+
+/*
+ * Return the forwarding port number from initial_path for outgoing SMP and
+ * from return_path for returning SMP
+ */
+int smi_get_fwd_port(struct ib_smp *smp)
+{
+ return (!ib_get_smp_direction(smp) ? smp->initial_path[smp->hop_ptr+1] :
+ smp->return_path[smp->hop_ptr-1]);
+}
+
+/*
+ * Return the forwarding port number from initial_path for outgoing SMP and
+ * from return_path for returning SMP
+ */
+int opa_smi_get_fwd_port(struct opa_smp *smp)
+{
+ return !opa_get_smp_direction(smp) ? smp->route.dr.initial_path[smp->hop_ptr+1] :
+ smp->route.dr.return_path[smp->hop_ptr-1];
+}
diff --git a/sys/ofed/drivers/infiniband/core/ib_sysfs.c b/sys/ofed/drivers/infiniband/core/ib_sysfs.c
new file mode 100644
index 0000000000000..3527a72a75a5e
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/ib_sysfs.c
@@ -0,0 +1,1327 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/netdevice.h>
+#include <linux/fs.h>
+#include <linux/printk.h>
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_pma.h>
+
+struct ib_port;
+
+struct gid_attr_group {
+ struct ib_port *port;
+ struct kobject kobj;
+ struct attribute_group ndev;
+ struct attribute_group type;
+};
+struct ib_port {
+ struct kobject kobj;
+ struct ib_device *ibdev;
+ struct gid_attr_group *gid_attr_group;
+ struct attribute_group gid_group;
+ struct attribute_group pkey_group;
+ struct attribute_group *pma_table;
+ struct attribute_group *hw_stats_ag;
+ struct rdma_hw_stats *hw_stats;
+ u8 port_num;
+};
+
+struct port_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf);
+ ssize_t (*store)(struct ib_port *, struct port_attribute *,
+ const char *buf, size_t count);
+};
+
+#define PORT_ATTR(_name, _mode, _show, _store) \
+struct port_attribute port_attr_##_name = __ATTR(_name, _mode, _show, _store)
+
+#define PORT_ATTR_RO(_name) \
+struct port_attribute port_attr_##_name = __ATTR_RO(_name)
+
+struct port_table_attribute {
+ struct port_attribute attr;
+ char name[8];
+ int index;
+ __be16 attr_id;
+};
+
+struct hw_stats_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct kobject *kobj,
+ struct attribute *attr, char *buf);
+ ssize_t (*store)(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf,
+ size_t count);
+ int index;
+ u8 port_num;
+};
+
+static ssize_t port_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct port_attribute *port_attr =
+ container_of(attr, struct port_attribute, attr);
+ struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+
+ if (!port_attr->show)
+ return -EIO;
+
+ return port_attr->show(p, port_attr, buf);
+}
+
+static const struct sysfs_ops port_sysfs_ops = {
+ .show = port_attr_show
+};
+
+static ssize_t gid_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct port_attribute *port_attr =
+ container_of(attr, struct port_attribute, attr);
+ struct ib_port *p = container_of(kobj, struct gid_attr_group,
+ kobj)->port;
+
+ if (!port_attr->show)
+ return -EIO;
+
+ return port_attr->show(p, port_attr, buf);
+}
+
+static const struct sysfs_ops gid_attr_sysfs_ops = {
+ .show = gid_attr_show
+};
+
+static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ static const char *state_name[] = {
+ [IB_PORT_NOP] = "NOP",
+ [IB_PORT_DOWN] = "DOWN",
+ [IB_PORT_INIT] = "INIT",
+ [IB_PORT_ARMED] = "ARMED",
+ [IB_PORT_ACTIVE] = "ACTIVE",
+ [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER"
+ };
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%d: %s\n", attr.state,
+ attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ?
+ state_name[attr.state] : "UNKNOWN");
+}
+
+static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%x\n", attr.lid);
+}
+
+static ssize_t lid_mask_count_show(struct ib_port *p,
+ struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%d\n", attr.lmc);
+}
+
+static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%x\n", attr.sm_lid);
+}
+
+static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%d\n", attr.sm_sl);
+}
+
+static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%08x\n", attr.port_cap_flags);
+}
+
+static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ char *speed = "";
+ int rate; /* in deci-Gb/sec */
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ switch (attr.active_speed) {
+ case IB_SPEED_DDR:
+ speed = " DDR";
+ rate = 50;
+ break;
+ case IB_SPEED_QDR:
+ speed = " QDR";
+ rate = 100;
+ break;
+ case IB_SPEED_FDR10:
+ speed = " FDR10";
+ rate = 100;
+ break;
+ case IB_SPEED_FDR:
+ speed = " FDR";
+ rate = 140;
+ break;
+ case IB_SPEED_EDR:
+ speed = " EDR";
+ rate = 250;
+ break;
+ case IB_SPEED_SDR:
+ default: /* default to SDR for invalid rates */
+ rate = 25;
+ break;
+ }
+
+ rate *= ib_width_enum_to_int(attr.active_width);
+ if (rate < 0)
+ return -EINVAL;
+
+ return sprintf(buf, "%d%s Gb/sec (%dX%s)\n",
+ rate / 10, rate % 10 ? ".5" : "",
+ ib_width_enum_to_int(attr.active_width), speed);
+}
+
+static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ switch (attr.phys_state) {
+ case 1: return sprintf(buf, "1: Sleep\n");
+ case 2: return sprintf(buf, "2: Polling\n");
+ case 3: return sprintf(buf, "3: Disabled\n");
+ case 4: return sprintf(buf, "4: PortConfigurationTraining\n");
+ case 5: return sprintf(buf, "5: LinkUp\n");
+ case 6: return sprintf(buf, "6: LinkErrorRecovery\n");
+ case 7: return sprintf(buf, "7: Phy Test\n");
+ default: return sprintf(buf, "%d: <unknown>\n", attr.phys_state);
+ }
+}
+
+static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) {
+ case IB_LINK_LAYER_INFINIBAND:
+ return sprintf(buf, "%s\n", "InfiniBand");
+ case IB_LINK_LAYER_ETHERNET:
+ return sprintf(buf, "%s\n", "Ethernet");
+ default:
+ return sprintf(buf, "%s\n", "Unknown");
+ }
+}
+
+static PORT_ATTR_RO(state);
+static PORT_ATTR_RO(lid);
+static PORT_ATTR_RO(lid_mask_count);
+static PORT_ATTR_RO(sm_lid);
+static PORT_ATTR_RO(sm_sl);
+static PORT_ATTR_RO(cap_mask);
+static PORT_ATTR_RO(rate);
+static PORT_ATTR_RO(phys_state);
+static PORT_ATTR_RO(link_layer);
+
+static struct attribute *port_default_attrs[] = {
+ &port_attr_state.attr,
+ &port_attr_lid.attr,
+ &port_attr_lid_mask_count.attr,
+ &port_attr_sm_lid.attr,
+ &port_attr_sm_sl.attr,
+ &port_attr_cap_mask.attr,
+ &port_attr_rate.attr,
+ &port_attr_phys_state.attr,
+ &port_attr_link_layer.attr,
+ NULL
+};
+
+static size_t print_ndev(struct ib_gid_attr *gid_attr, char *buf)
+{
+ if (!gid_attr->ndev)
+ return -EINVAL;
+
+ return sprintf(buf, "%s\n", if_name(gid_attr->ndev));
+}
+
+static size_t print_gid_type(struct ib_gid_attr *gid_attr, char *buf)
+{
+ return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_attr->gid_type));
+}
+
+static ssize_t _show_port_gid_attr(struct ib_port *p,
+ struct port_attribute *attr,
+ char *buf,
+ size_t (*print)(struct ib_gid_attr *gid_attr,
+ char *buf))
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ union ib_gid gid;
+ struct ib_gid_attr gid_attr = {};
+ ssize_t ret;
+
+ ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid,
+ &gid_attr);
+ if (ret)
+ goto err;
+
+ ret = print(&gid_attr, buf);
+
+err:
+ if (gid_attr.ndev)
+ dev_put(gid_attr.ndev);
+ return ret;
+}
+
+static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
+ char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ union ib_gid gid;
+ ssize_t ret;
+
+ ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid, NULL);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, GID_PRINT_FMT"\n", GID_PRINT_ARGS(gid.raw));
+}
+
+static ssize_t show_port_gid_attr_ndev(struct ib_port *p,
+ struct port_attribute *attr, char *buf)
+{
+ return _show_port_gid_attr(p, attr, buf, print_ndev);
+}
+
+static ssize_t show_port_gid_attr_gid_type(struct ib_port *p,
+ struct port_attribute *attr,
+ char *buf)
+{
+ return _show_port_gid_attr(p, attr, buf, print_gid_type);
+}
+
+static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
+ char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ u16 pkey;
+ ssize_t ret;
+
+ ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%04x\n", pkey);
+}
+
+#define PORT_PMA_ATTR(_name, _counter, _width, _offset) \
+struct port_table_attribute port_pma_attr_##_name = { \
+ .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \
+ .index = (_offset) | ((_width) << 16) | ((_counter) << 24), \
+ .attr_id = IB_PMA_PORT_COUNTERS , \
+}
+
+#define PORT_PMA_ATTR_EXT(_name, _width, _offset) \
+struct port_table_attribute port_pma_attr_ext_##_name = { \
+ .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \
+ .index = (_offset) | ((_width) << 16), \
+ .attr_id = IB_PMA_PORT_COUNTERS_EXT , \
+}
+
+/*
+ * Get a Perfmgmt MAD block of data.
+ * Returns error code or the number of bytes retrieved.
+ */
+static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr,
+ void *data, int offset, size_t size)
+{
+ struct ib_mad *in_mad;
+ struct ib_mad *out_mad;
+ size_t mad_size = sizeof(*out_mad);
+ u16 out_mad_pkey_index = 0;
+ ssize_t ret;
+
+ if (!dev->process_mad)
+ return -ENOSYS;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ in_mad->mad_hdr.base_version = 1;
+ in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT;
+ in_mad->mad_hdr.class_version = 1;
+ in_mad->mad_hdr.method = IB_MGMT_METHOD_GET;
+ in_mad->mad_hdr.attr_id = attr;
+
+ if (attr != IB_PMA_CLASS_PORT_INFO)
+ in_mad->data[41] = port_num; /* PortSelect field */
+
+ if ((dev->process_mad(dev, IB_MAD_IGNORE_MKEY,
+ port_num, NULL, NULL,
+ (const struct ib_mad_hdr *)in_mad, mad_size,
+ (struct ib_mad_hdr *)out_mad, &mad_size,
+ &out_mad_pkey_index) &
+ (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) !=
+ (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ memcpy(data, out_mad->data + offset, size);
+ ret = size;
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return ret;
+}
+
+static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
+ char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ int offset = tab_attr->index & 0xffff;
+ int width = (tab_attr->index >> 16) & 0xff;
+ ssize_t ret;
+ u8 data[8];
+
+ ret = get_perf_mad(p->ibdev, p->port_num, tab_attr->attr_id, &data,
+ 40 + offset / 8, sizeof(data));
+ if (ret < 0)
+ return sprintf(buf, "N/A (no PMA)\n");
+
+ switch (width) {
+ case 4:
+ ret = sprintf(buf, "%u\n", (*data >>
+ (4 - (offset % 8))) & 0xf);
+ break;
+ case 8:
+ ret = sprintf(buf, "%u\n", *data);
+ break;
+ case 16:
+ ret = sprintf(buf, "%u\n",
+ be16_to_cpup((__be16 *)data));
+ break;
+ case 32:
+ ret = sprintf(buf, "%u\n",
+ be32_to_cpup((__be32 *)data));
+ break;
+ case 64:
+ ret = sprintf(buf, "%llu\n",
+ (unsigned long long)be64_to_cpup((__be64 *)data));
+ break;
+
+ default:
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static PORT_PMA_ATTR(symbol_error , 0, 16, 32);
+static PORT_PMA_ATTR(link_error_recovery , 1, 8, 48);
+static PORT_PMA_ATTR(link_downed , 2, 8, 56);
+static PORT_PMA_ATTR(port_rcv_errors , 3, 16, 64);
+static PORT_PMA_ATTR(port_rcv_remote_physical_errors, 4, 16, 80);
+static PORT_PMA_ATTR(port_rcv_switch_relay_errors , 5, 16, 96);
+static PORT_PMA_ATTR(port_xmit_discards , 6, 16, 112);
+static PORT_PMA_ATTR(port_xmit_constraint_errors , 7, 8, 128);
+static PORT_PMA_ATTR(port_rcv_constraint_errors , 8, 8, 136);
+static PORT_PMA_ATTR(local_link_integrity_errors , 9, 4, 152);
+static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10, 4, 156);
+static PORT_PMA_ATTR(VL15_dropped , 11, 16, 176);
+static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192);
+static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224);
+static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256);
+static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288);
+static PORT_PMA_ATTR(port_xmit_wait , 0, 32, 320);
+
+/*
+ * Counters added by extended set
+ */
+static PORT_PMA_ATTR_EXT(port_xmit_data , 64, 64);
+static PORT_PMA_ATTR_EXT(port_rcv_data , 64, 128);
+static PORT_PMA_ATTR_EXT(port_xmit_packets , 64, 192);
+static PORT_PMA_ATTR_EXT(port_rcv_packets , 64, 256);
+static PORT_PMA_ATTR_EXT(unicast_xmit_packets , 64, 320);
+static PORT_PMA_ATTR_EXT(unicast_rcv_packets , 64, 384);
+static PORT_PMA_ATTR_EXT(multicast_xmit_packets , 64, 448);
+static PORT_PMA_ATTR_EXT(multicast_rcv_packets , 64, 512);
+
+static struct attribute *pma_attrs[] = {
+ &port_pma_attr_symbol_error.attr.attr,
+ &port_pma_attr_link_error_recovery.attr.attr,
+ &port_pma_attr_link_downed.attr.attr,
+ &port_pma_attr_port_rcv_errors.attr.attr,
+ &port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+ &port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+ &port_pma_attr_port_xmit_discards.attr.attr,
+ &port_pma_attr_port_xmit_constraint_errors.attr.attr,
+ &port_pma_attr_port_rcv_constraint_errors.attr.attr,
+ &port_pma_attr_local_link_integrity_errors.attr.attr,
+ &port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+ &port_pma_attr_VL15_dropped.attr.attr,
+ &port_pma_attr_port_xmit_data.attr.attr,
+ &port_pma_attr_port_rcv_data.attr.attr,
+ &port_pma_attr_port_xmit_packets.attr.attr,
+ &port_pma_attr_port_rcv_packets.attr.attr,
+ &port_pma_attr_port_xmit_wait.attr.attr,
+ NULL
+};
+
+static struct attribute *pma_attrs_ext[] = {
+ &port_pma_attr_symbol_error.attr.attr,
+ &port_pma_attr_link_error_recovery.attr.attr,
+ &port_pma_attr_link_downed.attr.attr,
+ &port_pma_attr_port_rcv_errors.attr.attr,
+ &port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+ &port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+ &port_pma_attr_port_xmit_discards.attr.attr,
+ &port_pma_attr_port_xmit_constraint_errors.attr.attr,
+ &port_pma_attr_port_rcv_constraint_errors.attr.attr,
+ &port_pma_attr_local_link_integrity_errors.attr.attr,
+ &port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+ &port_pma_attr_VL15_dropped.attr.attr,
+ &port_pma_attr_ext_port_xmit_data.attr.attr,
+ &port_pma_attr_ext_port_rcv_data.attr.attr,
+ &port_pma_attr_ext_port_xmit_packets.attr.attr,
+ &port_pma_attr_port_xmit_wait.attr.attr,
+ &port_pma_attr_ext_port_rcv_packets.attr.attr,
+ &port_pma_attr_ext_unicast_rcv_packets.attr.attr,
+ &port_pma_attr_ext_unicast_xmit_packets.attr.attr,
+ &port_pma_attr_ext_multicast_rcv_packets.attr.attr,
+ &port_pma_attr_ext_multicast_xmit_packets.attr.attr,
+ NULL
+};
+
+static struct attribute *pma_attrs_noietf[] = {
+ &port_pma_attr_symbol_error.attr.attr,
+ &port_pma_attr_link_error_recovery.attr.attr,
+ &port_pma_attr_link_downed.attr.attr,
+ &port_pma_attr_port_rcv_errors.attr.attr,
+ &port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+ &port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+ &port_pma_attr_port_xmit_discards.attr.attr,
+ &port_pma_attr_port_xmit_constraint_errors.attr.attr,
+ &port_pma_attr_port_rcv_constraint_errors.attr.attr,
+ &port_pma_attr_local_link_integrity_errors.attr.attr,
+ &port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+ &port_pma_attr_VL15_dropped.attr.attr,
+ &port_pma_attr_ext_port_xmit_data.attr.attr,
+ &port_pma_attr_ext_port_rcv_data.attr.attr,
+ &port_pma_attr_ext_port_xmit_packets.attr.attr,
+ &port_pma_attr_ext_port_rcv_packets.attr.attr,
+ &port_pma_attr_port_xmit_wait.attr.attr,
+ NULL
+};
+
+static struct attribute_group pma_group = {
+ .name = "counters",
+ .attrs = pma_attrs
+};
+
+static struct attribute_group pma_group_ext = {
+ .name = "counters",
+ .attrs = pma_attrs_ext
+};
+
+static struct attribute_group pma_group_noietf = {
+ .name = "counters",
+ .attrs = pma_attrs_noietf
+};
+
+static void ib_port_release(struct kobject *kobj)
+{
+ struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+ struct attribute *a;
+ int i;
+
+ if (p->gid_group.attrs) {
+ for (i = 0; (a = p->gid_group.attrs[i]); ++i)
+ kfree(a);
+
+ kfree(p->gid_group.attrs);
+ }
+
+ if (p->pkey_group.attrs) {
+ for (i = 0; (a = p->pkey_group.attrs[i]); ++i)
+ kfree(a);
+
+ kfree(p->pkey_group.attrs);
+ }
+
+ kfree(p);
+}
+
+static void ib_port_gid_attr_release(struct kobject *kobj)
+{
+ struct gid_attr_group *g = container_of(kobj, struct gid_attr_group,
+ kobj);
+ struct attribute *a;
+ int i;
+
+ if (g->ndev.attrs) {
+ for (i = 0; (a = g->ndev.attrs[i]); ++i)
+ kfree(a);
+
+ kfree(g->ndev.attrs);
+ }
+
+ if (g->type.attrs) {
+ for (i = 0; (a = g->type.attrs[i]); ++i)
+ kfree(a);
+
+ kfree(g->type.attrs);
+ }
+
+ kfree(g);
+}
+
+static struct kobj_type port_type = {
+ .release = ib_port_release,
+ .sysfs_ops = &port_sysfs_ops,
+ .default_attrs = port_default_attrs
+};
+
+static struct kobj_type gid_attr_type = {
+ .sysfs_ops = &gid_attr_sysfs_ops,
+ .release = ib_port_gid_attr_release
+};
+
+static struct attribute **
+alloc_group_attrs(ssize_t (*show)(struct ib_port *,
+ struct port_attribute *, char *buf),
+ int len)
+{
+ struct attribute **tab_attr;
+ struct port_table_attribute *element;
+ int i;
+
+ tab_attr = kcalloc(1 + len, sizeof(struct attribute *), GFP_KERNEL);
+ if (!tab_attr)
+ return NULL;
+
+ for (i = 0; i < len; i++) {
+ element = kzalloc(sizeof(struct port_table_attribute),
+ GFP_KERNEL);
+ if (!element)
+ goto err;
+
+ if (snprintf(element->name, sizeof(element->name),
+ "%d", i) >= sizeof(element->name)) {
+ kfree(element);
+ goto err;
+ }
+
+ element->attr.attr.name = element->name;
+ element->attr.attr.mode = S_IRUGO;
+ element->attr.show = show;
+ element->index = i;
+ sysfs_attr_init(&element->attr.attr);
+
+ tab_attr[i] = &element->attr.attr;
+ }
+
+ return tab_attr;
+
+err:
+ while (--i >= 0)
+ kfree(tab_attr[i]);
+ kfree(tab_attr);
+ return NULL;
+}
+
+/*
+ * Figure out which counter table to use depending on
+ * the device capabilities.
+ */
+static struct attribute_group *get_counter_table(struct ib_device *dev,
+ int port_num)
+{
+ struct ib_class_port_info cpi;
+
+ if (get_perf_mad(dev, port_num, IB_PMA_CLASS_PORT_INFO,
+ &cpi, 40, sizeof(cpi)) >= 0) {
+ if (cpi.capability_mask & IB_PMA_CLASS_CAP_EXT_WIDTH)
+ /* We have extended counters */
+ return &pma_group_ext;
+
+ if (cpi.capability_mask & IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF)
+ /* But not the IETF ones */
+ return &pma_group_noietf;
+ }
+
+ /* Fall back to normal counters */
+ return &pma_group;
+}
+
+static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats,
+ u8 port_num, int index)
+{
+ int ret;
+
+ if (time_is_after_eq_jiffies(stats->timestamp + stats->lifespan))
+ return 0;
+ ret = dev->get_hw_stats(dev, stats, port_num, index);
+ if (ret < 0)
+ return ret;
+ if (ret == stats->num_counters)
+ stats->timestamp = jiffies;
+
+ return 0;
+}
+
+static ssize_t print_hw_stat(struct rdma_hw_stats *stats, int index, char *buf)
+{
+ return sprintf(buf, "%llu\n", (unsigned long long)stats->value[index]);
+}
+
+static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct ib_device *dev;
+ struct ib_port *port;
+ struct hw_stats_attribute *hsa;
+ struct rdma_hw_stats *stats;
+ int ret;
+
+ hsa = container_of(attr, struct hw_stats_attribute, attr);
+ if (!hsa->port_num) {
+ dev = container_of((struct device *)kobj,
+ struct ib_device, dev);
+ stats = dev->hw_stats;
+ } else {
+ port = container_of(kobj, struct ib_port, kobj);
+ dev = port->ibdev;
+ stats = port->hw_stats;
+ }
+ ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index);
+ if (ret)
+ return ret;
+ return print_hw_stat(stats, hsa->index, buf);
+}
+
+static ssize_t show_stats_lifespan(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct hw_stats_attribute *hsa;
+ int msecs;
+
+ hsa = container_of(attr, struct hw_stats_attribute, attr);
+ if (!hsa->port_num) {
+ struct ib_device *dev = container_of((struct device *)kobj,
+ struct ib_device, dev);
+ msecs = jiffies_to_msecs(dev->hw_stats->lifespan);
+ } else {
+ struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+ msecs = jiffies_to_msecs(p->hw_stats->lifespan);
+ }
+ return sprintf(buf, "%d\n", msecs);
+}
+
+static ssize_t set_stats_lifespan(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct hw_stats_attribute *hsa;
+ int msecs;
+ int jiffies;
+ int ret;
+
+ ret = kstrtoint(buf, 10, &msecs);
+ if (ret)
+ return ret;
+ if (msecs < 0 || msecs > 10000)
+ return -EINVAL;
+ jiffies = msecs_to_jiffies(msecs);
+ hsa = container_of(attr, struct hw_stats_attribute, attr);
+ if (!hsa->port_num) {
+ struct ib_device *dev = container_of((struct device *)kobj,
+ struct ib_device, dev);
+ dev->hw_stats->lifespan = jiffies;
+ } else {
+ struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+ p->hw_stats->lifespan = jiffies;
+ }
+ return count;
+}
+
+static void free_hsag(struct kobject *kobj, struct attribute_group *attr_group)
+{
+ struct attribute **attr;
+
+ sysfs_remove_group(kobj, attr_group);
+
+ for (attr = attr_group->attrs; *attr; attr++)
+ kfree(*attr);
+ kfree(attr_group);
+}
+
+static struct attribute *alloc_hsa(int index, u8 port_num, const char *name)
+{
+ struct hw_stats_attribute *hsa;
+
+ hsa = kmalloc(sizeof(*hsa), GFP_KERNEL);
+ if (!hsa)
+ return NULL;
+
+ hsa->attr.name = __DECONST(char *, name);
+ hsa->attr.mode = S_IRUGO;
+ hsa->show = show_hw_stats;
+ hsa->store = NULL;
+ hsa->index = index;
+ hsa->port_num = port_num;
+
+ return &hsa->attr;
+}
+
+static struct attribute *alloc_hsa_lifespan(char *name, u8 port_num)
+{
+ struct hw_stats_attribute *hsa;
+
+ hsa = kmalloc(sizeof(*hsa), GFP_KERNEL);
+ if (!hsa)
+ return NULL;
+
+ hsa->attr.name = name;
+ hsa->attr.mode = S_IWUSR | S_IRUGO;
+ hsa->show = show_stats_lifespan;
+ hsa->store = set_stats_lifespan;
+ hsa->index = 0;
+ hsa->port_num = port_num;
+
+ return &hsa->attr;
+}
+
+static void setup_hw_stats(struct ib_device *device, struct ib_port *port,
+ u8 port_num)
+{
+ struct attribute_group *hsag;
+ struct rdma_hw_stats *stats;
+ int i, ret;
+
+ stats = device->alloc_hw_stats(device, port_num);
+
+ if (!stats)
+ return;
+
+ if (!stats->names || stats->num_counters <= 0)
+ goto err_free_stats;
+
+ /*
+ * Two extra attribue elements here, one for the lifespan entry and
+ * one to NULL terminate the list for the sysfs core code
+ */
+ hsag = kzalloc(sizeof(*hsag) +
+ sizeof(void *) * (stats->num_counters + 2),
+ GFP_KERNEL);
+ if (!hsag)
+ goto err_free_stats;
+
+ ret = device->get_hw_stats(device, stats, port_num,
+ stats->num_counters);
+ if (ret != stats->num_counters)
+ goto err_free_hsag;
+
+ stats->timestamp = jiffies;
+
+ hsag->name = "hw_counters";
+ hsag->attrs = (void *)((char *)hsag + sizeof(*hsag));
+
+ for (i = 0; i < stats->num_counters; i++) {
+ hsag->attrs[i] = alloc_hsa(i, port_num, stats->names[i]);
+ if (!hsag->attrs[i])
+ goto err;
+ sysfs_attr_init(hsag->attrs[i]);
+ }
+
+ /* treat an error here as non-fatal */
+ hsag->attrs[i] = alloc_hsa_lifespan("lifespan", port_num);
+ if (hsag->attrs[i])
+ sysfs_attr_init(hsag->attrs[i]);
+
+ if (port) {
+ struct kobject *kobj = &port->kobj;
+ ret = sysfs_create_group(kobj, hsag);
+ if (ret)
+ goto err;
+ port->hw_stats_ag = hsag;
+ port->hw_stats = stats;
+ } else {
+ struct kobject *kobj = &device->dev.kobj;
+ ret = sysfs_create_group(kobj, hsag);
+ if (ret)
+ goto err;
+ device->hw_stats_ag = hsag;
+ device->hw_stats = stats;
+ }
+
+ return;
+
+err:
+ for (; i >= 0; i--)
+ kfree(hsag->attrs[i]);
+err_free_hsag:
+ kfree(hsag);
+err_free_stats:
+ kfree(stats);
+ return;
+}
+
+static int add_port(struct ib_device *device, int port_num,
+ int (*port_callback)(struct ib_device *,
+ u8, struct kobject *))
+{
+ struct ib_port *p;
+ struct ib_port_attr attr;
+ int i;
+ int ret;
+
+ ret = ib_query_port(device, port_num, &attr);
+ if (ret)
+ return ret;
+
+ p = kzalloc(sizeof *p, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ p->ibdev = device;
+ p->port_num = port_num;
+
+ ret = kobject_init_and_add(&p->kobj, &port_type,
+ device->ports_parent,
+ "%d", port_num);
+ if (ret) {
+ kfree(p);
+ return ret;
+ }
+
+ p->gid_attr_group = kzalloc(sizeof(*p->gid_attr_group), GFP_KERNEL);
+ if (!p->gid_attr_group) {
+ ret = -ENOMEM;
+ goto err_put;
+ }
+
+ p->gid_attr_group->port = p;
+ ret = kobject_init_and_add(&p->gid_attr_group->kobj, &gid_attr_type,
+ &p->kobj, "gid_attrs");
+ if (ret) {
+ kfree(p->gid_attr_group);
+ goto err_put;
+ }
+
+ p->pma_table = get_counter_table(device, port_num);
+ ret = sysfs_create_group(&p->kobj, p->pma_table);
+ if (ret)
+ goto err_put_gid_attrs;
+
+ p->gid_group.name = "gids";
+ p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len);
+ if (!p->gid_group.attrs) {
+ ret = -ENOMEM;
+ goto err_remove_pma;
+ }
+
+ ret = sysfs_create_group(&p->kobj, &p->gid_group);
+ if (ret)
+ goto err_free_gid;
+
+ p->gid_attr_group->ndev.name = "ndevs";
+ p->gid_attr_group->ndev.attrs = alloc_group_attrs(show_port_gid_attr_ndev,
+ attr.gid_tbl_len);
+ if (!p->gid_attr_group->ndev.attrs) {
+ ret = -ENOMEM;
+ goto err_remove_gid;
+ }
+
+ ret = sysfs_create_group(&p->gid_attr_group->kobj,
+ &p->gid_attr_group->ndev);
+ if (ret)
+ goto err_free_gid_ndev;
+
+ p->gid_attr_group->type.name = "types";
+ p->gid_attr_group->type.attrs = alloc_group_attrs(show_port_gid_attr_gid_type,
+ attr.gid_tbl_len);
+ if (!p->gid_attr_group->type.attrs) {
+ ret = -ENOMEM;
+ goto err_remove_gid_ndev;
+ }
+
+ ret = sysfs_create_group(&p->gid_attr_group->kobj,
+ &p->gid_attr_group->type);
+ if (ret)
+ goto err_free_gid_type;
+
+ p->pkey_group.name = "pkeys";
+ p->pkey_group.attrs = alloc_group_attrs(show_port_pkey,
+ attr.pkey_tbl_len);
+ if (!p->pkey_group.attrs) {
+ ret = -ENOMEM;
+ goto err_remove_gid_type;
+ }
+
+ ret = sysfs_create_group(&p->kobj, &p->pkey_group);
+ if (ret)
+ goto err_free_pkey;
+
+ if (port_callback) {
+ ret = port_callback(device, port_num, &p->kobj);
+ if (ret)
+ goto err_remove_pkey;
+ }
+
+ /*
+ * If port == 0, it means we have only one port and the parent
+ * device, not this port device, should be the holder of the
+ * hw_counters
+ */
+ if (device->alloc_hw_stats && port_num)
+ setup_hw_stats(device, p, port_num);
+
+ list_add_tail(&p->kobj.entry, &device->port_list);
+
+ return 0;
+
+err_remove_pkey:
+ sysfs_remove_group(&p->kobj, &p->pkey_group);
+
+err_free_pkey:
+ for (i = 0; i < attr.pkey_tbl_len; ++i)
+ kfree(p->pkey_group.attrs[i]);
+
+ kfree(p->pkey_group.attrs);
+ p->pkey_group.attrs = NULL;
+
+err_remove_gid_type:
+ sysfs_remove_group(&p->gid_attr_group->kobj,
+ &p->gid_attr_group->type);
+
+err_free_gid_type:
+ for (i = 0; i < attr.gid_tbl_len; ++i)
+ kfree(p->gid_attr_group->type.attrs[i]);
+
+ kfree(p->gid_attr_group->type.attrs);
+ p->gid_attr_group->type.attrs = NULL;
+
+err_remove_gid_ndev:
+ sysfs_remove_group(&p->gid_attr_group->kobj,
+ &p->gid_attr_group->ndev);
+
+err_free_gid_ndev:
+ for (i = 0; i < attr.gid_tbl_len; ++i)
+ kfree(p->gid_attr_group->ndev.attrs[i]);
+
+ kfree(p->gid_attr_group->ndev.attrs);
+ p->gid_attr_group->ndev.attrs = NULL;
+
+err_remove_gid:
+ sysfs_remove_group(&p->kobj, &p->gid_group);
+
+err_free_gid:
+ for (i = 0; i < attr.gid_tbl_len; ++i)
+ kfree(p->gid_group.attrs[i]);
+
+ kfree(p->gid_group.attrs);
+ p->gid_group.attrs = NULL;
+
+err_remove_pma:
+ sysfs_remove_group(&p->kobj, p->pma_table);
+
+err_put_gid_attrs:
+ kobject_put(&p->gid_attr_group->kobj);
+
+err_put:
+ kobject_put(&p->kobj);
+ return ret;
+}
+
+static ssize_t show_node_type(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ switch (dev->node_type) {
+ case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type);
+ case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type);
+ case RDMA_NODE_USNIC: return sprintf(buf, "%d: usNIC\n", dev->node_type);
+ case RDMA_NODE_USNIC_UDP: return sprintf(buf, "%d: usNIC UDP\n", dev->node_type);
+ case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type);
+ case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type);
+ default: return sprintf(buf, "%d: <unknown>\n", dev->node_type);
+ }
+}
+
+static ssize_t show_sys_image_guid(struct device *device,
+ struct device_attribute *dev_attr, char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+ be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]),
+ be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[1]),
+ be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[2]),
+ be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[3]));
+}
+
+static ssize_t show_node_guid(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+ be16_to_cpu(((__be16 *) &dev->node_guid)[0]),
+ be16_to_cpu(((__be16 *) &dev->node_guid)[1]),
+ be16_to_cpu(((__be16 *) &dev->node_guid)[2]),
+ be16_to_cpu(((__be16 *) &dev->node_guid)[3]));
+}
+
+static ssize_t show_node_desc(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ return sprintf(buf, "%.64s\n", dev->node_desc);
+}
+
+static ssize_t set_node_desc(struct device *device,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+ struct ib_device_modify desc = {};
+ int ret;
+
+ if (!dev->modify_device)
+ return -EIO;
+
+ memcpy(desc.node_desc, buf, min_t(int, count, IB_DEVICE_NODE_DESC_MAX));
+ ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ ib_get_device_fw_str(dev, buf, PAGE_SIZE);
+ strlcat(buf, "\n", PAGE_SIZE);
+ return strlen(buf);
+}
+
+static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL);
+static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL);
+static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL);
+static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+
+static struct device_attribute *ib_class_attributes[] = {
+ &dev_attr_node_type,
+ &dev_attr_sys_image_guid,
+ &dev_attr_node_guid,
+ &dev_attr_node_desc,
+ &dev_attr_fw_ver,
+};
+
+static void free_port_list_attributes(struct ib_device *device)
+{
+ struct kobject *p, *t;
+
+ list_for_each_entry_safe(p, t, &device->port_list, entry) {
+ struct ib_port *port = container_of(p, struct ib_port, kobj);
+ list_del(&p->entry);
+ if (port->hw_stats) {
+ kfree(port->hw_stats);
+ free_hsag(&port->kobj, port->hw_stats_ag);
+ }
+ sysfs_remove_group(p, port->pma_table);
+ sysfs_remove_group(p, &port->pkey_group);
+ sysfs_remove_group(p, &port->gid_group);
+ sysfs_remove_group(&port->gid_attr_group->kobj,
+ &port->gid_attr_group->ndev);
+ sysfs_remove_group(&port->gid_attr_group->kobj,
+ &port->gid_attr_group->type);
+ kobject_put(&port->gid_attr_group->kobj);
+ kobject_put(p);
+ }
+
+ kobject_put(device->ports_parent);
+}
+
+int ib_device_register_sysfs(struct ib_device *device,
+ int (*port_callback)(struct ib_device *,
+ u8, struct kobject *))
+{
+ struct device *class_dev = &device->dev;
+ int ret;
+ int i;
+
+ device->dev.parent = device->dma_device;
+ ret = dev_set_name(class_dev, "%s", device->name);
+ if (ret)
+ return ret;
+
+ ret = device_add(class_dev);
+ if (ret)
+ goto err;
+
+ for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) {
+ ret = device_create_file(class_dev, ib_class_attributes[i]);
+ if (ret)
+ goto err_unregister;
+ }
+
+ device->ports_parent = kobject_create_and_add("ports",
+ &class_dev->kobj);
+ if (!device->ports_parent) {
+ ret = -ENOMEM;
+ goto err_put;
+ }
+
+ if (rdma_cap_ib_switch(device)) {
+ ret = add_port(device, 0, port_callback);
+ if (ret)
+ goto err_put;
+ } else {
+ for (i = 1; i <= device->phys_port_cnt; ++i) {
+ ret = add_port(device, i, port_callback);
+ if (ret)
+ goto err_put;
+ }
+ }
+
+ if (device->alloc_hw_stats)
+ setup_hw_stats(device, NULL, 0);
+
+ return 0;
+
+err_put:
+ free_port_list_attributes(device);
+
+err_unregister:
+ device_unregister(class_dev);
+
+err:
+ return ret;
+}
+
+void ib_device_unregister_sysfs(struct ib_device *device)
+{
+ int i;
+
+ /* Hold kobject until ib_dealloc_device() */
+ kobject_get(&device->dev.kobj);
+
+ free_port_list_attributes(device);
+
+ if (device->hw_stats) {
+ kfree(device->hw_stats);
+ free_hsag(&device->dev.kobj, device->hw_stats_ag);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i)
+ device_remove_file(&device->dev, ib_class_attributes[i]);
+
+ device_unregister(&device->dev);
+}
diff --git a/sys/ofed/drivers/infiniband/core/ib_umem.c b/sys/ofed/drivers/infiniband/core/ib_umem.c
new file mode 100644
index 0000000000000..75fddf08216dd
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/ib_umem.c
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define LINUXKPI_PARAM_PREFIX ibcore_
+
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <rdma/ib_umem_odp.h>
+
+#include "uverbs.h"
+
+#include <sys/priv.h>
+
+static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
+{
+ struct scatterlist *sg;
+ struct page *page;
+ int i;
+
+ if (umem->nmap > 0)
+ ib_dma_unmap_sg(dev, umem->sg_head.sgl,
+ umem->nmap,
+ DMA_BIDIRECTIONAL);
+
+ for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
+
+ page = sg_page(sg);
+ if (umem->writable && dirty)
+ set_page_dirty_lock(page);
+ put_page(page);
+ }
+
+ sg_free_table(&umem->sg_head);
+ return;
+
+}
+
+/**
+ * ib_umem_get - Pin and DMA map userspace memory.
+ *
+ * If access flags indicate ODP memory, avoid pinning. Instead, stores
+ * the mm for future page fault handling in conjunction with MMU notifiers.
+ *
+ * @context: userspace context to pin memory for
+ * @addr: userspace virtual address to start at
+ * @size: length of region to pin
+ * @access: IB_ACCESS_xxx flags for memory being pinned
+ * @dmasync: flush in-flight DMA when the memory region is written
+ */
+struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+ size_t size, int access, int dmasync)
+{
+ struct ib_umem *umem;
+ struct page **page_list;
+ struct vm_area_struct **vma_list;
+ unsigned long locked;
+ unsigned long cur_base;
+ unsigned long npages;
+ int ret;
+ int i;
+ struct dma_attrs dma_attrs = { 0 };
+ struct scatterlist *sg, *sg_list_start;
+ int need_release = 0;
+ unsigned int gup_flags = FOLL_WRITE;
+
+ if (dmasync)
+ dma_attrs.flags |= DMA_ATTR_WRITE_BARRIER;
+
+ if (!size)
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * If the combination of the addr and size requested for this memory
+ * region causes an integer overflow, return error.
+ */
+ if (((addr + size) < addr) ||
+ PAGE_ALIGN(addr + size) < (addr + size))
+ return ERR_PTR(-EINVAL);
+
+ if (priv_check(curthread, PRIV_VM_MLOCK) != 0)
+ return ERR_PTR(-EPERM);
+
+ umem = kzalloc(sizeof *umem, GFP_KERNEL);
+ if (!umem)
+ return ERR_PTR(-ENOMEM);
+
+ umem->context = context;
+ umem->length = size;
+ umem->address = addr;
+ umem->page_size = PAGE_SIZE;
+ umem->pid = get_pid(task_pid(current));
+ /*
+ * We ask for writable memory if any of the following
+ * access flags are set. "Local write" and "remote write"
+ * obviously require write access. "Remote atomic" can do
+ * things like fetch and add, which will modify memory, and
+ * "MW bind" can change permissions by binding a window.
+ */
+ umem->writable = !!(access &
+ (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND));
+
+ if (access & IB_ACCESS_ON_DEMAND) {
+ ret = ib_umem_odp_get(context, umem);
+ if (ret) {
+ kfree(umem);
+ return ERR_PTR(ret);
+ }
+ return umem;
+ }
+
+ umem->odp_data = NULL;
+
+ page_list = (struct page **) __get_free_page(GFP_KERNEL);
+ if (!page_list) {
+ kfree(umem);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL);
+
+ npages = ib_umem_num_pages(umem);
+
+ down_write(&current->mm->mmap_sem);
+
+ locked = npages + current->mm->pinned_vm;
+
+ cur_base = addr & PAGE_MASK;
+
+ if (npages == 0 || npages > UINT_MAX) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
+ if (ret)
+ goto out;
+
+ if (!umem->writable)
+ gup_flags |= FOLL_FORCE;
+
+ need_release = 1;
+ sg_list_start = umem->sg_head.sgl;
+
+ while (npages) {
+ ret = get_user_pages(cur_base,
+ min_t(unsigned long, npages,
+ PAGE_SIZE / sizeof (struct page *)),
+ gup_flags, page_list, vma_list);
+
+ if (ret < 0)
+ goto out;
+
+ umem->npages += ret;
+ cur_base += ret * PAGE_SIZE;
+ npages -= ret;
+
+ for_each_sg(sg_list_start, sg, ret, i) {
+ sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
+ }
+
+ /* preparing for next loop */
+ sg_list_start = sg;
+ }
+
+ umem->nmap = ib_dma_map_sg_attrs(context->device,
+ umem->sg_head.sgl,
+ umem->npages,
+ DMA_BIDIRECTIONAL,
+ &dma_attrs);
+
+ if (umem->nmap <= 0) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ if (ret < 0) {
+ if (need_release)
+ __ib_umem_release(context->device, umem, 0);
+ put_pid(umem->pid);
+ kfree(umem);
+ } else
+ current->mm->pinned_vm = locked;
+
+ up_write(&current->mm->mmap_sem);
+ if (vma_list)
+ free_page((unsigned long) vma_list);
+ free_page((unsigned long) page_list);
+
+ return ret < 0 ? ERR_PTR(ret) : umem;
+}
+EXPORT_SYMBOL(ib_umem_get);
+
+static void ib_umem_account(struct work_struct *work)
+{
+ struct ib_umem *umem = container_of(work, struct ib_umem, work);
+
+ down_write(&umem->mm->mmap_sem);
+ umem->mm->pinned_vm -= umem->diff;
+ up_write(&umem->mm->mmap_sem);
+ mmput(umem->mm);
+ kfree(umem);
+}
+
+/**
+ * ib_umem_release - release memory pinned with ib_umem_get
+ * @umem: umem struct to release
+ */
+void ib_umem_release(struct ib_umem *umem)
+{
+ struct ib_ucontext *context = umem->context;
+ struct mm_struct *mm;
+ struct task_struct *task;
+ unsigned long diff;
+
+ if (umem->odp_data) {
+ ib_umem_odp_release(umem);
+ return;
+ }
+
+ __ib_umem_release(umem->context->device, umem, 1);
+
+ task = get_pid_task(umem->pid, PIDTYPE_PID);
+ put_pid(umem->pid);
+ if (!task)
+ goto out;
+ mm = get_task_mm(task);
+ put_task_struct(task);
+ if (!mm)
+ goto out;
+
+ diff = ib_umem_num_pages(umem);
+
+ /*
+ * We may be called with the mm's mmap_sem already held. This
+ * can happen when a userspace munmap() is the call that drops
+ * the last reference to our file and calls our release
+ * method. If there are memory regions to destroy, we'll end
+ * up here and not be able to take the mmap_sem. In that case
+ * we defer the vm_locked accounting to the system workqueue.
+ */
+ if (context->closing) {
+ if (!down_write_trylock(&mm->mmap_sem)) {
+ INIT_WORK(&umem->work, ib_umem_account);
+ umem->mm = mm;
+ umem->diff = diff;
+
+ queue_work(ib_wq, &umem->work);
+ return;
+ }
+ } else
+ down_write(&mm->mmap_sem);
+
+ mm->pinned_vm -= diff;
+ up_write(&mm->mmap_sem);
+ mmput(mm);
+out:
+ kfree(umem);
+}
+EXPORT_SYMBOL(ib_umem_release);
+
+int ib_umem_page_count(struct ib_umem *umem)
+{
+ int shift;
+ int i;
+ int n;
+ struct scatterlist *sg;
+
+ if (umem->odp_data)
+ return ib_umem_num_pages(umem);
+
+ shift = ilog2(umem->page_size);
+
+ n = 0;
+ for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
+ n += sg_dma_len(sg) >> shift;
+
+ return n;
+}
+EXPORT_SYMBOL(ib_umem_page_count);
+
+/*
+ * Copy from the given ib_umem's pages to the given buffer.
+ *
+ * umem - the umem to copy from
+ * offset - offset to start copying from
+ * dst - destination buffer
+ * length - buffer length
+ *
+ * Returns 0 on success, or an error code.
+ */
+int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
+ size_t length)
+{
+ size_t end = offset + length;
+ int ret;
+
+ if (offset > umem->length || length > umem->length - offset) {
+ pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n",
+ offset, umem->length, end);
+ return -EINVAL;
+ }
+
+#ifdef __linux__
+ ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length,
+ offset + ib_umem_offset(umem));
+#else
+ ret = 0;
+#endif
+ if (ret < 0)
+ return ret;
+ else if (ret != length)
+ return -EINVAL;
+ else
+ return 0;
+}
+EXPORT_SYMBOL(ib_umem_copy_from);
diff --git a/sys/ofed/drivers/infiniband/core/ib_umem_odp.c b/sys/ofed/drivers/infiniband/core/ib_umem_odp.c
new file mode 100644
index 0000000000000..bf00214d7a936
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/ib_umem_odp.c
@@ -0,0 +1,667 @@
+/*
+ * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_umem_odp.h>
+
+static void ib_umem_notifier_start_account(struct ib_umem *item)
+{
+ mutex_lock(&item->odp_data->umem_mutex);
+
+ /* Only update private counters for this umem if it has them.
+ * Otherwise skip it. All page faults will be delayed for this umem. */
+ if (item->odp_data->mn_counters_active) {
+ int notifiers_count = item->odp_data->notifiers_count++;
+
+ if (notifiers_count == 0)
+ /* Initialize the completion object for waiting on
+ * notifiers. Since notifier_count is zero, no one
+ * should be waiting right now. */
+ reinit_completion(&item->odp_data->notifier_completion);
+ }
+ mutex_unlock(&item->odp_data->umem_mutex);
+}
+
+static void ib_umem_notifier_end_account(struct ib_umem *item)
+{
+ mutex_lock(&item->odp_data->umem_mutex);
+
+ /* Only update private counters for this umem if it has them.
+ * Otherwise skip it. All page faults will be delayed for this umem. */
+ if (item->odp_data->mn_counters_active) {
+ /*
+ * This sequence increase will notify the QP page fault that
+ * the page that is going to be mapped in the spte could have
+ * been freed.
+ */
+ ++item->odp_data->notifiers_seq;
+ if (--item->odp_data->notifiers_count == 0)
+ complete_all(&item->odp_data->notifier_completion);
+ }
+ mutex_unlock(&item->odp_data->umem_mutex);
+}
+
+/* Account for a new mmu notifier in an ib_ucontext. */
+static void ib_ucontext_notifier_start_account(struct ib_ucontext *context)
+{
+ atomic_inc(&context->notifier_count);
+}
+
+/* Account for a terminating mmu notifier in an ib_ucontext.
+ *
+ * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since
+ * the function takes the semaphore itself. */
+static void ib_ucontext_notifier_end_account(struct ib_ucontext *context)
+{
+ int zero_notifiers = atomic_dec_and_test(&context->notifier_count);
+
+ if (zero_notifiers &&
+ !list_empty(&context->no_private_counters)) {
+ /* No currently running mmu notifiers. Now is the chance to
+ * add private accounting to all previously added umems. */
+ struct ib_umem_odp *odp_data, *next;
+
+ /* Prevent concurrent mmu notifiers from working on the
+ * no_private_counters list. */
+ down_write(&context->umem_rwsem);
+
+ /* Read the notifier_count again, with the umem_rwsem
+ * semaphore taken for write. */
+ if (!atomic_read(&context->notifier_count)) {
+ list_for_each_entry_safe(odp_data, next,
+ &context->no_private_counters,
+ no_private_counters) {
+ mutex_lock(&odp_data->umem_mutex);
+ odp_data->mn_counters_active = true;
+ list_del(&odp_data->no_private_counters);
+ complete_all(&odp_data->notifier_completion);
+ mutex_unlock(&odp_data->umem_mutex);
+ }
+ }
+
+ up_write(&context->umem_rwsem);
+ }
+}
+
+static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start,
+ u64 end, void *cookie) {
+ /*
+ * Increase the number of notifiers running, to
+ * prevent any further fault handling on this MR.
+ */
+ ib_umem_notifier_start_account(item);
+ item->odp_data->dying = 1;
+ /* Make sure that the fact the umem is dying is out before we release
+ * all pending page faults. */
+ smp_wmb();
+ complete_all(&item->odp_data->notifier_completion);
+ item->context->invalidate_range(item, ib_umem_start(item),
+ ib_umem_end(item));
+ return 0;
+}
+
+static void ib_umem_notifier_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+ if (!context->invalidate_range)
+ return;
+
+ ib_ucontext_notifier_start_account(context);
+ down_read(&context->umem_rwsem);
+ rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
+ ULLONG_MAX,
+ ib_umem_notifier_release_trampoline,
+ NULL);
+ up_read(&context->umem_rwsem);
+}
+
+static int invalidate_page_trampoline(struct ib_umem *item, u64 start,
+ u64 end, void *cookie)
+{
+ ib_umem_notifier_start_account(item);
+ item->context->invalidate_range(item, start, start + PAGE_SIZE);
+ ib_umem_notifier_end_account(item);
+ return 0;
+}
+
+static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+ if (!context->invalidate_range)
+ return;
+
+ ib_ucontext_notifier_start_account(context);
+ down_read(&context->umem_rwsem);
+ rbt_ib_umem_for_each_in_range(&context->umem_tree, address,
+ address + PAGE_SIZE,
+ invalidate_page_trampoline, NULL);
+ up_read(&context->umem_rwsem);
+ ib_ucontext_notifier_end_account(context);
+}
+
+static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
+ u64 end, void *cookie)
+{
+ ib_umem_notifier_start_account(item);
+ item->context->invalidate_range(item, start, end);
+ return 0;
+}
+
+static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+ if (!context->invalidate_range)
+ return;
+
+ ib_ucontext_notifier_start_account(context);
+ down_read(&context->umem_rwsem);
+ rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
+ end,
+ invalidate_range_start_trampoline, NULL);
+ up_read(&context->umem_rwsem);
+}
+
+static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
+ u64 end, void *cookie)
+{
+ ib_umem_notifier_end_account(item);
+ return 0;
+}
+
+static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+ if (!context->invalidate_range)
+ return;
+
+ down_read(&context->umem_rwsem);
+ rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
+ end,
+ invalidate_range_end_trampoline, NULL);
+ up_read(&context->umem_rwsem);
+ ib_ucontext_notifier_end_account(context);
+}
+
+static const struct mmu_notifier_ops ib_umem_notifiers = {
+ .release = ib_umem_notifier_release,
+ .invalidate_page = ib_umem_notifier_invalidate_page,
+ .invalidate_range_start = ib_umem_notifier_invalidate_range_start,
+ .invalidate_range_end = ib_umem_notifier_invalidate_range_end,
+};
+
+int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
+{
+ int ret_val;
+ pid_t our_pid;
+ struct mm_struct *mm = get_task_mm(current);
+
+ if (!mm)
+ return -EINVAL;
+
+ /* Prevent creating ODP MRs in child processes */
+ rcu_read_lock();
+ our_pid = get_pid(task_pid_group_leader(current));
+ rcu_read_unlock();
+ put_pid(our_pid);
+ if (context->tgid != our_pid) {
+ ret_val = -EINVAL;
+ goto out_mm;
+ }
+
+ umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL);
+ if (!umem->odp_data) {
+ ret_val = -ENOMEM;
+ goto out_mm;
+ }
+ umem->odp_data->umem = umem;
+
+ mutex_init(&umem->odp_data->umem_mutex);
+
+ init_completion(&umem->odp_data->notifier_completion);
+
+ umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) *
+ sizeof(*umem->odp_data->page_list));
+ if (!umem->odp_data->page_list) {
+ ret_val = -ENOMEM;
+ goto out_odp_data;
+ }
+
+ umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) *
+ sizeof(*umem->odp_data->dma_list));
+ if (!umem->odp_data->dma_list) {
+ ret_val = -ENOMEM;
+ goto out_page_list;
+ }
+
+ /*
+ * When using MMU notifiers, we will get a
+ * notification before the "current" task (and MM) is
+ * destroyed. We use the umem_rwsem semaphore to synchronize.
+ */
+ down_write(&context->umem_rwsem);
+ context->odp_mrs_count++;
+ if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+ rbt_ib_umem_insert(&umem->odp_data->interval_tree,
+ &context->umem_tree);
+ if (likely(!atomic_read(&context->notifier_count)) ||
+ context->odp_mrs_count == 1)
+ umem->odp_data->mn_counters_active = true;
+ else
+ list_add(&umem->odp_data->no_private_counters,
+ &context->no_private_counters);
+ downgrade_write(&context->umem_rwsem);
+
+ if (context->odp_mrs_count == 1) {
+ /*
+ * Note that at this point, no MMU notifier is running
+ * for this context!
+ */
+ atomic_set(&context->notifier_count, 0);
+ INIT_HLIST_NODE(&context->mn.hlist);
+ context->mn.ops = &ib_umem_notifiers;
+ /*
+ * Lock-dep detects a false positive for mmap_sem vs.
+ * umem_rwsem, due to not grasping downgrade_write correctly.
+ */
+ ret_val = mmu_notifier_register(&context->mn, mm);
+ if (ret_val) {
+ pr_err("Failed to register mmu_notifier %d\n", ret_val);
+ ret_val = -EBUSY;
+ goto out_mutex;
+ }
+ }
+
+ up_read(&context->umem_rwsem);
+
+ /*
+ * Note that doing an mmput can cause a notifier for the relevant mm.
+ * If the notifier is called while we hold the umem_rwsem, this will
+ * cause a deadlock. Therefore, we release the reference only after we
+ * released the semaphore.
+ */
+ mmput(mm);
+ return 0;
+
+out_mutex:
+ up_read(&context->umem_rwsem);
+ vfree(umem->odp_data->dma_list);
+out_page_list:
+ vfree(umem->odp_data->page_list);
+out_odp_data:
+ kfree(umem->odp_data);
+out_mm:
+ mmput(mm);
+ return ret_val;
+}
+
+void ib_umem_odp_release(struct ib_umem *umem)
+{
+ struct ib_ucontext *context = umem->context;
+
+ /*
+ * Ensure that no more pages are mapped in the umem.
+ *
+ * It is the driver's responsibility to ensure, before calling us,
+ * that the hardware will not attempt to access the MR any more.
+ */
+ ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem),
+ ib_umem_end(umem));
+
+ down_write(&context->umem_rwsem);
+ if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+ rbt_ib_umem_remove(&umem->odp_data->interval_tree,
+ &context->umem_tree);
+ context->odp_mrs_count--;
+ if (!umem->odp_data->mn_counters_active) {
+ list_del(&umem->odp_data->no_private_counters);
+ complete_all(&umem->odp_data->notifier_completion);
+ }
+
+ /*
+ * Downgrade the lock to a read lock. This ensures that the notifiers
+ * (who lock the mutex for reading) will be able to finish, and we
+ * will be able to enventually obtain the mmu notifiers SRCU. Note
+ * that since we are doing it atomically, no other user could register
+ * and unregister while we do the check.
+ */
+ downgrade_write(&context->umem_rwsem);
+ if (!context->odp_mrs_count) {
+ struct task_struct *owning_process = NULL;
+ struct mm_struct *owning_mm = NULL;
+
+ owning_process = get_pid_task(context->tgid,
+ PIDTYPE_PID);
+ if (owning_process == NULL)
+ /*
+ * The process is already dead, notifier were removed
+ * already.
+ */
+ goto out;
+
+ owning_mm = get_task_mm(owning_process);
+ if (owning_mm == NULL)
+ /*
+ * The process' mm is already dead, notifier were
+ * removed already.
+ */
+ goto out_put_task;
+ mmu_notifier_unregister(&context->mn, owning_mm);
+
+ mmput(owning_mm);
+
+out_put_task:
+ put_task_struct(owning_process);
+ }
+out:
+ up_read(&context->umem_rwsem);
+
+ vfree(umem->odp_data->dma_list);
+ vfree(umem->odp_data->page_list);
+ kfree(umem->odp_data);
+ kfree(umem);
+}
+
+/*
+ * Map for DMA and insert a single page into the on-demand paging page tables.
+ *
+ * @umem: the umem to insert the page to.
+ * @page_index: index in the umem to add the page to.
+ * @page: the page struct to map and add.
+ * @access_mask: access permissions needed for this page.
+ * @current_seq: sequence number for synchronization with invalidations.
+ * the sequence number is taken from
+ * umem->odp_data->notifiers_seq.
+ *
+ * The function returns -EFAULT if the DMA mapping operation fails. It returns
+ * -EAGAIN if a concurrent invalidation prevents us from updating the page.
+ *
+ * The page is released via put_page even if the operation failed. For
+ * on-demand pinning, the page is released whenever it isn't stored in the
+ * umem.
+ */
+static int ib_umem_odp_map_dma_single_page(
+ struct ib_umem *umem,
+ int page_index,
+ u64 base_virt_addr,
+ struct page *page,
+ u64 access_mask,
+ unsigned long current_seq)
+{
+ struct ib_device *dev = umem->context->device;
+ dma_addr_t dma_addr;
+ int stored_page = 0;
+ int remove_existing_mapping = 0;
+ int ret = 0;
+
+ /*
+ * Note: we avoid writing if seq is different from the initial seq, to
+ * handle case of a racing notifier. This check also allows us to bail
+ * early if we have a notifier running in parallel with us.
+ */
+ if (ib_umem_mmu_notifier_retry(umem, current_seq)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ if (!(umem->odp_data->dma_list[page_index])) {
+ dma_addr = ib_dma_map_page(dev,
+ page,
+ 0, PAGE_SIZE,
+ DMA_BIDIRECTIONAL);
+ if (ib_dma_mapping_error(dev, dma_addr)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ umem->odp_data->dma_list[page_index] = dma_addr | access_mask;
+ umem->odp_data->page_list[page_index] = page;
+ stored_page = 1;
+ } else if (umem->odp_data->page_list[page_index] == page) {
+ umem->odp_data->dma_list[page_index] |= access_mask;
+ } else {
+ pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
+ umem->odp_data->page_list[page_index], page);
+ /* Better remove the mapping now, to prevent any further
+ * damage. */
+ remove_existing_mapping = 1;
+ }
+
+out:
+ /* On Demand Paging - avoid pinning the page */
+ if (umem->context->invalidate_range || !stored_page)
+ put_page(page);
+
+ if (remove_existing_mapping && umem->context->invalidate_range) {
+ invalidate_page_trampoline(
+ umem,
+ base_virt_addr + (page_index * PAGE_SIZE),
+ base_virt_addr + ((page_index+1)*PAGE_SIZE),
+ NULL);
+ ret = -EAGAIN;
+ }
+
+ return ret;
+}
+
+/**
+ * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR.
+ *
+ * Pins the range of pages passed in the argument, and maps them to
+ * DMA addresses. The DMA addresses of the mapped pages is updated in
+ * umem->odp_data->dma_list.
+ *
+ * Returns the number of pages mapped in success, negative error code
+ * for failure.
+ * An -EAGAIN error code is returned when a concurrent mmu notifier prevents
+ * the function from completing its task.
+ *
+ * @umem: the umem to map and pin
+ * @user_virt: the address from which we need to map.
+ * @bcnt: the minimal number of bytes to pin and map. The mapping might be
+ * bigger due to alignment, and may also be smaller in case of an error
+ * pinning or mapping a page. The actual pages mapped is returned in
+ * the return value.
+ * @access_mask: bit mask of the requested access permissions for the given
+ * range.
+ * @current_seq: the MMU notifiers sequance value for synchronization with
+ * invalidations. the sequance number is read from
+ * umem->odp_data->notifiers_seq before calling this function
+ */
+int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
+ u64 access_mask, unsigned long current_seq)
+{
+ struct task_struct *owning_process = NULL;
+ struct mm_struct *owning_mm = NULL;
+ struct page **local_page_list = NULL;
+ u64 off;
+ int j, k, ret = 0, start_idx, npages = 0;
+ u64 base_virt_addr;
+ unsigned int flags = 0;
+
+ if (access_mask == 0)
+ return -EINVAL;
+
+ if (user_virt < ib_umem_start(umem) ||
+ user_virt + bcnt > ib_umem_end(umem))
+ return -EFAULT;
+
+ local_page_list = (struct page **)__get_free_page(GFP_KERNEL);
+ if (!local_page_list)
+ return -ENOMEM;
+
+ off = user_virt & (~PAGE_MASK);
+ user_virt = user_virt & PAGE_MASK;
+ base_virt_addr = user_virt;
+ bcnt += off; /* Charge for the first page offset as well. */
+
+ owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID);
+ if (owning_process == NULL) {
+ ret = -EINVAL;
+ goto out_no_task;
+ }
+
+ owning_mm = get_task_mm(owning_process);
+ if (owning_mm == NULL) {
+ ret = -EINVAL;
+ goto out_put_task;
+ }
+
+ if (access_mask & ODP_WRITE_ALLOWED_BIT)
+ flags |= FOLL_WRITE;
+
+ start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT;
+ k = start_idx;
+
+ while (bcnt > 0) {
+ const size_t gup_num_pages =
+ min_t(size_t, ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE,
+ PAGE_SIZE / sizeof(struct page *));
+
+ down_read(&owning_mm->mmap_sem);
+ /*
+ * Note: this might result in redundent page getting. We can
+ * avoid this by checking dma_list to be 0 before calling
+ * get_user_pages. However, this make the code much more
+ * complex (and doesn't gain us much performance in most use
+ * cases).
+ */
+ npages = get_user_pages_remote(owning_process, owning_mm,
+ user_virt, gup_num_pages,
+ flags, local_page_list, NULL);
+ up_read(&owning_mm->mmap_sem);
+
+ if (npages < 0)
+ break;
+
+ bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt);
+ user_virt += npages << PAGE_SHIFT;
+ mutex_lock(&umem->odp_data->umem_mutex);
+ for (j = 0; j < npages; ++j) {
+ ret = ib_umem_odp_map_dma_single_page(
+ umem, k, base_virt_addr, local_page_list[j],
+ access_mask, current_seq);
+ if (ret < 0)
+ break;
+ k++;
+ }
+ mutex_unlock(&umem->odp_data->umem_mutex);
+
+ if (ret < 0) {
+ /* Release left over pages when handling errors. */
+ for (++j; j < npages; ++j)
+ put_page(local_page_list[j]);
+ break;
+ }
+ }
+
+ if (ret >= 0) {
+ if (npages < 0 && k == start_idx)
+ ret = npages;
+ else
+ ret = k - start_idx;
+ }
+
+ mmput(owning_mm);
+out_put_task:
+ put_task_struct(owning_process);
+out_no_task:
+ free_page((unsigned long)local_page_list);
+ return ret;
+}
+EXPORT_SYMBOL(ib_umem_odp_map_dma_pages);
+
+void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt,
+ u64 bound)
+{
+ int idx;
+ u64 addr;
+ struct ib_device *dev = umem->context->device;
+
+ virt = max_t(u64, virt, ib_umem_start(umem));
+ bound = min_t(u64, bound, ib_umem_end(umem));
+ /* Note that during the run of this function, the
+ * notifiers_count of the MR is > 0, preventing any racing
+ * faults from completion. We might be racing with other
+ * invalidations, so we must make sure we free each page only
+ * once. */
+ mutex_lock(&umem->odp_data->umem_mutex);
+ for (addr = virt; addr < bound; addr += (u64)umem->page_size) {
+ idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
+ if (umem->odp_data->page_list[idx]) {
+ struct page *page = umem->odp_data->page_list[idx];
+ dma_addr_t dma = umem->odp_data->dma_list[idx];
+ dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK;
+
+ WARN_ON(!dma_addr);
+
+ ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE,
+ DMA_BIDIRECTIONAL);
+ if (dma & ODP_WRITE_ALLOWED_BIT) {
+ struct page *head_page = compound_head(page);
+ /*
+ * set_page_dirty prefers being called with
+ * the page lock. However, MMU notifiers are
+ * called sometimes with and sometimes without
+ * the lock. We rely on the umem_mutex instead
+ * to prevent other mmu notifiers from
+ * continuing and allowing the page mapping to
+ * be removed.
+ */
+ set_page_dirty(head_page);
+ }
+ /* on demand pinning support */
+ if (!umem->context->invalidate_range)
+ put_page(page);
+ umem->odp_data->page_list[idx] = NULL;
+ umem->odp_data->dma_list[idx] = 0;
+ }
+ }
+ mutex_unlock(&umem->odp_data->umem_mutex);
+}
+EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
diff --git a/sys/ofed/drivers/infiniband/core/ib_umem_rbtree.c b/sys/ofed/drivers/infiniband/core/ib_umem_rbtree.c
new file mode 100644
index 0000000000000..c9277299ec910
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/ib_umem_rbtree.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <rdma/ib_umem_odp.h>
+
+/*
+ * The ib_umem list keeps track of memory regions for which the HW
+ * device request to receive notification when the related memory
+ * mapping is changed.
+ *
+ * ib_umem_lock protects the list.
+ */
+
+static inline u64 node_start(struct umem_odp_node *n)
+{
+ struct ib_umem_odp *umem_odp =
+ container_of(n, struct ib_umem_odp, interval_tree);
+
+ return ib_umem_start(umem_odp->umem);
+}
+
+/* Note that the representation of the intervals in the interval tree
+ * considers the ending point as contained in the interval, while the
+ * function ib_umem_end returns the first address which is not contained
+ * in the umem.
+ */
+static inline u64 node_last(struct umem_odp_node *n)
+{
+ struct ib_umem_odp *umem_odp =
+ container_of(n, struct ib_umem_odp, interval_tree);
+
+ return ib_umem_end(umem_odp->umem) - 1;
+}
+
+INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
+ node_start, node_last, , rbt_ib_umem)
+
+/* @last is not a part of the interval. See comment for function
+ * node_last.
+ */
+int rbt_ib_umem_for_each_in_range(struct rb_root *root,
+ u64 start, u64 last,
+ umem_call_back cb,
+ void *cookie)
+{
+ int ret_val = 0;
+ struct umem_odp_node *node;
+ struct ib_umem_odp *umem;
+
+ if (unlikely(start == last))
+ return ret_val;
+
+ for (node = rbt_ib_umem_iter_first(root, start, last - 1); node;
+ node = rbt_ib_umem_iter_next(node, start, last - 1)) {
+ umem = container_of(node, struct ib_umem_odp, interval_tree);
+ ret_val = cb(umem->umem, start, last, cookie) || ret_val;
+ }
+
+ return ret_val;
+}
diff --git a/sys/ofed/drivers/infiniband/core/ib_verbs.c b/sys/ofed/drivers/infiniband/core/ib_verbs.c
new file mode 100644
index 0000000000000..1e0295af9ea0b
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/ib_verbs.c
@@ -0,0 +1,2066 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+
+#include <machine/in_cksum.h>
+
+#include "core_priv.h"
+
+static const char * const ib_events[] = {
+ [IB_EVENT_CQ_ERR] = "CQ error",
+ [IB_EVENT_QP_FATAL] = "QP fatal error",
+ [IB_EVENT_QP_REQ_ERR] = "QP request error",
+ [IB_EVENT_QP_ACCESS_ERR] = "QP access error",
+ [IB_EVENT_COMM_EST] = "communication established",
+ [IB_EVENT_SQ_DRAINED] = "send queue drained",
+ [IB_EVENT_PATH_MIG] = "path migration successful",
+ [IB_EVENT_PATH_MIG_ERR] = "path migration error",
+ [IB_EVENT_DEVICE_FATAL] = "device fatal error",
+ [IB_EVENT_PORT_ACTIVE] = "port active",
+ [IB_EVENT_PORT_ERR] = "port error",
+ [IB_EVENT_LID_CHANGE] = "LID change",
+ [IB_EVENT_PKEY_CHANGE] = "P_key change",
+ [IB_EVENT_SM_CHANGE] = "SM change",
+ [IB_EVENT_SRQ_ERR] = "SRQ error",
+ [IB_EVENT_SRQ_LIMIT_REACHED] = "SRQ limit reached",
+ [IB_EVENT_QP_LAST_WQE_REACHED] = "last WQE reached",
+ [IB_EVENT_CLIENT_REREGISTER] = "client reregister",
+ [IB_EVENT_GID_CHANGE] = "GID changed",
+};
+
+const char *__attribute_const__ ib_event_msg(enum ib_event_type event)
+{
+ size_t index = event;
+
+ return (index < ARRAY_SIZE(ib_events) && ib_events[index]) ?
+ ib_events[index] : "unrecognized event";
+}
+EXPORT_SYMBOL(ib_event_msg);
+
+static const char * const wc_statuses[] = {
+ [IB_WC_SUCCESS] = "success",
+ [IB_WC_LOC_LEN_ERR] = "local length error",
+ [IB_WC_LOC_QP_OP_ERR] = "local QP operation error",
+ [IB_WC_LOC_EEC_OP_ERR] = "local EE context operation error",
+ [IB_WC_LOC_PROT_ERR] = "local protection error",
+ [IB_WC_WR_FLUSH_ERR] = "WR flushed",
+ [IB_WC_MW_BIND_ERR] = "memory management operation error",
+ [IB_WC_BAD_RESP_ERR] = "bad response error",
+ [IB_WC_LOC_ACCESS_ERR] = "local access error",
+ [IB_WC_REM_INV_REQ_ERR] = "invalid request error",
+ [IB_WC_REM_ACCESS_ERR] = "remote access error",
+ [IB_WC_REM_OP_ERR] = "remote operation error",
+ [IB_WC_RETRY_EXC_ERR] = "transport retry counter exceeded",
+ [IB_WC_RNR_RETRY_EXC_ERR] = "RNR retry counter exceeded",
+ [IB_WC_LOC_RDD_VIOL_ERR] = "local RDD violation error",
+ [IB_WC_REM_INV_RD_REQ_ERR] = "remote invalid RD request",
+ [IB_WC_REM_ABORT_ERR] = "operation aborted",
+ [IB_WC_INV_EECN_ERR] = "invalid EE context number",
+ [IB_WC_INV_EEC_STATE_ERR] = "invalid EE context state",
+ [IB_WC_FATAL_ERR] = "fatal error",
+ [IB_WC_RESP_TIMEOUT_ERR] = "response timeout error",
+ [IB_WC_GENERAL_ERR] = "general error",
+};
+
+const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status)
+{
+ size_t index = status;
+
+ return (index < ARRAY_SIZE(wc_statuses) && wc_statuses[index]) ?
+ wc_statuses[index] : "unrecognized status";
+}
+EXPORT_SYMBOL(ib_wc_status_msg);
+
+__attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
+{
+ switch (rate) {
+ case IB_RATE_2_5_GBPS: return 1;
+ case IB_RATE_5_GBPS: return 2;
+ case IB_RATE_10_GBPS: return 4;
+ case IB_RATE_20_GBPS: return 8;
+ case IB_RATE_30_GBPS: return 12;
+ case IB_RATE_40_GBPS: return 16;
+ case IB_RATE_60_GBPS: return 24;
+ case IB_RATE_80_GBPS: return 32;
+ case IB_RATE_120_GBPS: return 48;
+ default: return -1;
+ }
+}
+EXPORT_SYMBOL(ib_rate_to_mult);
+
+__attribute_const__ enum ib_rate mult_to_ib_rate(int mult)
+{
+ switch (mult) {
+ case 1: return IB_RATE_2_5_GBPS;
+ case 2: return IB_RATE_5_GBPS;
+ case 4: return IB_RATE_10_GBPS;
+ case 8: return IB_RATE_20_GBPS;
+ case 12: return IB_RATE_30_GBPS;
+ case 16: return IB_RATE_40_GBPS;
+ case 24: return IB_RATE_60_GBPS;
+ case 32: return IB_RATE_80_GBPS;
+ case 48: return IB_RATE_120_GBPS;
+ default: return IB_RATE_PORT_CURRENT;
+ }
+}
+EXPORT_SYMBOL(mult_to_ib_rate);
+
+__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
+{
+ switch (rate) {
+ case IB_RATE_2_5_GBPS: return 2500;
+ case IB_RATE_5_GBPS: return 5000;
+ case IB_RATE_10_GBPS: return 10000;
+ case IB_RATE_20_GBPS: return 20000;
+ case IB_RATE_30_GBPS: return 30000;
+ case IB_RATE_40_GBPS: return 40000;
+ case IB_RATE_60_GBPS: return 60000;
+ case IB_RATE_80_GBPS: return 80000;
+ case IB_RATE_120_GBPS: return 120000;
+ case IB_RATE_14_GBPS: return 14062;
+ case IB_RATE_56_GBPS: return 56250;
+ case IB_RATE_112_GBPS: return 112500;
+ case IB_RATE_168_GBPS: return 168750;
+ case IB_RATE_25_GBPS: return 25781;
+ case IB_RATE_100_GBPS: return 103125;
+ case IB_RATE_200_GBPS: return 206250;
+ case IB_RATE_300_GBPS: return 309375;
+ default: return -1;
+ }
+}
+EXPORT_SYMBOL(ib_rate_to_mbps);
+
+__attribute_const__ enum rdma_transport_type
+rdma_node_get_transport(enum rdma_node_type node_type)
+{
+ switch (node_type) {
+ case RDMA_NODE_IB_CA:
+ case RDMA_NODE_IB_SWITCH:
+ case RDMA_NODE_IB_ROUTER:
+ return RDMA_TRANSPORT_IB;
+ case RDMA_NODE_RNIC:
+ return RDMA_TRANSPORT_IWARP;
+ case RDMA_NODE_USNIC:
+ return RDMA_TRANSPORT_USNIC;
+ case RDMA_NODE_USNIC_UDP:
+ return RDMA_TRANSPORT_USNIC_UDP;
+ default:
+ BUG();
+ return 0;
+ }
+}
+EXPORT_SYMBOL(rdma_node_get_transport);
+
+enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num)
+{
+ if (device->get_link_layer)
+ return device->get_link_layer(device, port_num);
+
+ switch (rdma_node_get_transport(device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ return IB_LINK_LAYER_INFINIBAND;
+ case RDMA_TRANSPORT_IWARP:
+ case RDMA_TRANSPORT_USNIC:
+ case RDMA_TRANSPORT_USNIC_UDP:
+ return IB_LINK_LAYER_ETHERNET;
+ default:
+ return IB_LINK_LAYER_UNSPECIFIED;
+ }
+}
+EXPORT_SYMBOL(rdma_port_get_link_layer);
+
+/* Protection domains */
+
+/**
+ * ib_alloc_pd - Allocates an unused protection domain.
+ * @device: The device on which to allocate the protection domain.
+ *
+ * A protection domain object provides an association between QPs, shared
+ * receive queues, address handles, memory regions, and memory windows.
+ *
+ * Every PD has a local_dma_lkey which can be used as the lkey value for local
+ * memory operations.
+ */
+struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
+ const char *caller)
+{
+ struct ib_pd *pd;
+ int mr_access_flags = 0;
+
+ pd = device->alloc_pd(device, NULL, NULL);
+ if (IS_ERR(pd))
+ return pd;
+
+ pd->device = device;
+ pd->uobject = NULL;
+ pd->__internal_mr = NULL;
+ atomic_set(&pd->usecnt, 0);
+ pd->flags = flags;
+
+ if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
+ pd->local_dma_lkey = device->local_dma_lkey;
+ else
+ mr_access_flags |= IB_ACCESS_LOCAL_WRITE;
+
+ if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
+ pr_warn("%s: enabling unsafe global rkey\n", caller);
+ mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE;
+ }
+
+ if (mr_access_flags) {
+ struct ib_mr *mr;
+
+ mr = pd->device->get_dma_mr(pd, mr_access_flags);
+ if (IS_ERR(mr)) {
+ ib_dealloc_pd(pd);
+ return ERR_CAST(mr);
+ }
+
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->uobject = NULL;
+ mr->need_inval = false;
+
+ pd->__internal_mr = mr;
+
+ if (!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY))
+ pd->local_dma_lkey = pd->__internal_mr->lkey;
+
+ if (flags & IB_PD_UNSAFE_GLOBAL_RKEY)
+ pd->unsafe_global_rkey = pd->__internal_mr->rkey;
+ }
+
+ return pd;
+}
+EXPORT_SYMBOL(__ib_alloc_pd);
+
+/**
+ * ib_dealloc_pd - Deallocates a protection domain.
+ * @pd: The protection domain to deallocate.
+ *
+ * It is an error to call this function while any resources in the pd still
+ * exist. The caller is responsible to synchronously destroy them and
+ * guarantee no new allocations will happen.
+ */
+void ib_dealloc_pd(struct ib_pd *pd)
+{
+ int ret;
+
+ if (pd->__internal_mr) {
+ ret = pd->device->dereg_mr(pd->__internal_mr);
+ WARN_ON(ret);
+ pd->__internal_mr = NULL;
+ }
+
+ /* uverbs manipulates usecnt with proper locking, while the kabi
+ requires the caller to guarantee we can't race here. */
+ WARN_ON(atomic_read(&pd->usecnt));
+
+ /* Making delalloc_pd a void return is a WIP, no driver should return
+ an error here. */
+ ret = pd->device->dealloc_pd(pd);
+ WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
+}
+EXPORT_SYMBOL(ib_dealloc_pd);
+
+/* Address handles */
+
+struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+ struct ib_ah *ah;
+
+ ah = pd->device->create_ah(pd, ah_attr);
+
+ if (!IS_ERR(ah)) {
+ ah->device = pd->device;
+ ah->pd = pd;
+ ah->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ }
+
+ return ah;
+}
+EXPORT_SYMBOL(ib_create_ah);
+
+static int ib_get_header_version(const union rdma_network_hdr *hdr)
+{
+ const struct ip *ip4h = (const struct ip *)&hdr->roce4grh;
+ struct ip ip4h_checked;
+ const struct ip6_hdr *ip6h = (const struct ip6_hdr *)&hdr->ibgrh;
+
+ /* If it's IPv6, the version must be 6, otherwise, the first
+ * 20 bytes (before the IPv4 header) are garbled.
+ */
+ if ((ip6h->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION)
+ return (ip4h->ip_v == 4) ? 4 : 0;
+ /* version may be 6 or 4 because the first 20 bytes could be garbled */
+
+ /* RoCE v2 requires no options, thus header length
+ * must be 5 words
+ */
+ if (ip4h->ip_hl != 5)
+ return 6;
+
+ /* Verify checksum.
+ * We can't write on scattered buffers so we need to copy to
+ * temp buffer.
+ */
+ memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked));
+ ip4h_checked.ip_sum = 0;
+ ip4h_checked.ip_sum = in_cksum_hdr(&ip4h_checked);
+ /* if IPv4 header checksum is OK, believe it */
+ if (ip4h->ip_sum == ip4h_checked.ip_sum)
+ return 4;
+ return 6;
+}
+
+static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device,
+ u8 port_num,
+ const struct ib_grh *grh)
+{
+ int grh_version;
+
+ if (rdma_protocol_ib(device, port_num))
+ return RDMA_NETWORK_IB;
+
+ grh_version = ib_get_header_version((const union rdma_network_hdr *)grh);
+
+ if (grh_version == 4)
+ return RDMA_NETWORK_IPV4;
+
+ if (grh->next_hdr == IPPROTO_UDP)
+ return RDMA_NETWORK_IPV6;
+
+ return RDMA_NETWORK_ROCE_V1;
+}
+
+struct find_gid_index_context {
+ u16 vlan_id;
+ enum ib_gid_type gid_type;
+};
+
+static bool find_gid_index(const union ib_gid *gid,
+ const struct ib_gid_attr *gid_attr,
+ void *context)
+{
+ struct find_gid_index_context *ctx =
+ (struct find_gid_index_context *)context;
+
+ if (ctx->gid_type != gid_attr->gid_type)
+ return false;
+
+ if ((!!(ctx->vlan_id != 0xffff) == !is_vlan_dev(gid_attr->ndev)) ||
+ (is_vlan_dev(gid_attr->ndev) &&
+ vlan_dev_vlan_id(gid_attr->ndev) != ctx->vlan_id))
+ return false;
+
+ return true;
+}
+
+static int get_sgid_index_from_eth(struct ib_device *device, u8 port_num,
+ u16 vlan_id, const union ib_gid *sgid,
+ enum ib_gid_type gid_type,
+ u16 *gid_index)
+{
+ struct find_gid_index_context context = {.vlan_id = vlan_id,
+ .gid_type = gid_type};
+
+ return ib_find_gid_by_filter(device, sgid, port_num, find_gid_index,
+ &context, gid_index);
+}
+
+static int get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr,
+ enum rdma_network_type net_type,
+ union ib_gid *sgid, union ib_gid *dgid)
+{
+ struct sockaddr_in src_in;
+ struct sockaddr_in dst_in;
+ __be32 src_saddr, dst_saddr;
+
+ if (!sgid || !dgid)
+ return -EINVAL;
+
+ if (net_type == RDMA_NETWORK_IPV4) {
+ memcpy(&src_in.sin_addr.s_addr,
+ &hdr->roce4grh.ip_src, 4);
+ memcpy(&dst_in.sin_addr.s_addr,
+ &hdr->roce4grh.ip_dst, 4);
+ src_saddr = src_in.sin_addr.s_addr;
+ dst_saddr = dst_in.sin_addr.s_addr;
+ ipv6_addr_set_v4mapped(src_saddr,
+ (struct in6_addr *)sgid);
+ ipv6_addr_set_v4mapped(dst_saddr,
+ (struct in6_addr *)dgid);
+ return 0;
+ } else if (net_type == RDMA_NETWORK_IPV6 ||
+ net_type == RDMA_NETWORK_IB) {
+ *dgid = hdr->ibgrh.dgid;
+ *sgid = hdr->ibgrh.sgid;
+ return 0;
+ } else {
+ return -EINVAL;
+ }
+}
+
+int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
+ const struct ib_wc *wc, const struct ib_grh *grh,
+ struct ib_ah_attr *ah_attr)
+{
+ u32 flow_class;
+ u16 gid_index;
+ int ret;
+ enum rdma_network_type net_type = RDMA_NETWORK_IB;
+ enum ib_gid_type gid_type = IB_GID_TYPE_IB;
+ int hoplimit = 0xff;
+ union ib_gid dgid;
+ union ib_gid sgid;
+
+ memset(ah_attr, 0, sizeof *ah_attr);
+ if (rdma_cap_eth_ah(device, port_num)) {
+ if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE)
+ net_type = wc->network_hdr_type;
+ else
+ net_type = ib_get_net_type_by_grh(device, port_num, grh);
+ gid_type = ib_network_to_gid_type(net_type);
+ }
+ ret = get_gids_from_rdma_hdr((const union rdma_network_hdr *)grh, net_type,
+ &sgid, &dgid);
+ if (ret)
+ return ret;
+
+ if (rdma_protocol_roce(device, port_num)) {
+ int if_index = 0;
+ u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ?
+ wc->vlan_id : 0xffff;
+ struct net_device *idev;
+ struct net_device *resolved_dev;
+
+ if (!(wc->wc_flags & IB_WC_GRH))
+ return -EPROTOTYPE;
+
+ if (!device->get_netdev)
+ return -EOPNOTSUPP;
+
+ idev = device->get_netdev(device, port_num);
+ if (!idev)
+ return -ENODEV;
+
+ ret = rdma_addr_find_l2_eth_by_grh(&dgid, &sgid,
+ ah_attr->dmac,
+ wc->wc_flags & IB_WC_WITH_VLAN ?
+ NULL : &vlan_id,
+ &if_index, &hoplimit);
+ if (ret) {
+ dev_put(idev);
+ return ret;
+ }
+
+ resolved_dev = dev_get_by_index(&init_net, if_index);
+ if (resolved_dev->if_flags & IFF_LOOPBACK) {
+ dev_put(resolved_dev);
+ resolved_dev = idev;
+ dev_hold(resolved_dev);
+ }
+ rcu_read_lock();
+ if (resolved_dev != idev && !rdma_is_upper_dev_rcu(idev,
+ resolved_dev))
+ ret = -EHOSTUNREACH;
+ rcu_read_unlock();
+ dev_put(idev);
+ dev_put(resolved_dev);
+ if (ret)
+ return ret;
+
+ ret = get_sgid_index_from_eth(device, port_num, vlan_id,
+ &dgid, gid_type, &gid_index);
+ if (ret)
+ return ret;
+ }
+
+ ah_attr->dlid = wc->slid;
+ ah_attr->sl = wc->sl;
+ ah_attr->src_path_bits = wc->dlid_path_bits;
+ ah_attr->port_num = port_num;
+
+ if (wc->wc_flags & IB_WC_GRH) {
+ ah_attr->ah_flags = IB_AH_GRH;
+ ah_attr->grh.dgid = sgid;
+
+ if (!rdma_cap_eth_ah(device, port_num)) {
+ if (dgid.global.interface_id != cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) {
+ ret = ib_find_cached_gid_by_port(device, &dgid,
+ IB_GID_TYPE_IB,
+ port_num, NULL,
+ &gid_index);
+ if (ret)
+ return ret;
+ } else {
+ gid_index = 0;
+ }
+ }
+
+ ah_attr->grh.sgid_index = (u8) gid_index;
+ flow_class = be32_to_cpu(grh->version_tclass_flow);
+ ah_attr->grh.flow_label = flow_class & 0xFFFFF;
+ ah_attr->grh.hop_limit = hoplimit;
+ ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_wc);
+
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc,
+ const struct ib_grh *grh, u8 port_num)
+{
+ struct ib_ah_attr ah_attr;
+ int ret;
+
+ ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return ib_create_ah(pd, &ah_attr);
+}
+EXPORT_SYMBOL(ib_create_ah_from_wc);
+
+int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+ return ah->device->modify_ah ?
+ ah->device->modify_ah(ah, ah_attr) :
+ -ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_ah);
+
+int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+ return ah->device->query_ah ?
+ ah->device->query_ah(ah, ah_attr) :
+ -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_ah);
+
+int ib_destroy_ah(struct ib_ah *ah)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ pd = ah->pd;
+ ret = ah->device->destroy_ah(ah);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_destroy_ah);
+
+/* Shared receive queues */
+
+struct ib_srq *ib_create_srq(struct ib_pd *pd,
+ struct ib_srq_init_attr *srq_init_attr)
+{
+ struct ib_srq *srq;
+
+ if (!pd->device->create_srq)
+ return ERR_PTR(-ENOSYS);
+
+ srq = pd->device->create_srq(pd, srq_init_attr, NULL);
+
+ if (!IS_ERR(srq)) {
+ srq->device = pd->device;
+ srq->pd = pd;
+ srq->uobject = NULL;
+ srq->event_handler = srq_init_attr->event_handler;
+ srq->srq_context = srq_init_attr->srq_context;
+ srq->srq_type = srq_init_attr->srq_type;
+ if (srq->srq_type == IB_SRQT_XRC) {
+ srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd;
+ srq->ext.xrc.cq = srq_init_attr->ext.xrc.cq;
+ atomic_inc(&srq->ext.xrc.xrcd->usecnt);
+ atomic_inc(&srq->ext.xrc.cq->usecnt);
+ }
+ atomic_inc(&pd->usecnt);
+ atomic_set(&srq->usecnt, 0);
+ }
+
+ return srq;
+}
+EXPORT_SYMBOL(ib_create_srq);
+
+int ib_modify_srq(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr,
+ enum ib_srq_attr_mask srq_attr_mask)
+{
+ return srq->device->modify_srq ?
+ srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) :
+ -ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_srq);
+
+int ib_query_srq(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr)
+{
+ return srq->device->query_srq ?
+ srq->device->query_srq(srq, srq_attr) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_srq);
+
+int ib_destroy_srq(struct ib_srq *srq)
+{
+ struct ib_pd *pd;
+ enum ib_srq_type srq_type;
+ struct ib_xrcd *uninitialized_var(xrcd);
+ struct ib_cq *uninitialized_var(cq);
+ int ret;
+
+ if (atomic_read(&srq->usecnt))
+ return -EBUSY;
+
+ pd = srq->pd;
+ srq_type = srq->srq_type;
+ if (srq_type == IB_SRQT_XRC) {
+ xrcd = srq->ext.xrc.xrcd;
+ cq = srq->ext.xrc.cq;
+ }
+
+ ret = srq->device->destroy_srq(srq);
+ if (!ret) {
+ atomic_dec(&pd->usecnt);
+ if (srq_type == IB_SRQT_XRC) {
+ atomic_dec(&xrcd->usecnt);
+ atomic_dec(&cq->usecnt);
+ }
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_destroy_srq);
+
+/* Queue pairs */
+
+static void __ib_shared_qp_event_handler(struct ib_event *event, void *context)
+{
+ struct ib_qp *qp = context;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->device->event_handler_lock, flags);
+ list_for_each_entry(event->element.qp, &qp->open_list, open_list)
+ if (event->element.qp->event_handler)
+ event->element.qp->event_handler(event, event->element.qp->qp_context);
+ spin_unlock_irqrestore(&qp->device->event_handler_lock, flags);
+}
+
+static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp)
+{
+ mutex_lock(&xrcd->tgt_qp_mutex);
+ list_add(&qp->xrcd_list, &xrcd->tgt_qp_list);
+ mutex_unlock(&xrcd->tgt_qp_mutex);
+}
+
+static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp,
+ void (*event_handler)(struct ib_event *, void *),
+ void *qp_context)
+{
+ struct ib_qp *qp;
+ unsigned long flags;
+
+ qp = kzalloc(sizeof *qp, GFP_KERNEL);
+ if (!qp)
+ return ERR_PTR(-ENOMEM);
+
+ qp->real_qp = real_qp;
+ atomic_inc(&real_qp->usecnt);
+ qp->device = real_qp->device;
+ qp->event_handler = event_handler;
+ qp->qp_context = qp_context;
+ qp->qp_num = real_qp->qp_num;
+ qp->qp_type = real_qp->qp_type;
+
+ spin_lock_irqsave(&real_qp->device->event_handler_lock, flags);
+ list_add(&qp->open_list, &real_qp->open_list);
+ spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
+
+ return qp;
+}
+
+struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
+ struct ib_qp_open_attr *qp_open_attr)
+{
+ struct ib_qp *qp, *real_qp;
+
+ if (qp_open_attr->qp_type != IB_QPT_XRC_TGT)
+ return ERR_PTR(-EINVAL);
+
+ qp = ERR_PTR(-EINVAL);
+ mutex_lock(&xrcd->tgt_qp_mutex);
+ list_for_each_entry(real_qp, &xrcd->tgt_qp_list, xrcd_list) {
+ if (real_qp->qp_num == qp_open_attr->qp_num) {
+ qp = __ib_open_qp(real_qp, qp_open_attr->event_handler,
+ qp_open_attr->qp_context);
+ break;
+ }
+ }
+ mutex_unlock(&xrcd->tgt_qp_mutex);
+ return qp;
+}
+EXPORT_SYMBOL(ib_open_qp);
+
+static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct ib_qp *real_qp = qp;
+
+ qp->event_handler = __ib_shared_qp_event_handler;
+ qp->qp_context = qp;
+ qp->pd = NULL;
+ qp->send_cq = qp->recv_cq = NULL;
+ qp->srq = NULL;
+ qp->xrcd = qp_init_attr->xrcd;
+ atomic_inc(&qp_init_attr->xrcd->usecnt);
+ INIT_LIST_HEAD(&qp->open_list);
+
+ qp = __ib_open_qp(real_qp, qp_init_attr->event_handler,
+ qp_init_attr->qp_context);
+ if (!IS_ERR(qp))
+ __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp);
+ else
+ real_qp->device->destroy_qp(real_qp);
+ return qp;
+}
+
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device;
+ struct ib_qp *qp;
+
+ if (qp_init_attr->rwq_ind_tbl &&
+ (qp_init_attr->recv_cq ||
+ qp_init_attr->srq || qp_init_attr->cap.max_recv_wr ||
+ qp_init_attr->cap.max_recv_sge))
+ return ERR_PTR(-EINVAL);
+
+ qp = device->create_qp(pd, qp_init_attr, NULL);
+ if (IS_ERR(qp))
+ return qp;
+
+ qp->device = device;
+ qp->real_qp = qp;
+ qp->uobject = NULL;
+ qp->qp_type = qp_init_attr->qp_type;
+ qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl;
+
+ atomic_set(&qp->usecnt, 0);
+ spin_lock_init(&qp->mr_lock);
+
+ if (qp_init_attr->qp_type == IB_QPT_XRC_TGT)
+ return ib_create_xrc_qp(qp, qp_init_attr);
+
+ qp->event_handler = qp_init_attr->event_handler;
+ qp->qp_context = qp_init_attr->qp_context;
+ if (qp_init_attr->qp_type == IB_QPT_XRC_INI) {
+ qp->recv_cq = NULL;
+ qp->srq = NULL;
+ } else {
+ qp->recv_cq = qp_init_attr->recv_cq;
+ if (qp_init_attr->recv_cq)
+ atomic_inc(&qp_init_attr->recv_cq->usecnt);
+ qp->srq = qp_init_attr->srq;
+ if (qp->srq)
+ atomic_inc(&qp_init_attr->srq->usecnt);
+ }
+
+ qp->pd = pd;
+ qp->send_cq = qp_init_attr->send_cq;
+ qp->xrcd = NULL;
+
+ atomic_inc(&pd->usecnt);
+ if (qp_init_attr->send_cq)
+ atomic_inc(&qp_init_attr->send_cq->usecnt);
+ if (qp_init_attr->rwq_ind_tbl)
+ atomic_inc(&qp->rwq_ind_tbl->usecnt);
+
+ /*
+ * Note: all hw drivers guarantee that max_send_sge is lower than
+ * the device RDMA WRITE SGE limit but not all hw drivers ensure that
+ * max_send_sge <= max_sge_rd.
+ */
+ qp->max_write_sge = qp_init_attr->cap.max_send_sge;
+ qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge,
+ device->attrs.max_sge_rd);
+
+ return qp;
+}
+EXPORT_SYMBOL(ib_create_qp);
+
+static const struct {
+ int valid;
+ enum ib_qp_attr_mask req_param[IB_QPT_MAX];
+ enum ib_qp_attr_mask opt_param[IB_QPT_MAX];
+} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+ [IB_QPS_RESET] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_INIT] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY),
+ [IB_QPT_RAW_PACKET] = IB_QP_PORT,
+ [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ },
+ },
+ [IB_QPS_INIT] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_INIT] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_RTR] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UC] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN),
+ [IB_QPT_RC] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER),
+ [IB_QPT_XRC_INI] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN),
+ [IB_QPT_XRC_TGT] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER),
+ },
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_RC] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ },
+ },
+ },
+ [IB_QPS_RTR] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UD] = IB_QP_SQ_PSN,
+ [IB_QPT_UC] = IB_QP_SQ_PSN,
+ [IB_QPT_RC] = (IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_SQ_PSN |
+ IB_QP_MAX_QP_RD_ATOMIC),
+ [IB_QPT_XRC_INI] = (IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_SQ_PSN |
+ IB_QP_MAX_QP_RD_ATOMIC),
+ [IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT |
+ IB_QP_SQ_PSN),
+ [IB_QPT_SMI] = IB_QP_SQ_PSN,
+ [IB_QPT_GSI] = IB_QP_SQ_PSN,
+ },
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_RTS] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE |
+ IB_QP_MIN_RNR_TIMER),
+ [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE |
+ IB_QP_MIN_RNR_TIMER),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_SQD] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */
+ [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY
+ }
+ },
+ },
+ [IB_QPS_SQD] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_SQD] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_AV |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_PORT |
+ IB_QP_AV |
+ IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_INI] = (IB_QP_PORT |
+ IB_QP_AV |
+ IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_TGT] = (IB_QP_PORT |
+ IB_QP_AV |
+ IB_QP_TIMEOUT |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_SQE] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_ERR] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 }
+ }
+};
+
+int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
+ enum ib_qp_type type, enum ib_qp_attr_mask mask,
+ enum rdma_link_layer ll)
+{
+ enum ib_qp_attr_mask req_param, opt_param;
+
+ if (cur_state < 0 || cur_state > IB_QPS_ERR ||
+ next_state < 0 || next_state > IB_QPS_ERR)
+ return 0;
+
+ if (mask & IB_QP_CUR_STATE &&
+ cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
+ cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
+ return 0;
+
+ if (!qp_state_table[cur_state][next_state].valid)
+ return 0;
+
+ req_param = qp_state_table[cur_state][next_state].req_param[type];
+ opt_param = qp_state_table[cur_state][next_state].opt_param[type];
+
+ if ((mask & req_param) != req_param)
+ return 0;
+
+ if (mask & ~(req_param | opt_param | IB_QP_STATE))
+ return 0;
+
+ return 1;
+}
+EXPORT_SYMBOL(ib_modify_qp_is_ok);
+
+int ib_resolve_eth_dmac(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr, int *qp_attr_mask)
+{
+ int ret = 0;
+
+ if (*qp_attr_mask & IB_QP_AV) {
+ if (qp_attr->ah_attr.port_num < rdma_start_port(qp->device) ||
+ qp_attr->ah_attr.port_num > rdma_end_port(qp->device))
+ return -EINVAL;
+
+ if (!rdma_cap_eth_ah(qp->device, qp_attr->ah_attr.port_num))
+ return 0;
+
+ if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) {
+ rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw,
+ qp_attr->ah_attr.dmac);
+ } else {
+ union ib_gid sgid;
+ struct ib_gid_attr sgid_attr;
+ int ifindex;
+ int hop_limit;
+
+ ret = ib_query_gid(qp->device,
+ qp_attr->ah_attr.port_num,
+ qp_attr->ah_attr.grh.sgid_index,
+ &sgid, &sgid_attr);
+
+ if (ret || !sgid_attr.ndev) {
+ if (!ret)
+ ret = -ENXIO;
+ goto out;
+ }
+
+ ifindex = sgid_attr.ndev->if_index;
+
+ ret = rdma_addr_find_l2_eth_by_grh(&sgid,
+ &qp_attr->ah_attr.grh.dgid,
+ qp_attr->ah_attr.dmac,
+ NULL, &ifindex, &hop_limit);
+
+ dev_put(sgid_attr.ndev);
+
+ qp_attr->ah_attr.grh.hop_limit = hop_limit;
+ }
+ }
+out:
+ return ret;
+}
+EXPORT_SYMBOL(ib_resolve_eth_dmac);
+
+
+int ib_modify_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask)
+{
+ int ret;
+
+ ret = ib_resolve_eth_dmac(qp, qp_attr, &qp_attr_mask);
+ if (ret)
+ return ret;
+
+ return qp->device->modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL);
+}
+EXPORT_SYMBOL(ib_modify_qp);
+
+int ib_query_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ return qp->device->query_qp ?
+ qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) :
+ -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_qp);
+
+int ib_close_qp(struct ib_qp *qp)
+{
+ struct ib_qp *real_qp;
+ unsigned long flags;
+
+ real_qp = qp->real_qp;
+ if (real_qp == qp)
+ return -EINVAL;
+
+ spin_lock_irqsave(&real_qp->device->event_handler_lock, flags);
+ list_del(&qp->open_list);
+ spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
+
+ atomic_dec(&real_qp->usecnt);
+ kfree(qp);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_close_qp);
+
+static int __ib_destroy_shared_qp(struct ib_qp *qp)
+{
+ struct ib_xrcd *xrcd;
+ struct ib_qp *real_qp;
+ int ret;
+
+ real_qp = qp->real_qp;
+ xrcd = real_qp->xrcd;
+
+ mutex_lock(&xrcd->tgt_qp_mutex);
+ ib_close_qp(qp);
+ if (atomic_read(&real_qp->usecnt) == 0)
+ list_del(&real_qp->xrcd_list);
+ else
+ real_qp = NULL;
+ mutex_unlock(&xrcd->tgt_qp_mutex);
+
+ if (real_qp) {
+ ret = ib_destroy_qp(real_qp);
+ if (!ret)
+ atomic_dec(&xrcd->usecnt);
+ else
+ __ib_insert_xrcd_qp(xrcd, real_qp);
+ }
+
+ return 0;
+}
+
+int ib_destroy_qp(struct ib_qp *qp)
+{
+ struct ib_pd *pd;
+ struct ib_cq *scq, *rcq;
+ struct ib_srq *srq;
+ struct ib_rwq_ind_table *ind_tbl;
+ int ret;
+
+ if (atomic_read(&qp->usecnt))
+ return -EBUSY;
+
+ if (qp->real_qp != qp)
+ return __ib_destroy_shared_qp(qp);
+
+ pd = qp->pd;
+ scq = qp->send_cq;
+ rcq = qp->recv_cq;
+ srq = qp->srq;
+ ind_tbl = qp->rwq_ind_tbl;
+
+ ret = qp->device->destroy_qp(qp);
+ if (!ret) {
+ if (pd)
+ atomic_dec(&pd->usecnt);
+ if (scq)
+ atomic_dec(&scq->usecnt);
+ if (rcq)
+ atomic_dec(&rcq->usecnt);
+ if (srq)
+ atomic_dec(&srq->usecnt);
+ if (ind_tbl)
+ atomic_dec(&ind_tbl->usecnt);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_destroy_qp);
+
+/* Completion queues */
+
+struct ib_cq *ib_create_cq(struct ib_device *device,
+ ib_comp_handler comp_handler,
+ void (*event_handler)(struct ib_event *, void *),
+ void *cq_context,
+ const struct ib_cq_init_attr *cq_attr)
+{
+ struct ib_cq *cq;
+
+ cq = device->create_cq(device, cq_attr, NULL, NULL);
+
+ if (!IS_ERR(cq)) {
+ cq->device = device;
+ cq->uobject = NULL;
+ cq->comp_handler = comp_handler;
+ cq->event_handler = event_handler;
+ cq->cq_context = cq_context;
+ atomic_set(&cq->usecnt, 0);
+ }
+
+ return cq;
+}
+EXPORT_SYMBOL(ib_create_cq);
+
+int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+ return cq->device->modify_cq ?
+ cq->device->modify_cq(cq, cq_count, cq_period) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_cq);
+
+int ib_destroy_cq(struct ib_cq *cq)
+{
+ if (atomic_read(&cq->usecnt))
+ return -EBUSY;
+
+ return cq->device->destroy_cq(cq);
+}
+EXPORT_SYMBOL(ib_destroy_cq);
+
+int ib_resize_cq(struct ib_cq *cq, int cqe)
+{
+ return cq->device->resize_cq ?
+ cq->device->resize_cq(cq, cqe, NULL) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_resize_cq);
+
+/* Memory regions */
+
+int ib_dereg_mr(struct ib_mr *mr)
+{
+ struct ib_pd *pd = mr->pd;
+ int ret;
+
+ ret = mr->device->dereg_mr(mr);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_dereg_mr);
+
+/**
+ * ib_alloc_mr() - Allocates a memory region
+ * @pd: protection domain associated with the region
+ * @mr_type: memory region type
+ * @max_num_sg: maximum sg entries available for registration.
+ *
+ * Notes:
+ * Memory registeration page/sg lists must not exceed max_num_sg.
+ * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed
+ * max_num_sg * used_page_size.
+ *
+ */
+struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
+ enum ib_mr_type mr_type,
+ u32 max_num_sg)
+{
+ struct ib_mr *mr;
+
+ if (!pd->device->alloc_mr)
+ return ERR_PTR(-ENOSYS);
+
+ mr = pd->device->alloc_mr(pd, mr_type, max_num_sg);
+ if (!IS_ERR(mr)) {
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ mr->need_inval = false;
+ }
+
+ return mr;
+}
+EXPORT_SYMBOL(ib_alloc_mr);
+
+/* "Fast" memory regions */
+
+struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
+ int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr)
+{
+ struct ib_fmr *fmr;
+
+ if (!pd->device->alloc_fmr)
+ return ERR_PTR(-ENOSYS);
+
+ fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr);
+ if (!IS_ERR(fmr)) {
+ fmr->device = pd->device;
+ fmr->pd = pd;
+ atomic_inc(&pd->usecnt);
+ }
+
+ return fmr;
+}
+EXPORT_SYMBOL(ib_alloc_fmr);
+
+int ib_unmap_fmr(struct list_head *fmr_list)
+{
+ struct ib_fmr *fmr;
+
+ if (list_empty(fmr_list))
+ return 0;
+
+ fmr = list_entry(fmr_list->next, struct ib_fmr, list);
+ return fmr->device->unmap_fmr(fmr_list);
+}
+EXPORT_SYMBOL(ib_unmap_fmr);
+
+int ib_dealloc_fmr(struct ib_fmr *fmr)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ pd = fmr->pd;
+ ret = fmr->device->dealloc_fmr(fmr);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_fmr);
+
+/* Multicast groups */
+
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+ int ret;
+
+ if (!qp->device->attach_mcast)
+ return -ENOSYS;
+ if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD)
+ return -EINVAL;
+
+ ret = qp->device->attach_mcast(qp, gid, lid);
+ if (!ret)
+ atomic_inc(&qp->usecnt);
+ return ret;
+}
+EXPORT_SYMBOL(ib_attach_mcast);
+
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+ int ret;
+
+ if (!qp->device->detach_mcast)
+ return -ENOSYS;
+ if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD)
+ return -EINVAL;
+
+ ret = qp->device->detach_mcast(qp, gid, lid);
+ if (!ret)
+ atomic_dec(&qp->usecnt);
+ return ret;
+}
+EXPORT_SYMBOL(ib_detach_mcast);
+
+struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device)
+{
+ struct ib_xrcd *xrcd;
+
+ if (!device->alloc_xrcd)
+ return ERR_PTR(-ENOSYS);
+
+ xrcd = device->alloc_xrcd(device, NULL, NULL);
+ if (!IS_ERR(xrcd)) {
+ xrcd->device = device;
+ xrcd->inode = NULL;
+ atomic_set(&xrcd->usecnt, 0);
+ mutex_init(&xrcd->tgt_qp_mutex);
+ INIT_LIST_HEAD(&xrcd->tgt_qp_list);
+ }
+
+ return xrcd;
+}
+EXPORT_SYMBOL(ib_alloc_xrcd);
+
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+ struct ib_qp *qp;
+ int ret;
+
+ if (atomic_read(&xrcd->usecnt))
+ return -EBUSY;
+
+ while (!list_empty(&xrcd->tgt_qp_list)) {
+ qp = list_entry(xrcd->tgt_qp_list.next, struct ib_qp, xrcd_list);
+ ret = ib_destroy_qp(qp);
+ if (ret)
+ return ret;
+ }
+
+ return xrcd->device->dealloc_xrcd(xrcd);
+}
+EXPORT_SYMBOL(ib_dealloc_xrcd);
+
+/**
+ * ib_create_wq - Creates a WQ associated with the specified protection
+ * domain.
+ * @pd: The protection domain associated with the WQ.
+ * @wq_init_attr: A list of initial attributes required to create the
+ * WQ. If WQ creation succeeds, then the attributes are updated to
+ * the actual capabilities of the created WQ.
+ *
+ * wq_init_attr->max_wr and wq_init_attr->max_sge determine
+ * the requested size of the WQ, and set to the actual values allocated
+ * on return.
+ * If ib_create_wq() succeeds, then max_wr and max_sge will always be
+ * at least as large as the requested values.
+ */
+struct ib_wq *ib_create_wq(struct ib_pd *pd,
+ struct ib_wq_init_attr *wq_attr)
+{
+ struct ib_wq *wq;
+
+ if (!pd->device->create_wq)
+ return ERR_PTR(-ENOSYS);
+
+ wq = pd->device->create_wq(pd, wq_attr, NULL);
+ if (!IS_ERR(wq)) {
+ wq->event_handler = wq_attr->event_handler;
+ wq->wq_context = wq_attr->wq_context;
+ wq->wq_type = wq_attr->wq_type;
+ wq->cq = wq_attr->cq;
+ wq->device = pd->device;
+ wq->pd = pd;
+ wq->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&wq_attr->cq->usecnt);
+ atomic_set(&wq->usecnt, 0);
+ }
+ return wq;
+}
+EXPORT_SYMBOL(ib_create_wq);
+
+/**
+ * ib_destroy_wq - Destroys the specified WQ.
+ * @wq: The WQ to destroy.
+ */
+int ib_destroy_wq(struct ib_wq *wq)
+{
+ int err;
+ struct ib_cq *cq = wq->cq;
+ struct ib_pd *pd = wq->pd;
+
+ if (atomic_read(&wq->usecnt))
+ return -EBUSY;
+
+ err = wq->device->destroy_wq(wq);
+ if (!err) {
+ atomic_dec(&pd->usecnt);
+ atomic_dec(&cq->usecnt);
+ }
+ return err;
+}
+EXPORT_SYMBOL(ib_destroy_wq);
+
+/**
+ * ib_modify_wq - Modifies the specified WQ.
+ * @wq: The WQ to modify.
+ * @wq_attr: On input, specifies the WQ attributes to modify.
+ * @wq_attr_mask: A bit-mask used to specify which attributes of the WQ
+ * are being modified.
+ * On output, the current values of selected WQ attributes are returned.
+ */
+int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+ u32 wq_attr_mask)
+{
+ int err;
+
+ if (!wq->device->modify_wq)
+ return -ENOSYS;
+
+ err = wq->device->modify_wq(wq, wq_attr, wq_attr_mask, NULL);
+ return err;
+}
+EXPORT_SYMBOL(ib_modify_wq);
+
+/*
+ * ib_create_rwq_ind_table - Creates a RQ Indirection Table.
+ * @device: The device on which to create the rwq indirection table.
+ * @ib_rwq_ind_table_init_attr: A list of initial attributes required to
+ * create the Indirection Table.
+ *
+ * Note: The life time of ib_rwq_ind_table_init_attr->ind_tbl is not less
+ * than the created ib_rwq_ind_table object and the caller is responsible
+ * for its memory allocation/free.
+ */
+struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device,
+ struct ib_rwq_ind_table_init_attr *init_attr)
+{
+ struct ib_rwq_ind_table *rwq_ind_table;
+ int i;
+ u32 table_size;
+
+ if (!device->create_rwq_ind_table)
+ return ERR_PTR(-ENOSYS);
+
+ table_size = (1 << init_attr->log_ind_tbl_size);
+ rwq_ind_table = device->create_rwq_ind_table(device,
+ init_attr, NULL);
+ if (IS_ERR(rwq_ind_table))
+ return rwq_ind_table;
+
+ rwq_ind_table->ind_tbl = init_attr->ind_tbl;
+ rwq_ind_table->log_ind_tbl_size = init_attr->log_ind_tbl_size;
+ rwq_ind_table->device = device;
+ rwq_ind_table->uobject = NULL;
+ atomic_set(&rwq_ind_table->usecnt, 0);
+
+ for (i = 0; i < table_size; i++)
+ atomic_inc(&rwq_ind_table->ind_tbl[i]->usecnt);
+
+ return rwq_ind_table;
+}
+EXPORT_SYMBOL(ib_create_rwq_ind_table);
+
+/*
+ * ib_destroy_rwq_ind_table - Destroys the specified Indirection Table.
+ * @wq_ind_table: The Indirection Table to destroy.
+*/
+int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *rwq_ind_table)
+{
+ int err, i;
+ u32 table_size = (1 << rwq_ind_table->log_ind_tbl_size);
+ struct ib_wq **ind_tbl = rwq_ind_table->ind_tbl;
+
+ if (atomic_read(&rwq_ind_table->usecnt))
+ return -EBUSY;
+
+ err = rwq_ind_table->device->destroy_rwq_ind_table(rwq_ind_table);
+ if (!err) {
+ for (i = 0; i < table_size; i++)
+ atomic_dec(&ind_tbl[i]->usecnt);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(ib_destroy_rwq_ind_table);
+
+struct ib_flow *ib_create_flow(struct ib_qp *qp,
+ struct ib_flow_attr *flow_attr,
+ int domain)
+{
+ struct ib_flow *flow_id;
+ if (!qp->device->create_flow)
+ return ERR_PTR(-ENOSYS);
+
+ flow_id = qp->device->create_flow(qp, flow_attr, domain);
+ if (!IS_ERR(flow_id))
+ atomic_inc(&qp->usecnt);
+ return flow_id;
+}
+EXPORT_SYMBOL(ib_create_flow);
+
+int ib_destroy_flow(struct ib_flow *flow_id)
+{
+ int err;
+ struct ib_qp *qp = flow_id->qp;
+
+ err = qp->device->destroy_flow(flow_id);
+ if (!err)
+ atomic_dec(&qp->usecnt);
+ return err;
+}
+EXPORT_SYMBOL(ib_destroy_flow);
+
+int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
+ struct ib_mr_status *mr_status)
+{
+ return mr->device->check_mr_status ?
+ mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_check_mr_status);
+
+int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port,
+ int state)
+{
+ if (!device->set_vf_link_state)
+ return -ENOSYS;
+
+ return device->set_vf_link_state(device, vf, port, state);
+}
+EXPORT_SYMBOL(ib_set_vf_link_state);
+
+int ib_get_vf_config(struct ib_device *device, int vf, u8 port,
+ struct ifla_vf_info *info)
+{
+ if (!device->get_vf_config)
+ return -ENOSYS;
+
+ return device->get_vf_config(device, vf, port, info);
+}
+EXPORT_SYMBOL(ib_get_vf_config);
+
+int ib_get_vf_stats(struct ib_device *device, int vf, u8 port,
+ struct ifla_vf_stats *stats)
+{
+ if (!device->get_vf_stats)
+ return -ENOSYS;
+
+ return device->get_vf_stats(device, vf, port, stats);
+}
+EXPORT_SYMBOL(ib_get_vf_stats);
+
+int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid,
+ int type)
+{
+ if (!device->set_vf_guid)
+ return -ENOSYS;
+
+ return device->set_vf_guid(device, vf, port, guid, type);
+}
+EXPORT_SYMBOL(ib_set_vf_guid);
+
+/**
+ * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list
+ * and set it the memory region.
+ * @mr: memory region
+ * @sg: dma mapped scatterlist
+ * @sg_nents: number of entries in sg
+ * @sg_offset: offset in bytes into sg
+ * @page_size: page vector desired page size
+ *
+ * Constraints:
+ * - The first sg element is allowed to have an offset.
+ * - Each sg element must either be aligned to page_size or virtually
+ * contiguous to the previous element. In case an sg element has a
+ * non-contiguous offset, the mapping prefix will not include it.
+ * - The last sg element is allowed to have length less than page_size.
+ * - If sg_nents total byte length exceeds the mr max_num_sge * page_size
+ * then only max_num_sg entries will be mapped.
+ * - If the MR was allocated with type IB_MR_TYPE_SG_GAPS, none of these
+ * constraints holds and the page_size argument is ignored.
+ *
+ * Returns the number of sg elements that were mapped to the memory region.
+ *
+ * After this completes successfully, the memory region
+ * is ready for registration.
+ */
+int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset, unsigned int page_size)
+{
+ if (unlikely(!mr->device->map_mr_sg))
+ return -ENOSYS;
+
+ mr->page_size = page_size;
+
+ return mr->device->map_mr_sg(mr, sg, sg_nents, sg_offset);
+}
+EXPORT_SYMBOL(ib_map_mr_sg);
+
+/**
+ * ib_sg_to_pages() - Convert the largest prefix of a sg list
+ * to a page vector
+ * @mr: memory region
+ * @sgl: dma mapped scatterlist
+ * @sg_nents: number of entries in sg
+ * @sg_offset_p: IN: start offset in bytes into sg
+ * OUT: offset in bytes for element n of the sg of the first
+ * byte that has not been processed where n is the return
+ * value of this function.
+ * @set_page: driver page assignment function pointer
+ *
+ * Core service helper for drivers to convert the largest
+ * prefix of given sg list to a page vector. The sg list
+ * prefix converted is the prefix that meet the requirements
+ * of ib_map_mr_sg.
+ *
+ * Returns the number of sg elements that were assigned to
+ * a page vector.
+ */
+int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents,
+ unsigned int *sg_offset_p, int (*set_page)(struct ib_mr *, u64))
+{
+ struct scatterlist *sg;
+ u64 last_end_dma_addr = 0;
+ unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
+ unsigned int last_page_off = 0;
+ u64 page_mask = ~((u64)mr->page_size - 1);
+ int i, ret;
+
+ if (unlikely(sg_nents <= 0 || sg_offset > sg_dma_len(&sgl[0])))
+ return -EINVAL;
+
+ mr->iova = sg_dma_address(&sgl[0]) + sg_offset;
+ mr->length = 0;
+
+ for_each_sg(sgl, sg, sg_nents, i) {
+ u64 dma_addr = sg_dma_address(sg) + sg_offset;
+ u64 prev_addr = dma_addr;
+ unsigned int dma_len = sg_dma_len(sg) - sg_offset;
+ u64 end_dma_addr = dma_addr + dma_len;
+ u64 page_addr = dma_addr & page_mask;
+
+ /*
+ * For the second and later elements, check whether either the
+ * end of element i-1 or the start of element i is not aligned
+ * on a page boundary.
+ */
+ if (i && (last_page_off != 0 || page_addr != dma_addr)) {
+ /* Stop mapping if there is a gap. */
+ if (last_end_dma_addr != dma_addr)
+ break;
+
+ /*
+ * Coalesce this element with the last. If it is small
+ * enough just update mr->length. Otherwise start
+ * mapping from the next page.
+ */
+ goto next_page;
+ }
+
+ do {
+ ret = set_page(mr, page_addr);
+ if (unlikely(ret < 0)) {
+ sg_offset = prev_addr - sg_dma_address(sg);
+ mr->length += prev_addr - dma_addr;
+ if (sg_offset_p)
+ *sg_offset_p = sg_offset;
+ return i || sg_offset ? i : ret;
+ }
+ prev_addr = page_addr;
+next_page:
+ page_addr += mr->page_size;
+ } while (page_addr < end_dma_addr);
+
+ mr->length += dma_len;
+ last_end_dma_addr = end_dma_addr;
+ last_page_off = end_dma_addr & ~page_mask;
+
+ sg_offset = 0;
+ }
+
+ if (sg_offset_p)
+ *sg_offset_p = 0;
+ return i;
+}
+EXPORT_SYMBOL(ib_sg_to_pages);
+
+struct ib_drain_cqe {
+ struct ib_cqe cqe;
+ struct completion done;
+};
+
+static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_drain_cqe *cqe = container_of(wc->wr_cqe, struct ib_drain_cqe,
+ cqe);
+
+ complete(&cqe->done);
+}
+
+/*
+ * Post a WR and block until its completion is reaped for the SQ.
+ */
+static void __ib_drain_sq(struct ib_qp *qp)
+{
+ struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
+ struct ib_drain_cqe sdrain;
+ struct ib_send_wr swr = {}, *bad_swr;
+ int ret;
+
+ if (qp->send_cq->poll_ctx == IB_POLL_DIRECT) {
+ WARN_ONCE(qp->send_cq->poll_ctx == IB_POLL_DIRECT,
+ "IB_POLL_DIRECT poll_ctx not supported for drain\n");
+ return;
+ }
+
+ swr.wr_cqe = &sdrain.cqe;
+ sdrain.cqe.done = ib_drain_qp_done;
+ init_completion(&sdrain.done);
+
+ ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
+ if (ret) {
+ WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
+ return;
+ }
+
+ ret = ib_post_send(qp, &swr, &bad_swr);
+ if (ret) {
+ WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
+ return;
+ }
+
+ wait_for_completion(&sdrain.done);
+}
+
+/*
+ * Post a WR and block until its completion is reaped for the RQ.
+ */
+static void __ib_drain_rq(struct ib_qp *qp)
+{
+ struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
+ struct ib_drain_cqe rdrain;
+ struct ib_recv_wr rwr = {}, *bad_rwr;
+ int ret;
+
+ if (qp->recv_cq->poll_ctx == IB_POLL_DIRECT) {
+ WARN_ONCE(qp->recv_cq->poll_ctx == IB_POLL_DIRECT,
+ "IB_POLL_DIRECT poll_ctx not supported for drain\n");
+ return;
+ }
+
+ rwr.wr_cqe = &rdrain.cqe;
+ rdrain.cqe.done = ib_drain_qp_done;
+ init_completion(&rdrain.done);
+
+ ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
+ if (ret) {
+ WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
+ return;
+ }
+
+ ret = ib_post_recv(qp, &rwr, &bad_rwr);
+ if (ret) {
+ WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
+ return;
+ }
+
+ wait_for_completion(&rdrain.done);
+}
+
+/**
+ * ib_drain_sq() - Block until all SQ CQEs have been consumed by the
+ * application.
+ * @qp: queue pair to drain
+ *
+ * If the device has a provider-specific drain function, then
+ * call that. Otherwise call the generic drain function
+ * __ib_drain_sq().
+ *
+ * The caller must:
+ *
+ * ensure there is room in the CQ and SQ for the drain work request and
+ * completion.
+ *
+ * allocate the CQ using ib_alloc_cq() and the CQ poll context cannot be
+ * IB_POLL_DIRECT.
+ *
+ * ensure that there are no other contexts that are posting WRs concurrently.
+ * Otherwise the drain is not guaranteed.
+ */
+void ib_drain_sq(struct ib_qp *qp)
+{
+ if (qp->device->drain_sq)
+ qp->device->drain_sq(qp);
+ else
+ __ib_drain_sq(qp);
+}
+EXPORT_SYMBOL(ib_drain_sq);
+
+/**
+ * ib_drain_rq() - Block until all RQ CQEs have been consumed by the
+ * application.
+ * @qp: queue pair to drain
+ *
+ * If the device has a provider-specific drain function, then
+ * call that. Otherwise call the generic drain function
+ * __ib_drain_rq().
+ *
+ * The caller must:
+ *
+ * ensure there is room in the CQ and RQ for the drain work request and
+ * completion.
+ *
+ * allocate the CQ using ib_alloc_cq() and the CQ poll context cannot be
+ * IB_POLL_DIRECT.
+ *
+ * ensure that there are no other contexts that are posting WRs concurrently.
+ * Otherwise the drain is not guaranteed.
+ */
+void ib_drain_rq(struct ib_qp *qp)
+{
+ if (qp->device->drain_rq)
+ qp->device->drain_rq(qp);
+ else
+ __ib_drain_rq(qp);
+}
+EXPORT_SYMBOL(ib_drain_rq);
+
+/**
+ * ib_drain_qp() - Block until all CQEs have been consumed by the
+ * application on both the RQ and SQ.
+ * @qp: queue pair to drain
+ *
+ * The caller must:
+ *
+ * ensure there is room in the CQ(s), SQ, and RQ for drain work requests
+ * and completions.
+ *
+ * allocate the CQs using ib_alloc_cq() and the CQ poll context cannot be
+ * IB_POLL_DIRECT.
+ *
+ * ensure that there are no other contexts that are posting WRs concurrently.
+ * Otherwise the drain is not guaranteed.
+ */
+void ib_drain_qp(struct ib_qp *qp)
+{
+ ib_drain_sq(qp);
+ if (!qp->srq)
+ ib_drain_rq(qp);
+}
+EXPORT_SYMBOL(ib_drain_qp);
diff --git a/sys/ofed/drivers/infiniband/core/iwcm.c b/sys/ofed/drivers/infiniband/core/iwcm.c
index a40d9dc8bbe91..8e55bb0f8bf7f 100644
--- a/sys/ofed/drivers/infiniband/core/iwcm.c
+++ b/sys/ofed/drivers/infiniband/core/iwcm.c
@@ -5,7 +5,6 @@
* Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
* Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
- * Copyright (c) 2016 Chelsio Communications. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -36,8 +35,6 @@
* SOFTWARE.
*
*/
-#include "opt_inet.h"
-
#include <linux/dma-mapping.h>
#include <linux/err.h>
#include <linux/idr.h>
@@ -49,13 +46,10 @@
#include <linux/completion.h>
#include <linux/slab.h>
#include <linux/module.h>
-#include <linux/string.h>
-#include <netinet/tcp.h>
-#include <sys/mutex.h>
-#include <rdma/rdma_cm.h>
#include <rdma/iw_cm.h>
#include <rdma/ib_addr.h>
+#include <rdma/iw_portmap.h>
#include "iwcm.h"
@@ -71,84 +65,8 @@ struct iwcm_work {
struct iw_cm_event event;
struct list_head free_list;
};
-struct iwcm_listen_work {
- struct work_struct work;
- struct iw_cm_id *cm_id;
-};
-
-static LIST_HEAD(listen_port_list);
-
-static DEFINE_MUTEX(listen_port_mutex);
-
-struct listen_port_info {
- struct list_head list;
- uint16_t port_num;
- uint32_t refcnt;
-};
-
-static int32_t
-add_port_to_listenlist(uint16_t port)
-{
- struct listen_port_info *port_info;
- int err = 0;
-
- mutex_lock(&listen_port_mutex);
-
- list_for_each_entry(port_info, &listen_port_list, list)
- if (port_info->port_num == port)
- goto found_port;
-
- port_info = kmalloc(sizeof(*port_info), GFP_KERNEL);
- if (!port_info) {
- err = -ENOMEM;
- mutex_unlock(&listen_port_mutex);
- goto out;
- }
- port_info->port_num = port;
- port_info->refcnt = 0;
-
- list_add(&port_info->list, &listen_port_list);
-
-found_port:
- ++(port_info->refcnt);
- mutex_unlock(&listen_port_mutex);
- return port_info->refcnt;
-out:
- return err;
-}
-
-static int32_t
-rem_port_from_listenlist(uint16_t port)
-{
- struct listen_port_info *port_info;
- int ret, found_port = 0;
-
- mutex_lock(&listen_port_mutex);
-
- list_for_each_entry(port_info, &listen_port_list, list)
- if (port_info->port_num == port) {
- found_port = 1;
- break;
- }
-
- if (found_port) {
- --(port_info->refcnt);
- ret = port_info->refcnt;
- if (port_info->refcnt == 0) {
- /* Remove this entry from the list as there are no
- * more listeners for this port_num.
- */
- list_del(&port_info->list);
- kfree(port_info);
- }
- } else {
- ret = -EINVAL;
- }
- mutex_unlock(&listen_port_mutex);
- return ret;
-
-}
+static unsigned int default_backlog = 256;
/*
* The following services provide a mechanism for pre-allocating iwcm_work
@@ -241,15 +159,14 @@ static void free_cm_id(struct iwcm_id_private *cm_id_priv)
/*
* Release a reference on cm_id. If the last reference is being
- * released, enable the waiting thread (in iw_destroy_cm_id) to
- * get woken up, and return 1 if a thread is already waiting.
+ * released, free the cm_id and return 1.
*/
static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
{
BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
if (atomic_dec_and_test(&cm_id_priv->refcount)) {
BUG_ON(!list_empty(&cm_id_priv->work_list));
- complete(&cm_id_priv->destroy_comp);
+ free_cm_id(cm_id_priv);
return 1;
}
@@ -266,25 +183,15 @@ static void add_ref(struct iw_cm_id *cm_id)
static void rem_ref(struct iw_cm_id *cm_id)
{
struct iwcm_id_private *cm_id_priv;
- int cb_destroy;
cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
- /*
- * Test bit before deref in case the cm_id gets freed on another
- * thread.
- */
- cb_destroy = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
- if (iwcm_deref_id(cm_id_priv) && cb_destroy) {
- BUG_ON(!list_empty(&cm_id_priv->work_list));
- free_cm_id(cm_id_priv);
- }
+ (void)iwcm_deref_id(cm_id_priv);
}
static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event);
struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
- struct socket *so,
iw_cm_handler cm_handler,
void *context)
{
@@ -301,7 +208,6 @@ struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
cm_id_priv->id.event_handler = cm_event_handler;
cm_id_priv->id.add_ref = add_ref;
cm_id_priv->id.rem_ref = rem_ref;
- cm_id_priv->id.so = so;
spin_lock_init(&cm_id_priv->lock);
atomic_set(&cm_id_priv->refcount, 1);
init_waitqueue_head(&cm_id_priv->connect_wait);
@@ -411,123 +317,6 @@ int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt)
}
EXPORT_SYMBOL(iw_cm_disconnect);
-static struct socket *
-dequeue_socket(struct socket *head)
-{
- struct socket *so;
- struct sockaddr_in *remote;
- int error;
-
- SOLISTEN_LOCK(head);
- error = solisten_dequeue(head, &so, SOCK_NONBLOCK);
- if (error == EWOULDBLOCK)
- return (NULL);
- remote = NULL;
- soaccept(so, (struct sockaddr **)&remote);
-
- free(remote, M_SONAME);
- return so;
-}
-
-static void
-iw_so_event_handler(struct work_struct *_work)
-{
-#ifdef INET
- struct iwcm_listen_work *work = container_of(_work,
- struct iwcm_listen_work, work);
- struct iw_cm_id *listen_cm_id = work->cm_id;
- struct iwcm_id_private *cm_id_priv;
- struct iw_cm_id *real_cm_id;
- struct sockaddr_in *local;
- struct socket *so;
-
- cm_id_priv = container_of(listen_cm_id, struct iwcm_id_private, id);
-
- if (cm_id_priv->state != IW_CM_STATE_LISTEN) {
- kfree(work);
- return;
- }
-
- /* Dequeue & process all new 'so' connection requests for this cmid */
- while ((so = dequeue_socket(work->cm_id->so)) != NULL) {
- if (rdma_cma_any_addr((struct sockaddr *)
- &listen_cm_id->local_addr)) {
- in_getsockaddr(so, (struct sockaddr **)&local);
- if (rdma_find_cmid_laddr(local, ARPHRD_ETHER,
- (void **) &real_cm_id)) {
- free(local, M_SONAME);
- goto err;
- }
- free(local, M_SONAME);
-
- real_cm_id->device->iwcm->newconn(real_cm_id, so);
- } else {
- listen_cm_id->device->iwcm->newconn(listen_cm_id, so);
- }
- }
-err:
- kfree(work);
-#endif
- return;
-}
-
-static int
-iw_so_upcall(struct socket *parent_so, void *arg, int waitflag)
-{
- struct iwcm_listen_work *work;
- struct iw_cm_id *cm_id = arg;
-
- /* check whether iw_so_event_handler() already dequeued this 'so' */
- if (TAILQ_EMPTY(&parent_so->sol_comp))
- return SU_OK;
- work = kzalloc(sizeof(*work), waitflag);
- if (!work)
- return -ENOMEM;
- work->cm_id = cm_id;
-
- INIT_WORK(&work->work, iw_so_event_handler);
- queue_work(iwcm_wq, &work->work);
-
- return SU_OK;
-}
-
-static int
-iw_create_listen(struct iw_cm_id *cm_id, int backlog)
-{
- struct sockopt sopt;
- struct socket *so = cm_id->so;
- int on = 1;
- int rc;
-
- rc = -solisten(cm_id->so, backlog, curthread);
- if (rc != 0)
- return (rc);
- SOLISTEN_LOCK(so);
- solisten_upcall_set(so, iw_so_upcall, cm_id);
- so->so_state |= SS_NBIO;
- SOLISTEN_UNLOCK(so);
- sopt.sopt_dir = SOPT_SET;
- sopt.sopt_level = IPPROTO_TCP;
- sopt.sopt_name = TCP_NODELAY;
- sopt.sopt_val = (caddr_t)&on;
- sopt.sopt_valsize = sizeof(on);
- sopt.sopt_td = NULL;
- sosetopt(so, &sopt);
- return (0);
-}
-
-static int
-iw_destroy_listen(struct iw_cm_id *cm_id)
-{
- struct socket *so = cm_id->so;
-
- SOLISTEN_LOCK(so);
- solisten_upcall_set(so, NULL, NULL);
- SOLISTEN_UNLOCK(so);
- return (0);
-}
-
-
/*
* CM_ID <-- DESTROYING
*
@@ -538,7 +327,6 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
{
struct iwcm_id_private *cm_id_priv;
unsigned long flags;
- int ret = 0, refcnt;
cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
/*
@@ -548,23 +336,19 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
wait_event(cm_id_priv->connect_wait,
!test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
+ /*
+ * Since we're deleting the cm_id, drop any events that
+ * might arrive before the last dereference.
+ */
+ set_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags);
+
spin_lock_irqsave(&cm_id_priv->lock, flags);
switch (cm_id_priv->state) {
case IW_CM_STATE_LISTEN:
cm_id_priv->state = IW_CM_STATE_DESTROYING;
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- if (rdma_cma_any_addr((struct sockaddr *)&cm_id->local_addr)) {
- refcnt =
- rem_port_from_listenlist(cm_id->local_addr.sin_port);
-
- if (refcnt == 0)
- ret = iw_destroy_listen(cm_id);
-
- cm_id->device->iwcm->destroy_listen_ep(cm_id);
- } else {
- ret = iw_destroy_listen(cm_id);
- cm_id->device->iwcm->destroy_listen_ep(cm_id);
- }
+ /* destroy the listening endpoint */
+ cm_id->device->iwcm->destroy_listen(cm_id);
spin_lock_irqsave(&cm_id_priv->lock, flags);
break;
case IW_CM_STATE_ESTABLISHED:
@@ -616,18 +400,28 @@ void iw_destroy_cm_id(struct iw_cm_id *cm_id)
struct iwcm_id_private *cm_id_priv;
cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
- BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags));
-
destroy_cm_id(cm_id);
+}
+EXPORT_SYMBOL(iw_destroy_cm_id);
- wait_for_completion(&cm_id_priv->destroy_comp);
-
- if (cm_id->so)
- sock_release(cm_id->so);
+/**
+ * iw_cm_map - Use portmapper to map the ports
+ * @cm_id: connection manager pointer
+ * @active: Indicates the active side when true
+ * returns nonzero for error only if iwpm_create_mapinfo() fails
+ *
+ * Tries to add a mapping for a port using the Portmapper. If
+ * successful in mapping the IP/Port it will check the remote
+ * mapped IP address for a wildcard IP address and replace the
+ * zero IP address with the remote_addr.
+ */
+static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
+{
+ cm_id->m_local_addr = cm_id->local_addr;
+ cm_id->m_remote_addr = cm_id->remote_addr;
- free_cm_id(cm_id_priv);
+ return 0;
}
-EXPORT_SYMBOL(iw_destroy_cm_id);
/*
* CM_ID <-- LISTEN
@@ -639,10 +433,13 @@ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)
{
struct iwcm_id_private *cm_id_priv;
unsigned long flags;
- int ret, refcnt;
+ int ret;
cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ if (!backlog)
+ backlog = default_backlog;
+
ret = alloc_work_entries(cm_id_priv, backlog);
if (ret)
return ret;
@@ -652,33 +449,11 @@ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)
case IW_CM_STATE_IDLE:
cm_id_priv->state = IW_CM_STATE_LISTEN;
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
-
- if (rdma_cma_any_addr((struct sockaddr *)&cm_id->local_addr)) {
- refcnt =
- add_port_to_listenlist(cm_id->local_addr.sin_port);
-
- if (refcnt == 1) {
- ret = iw_create_listen(cm_id, backlog);
- } else if (refcnt <= 0) {
- ret = -EINVAL;
- } else {
- /* if refcnt > 1, a socket listener created
- * already. And we need not create socket
- * listener on other rdma devices/listen cm_id's
- * due to TOE. That is when a socket listener is
- * created with INADDR_ANY all registered TOE
- * devices will get a call to start
- * hardware listeners.
- */
- }
- } else {
- ret = iw_create_listen(cm_id, backlog);
- }
+ ret = iw_cm_map(cm_id, false);
if (!ret)
- cm_id->device->iwcm->create_listen_ep(cm_id, backlog);
- else
+ ret = cm_id->device->iwcm->create_listen(cm_id, backlog);
+ if (ret)
cm_id_priv->state = IW_CM_STATE_IDLE;
-
spin_lock_irqsave(&cm_id_priv->lock, flags);
break;
default:
@@ -806,39 +581,37 @@ int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
spin_lock_irqsave(&cm_id_priv->lock, flags);
if (cm_id_priv->state != IW_CM_STATE_IDLE) {
- spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
- wake_up_all(&cm_id_priv->connect_wait);
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
/* Get the ib_qp given the QPN */
qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
if (!qp) {
- spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
- wake_up_all(&cm_id_priv->connect_wait);
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
cm_id->device->iwcm->add_ref(qp);
cm_id_priv->qp = qp;
cm_id_priv->state = IW_CM_STATE_CONN_SENT;
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- ret = cm_id->device->iwcm->connect(cm_id, iw_param);
- if (ret) {
- spin_lock_irqsave(&cm_id_priv->lock, flags);
- if (cm_id_priv->qp) {
- cm_id->device->iwcm->rem_ref(qp);
- cm_id_priv->qp = NULL;
- }
- spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
- cm_id_priv->state = IW_CM_STATE_IDLE;
- clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
- wake_up_all(&cm_id_priv->connect_wait);
- }
+ ret = iw_cm_map(cm_id, true);
+ if (!ret)
+ ret = cm_id->device->iwcm->connect(cm_id, iw_param);
+ if (!ret)
+ return 0; /* success */
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id_priv->qp) {
+ cm_id->device->iwcm->rem_ref(qp);
+ cm_id_priv->qp = NULL;
+ }
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+err:
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
return ret;
}
EXPORT_SYMBOL(iw_cm_connect);
@@ -873,7 +646,6 @@ static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
BUG_ON(iw_event->status);
cm_id = iw_create_cm_id(listen_id_priv->id.device,
- iw_event->so,
listen_id_priv->id.cm_handler,
listen_id_priv->id.context);
/* If the cm_id could not be created, ignore the request */
@@ -881,9 +653,10 @@ static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
goto out;
cm_id->provider_data = iw_event->provider_data;
- cm_id->local_addr = iw_event->local_addr;
+ cm_id->m_local_addr = iw_event->local_addr;
+ cm_id->m_remote_addr = iw_event->remote_addr;
+ cm_id->local_addr = listen_id_priv->id.local_addr;
cm_id->remote_addr = iw_event->remote_addr;
-
cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
cm_id_priv->state = IW_CM_STATE_CONN_RECV;
@@ -911,10 +684,7 @@ static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
ret = cm_id->cm_handler(cm_id, iw_event);
if (ret) {
iw_cm_reject(cm_id, NULL, 0);
- set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
- destroy_cm_id(cm_id);
- if (atomic_read(&cm_id_priv->refcount)==0)
- free_cm_id(cm_id_priv);
+ iw_destroy_cm_id(cm_id);
}
out:
@@ -978,8 +748,10 @@ static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,
clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
if (iw_event->status == 0) {
- cm_id_priv->id.local_addr = iw_event->local_addr;
- cm_id_priv->id.remote_addr = iw_event->remote_addr;
+ cm_id_priv->id.m_local_addr = iw_event->local_addr;
+ cm_id_priv->id.m_remote_addr = iw_event->remote_addr;
+ iw_event->local_addr = cm_id_priv->id.local_addr;
+ iw_event->remote_addr = cm_id_priv->id.remote_addr;
cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
} else {
/* REJECTED or RESET */
@@ -1100,7 +872,6 @@ static void cm_work_handler(struct work_struct *_work)
unsigned long flags;
int empty;
int ret = 0;
- int destroy_id;
spin_lock_irqsave(&cm_id_priv->lock, flags);
empty = list_empty(&cm_id_priv->work_list);
@@ -1113,20 +884,14 @@ static void cm_work_handler(struct work_struct *_work)
put_work(work);
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- ret = process_event(cm_id_priv, &levent);
- if (ret) {
- set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
- destroy_cm_id(&cm_id_priv->id);
- }
- BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
- destroy_id = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
- if (iwcm_deref_id(cm_id_priv)) {
- if (destroy_id) {
- BUG_ON(!list_empty(&cm_id_priv->work_list));
- free_cm_id(cm_id_priv);
- }
+ if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) {
+ ret = process_event(cm_id_priv, &levent);
+ if (ret)
+ destroy_cm_id(&cm_id_priv->id);
+ } else
+ pr_debug("dropping event %d\n", levent.event);
+ if (iwcm_deref_id(cm_id_priv))
return;
- }
if (empty)
return;
spin_lock_irqsave(&cm_id_priv->lock, flags);
@@ -1269,7 +1034,7 @@ EXPORT_SYMBOL(iw_cm_init_qp_attr);
static int __init iw_cm_init(void)
{
- iwcm_wq = create_singlethread_workqueue("iw_cm_wq");
+ iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", WQ_MEM_RECLAIM);
if (!iwcm_wq)
return -ENOMEM;
diff --git a/sys/ofed/drivers/infiniband/core/iwcm.h b/sys/ofed/drivers/infiniband/core/iwcm.h
index 3f6cc82564c8b..82c2cd1b0a804 100644
--- a/sys/ofed/drivers/infiniband/core/iwcm.h
+++ b/sys/ofed/drivers/infiniband/core/iwcm.h
@@ -56,7 +56,7 @@ struct iwcm_id_private {
struct list_head work_free_list;
};
-#define IWCM_F_CALLBACK_DESTROY 1
+#define IWCM_F_DROP_EVENTS 1
#define IWCM_F_CONNECT_WAIT 2
#endif /* IWCM_H */
diff --git a/sys/ofed/drivers/infiniband/core/iwpm_util.h b/sys/ofed/drivers/infiniband/core/iwpm_util.h
new file mode 100644
index 0000000000000..5553b8ce34f11
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/iwpm_util.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _IWPM_UTIL_H
+#define _IWPM_UTIL_H
+
+#include <linux/module.h>
+#include <linux/io.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+#include <linux/jhash.h>
+#include <linux/kref.h>
+#include <linux/errno.h>
+#include <linux/rwsem.h>
+
+#include <rdma/iw_portmap.h>
+
+#define IWPM_PID_UNDEFINED -1
+#define IWPM_PID_UNAVAILABLE -2
+
+#define IWPM_REG_UNDEF 0x01
+#define IWPM_REG_VALID 0x02
+#define IWPM_REG_INCOMPL 0x04
+
+/**
+ * iwpm_compare_sockaddr - Compare two sockaddr storage structs
+ *
+ * Returns 0 if they are holding the same ip/tcp address info,
+ * otherwise returns 1
+ */
+int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr,
+ struct sockaddr_storage *b_sockaddr);
+
+/**
+ * iwpm_print_sockaddr - Print IPv4/IPv6 address and TCP port
+ * @sockaddr: Socket address to print
+ * @msg: Message to print
+ */
+void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg);
+#endif
diff --git a/sys/ofed/drivers/infiniband/core/mad.c b/sys/ofed/drivers/infiniband/core/mad.c
index a5b11134d38fc..97439a38afa54 100644
--- a/sys/ofed/drivers/infiniband/core/mad.c
+++ b/sys/ofed/drivers/infiniband/core/mad.c
@@ -3,6 +3,7 @@
* Copyright (c) 2005 Intel Corporation. All rights reserved.
* Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
* Copyright (c) 2009 HNR Consulting. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -35,22 +36,21 @@
*/
#define LINUXKPI_PARAM_PREFIX ibcore_
+#define KBUILD_MODNAME "ibcore"
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/dma-mapping.h>
#include <linux/slab.h>
#include <linux/module.h>
-#include <linux/string.h>
#include <rdma/ib_cache.h>
#include "mad_priv.h"
#include "mad_rmpp.h"
#include "smi.h"
+#include "opa_smi.h"
#include "agent.h"
-
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_DESCRIPTION("kernel IB MAD API");
-MODULE_AUTHOR("Hal Rosenstock");
-MODULE_AUTHOR("Sean Hefty");
+#include "core_priv.h"
static int mad_sendq_size = IB_MAD_QP_SEND_SIZE;
static int mad_recvq_size = IB_MAD_QP_RECV_SIZE;
@@ -60,29 +60,9 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests
module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
-static struct kmem_cache *ib_mad_cache;
-
static struct list_head ib_mad_port_list;
static u32 ib_mad_client_id = 0;
-
-/*
- * Timeout FIFO (tf) param
- */
-enum {
- /* min time between 2 consecutive activations of tf workqueue */
- MIN_BETWEEN_ACTIVATIONS_MS = 5
-};
-
-/*
- * SA congestion control params
- */
-enum {
- MAX_OUTSTANDING_SA_MADS = 10,
- MIN_TIME_FOR_SA_MAD_SEND_MS = 20,
- MAX_SA_MADS = 10000
-};
-
/* Port list lock */
static DEFINE_SPINLOCK(ib_mad_port_list_lock);
@@ -92,7 +72,7 @@ static int method_in_use(struct ib_mad_mgmt_method_table **method,
static void remove_mad_reg_req(struct ib_mad_agent_private *priv);
static struct ib_mad_agent_private *find_mad_agent(
struct ib_mad_port_private *port_priv,
- struct ib_mad *mad);
+ const struct ib_mad_hdr *mad);
static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
struct ib_mad_private *mad);
static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv);
@@ -103,512 +83,9 @@ static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
u8 mgmt_class);
static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
struct ib_mad_agent_private *agent_priv);
-static int send_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr,
- u32 timeout_ms, u32 retries_left);
-
-
-/*
- * Timeout FIFO functions - implements FIFO with timeout mechanism
- */
-
-static void activate_timeout_handler_task(unsigned long data)
-{
- struct to_fifo *tf;
-
- tf = (struct to_fifo *)data;
- del_timer(&tf->timer);
- queue_work(tf->workq, &tf->work);
-}
-
-static unsigned long adjusted_time(unsigned long last, unsigned long next)
-{
- unsigned long min_next;
-
- min_next = last + msecs_to_jiffies(MIN_BETWEEN_ACTIVATIONS_MS);
- if (time_after(min_next, next))
- return min_next;
-
- return next;
-}
-
-static void notify_failure(struct ib_mad_send_wr_private *mad_send_wr,
- enum ib_wc_status status)
-{
- struct ib_mad_send_wc mad_send_wc;
- struct ib_mad_agent_private *mad_agent_priv;
-
- mad_send_wc.status = status;
- mad_send_wc.vendor_err = 0;
- mad_send_wc.send_buf = &mad_send_wr->send_buf;
- mad_agent_priv = mad_send_wr->mad_agent_priv;
- mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc);
-}
-
-static inline struct sa_cc_data *
-get_cc_obj(struct ib_mad_send_wr_private *mad_send_wr)
-{
- return &mad_send_wr->mad_agent_priv->qp_info->port_priv->sa_cc;
-}
-
-static inline struct ib_mad_send_wr_private *tfe_to_mad(struct tf_entry *tfe)
-{
- return container_of(tfe, struct ib_mad_send_wr_private, tf_list);
-}
-
-static void timeout_handler_task(struct work_struct *work)
-{
- struct tf_entry *tmp1, *tmp2;
- struct list_head *list_item, exp_lst;
- unsigned long flags, curr_time;
- int lst_empty;
- struct to_fifo *tf;
-
- tf = container_of(work, struct to_fifo, work);
- do {
- INIT_LIST_HEAD(&exp_lst);
-
- spin_lock_irqsave(&tf->lists_lock, flags);
- curr_time = jiffies;
- list_for_each(list_item, &tf->to_head) {
- tmp1 = list_entry(list_item, struct tf_entry, to_list);
- if (time_before(curr_time, tmp1->exp_time))
- break;
- list_del(&tmp1->fifo_list);
- tf->num_items--;
- }
-
- /* cut list up to and including list_item->prev */
- list_cut_position(&exp_lst, &tf->to_head, list_item->prev);
- spin_unlock_irqrestore(&tf->lists_lock, flags);
-
- lst_empty = list_empty(&exp_lst);
- list_for_each_entry_safe(tmp1, tmp2, &exp_lst, to_list) {
- list_del(&tmp1->to_list);
- if (tmp1->canceled) {
- tmp1->canceled = 0;
- notify_failure(tfe_to_mad(tmp1), IB_WC_WR_FLUSH_ERR);
- } else {
- notify_failure(tfe_to_mad(tmp1), IB_WC_RESP_TIMEOUT_ERR);
- }
- }
- } while (!lst_empty);
-
- spin_lock_irqsave(&tf->lists_lock, flags);
- if (!list_empty(&tf->to_head)) {
- tmp1 = list_entry(tf->to_head.next, struct tf_entry, to_list);
- mod_timer(&tf->timer, adjusted_time(curr_time, tmp1->exp_time));
- }
- spin_unlock_irqrestore(&tf->lists_lock, flags);
-}
-
-/**
- * tf_create - creates new timeout-fifo object
- * @fifo_size: Maximum fifo size
- *
- * Allocate and initialize new timeout-fifo object
- */
-static struct to_fifo *tf_create(u32 fifo_size)
-{
- struct to_fifo *tf;
-
- tf = kzalloc(sizeof(*tf), GFP_KERNEL);
- if (tf) {
- tf->workq = create_singlethread_workqueue("to_fifo");
- if (!tf->workq) {
- kfree(tf);
- return NULL;
- }
- spin_lock_init(&tf->lists_lock);
- INIT_LIST_HEAD(&tf->to_head);
- INIT_LIST_HEAD(&tf->fifo_head);
- init_timer(&tf->timer);
- INIT_WORK(&tf->work, timeout_handler_task);
- tf->timer.data = (unsigned long) tf;
- tf->timer.function = activate_timeout_handler_task;
- tf->timer.expires = jiffies;
- tf->fifo_size = fifo_size;
- tf->stop_enqueue = 0;
- tf->num_items = 0;
- }
-
- return tf;
-}
-
-/**
- * tf_enqueue - enqueue item to timeout-fifo object
- * @tf:timeout-fifo object
- * @item: item to enqueue.
- * @timeout_ms: item expiration time in ms.
- *
- * Enqueue item to fifo and modify expiration timer when required.
- *
- * Returns 0 on success and negative on failure.
- */
-static int tf_enqueue(struct to_fifo *tf, struct tf_entry *item, u32 timeout_ms)
-{
- struct tf_entry *tmp;
- struct list_head *list_item;
- unsigned long flags;
-
- item->exp_time = jiffies + msecs_to_jiffies(timeout_ms);
-
- spin_lock_irqsave(&tf->lists_lock, flags);
- if (tf->num_items >= tf->fifo_size || tf->stop_enqueue) {
- spin_unlock_irqrestore(&tf->lists_lock, flags);
- return -EBUSY;
- }
-
- /* Insert item to timeout list */
- list_for_each_prev(list_item, &tf->to_head) {
- tmp = list_entry(list_item, struct tf_entry, to_list);
- if (time_after(item->exp_time, tmp->exp_time))
- break;
- }
-
- list_add(&item->to_list, list_item);
-
- /* Insert item to fifo list */
- list_add_tail(&item->fifo_list, &tf->fifo_head);
-
- tf->num_items++;
-
- /* modify expiration timer if required */
- if (list_item == &tf->to_head)
- mod_timer(&tf->timer, item->exp_time);
-
- spin_unlock_irqrestore(&tf->lists_lock, flags);
-
- return 0;
-}
-
-/**
- * tf_dequeue - dequeue item from timeout-fifo object
- * @tf:timeout-fifo object
- * @time_left_ms: returns the time left for expiration in ms.
- *
- * Dequeue item from fifo and modify expiration timer when required.
- *
- * Returns pointer to tf_entry on success and NULL on failure.
- */
-static struct tf_entry *tf_dequeue(struct to_fifo *tf, u32 *time_left_ms)
-{
- unsigned long flags;
- unsigned long time_left;
- struct tf_entry *tmp, *tmp1;
- bool found = false;
-
- spin_lock_irqsave(&tf->lists_lock, flags);
- if (list_empty(&tf->fifo_head)) {
- spin_unlock_irqrestore(&tf->lists_lock, flags);
- return NULL;
- }
-
- list_for_each_entry(tmp, &tf->fifo_head, fifo_list) {
- if (!tmp->canceled) {
- found = true;
- break;
- }
- }
-
- if (!found) {
- spin_unlock_irqrestore(&tf->lists_lock, flags);
- return NULL;
- }
-
- /* modify timer in case enqueued item is the next to expire */
- if (tf->to_head.next == &tmp->to_list) {
- if (list_is_last(&tmp->to_list, &tf->to_head)) {
- del_timer(&tf->timer);
- } else {
- tmp1 = list_entry(tmp->to_list.next, struct tf_entry, to_list);
- mod_timer(&tf->timer, tmp1->exp_time);
- }
- }
- list_del(&tmp->fifo_list);
- list_del(&tmp->to_list);
- tf->num_items--;
- spin_unlock_irqrestore(&tf->lists_lock, flags);
-
- time_left = tmp->exp_time - jiffies;
- if ((long) time_left <= 0)
- time_left = 0;
- *time_left_ms = jiffies_to_msecs(time_left);
-
- return tmp;
-}
-
-static void tf_stop_enqueue(struct to_fifo *tf)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&tf->lists_lock, flags);
- tf->stop_enqueue = 1;
- spin_unlock_irqrestore(&tf->lists_lock, flags);
-}
-
-/**
- * tf_free - free empty timeout-fifo object
- * @tf:timeout-fifo object
- *
- */
-static void tf_free(struct to_fifo *tf)
-{
- del_timer_sync(&tf->timer);
- flush_workqueue(tf->workq);
- destroy_workqueue(tf->workq);
-
- kfree(tf);
-}
-
-/**
- * tf_free_agent - free MADs related to specific MAD agent from timeout-fifo
- * @tf:timeout-fifo object
- * @mad_agent_priv: MAD agent.
- *
- */
-static void tf_free_agent(struct to_fifo *tf, struct ib_mad_agent_private *mad_agent_priv)
-{
- unsigned long flags;
- struct tf_entry *tmp, *tmp1;
- struct list_head tmp_head;
-
- INIT_LIST_HEAD(&tmp_head);
- spin_lock_irqsave(&tf->lists_lock, flags);
- list_for_each_entry_safe(tmp, tmp1, &tf->fifo_head, fifo_list) {
- if (tfe_to_mad(tmp)->mad_agent_priv == mad_agent_priv) {
- list_del(&tmp->to_list);
- list_move(&tmp->fifo_list, &tmp_head);
- tf->num_items--;
- }
- }
- spin_unlock_irqrestore(&tf->lists_lock, flags);
-
- list_for_each_entry_safe(tmp, tmp1, &tmp_head, fifo_list) {
- list_del(&tmp->fifo_list);
- notify_failure(tfe_to_mad(tmp), IB_WC_WR_FLUSH_ERR);
- }
-}
-
-/**
- * tf_modify_item - to modify expiration time for specific item
- * @tf:timeout-fifo object
- * @mad_agent_priv: MAD agent.
- * @send_buf: the MAD to modify in queue
- * @timeout_ms: new timeout to set.
- *
- * Returns 0 if item found on list and -ENXIO if not.
- *
- * Note: The send_buf may point on MAD that is already released.
- * Therefore we can't use this struct before finding it in the list
- */
-static int tf_modify_item(struct to_fifo *tf,
- struct ib_mad_agent_private *mad_agent_priv,
- struct ib_mad_send_buf *send_buf, u32 timeout_ms)
-{
- struct tf_entry *tmp, *item;
- struct list_head *list_item;
- unsigned long flags;
- int found = 0;
-
- spin_lock_irqsave(&tf->lists_lock, flags);
- list_for_each_entry(item, &tf->fifo_head, fifo_list) {
- if (tfe_to_mad(item)->mad_agent_priv == mad_agent_priv &&
- &tfe_to_mad(item)->send_buf == send_buf) {
- found = 1;
- break;
- }
- }
-
- if (!found) {
- spin_unlock_irqrestore(&tf->lists_lock, flags);
- return -ENXIO;
- }
-
- item->exp_time = jiffies + msecs_to_jiffies(timeout_ms);
-
- if (timeout_ms) {
- list_del(&item->to_list);
- list_for_each_prev(list_item, &tf->to_head) {
- tmp = list_entry(list_item, struct tf_entry, to_list);
- if (time_after(item->exp_time, tmp->exp_time))
- break;
- }
- list_add(&item->to_list, list_item);
-
- /* modify expiration timer if required */
- if (list_item == &tf->to_head)
- mod_timer(&tf->timer, item->exp_time);
- } else {
- /*
- * when item canceled (timeout_ms == 0) move item to
- * head of timeout list and to the tail of fifo list
- */
- item->canceled = 1;
- list_move(&item->to_list, &tf->to_head);
- list_move_tail(&item->fifo_list, &tf->fifo_head);
- mod_timer(&tf->timer, item->exp_time);
- }
- spin_unlock_irqrestore(&tf->lists_lock, flags);
-
- return 0;
-}
-
-/*
- * SA congestion control functions
- */
-
-/*
- * Defines which MAD is under congestion control.
- */
-static int is_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr)
-{
- struct ib_mad_hdr *mad;
-
- mad = (struct ib_mad_hdr *)mad_send_wr->send_buf.mad;
-
- return ((mad_send_wr->send_buf.timeout_ms) &&
- (mad->mgmt_class == IB_MGMT_CLASS_SUBN_ADM) &&
- ((mad->method == IB_MGMT_METHOD_GET) ||
- (mad->method == IB_MGMT_METHOD_SET)));
-}
-
-/*
- * Notify that SA congestion controlled MAD is done.
- * to allow dequeuing SA MAD from congestion control queue.
- */
-static void sa_cc_mad_done(struct sa_cc_data *cc_obj)
-{
- unsigned long flags;
- struct tf_entry *tfe;
- struct ib_mad_send_wr_private *mad_send_wr;
- u32 time_left_ms, timeout_ms, retries;
- int ret;
-
- do {
- spin_lock_irqsave(&cc_obj->lock, flags);
- tfe = tf_dequeue(cc_obj->tf, &time_left_ms);
- if (!tfe) {
- if (cc_obj->outstanding > 0)
- cc_obj->outstanding--;
- spin_unlock_irqrestore(&cc_obj->lock, flags);
- break;
- }
- spin_unlock_irqrestore(&cc_obj->lock, flags);
- mad_send_wr = tfe_to_mad(tfe);
- time_left_ms += MIN_TIME_FOR_SA_MAD_SEND_MS;
- if (time_left_ms > mad_send_wr->send_buf.timeout_ms) {
- retries = time_left_ms / mad_send_wr->send_buf.timeout_ms - 1;
- timeout_ms = mad_send_wr->send_buf.timeout_ms;
- } else {
- retries = 0;
- timeout_ms = time_left_ms;
- }
- ret = send_sa_cc_mad(mad_send_wr, timeout_ms, retries);
- if (ret) {
- if (ret == -ENOMEM)
- notify_failure(mad_send_wr, IB_WC_GENERAL_ERR);
- else
- notify_failure(mad_send_wr, IB_WC_LOC_QP_OP_ERR);
- }
- } while (ret);
-}
-
-/*
- * Send SA MAD under congestion control.
- */
-static int sa_cc_mad_send(struct ib_mad_send_wr_private *mad_send_wr)
-{
- unsigned long flags;
- int ret;
- struct sa_cc_data *cc_obj;
-
- cc_obj = get_cc_obj(mad_send_wr);
- spin_lock_irqsave(&cc_obj->lock, flags);
- if (cc_obj->outstanding < MAX_OUTSTANDING_SA_MADS) {
- cc_obj->outstanding++;
- spin_unlock_irqrestore(&cc_obj->lock, flags);
- ret = send_sa_cc_mad(mad_send_wr, mad_send_wr->send_buf.timeout_ms,
- mad_send_wr->retries_left);
- if (ret)
- sa_cc_mad_done(cc_obj);
-
- } else {
- int qtime = (mad_send_wr->send_buf.timeout_ms *
- (mad_send_wr->retries_left + 1))
- - MIN_TIME_FOR_SA_MAD_SEND_MS;
-
- if (qtime < 0)
- qtime = 0;
- ret = tf_enqueue(cc_obj->tf, &mad_send_wr->tf_list, (u32)qtime);
-
- spin_unlock_irqrestore(&cc_obj->lock, flags);
- }
-
- return ret;
-}
-
-/*
- * Initialize SA congestion control.
- */
-static int sa_cc_init(struct sa_cc_data *cc_obj)
-{
- spin_lock_init(&cc_obj->lock);
- cc_obj->outstanding = 0;
- cc_obj->tf = tf_create(MAX_SA_MADS);
- if (!cc_obj->tf)
- return -ENOMEM;
- return 0;
-}
-
-/*
- * Cancel SA MADs from congestion control queue.
- */
-static void cancel_sa_cc_mads(struct ib_mad_agent_private *mad_agent_priv)
-{
- tf_free_agent(mad_agent_priv->qp_info->port_priv->sa_cc.tf,
- mad_agent_priv);
-}
-
-/*
- * Modify timeout of SA MAD on congestion control queue.
- */
-static int modify_sa_cc_mad(struct ib_mad_agent_private *mad_agent_priv,
- struct ib_mad_send_buf *send_buf, u32 timeout_ms)
-{
- int ret;
- int qtime = 0;
-
- if (timeout_ms > MIN_TIME_FOR_SA_MAD_SEND_MS)
- qtime = timeout_ms - MIN_TIME_FOR_SA_MAD_SEND_MS;
-
- ret = tf_modify_item(mad_agent_priv->qp_info->port_priv->sa_cc.tf,
- mad_agent_priv, send_buf, (u32)qtime);
- return ret;
-}
-
-static void sa_cc_destroy(struct sa_cc_data *cc_obj)
-{
- struct ib_mad_send_wr_private *mad_send_wr;
- struct tf_entry *tfe;
- struct ib_mad_send_wc mad_send_wc;
- struct ib_mad_agent_private *mad_agent_priv;
- u32 time_left_ms;
-
- mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
- mad_send_wc.vendor_err = 0;
-
- tf_stop_enqueue(cc_obj->tf);
- tfe = tf_dequeue(cc_obj->tf, &time_left_ms);
- while (tfe) {
- mad_send_wr = tfe_to_mad(tfe);
- mad_send_wc.send_buf = &mad_send_wr->send_buf;
- mad_agent_priv = mad_send_wr->mad_agent_priv;
- mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
- &mad_send_wc);
- tfe = tf_dequeue(cc_obj->tf, &time_left_ms);
- }
- tf_free(cc_obj->tf);
-}
+static bool ib_mad_send_error(struct ib_mad_port_private *port_priv,
+ struct ib_wc *wc);
+static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc);
/*
* Returns a ib_mad_port_private structure or NULL for a device/port
@@ -704,12 +181,12 @@ static int is_vendor_method_in_use(
return 0;
}
-int ib_response_mad(struct ib_mad *mad)
+int ib_response_mad(const struct ib_mad_hdr *hdr)
{
- return ((mad->mad_hdr.method & IB_MGMT_METHOD_RESP) ||
- (mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) ||
- ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_BM) &&
- (mad->mad_hdr.attr_mod & IB_BM_ATTR_MOD_RESP)));
+ return ((hdr->method & IB_MGMT_METHOD_RESP) ||
+ (hdr->method == IB_MGMT_METHOD_TRAP_REPRESS) ||
+ ((hdr->mgmt_class == IB_MGMT_CLASS_BM) &&
+ (hdr->attr_mod & IB_BM_ATTR_MOD_RESP)));
}
EXPORT_SYMBOL(ib_response_mad);
@@ -723,7 +200,8 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
u8 rmpp_version,
ib_mad_send_handler send_handler,
ib_mad_recv_handler recv_handler,
- void *context)
+ void *context,
+ u32 registration_flags)
{
struct ib_mad_port_private *port_priv;
struct ib_mad_agent *ret = ERR_PTR(-EINVAL);
@@ -739,68 +217,109 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
/* Validate parameters */
qpn = get_spl_qp_index(qp_type);
- if (qpn == -1)
+ if (qpn == -1) {
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: invalid QP Type %d\n",
+ qp_type);
goto error1;
+ }
- if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION)
+ if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) {
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: invalid RMPP Version %u\n",
+ rmpp_version);
goto error1;
+ }
/* Validate MAD registration request if supplied */
if (mad_reg_req) {
- if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION)
+ if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) {
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: invalid Class Version %u\n",
+ mad_reg_req->mgmt_class_version);
goto error1;
- if (!recv_handler)
+ }
+ if (!recv_handler) {
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: no recv_handler\n");
goto error1;
+ }
if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) {
/*
* IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only
* one in this range currently allowed
*/
if (mad_reg_req->mgmt_class !=
- IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: Invalid Mgmt Class 0x%x\n",
+ mad_reg_req->mgmt_class);
goto error1;
+ }
} else if (mad_reg_req->mgmt_class == 0) {
/*
* Class 0 is reserved in IBA and is used for
* aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
*/
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: Invalid Mgmt Class 0\n");
goto error1;
} else if (is_vendor_class(mad_reg_req->mgmt_class)) {
/*
* If class is in "new" vendor range,
* ensure supplied OUI is not zero
*/
- if (!is_vendor_oui(mad_reg_req->oui))
+ if (!is_vendor_oui(mad_reg_req->oui)) {
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: No OUI specified for class 0x%x\n",
+ mad_reg_req->mgmt_class);
goto error1;
+ }
}
/* Make sure class supplied is consistent with RMPP */
if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) {
- if (rmpp_version)
+ if (rmpp_version) {
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: RMPP version for non-RMPP class 0x%x\n",
+ mad_reg_req->mgmt_class);
goto error1;
+ }
}
+
/* Make sure class supplied is consistent with QP type */
if (qp_type == IB_QPT_SMI) {
if ((mad_reg_req->mgmt_class !=
IB_MGMT_CLASS_SUBN_LID_ROUTED) &&
(mad_reg_req->mgmt_class !=
- IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE))
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: Invalid SM QP type: class 0x%x\n",
+ mad_reg_req->mgmt_class);
goto error1;
+ }
} else {
if ((mad_reg_req->mgmt_class ==
IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
(mad_reg_req->mgmt_class ==
- IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE))
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: Invalid GS QP type: class 0x%x\n",
+ mad_reg_req->mgmt_class);
goto error1;
+ }
}
} else {
/* No registration request supplied */
if (!send_handler)
goto error1;
+ if (registration_flags & IB_MAD_USER_RMPP)
+ goto error1;
}
/* Validate device and port */
port_priv = ib_get_mad_port(device, port_num);
if (!port_priv) {
+ dev_notice(&device->dev, "ib_register_mad_agent: Invalid port\n");
ret = ERR_PTR(-ENODEV);
goto error1;
}
@@ -808,6 +327,8 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
/* Verify the QP requested is supported. For example, Ethernet devices
* will not have QP0 */
if (!port_priv->qp_info[qpn].qp) {
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: QP %d not supported\n", qpn);
ret = ERR_PTR(-EPROTONOSUPPORT);
goto error1;
}
@@ -819,13 +340,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
goto error1;
}
- mad_agent_priv->agent.mr = ib_get_dma_mr(port_priv->qp_info[qpn].qp->pd,
- IB_ACCESS_LOCAL_WRITE);
- if (IS_ERR(mad_agent_priv->agent.mr)) {
- ret = ERR_PTR(-ENOMEM);
- goto error2;
- }
-
if (mad_reg_req) {
reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL);
if (!reg_req) {
@@ -844,6 +358,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
mad_agent_priv->agent.context = context;
mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp;
mad_agent_priv->agent.port_num = port_num;
+ mad_agent_priv->agent.flags = registration_flags;
spin_lock_init(&mad_agent_priv->lock);
INIT_LIST_HEAD(&mad_agent_priv->send_list);
INIT_LIST_HEAD(&mad_agent_priv->wait_list);
@@ -909,8 +424,6 @@ error4:
spin_unlock_irqrestore(&port_priv->reg_lock, flags);
kfree(reg_req);
error3:
- ib_dereg_mr(mad_agent_priv->agent.mr);
-error2:
kfree(mad_agent_priv);
error1:
return ret;
@@ -1056,7 +569,7 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
*/
cancel_mads(mad_agent_priv);
port_priv = mad_agent_priv->qp_info->port_priv;
- cancel_delayed_work_sync(&mad_agent_priv->timed_work);
+ cancel_delayed_work(&mad_agent_priv->timed_work);
spin_lock_irqsave(&port_priv->reg_lock, flags);
remove_mad_reg_req(mad_agent_priv);
@@ -1070,7 +583,6 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
wait_for_completion(&mad_agent_priv->comp);
kfree(mad_agent_priv->reg_req);
- ib_dereg_mr(mad_agent_priv->agent.mr);
kfree(mad_agent_priv);
}
@@ -1099,7 +611,6 @@ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent)
struct ib_mad_agent_private *mad_agent_priv;
struct ib_mad_snoop_private *mad_snoop_priv;
- if (!IS_ERR(mad_agent)) {
/* If the TID is zero, the agent can only snoop. */
if (mad_agent->hi_tid) {
mad_agent_priv = container_of(mad_agent,
@@ -1112,8 +623,6 @@ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent)
agent);
unregister_mad_snoop(mad_snoop_priv);
}
- }
-
return 0;
}
EXPORT_SYMBOL(ib_unregister_mad_agent);
@@ -1174,7 +683,7 @@ static void snoop_recv(struct ib_mad_qp_info *qp_info,
atomic_inc(&mad_snoop_priv->refcount);
spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
- mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent,
+ mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, NULL,
mad_recv_wc);
deref_snoop_agent(mad_snoop_priv);
spin_lock_irqsave(&qp_info->snoop_lock, flags);
@@ -1182,12 +691,11 @@ static void snoop_recv(struct ib_mad_qp_info *qp_info,
spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
}
-static void build_smp_wc(struct ib_qp *qp,
- u64 wr_id, u16 slid, u16 pkey_index, u8 port_num,
- struct ib_wc *wc)
+static void build_smp_wc(struct ib_qp *qp, struct ib_cqe *cqe, u16 slid,
+ u16 pkey_index, u8 port_num, struct ib_wc *wc)
{
memset(wc, 0, sizeof *wc);
- wc->wr_id = wr_id;
+ wc->wr_cqe = cqe;
wc->status = IB_WC_SUCCESS;
wc->opcode = IB_WC_RECV;
wc->pkey_index = pkey_index;
@@ -1200,6 +708,32 @@ static void build_smp_wc(struct ib_qp *qp,
wc->port_num = port_num;
}
+static size_t mad_priv_size(const struct ib_mad_private *mp)
+{
+ return sizeof(struct ib_mad_private) + mp->mad_size;
+}
+
+static struct ib_mad_private *alloc_mad_private(size_t mad_size, gfp_t flags)
+{
+ size_t size = sizeof(struct ib_mad_private) + mad_size;
+ struct ib_mad_private *ret = kzalloc(size, flags);
+
+ if (ret)
+ ret->mad_size = mad_size;
+
+ return ret;
+}
+
+static size_t port_mad_size(const struct ib_mad_port_private *port_priv)
+{
+ return rdma_max_mad_size(port_priv->device, port_priv->port_num);
+}
+
+static size_t mad_priv_dma_size(const struct ib_mad_private *mp)
+{
+ return sizeof(struct ib_grh) + mp->mad_size;
+}
+
/*
* Return 0 if SMP is to be sent
* Return 1 if SMP was consumed locally (whether or not solicited)
@@ -1210,6 +744,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
{
int ret = 0;
struct ib_smp *smp = mad_send_wr->send_buf.mad;
+ struct opa_smp *opa_smp = (struct opa_smp *)smp;
unsigned long flags;
struct ib_mad_local_private *local;
struct ib_mad_private *mad_priv;
@@ -1218,11 +753,16 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
struct ib_device *device = mad_agent_priv->agent.device;
u8 port_num;
struct ib_wc mad_wc;
- struct ib_send_wr *send_wr = &mad_send_wr->send_wr;
-
- if (device->node_type == RDMA_NODE_IB_SWITCH &&
+ struct ib_ud_wr *send_wr = &mad_send_wr->send_wr;
+ size_t mad_size = port_mad_size(mad_agent_priv->qp_info->port_priv);
+ u16 out_mad_pkey_index = 0;
+ u16 drslid;
+ bool opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device,
+ mad_agent_priv->qp_info->port_priv->port_num);
+
+ if (rdma_cap_ib_switch(device) &&
smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
- port_num = send_wr->wr.ud.port_num;
+ port_num = send_wr->port_num;
else
port_num = mad_agent_priv->agent.port_num;
@@ -1232,50 +772,86 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
* If we are at the start of the LID routed part, don't update the
* hop_ptr or hop_cnt. See section 14.2.2, Vol 1 IB spec.
*/
- if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) !=
- IB_LID_PERMISSIVE)
- goto out;
- if (smi_handle_dr_smp_send(smp, device->node_type, port_num) ==
- IB_SMI_DISCARD) {
- ret = -EINVAL;
- printk(KERN_ERR PFX "Invalid directed route\n");
- goto out;
- }
+ if (opa && smp->class_version == OPA_SMP_CLASS_VERSION) {
+ u32 opa_drslid;
+
+ if ((opa_get_smp_direction(opa_smp)
+ ? opa_smp->route.dr.dr_dlid : opa_smp->route.dr.dr_slid) ==
+ OPA_LID_PERMISSIVE &&
+ opa_smi_handle_dr_smp_send(opa_smp,
+ rdma_cap_ib_switch(device),
+ port_num) == IB_SMI_DISCARD) {
+ ret = -EINVAL;
+ dev_err(&device->dev, "OPA Invalid directed route\n");
+ goto out;
+ }
+ opa_drslid = be32_to_cpu(opa_smp->route.dr.dr_slid);
+ if (opa_drslid != be32_to_cpu(OPA_LID_PERMISSIVE) &&
+ opa_drslid & 0xffff0000) {
+ ret = -EINVAL;
+ dev_err(&device->dev, "OPA Invalid dr_slid 0x%x\n",
+ opa_drslid);
+ goto out;
+ }
+ drslid = (u16)(opa_drslid & 0x0000ffff);
- /* Check to post send on QP or process locally */
- if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD &&
- smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD)
- goto out;
+ /* Check to post send on QP or process locally */
+ if (opa_smi_check_local_smp(opa_smp, device) == IB_SMI_DISCARD &&
+ opa_smi_check_local_returning_smp(opa_smp, device) == IB_SMI_DISCARD)
+ goto out;
+ } else {
+ if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) ==
+ IB_LID_PERMISSIVE &&
+ smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(device), port_num) ==
+ IB_SMI_DISCARD) {
+ ret = -EINVAL;
+ dev_err(&device->dev, "Invalid directed route\n");
+ goto out;
+ }
+ drslid = be16_to_cpu(smp->dr_slid);
+
+ /* Check to post send on QP or process locally */
+ if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD &&
+ smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD)
+ goto out;
+ }
local = kmalloc(sizeof *local, GFP_ATOMIC);
if (!local) {
ret = -ENOMEM;
- printk(KERN_ERR PFX "No memory for ib_mad_local_private\n");
+ dev_err(&device->dev, "No memory for ib_mad_local_private\n");
goto out;
}
local->mad_priv = NULL;
local->recv_mad_agent = NULL;
- mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_ATOMIC);
+ mad_priv = alloc_mad_private(mad_size, GFP_ATOMIC);
if (!mad_priv) {
ret = -ENOMEM;
- printk(KERN_ERR PFX "No memory for local response MAD\n");
+ dev_err(&device->dev, "No memory for local response MAD\n");
kfree(local);
goto out;
}
build_smp_wc(mad_agent_priv->agent.qp,
- send_wr->wr_id, be16_to_cpu(smp->dr_slid),
- send_wr->wr.ud.pkey_index,
- send_wr->wr.ud.port_num, &mad_wc);
+ send_wr->wr.wr_cqe, drslid,
+ send_wr->pkey_index,
+ send_wr->port_num, &mad_wc);
+
+ if (opa && smp->base_version == OPA_MGMT_BASE_VERSION) {
+ mad_wc.byte_len = mad_send_wr->send_buf.hdr_len
+ + mad_send_wr->send_buf.data_len
+ + sizeof(struct ib_grh);
+ }
/* No GRH for DR SMP */
ret = device->process_mad(device, 0, port_num, &mad_wc, NULL,
- (struct ib_mad *)smp,
- (struct ib_mad *)&mad_priv->mad);
+ (const struct ib_mad_hdr *)smp, mad_size,
+ (struct ib_mad_hdr *)mad_priv->mad,
+ &mad_size, &out_mad_pkey_index);
switch (ret)
{
case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY:
- if (ib_response_mad(&mad_priv->mad.mad) &&
+ if (ib_response_mad((const struct ib_mad_hdr *)mad_priv->mad) &&
mad_agent_priv->agent.recv_handler) {
local->mad_priv = mad_priv;
local->recv_mad_agent = mad_agent_priv;
@@ -1285,39 +861,43 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
*/
atomic_inc(&mad_agent_priv->refcount);
} else
- kmem_cache_free(ib_mad_cache, mad_priv);
+ kfree(mad_priv);
break;
case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED:
- kmem_cache_free(ib_mad_cache, mad_priv);
+ kfree(mad_priv);
break;
case IB_MAD_RESULT_SUCCESS:
/* Treat like an incoming receive MAD */
port_priv = ib_get_mad_port(mad_agent_priv->agent.device,
mad_agent_priv->agent.port_num);
if (port_priv) {
- memcpy(&mad_priv->mad.mad, smp, sizeof(struct ib_mad));
+ memcpy(mad_priv->mad, smp, mad_priv->mad_size);
recv_mad_agent = find_mad_agent(port_priv,
- &mad_priv->mad.mad);
+ (const struct ib_mad_hdr *)mad_priv->mad);
}
if (!port_priv || !recv_mad_agent) {
/*
* No receiving agent so drop packet and
* generate send completion.
*/
- kmem_cache_free(ib_mad_cache, mad_priv);
+ kfree(mad_priv);
break;
}
local->mad_priv = mad_priv;
local->recv_mad_agent = recv_mad_agent;
break;
default:
- kmem_cache_free(ib_mad_cache, mad_priv);
+ kfree(mad_priv);
kfree(local);
ret = -EINVAL;
goto out;
}
local->mad_send_wr = mad_send_wr;
+ if (opa) {
+ local->mad_send_wr->send_wr.pkey_index = out_mad_pkey_index;
+ local->return_wc_byte_len = mad_size;
+ }
/* Reference MAD agent until send side of local completion handled */
atomic_inc(&mad_agent_priv->refcount);
/* Queue local completion to local list */
@@ -1331,11 +911,11 @@ out:
return ret;
}
-static int get_pad_size(int hdr_len, int data_len)
+static int get_pad_size(int hdr_len, int data_len, size_t mad_size)
{
int seg_size, pad;
- seg_size = sizeof(struct ib_mad) - hdr_len;
+ seg_size = mad_size - hdr_len;
if (data_len && seg_size) {
pad = seg_size - data_len % seg_size;
return pad == seg_size ? 0 : pad;
@@ -1354,14 +934,15 @@ static void free_send_rmpp_list(struct ib_mad_send_wr_private *mad_send_wr)
}
static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr,
- gfp_t gfp_mask)
+ size_t mad_size, gfp_t gfp_mask)
{
struct ib_mad_send_buf *send_buf = &send_wr->send_buf;
struct ib_rmpp_mad *rmpp_mad = send_buf->mad;
struct ib_rmpp_segment *seg = NULL;
int left, seg_size, pad;
- send_buf->seg_size = sizeof (struct ib_mad) - send_buf->hdr_len;
+ send_buf->seg_size = mad_size - send_buf->hdr_len;
+ send_buf->seg_rmpp_size = mad_size - IB_MGMT_RMPP_HDR;
seg_size = send_buf->seg_size;
pad = send_wr->pad;
@@ -1369,9 +950,9 @@ static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr,
for (left = send_buf->data_len + pad; left > 0; left -= seg_size) {
seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask);
if (!seg) {
- printk(KERN_ERR "alloc_send_rmpp_segs: RMPP mem "
- "alloc failed for len %zd, gfp %#x\n",
- sizeof (*seg) + seg_size, gfp_mask);
+ dev_err(&send_buf->mad_agent->device->dev,
+ "alloc_send_rmpp_segs: RMPP mem alloc failed for len %zd, gfp %#x\n",
+ sizeof (*seg) + seg_size, gfp_mask);
free_send_rmpp_list(send_wr);
return -ENOMEM;
}
@@ -1394,33 +975,52 @@ static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr,
return 0;
}
+int ib_mad_kernel_rmpp_agent(const struct ib_mad_agent *agent)
+{
+ return agent->rmpp_version && !(agent->flags & IB_MAD_USER_RMPP);
+}
+EXPORT_SYMBOL(ib_mad_kernel_rmpp_agent);
+
struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
u32 remote_qpn, u16 pkey_index,
int rmpp_active,
int hdr_len, int data_len,
- gfp_t gfp_mask)
+ gfp_t gfp_mask,
+ u8 base_version)
{
struct ib_mad_agent_private *mad_agent_priv;
struct ib_mad_send_wr_private *mad_send_wr;
int pad, message_size, ret, size;
void *buf;
+ size_t mad_size;
+ bool opa;
mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
agent);
- pad = get_pad_size(hdr_len, data_len);
+
+ opa = rdma_cap_opa_mad(mad_agent->device, mad_agent->port_num);
+
+ if (opa && base_version == OPA_MGMT_BASE_VERSION)
+ mad_size = sizeof(struct opa_mad);
+ else
+ mad_size = sizeof(struct ib_mad);
+
+ pad = get_pad_size(hdr_len, data_len, mad_size);
message_size = hdr_len + data_len + pad;
- if ((!mad_agent->rmpp_version &&
- (rmpp_active || message_size > sizeof(struct ib_mad))) ||
- (!rmpp_active && message_size > sizeof(struct ib_mad)))
- return ERR_PTR(-EINVAL);
+ if (ib_mad_kernel_rmpp_agent(mad_agent)) {
+ if (!rmpp_active && message_size > mad_size)
+ return ERR_PTR(-EINVAL);
+ } else
+ if (rmpp_active || message_size > mad_size)
+ return ERR_PTR(-EINVAL);
- size = rmpp_active ? hdr_len : sizeof(struct ib_mad);
+ size = rmpp_active ? hdr_len : mad_size;
buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask);
if (!buf)
return ERR_PTR(-ENOMEM);
- mad_send_wr = buf + size;
+ mad_send_wr = (struct ib_mad_send_wr_private *)((char *)buf + size);
INIT_LIST_HEAD(&mad_send_wr->rmpp_list);
mad_send_wr->send_buf.mad = buf;
mad_send_wr->send_buf.hdr_len = hdr_len;
@@ -1429,21 +1029,30 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
mad_send_wr->mad_agent_priv = mad_agent_priv;
mad_send_wr->sg_list[0].length = hdr_len;
- mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey;
- mad_send_wr->sg_list[1].length = sizeof(struct ib_mad) - hdr_len;
- mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey;
-
- mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr;
- mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list;
- mad_send_wr->send_wr.num_sge = 2;
- mad_send_wr->send_wr.opcode = IB_WR_SEND;
- mad_send_wr->send_wr.send_flags = IB_SEND_SIGNALED;
- mad_send_wr->send_wr.wr.ud.remote_qpn = remote_qpn;
- mad_send_wr->send_wr.wr.ud.remote_qkey = IB_QP_SET_QKEY;
- mad_send_wr->send_wr.wr.ud.pkey_index = pkey_index;
+ mad_send_wr->sg_list[0].lkey = mad_agent->qp->pd->local_dma_lkey;
+
+ /* OPA MADs don't have to be the full 2048 bytes */
+ if (opa && base_version == OPA_MGMT_BASE_VERSION &&
+ data_len < mad_size - hdr_len)
+ mad_send_wr->sg_list[1].length = data_len;
+ else
+ mad_send_wr->sg_list[1].length = mad_size - hdr_len;
+
+ mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey;
+
+ mad_send_wr->mad_list.cqe.done = ib_mad_send_done;
+
+ mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe;
+ mad_send_wr->send_wr.wr.sg_list = mad_send_wr->sg_list;
+ mad_send_wr->send_wr.wr.num_sge = 2;
+ mad_send_wr->send_wr.wr.opcode = IB_WR_SEND;
+ mad_send_wr->send_wr.wr.send_flags = IB_SEND_SIGNALED;
+ mad_send_wr->send_wr.remote_qpn = remote_qpn;
+ mad_send_wr->send_wr.remote_qkey = IB_QP_SET_QKEY;
+ mad_send_wr->send_wr.pkey_index = pkey_index;
if (rmpp_active) {
- ret = alloc_send_rmpp_list(mad_send_wr, gfp_mask);
+ ret = alloc_send_rmpp_list(mad_send_wr, mad_size, gfp_mask);
if (ret) {
kfree(buf);
return ERR_PTR(ret);
@@ -1513,7 +1122,7 @@ static inline void *ib_get_payload(struct ib_mad_send_wr_private *mad_send_wr)
return ib_get_rmpp_segment(&mad_send_wr->send_buf,
mad_send_wr->seg_num);
else
- return mad_send_wr->send_buf.mad +
+ return (char *)mad_send_wr->send_buf.mad +
mad_send_wr->send_buf.hdr_len;
}
@@ -1545,8 +1154,9 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
/* Set WR ID to find mad_send_wr upon completion */
qp_info = mad_send_wr->mad_agent_priv->qp_info;
- mad_send_wr->send_wr.wr_id = (unsigned long)&mad_send_wr->mad_list;
mad_send_wr->mad_list.mad_queue = &qp_info->send_queue;
+ mad_send_wr->mad_list.cqe.done = ib_mad_send_done;
+ mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe;
mad_agent = mad_send_wr->send_buf.mad_agent;
sge = mad_send_wr->sg_list;
@@ -1557,22 +1167,23 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[0].addr)))
return -ENOMEM;
+ mad_send_wr->header_mapping = sge[0].addr;
+
sge[1].addr = ib_dma_map_single(mad_agent->device,
ib_get_payload(mad_send_wr),
sge[1].length,
DMA_TO_DEVICE);
-
if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[1].addr))) {
- ret = -ENOMEM;
- goto dma1_err;
+ ib_dma_unmap_single(mad_agent->device,
+ mad_send_wr->header_mapping,
+ sge[0].length, DMA_TO_DEVICE);
+ return -ENOMEM;
}
-
- mad_send_wr->header_mapping = sge[0].addr;
mad_send_wr->payload_mapping = sge[1].addr;
spin_lock_irqsave(&qp_info->send_queue.lock, flags);
if (qp_info->send_queue.count < qp_info->send_queue.max_active) {
- ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr,
+ ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr.wr,
&bad_send_wr);
list = &qp_info->send_queue.list;
} else {
@@ -1585,51 +1196,14 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
list_add_tail(&mad_send_wr->mad_list.list, list);
}
spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
-
- if (!ret)
- return 0;
-
+ if (ret) {
ib_dma_unmap_single(mad_agent->device,
mad_send_wr->header_mapping,
- sge[1].length, DMA_TO_DEVICE);
-dma1_err:
+ sge[0].length, DMA_TO_DEVICE);
ib_dma_unmap_single(mad_agent->device,
mad_send_wr->payload_mapping,
- sge[0].length, DMA_TO_DEVICE);
- return ret;
-}
-
-/*
- * Send SA MAD that passed congestion control
- */
-static int send_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr,
- u32 timeout_ms, u32 retries_left)
-{
- int ret;
- unsigned long flags;
- struct ib_mad_agent_private *mad_agent_priv;
-
- mad_agent_priv = mad_send_wr->mad_agent_priv;
- mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
- mad_send_wr->retries_left = retries_left;
- mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
-
- /* Reference MAD agent until send completes */
- atomic_inc(&mad_agent_priv->refcount);
- spin_lock_irqsave(&mad_agent_priv->lock, flags);
- list_add_tail(&mad_send_wr->agent_list,
- &mad_agent_priv->send_list);
- spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
-
- ret = ib_send_mad(mad_send_wr);
- if (ret < 0) {
- /* Fail send request */
- spin_lock_irqsave(&mad_agent_priv->lock, flags);
- list_del(&mad_send_wr->agent_list);
- spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
- atomic_dec(&mad_agent_priv->refcount);
+ sge[1].length, DMA_TO_DEVICE);
}
-
return ret;
}
@@ -1674,7 +1248,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
* request associated with the completion
*/
next_send_buf = send_buf->next;
- mad_send_wr->send_wr.wr.ud.ah = send_buf->ah;
+ mad_send_wr->send_wr.ah = send_buf->ah;
if (((struct ib_mad_hdr *) send_buf->mad)->mgmt_class ==
IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
@@ -1696,33 +1270,26 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
mad_send_wr->status = IB_WC_SUCCESS;
- if (is_sa_cc_mad(mad_send_wr)) {
- mad_send_wr->is_sa_cc_mad = 1;
- ret = sa_cc_mad_send(mad_send_wr);
- if (ret < 0)
- goto error;
- } else {
- /* Reference MAD agent until send completes */
- atomic_inc(&mad_agent_priv->refcount);
- spin_lock_irqsave(&mad_agent_priv->lock, flags);
- list_add_tail(&mad_send_wr->agent_list,
- &mad_agent_priv->send_list);
- spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ /* Reference MAD agent until send completes */
+ atomic_inc(&mad_agent_priv->refcount);
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_add_tail(&mad_send_wr->agent_list,
+ &mad_agent_priv->send_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
- if (mad_agent_priv->agent.rmpp_version) {
- ret = ib_send_rmpp_mad(mad_send_wr);
- if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED)
- ret = ib_send_mad(mad_send_wr);
- } else
+ if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
+ ret = ib_send_rmpp_mad(mad_send_wr);
+ if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED)
ret = ib_send_mad(mad_send_wr);
- if (ret < 0) {
- /* Fail send request */
- spin_lock_irqsave(&mad_agent_priv->lock, flags);
- list_del(&mad_send_wr->agent_list);
- spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
- atomic_dec(&mad_agent_priv->refcount);
- goto error;
- }
+ } else
+ ret = ib_send_mad(mad_send_wr);
+ if (ret < 0) {
+ /* Fail send request */
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_del(&mad_send_wr->agent_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ atomic_dec(&mad_agent_priv->refcount);
+ goto error;
}
}
return 0;
@@ -1756,7 +1323,7 @@ void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc)
recv_wc);
priv = container_of(mad_priv_hdr, struct ib_mad_private,
header);
- kmem_cache_free(ib_mad_cache, priv);
+ kfree(priv);
}
}
EXPORT_SYMBOL(ib_free_recv_mad);
@@ -1774,7 +1341,8 @@ EXPORT_SYMBOL(ib_redirect_mad_qp);
int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
struct ib_wc *wc)
{
- printk(KERN_ERR PFX "ib_process_mad_wc() not implemented yet\n");
+ dev_err(&mad_agent->device->dev,
+ "ib_process_mad_wc() not implemented yet\n");
return 0;
}
EXPORT_SYMBOL(ib_process_mad_wc);
@@ -1786,7 +1354,7 @@ static int method_in_use(struct ib_mad_mgmt_method_table **method,
for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) {
if ((*method)->agent[i]) {
- printk(KERN_ERR PFX "Method %d already in use\n", i);
+ pr_err("Method %d already in use\n", i);
return -EINVAL;
}
}
@@ -1798,8 +1366,7 @@ static int allocate_method_table(struct ib_mad_mgmt_method_table **method)
/* Allocate management method table */
*method = kzalloc(sizeof **method, GFP_ATOMIC);
if (!*method) {
- printk(KERN_ERR PFX "No memory for "
- "ib_mad_mgmt_method_table\n");
+ pr_err("No memory for ib_mad_mgmt_method_table\n");
return -ENOMEM;
}
@@ -1843,7 +1410,7 @@ static int check_vendor_class(struct ib_mad_mgmt_vendor_class *vendor_class)
}
static int find_vendor_oui(struct ib_mad_mgmt_vendor_class *vendor_class,
- char *oui)
+ const char *oui)
{
int i;
@@ -1894,8 +1461,8 @@ static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
/* Allocate management class table for "new" class version */
*class = kzalloc(sizeof **class, GFP_ATOMIC);
if (!*class) {
- printk(KERN_ERR PFX "No memory for "
- "ib_mad_mgmt_class_table\n");
+ dev_err(&agent_priv->agent.device->dev,
+ "No memory for ib_mad_mgmt_class_table\n");
ret = -ENOMEM;
goto error1;
}
@@ -1961,8 +1528,8 @@ static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
/* Allocate mgmt vendor class table for "new" class version */
vendor = kzalloc(sizeof *vendor, GFP_ATOMIC);
if (!vendor) {
- printk(KERN_ERR PFX "No memory for "
- "ib_mad_mgmt_vendor_class_table\n");
+ dev_err(&agent_priv->agent.device->dev,
+ "No memory for ib_mad_mgmt_vendor_class_table\n");
goto error1;
}
@@ -1972,8 +1539,8 @@ static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
/* Allocate table for this management vendor class */
vendor_class = kzalloc(sizeof *vendor_class, GFP_ATOMIC);
if (!vendor_class) {
- printk(KERN_ERR PFX "No memory for "
- "ib_mad_mgmt_vendor_class\n");
+ dev_err(&agent_priv->agent.device->dev,
+ "No memory for ib_mad_mgmt_vendor_class\n");
goto error2;
}
@@ -2004,7 +1571,7 @@ static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
goto check_in_use;
}
}
- printk(KERN_ERR PFX "All OUI slots in use\n");
+ dev_err(&agent_priv->agent.device->dev, "All OUI slots in use\n");
goto error3;
check_in_use:
@@ -2074,9 +1641,9 @@ static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv)
/* Now, check to see if there are any methods still in use */
if (!check_method_table(method)) {
/* If not, release management method table */
- kfree(method);
- class->method_table[mgmt_class] = NULL;
- /* Any management classes left ? */
+ kfree(method);
+ class->method_table[mgmt_class] = NULL;
+ /* Any management classes left ? */
if (!check_class_table(class)) {
/* If not, release management class table */
kfree(class);
@@ -2141,13 +1708,13 @@ out:
static struct ib_mad_agent_private *
find_mad_agent(struct ib_mad_port_private *port_priv,
- struct ib_mad *mad)
+ const struct ib_mad_hdr *mad_hdr)
{
struct ib_mad_agent_private *mad_agent = NULL;
unsigned long flags;
spin_lock_irqsave(&port_priv->reg_lock, flags);
- if (ib_response_mad(mad)) {
+ if (ib_response_mad(mad_hdr)) {
u32 hi_tid;
struct ib_mad_agent_private *entry;
@@ -2155,7 +1722,7 @@ find_mad_agent(struct ib_mad_port_private *port_priv,
* Routing is based on high 32 bits of transaction ID
* of MAD.
*/
- hi_tid = be64_to_cpu(mad->mad_hdr.tid) >> 32;
+ hi_tid = be64_to_cpu(mad_hdr->tid) >> 32;
list_for_each_entry(entry, &port_priv->agent_list, agent_list) {
if (entry->agent.hi_tid == hi_tid) {
mad_agent = entry;
@@ -2167,45 +1734,45 @@ find_mad_agent(struct ib_mad_port_private *port_priv,
struct ib_mad_mgmt_method_table *method;
struct ib_mad_mgmt_vendor_class_table *vendor;
struct ib_mad_mgmt_vendor_class *vendor_class;
- struct ib_vendor_mad *vendor_mad;
+ const struct ib_vendor_mad *vendor_mad;
int index;
/*
* Routing is based on version, class, and method
* For "newer" vendor MADs, also based on OUI
*/
- if (mad->mad_hdr.class_version >= MAX_MGMT_VERSION)
+ if (mad_hdr->class_version >= MAX_MGMT_VERSION)
goto out;
- if (!is_vendor_class(mad->mad_hdr.mgmt_class)) {
+ if (!is_vendor_class(mad_hdr->mgmt_class)) {
class = port_priv->version[
- mad->mad_hdr.class_version].class;
+ mad_hdr->class_version].class;
if (!class)
goto out;
- if (convert_mgmt_class(mad->mad_hdr.mgmt_class) >=
+ if (convert_mgmt_class(mad_hdr->mgmt_class) >=
IB_MGMT_MAX_METHODS)
goto out;
method = class->method_table[convert_mgmt_class(
- mad->mad_hdr.mgmt_class)];
+ mad_hdr->mgmt_class)];
if (method)
- mad_agent = method->agent[mad->mad_hdr.method &
+ mad_agent = method->agent[mad_hdr->method &
~IB_MGMT_METHOD_RESP];
} else {
vendor = port_priv->version[
- mad->mad_hdr.class_version].vendor;
+ mad_hdr->class_version].vendor;
if (!vendor)
goto out;
vendor_class = vendor->vendor_class[vendor_class_index(
- mad->mad_hdr.mgmt_class)];
+ mad_hdr->mgmt_class)];
if (!vendor_class)
goto out;
/* Find matching OUI */
- vendor_mad = (struct ib_vendor_mad *)mad;
+ vendor_mad = (const struct ib_vendor_mad *)mad_hdr;
index = find_vendor_oui(vendor_class, vendor_mad->oui);
if (index == -1)
goto out;
method = vendor_class->method_table[index];
if (method) {
- mad_agent = method->agent[mad->mad_hdr.method &
+ mad_agent = method->agent[mad_hdr->method &
~IB_MGMT_METHOD_RESP];
}
}
@@ -2215,9 +1782,9 @@ find_mad_agent(struct ib_mad_port_private *port_priv,
if (mad_agent->agent.recv_handler)
atomic_inc(&mad_agent->refcount);
else {
- printk(KERN_NOTICE PFX "No receive handler for client "
- "%p on port %d\n",
- &mad_agent->agent, port_priv->port_num);
+ dev_notice(&port_priv->device->dev,
+ "No receive handler for client %p on port %d\n",
+ &mad_agent->agent, port_priv->port_num);
mad_agent = NULL;
}
}
@@ -2227,23 +1794,32 @@ out:
return mad_agent;
}
-static int validate_mad(struct ib_mad *mad, u32 qp_num)
+static int validate_mad(const struct ib_mad_hdr *mad_hdr,
+ const struct ib_mad_qp_info *qp_info,
+ bool opa)
{
int valid = 0;
+ u32 qp_num = qp_info->qp->qp_num;
/* Make sure MAD base version is understood */
- if (mad->mad_hdr.base_version != IB_MGMT_BASE_VERSION) {
- printk(KERN_ERR PFX "MAD received with unsupported base "
- "version %d\n", mad->mad_hdr.base_version);
+ if (mad_hdr->base_version != IB_MGMT_BASE_VERSION &&
+ (!opa || mad_hdr->base_version != OPA_MGMT_BASE_VERSION)) {
+ pr_err("MAD received with unsupported base version %d %s\n",
+ mad_hdr->base_version, opa ? "(opa)" : "");
goto out;
}
/* Filter SMI packets sent to other than QP0 */
- if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
- (mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+ if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+ (mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
if (qp_num == 0)
valid = 1;
} else {
+ /* CM attributes other than ClassPortInfo only use Send method */
+ if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_CM) &&
+ (mad_hdr->attr_id != IB_MGMT_CLASSPORTINFO_ATTR_ID) &&
+ (mad_hdr->method != IB_MGMT_METHOD_SEND))
+ goto out;
/* Filter GSI packets sent to QP0 */
if (qp_num != 0)
valid = 1;
@@ -2253,28 +1829,29 @@ out:
return valid;
}
-static int is_data_mad(struct ib_mad_agent_private *mad_agent_priv,
- struct ib_mad_hdr *mad_hdr)
+static int is_rmpp_data_mad(const struct ib_mad_agent_private *mad_agent_priv,
+ const struct ib_mad_hdr *mad_hdr)
{
- struct ib_rmpp_mad *rmpp_mad;
+ const struct ib_rmpp_mad *rmpp_mad;
- rmpp_mad = (struct ib_rmpp_mad *)mad_hdr;
+ rmpp_mad = (const struct ib_rmpp_mad *)mad_hdr;
return !mad_agent_priv->agent.rmpp_version ||
+ !ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) ||
!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
IB_MGMT_RMPP_FLAG_ACTIVE) ||
(rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA);
}
-static inline int rcv_has_same_class(struct ib_mad_send_wr_private *wr,
- struct ib_mad_recv_wc *rwc)
+static inline int rcv_has_same_class(const struct ib_mad_send_wr_private *wr,
+ const struct ib_mad_recv_wc *rwc)
{
- return ((struct ib_mad *)(wr->send_buf.mad))->mad_hdr.mgmt_class ==
+ return ((struct ib_mad_hdr *)(wr->send_buf.mad))->mgmt_class ==
rwc->recv_buf.mad->mad_hdr.mgmt_class;
}
-static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv,
- struct ib_mad_send_wr_private *wr,
- struct ib_mad_recv_wc *rwc )
+static inline int rcv_has_same_gid(const struct ib_mad_agent_private *mad_agent_priv,
+ const struct ib_mad_send_wr_private *wr,
+ const struct ib_mad_recv_wc *rwc )
{
struct ib_ah_attr attr;
u8 send_resp, rcv_resp;
@@ -2283,8 +1860,8 @@ static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv,
u8 port_num = mad_agent_priv->agent.port_num;
u8 lmc;
- send_resp = ib_response_mad((struct ib_mad *)wr->send_buf.mad);
- rcv_resp = ib_response_mad(rwc->recv_buf.mad);
+ send_resp = ib_response_mad((struct ib_mad_hdr *)wr->send_buf.mad);
+ rcv_resp = ib_response_mad(&rwc->recv_buf.mad->mad_hdr);
if (send_resp == rcv_resp)
/* both requests, or both responses. GIDs different */
@@ -2309,7 +1886,7 @@ static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv,
((1 << lmc) - 1)));
} else {
if (ib_get_cached_gid(device, port_num,
- attr.grh.sgid_index, &sgid))
+ attr.grh.sgid_index, &sgid, NULL))
return 0;
return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw,
16);
@@ -2329,22 +1906,22 @@ static inline int is_direct(u8 class)
}
struct ib_mad_send_wr_private*
-ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv,
- struct ib_mad_recv_wc *wc)
+ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv,
+ const struct ib_mad_recv_wc *wc)
{
struct ib_mad_send_wr_private *wr;
- struct ib_mad *mad;
+ const struct ib_mad_hdr *mad_hdr;
- mad = (struct ib_mad *)wc->recv_buf.mad;
+ mad_hdr = &wc->recv_buf.mad->mad_hdr;
list_for_each_entry(wr, &mad_agent_priv->wait_list, agent_list) {
- if ((wr->tid == mad->mad_hdr.tid) &&
+ if ((wr->tid == mad_hdr->tid) &&
rcv_has_same_class(wr, wc) &&
/*
* Don't check GID for direct routed MADs.
* These might have permissive LIDs.
*/
- (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) ||
+ (is_direct(mad_hdr->mgmt_class) ||
rcv_has_same_gid(mad_agent_priv, wr, wc)))
return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
}
@@ -2354,15 +1931,15 @@ ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv,
* been notified that the send has completed
*/
list_for_each_entry(wr, &mad_agent_priv->send_list, agent_list) {
- if (is_data_mad(mad_agent_priv, wr->send_buf.mad) &&
- wr->tid == mad->mad_hdr.tid &&
+ if (is_rmpp_data_mad(mad_agent_priv, wr->send_buf.mad) &&
+ wr->tid == mad_hdr->tid &&
wr->timeout &&
rcv_has_same_class(wr, wc) &&
/*
* Don't check GID for direct routed MADs.
* These might have permissive LIDs.
*/
- (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) ||
+ (is_direct(mad_hdr->mgmt_class) ||
rcv_has_same_gid(mad_agent_priv, wr, wc)))
/* Verify request has not been canceled */
return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
@@ -2387,7 +1964,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list);
- if (mad_agent_priv->agent.rmpp_version) {
+ if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv,
mad_recv_wc);
if (!mad_recv_wc) {
@@ -2397,168 +1974,321 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
}
/* Complete corresponding request */
- if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
+ if (ib_response_mad(&mad_recv_wc->recv_buf.mad->mad_hdr)) {
spin_lock_irqsave(&mad_agent_priv->lock, flags);
mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
if (!mad_send_wr) {
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
- ib_free_recv_mad(mad_recv_wc);
- deref_mad_agent(mad_agent_priv);
- return;
- }
- ib_mark_mad_done(mad_send_wr);
- spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
-
- /* Defined behavior is to complete response before request */
- mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf;
- mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
- mad_recv_wc);
- atomic_dec(&mad_agent_priv->refcount);
+ if (!ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)
+ && ib_is_mad_class_rmpp(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class)
+ && (ib_get_rmpp_flags(&((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr)
+ & IB_MGMT_RMPP_FLAG_ACTIVE)) {
+ /* user rmpp is in effect
+ * and this is an active RMPP MAD
+ */
+ mad_agent_priv->agent.recv_handler(
+ &mad_agent_priv->agent, NULL,
+ mad_recv_wc);
+ atomic_dec(&mad_agent_priv->refcount);
+ } else {
+ /* not user rmpp, revert to normal behavior and
+ * drop the mad */
+ ib_free_recv_mad(mad_recv_wc);
+ deref_mad_agent(mad_agent_priv);
+ return;
+ }
+ } else {
+ ib_mark_mad_done(mad_send_wr);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
- mad_send_wc.status = IB_WC_SUCCESS;
- mad_send_wc.vendor_err = 0;
- mad_send_wc.send_buf = &mad_send_wr->send_buf;
- ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+ /* Defined behavior is to complete response before request */
+ mad_agent_priv->agent.recv_handler(
+ &mad_agent_priv->agent,
+ &mad_send_wr->send_buf,
+ mad_recv_wc);
+ atomic_dec(&mad_agent_priv->refcount);
+
+ mad_send_wc.status = IB_WC_SUCCESS;
+ mad_send_wc.vendor_err = 0;
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+ }
} else {
- mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+ mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, NULL,
mad_recv_wc);
deref_mad_agent(mad_agent_priv);
}
}
-static bool generate_unmatched_resp(struct ib_mad_private *recv,
- struct ib_mad_private *response)
+static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv,
+ const struct ib_mad_qp_info *qp_info,
+ const struct ib_wc *wc,
+ int port_num,
+ struct ib_mad_private *recv,
+ struct ib_mad_private *response)
+{
+ enum smi_forward_action retsmi;
+ struct ib_smp *smp = (struct ib_smp *)recv->mad;
+
+ if (smi_handle_dr_smp_recv(smp,
+ rdma_cap_ib_switch(port_priv->device),
+ port_num,
+ port_priv->device->phys_port_cnt) ==
+ IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+
+ retsmi = smi_check_forward_dr_smp(smp);
+ if (retsmi == IB_SMI_LOCAL)
+ return IB_SMI_HANDLE;
+
+ if (retsmi == IB_SMI_SEND) { /* don't forward */
+ if (smi_handle_dr_smp_send(smp,
+ rdma_cap_ib_switch(port_priv->device),
+ port_num) == IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+
+ if (smi_check_local_smp(smp, port_priv->device) == IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+ } else if (rdma_cap_ib_switch(port_priv->device)) {
+ /* forward case for switches */
+ memcpy(response, recv, mad_priv_size(response));
+ response->header.recv_wc.wc = &response->header.wc;
+ response->header.recv_wc.recv_buf.mad = (struct ib_mad *)response->mad;
+ response->header.recv_wc.recv_buf.grh = &response->grh;
+
+ agent_send_response((const struct ib_mad_hdr *)response->mad,
+ &response->grh, wc,
+ port_priv->device,
+ smi_get_fwd_port(smp),
+ qp_info->qp->qp_num,
+ response->mad_size,
+ false);
+
+ return IB_SMI_DISCARD;
+ }
+ return IB_SMI_HANDLE;
+}
+
+static bool generate_unmatched_resp(const struct ib_mad_private *recv,
+ struct ib_mad_private *response,
+ size_t *resp_len, bool opa)
{
- if (recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_GET ||
- recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_SET) {
- memcpy(response, recv, sizeof *response);
+ const struct ib_mad_hdr *recv_hdr = (const struct ib_mad_hdr *)recv->mad;
+ struct ib_mad_hdr *resp_hdr = (struct ib_mad_hdr *)response->mad;
+
+ if (recv_hdr->method == IB_MGMT_METHOD_GET ||
+ recv_hdr->method == IB_MGMT_METHOD_SET) {
+ memcpy(response, recv, mad_priv_size(response));
response->header.recv_wc.wc = &response->header.wc;
- response->header.recv_wc.recv_buf.mad = &response->mad.mad;
+ response->header.recv_wc.recv_buf.mad = (struct ib_mad *)response->mad;
response->header.recv_wc.recv_buf.grh = &response->grh;
- response->mad.mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
- response->mad.mad.mad_hdr.status =
- cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB);
- if (recv->mad.mad.mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
- response->mad.mad.mad_hdr.status |= IB_SMP_DIRECTION;
+ resp_hdr->method = IB_MGMT_METHOD_GET_RESP;
+ resp_hdr->status = cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB);
+ if (recv_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ resp_hdr->status |= IB_SMP_DIRECTION;
+
+ if (opa && recv_hdr->base_version == OPA_MGMT_BASE_VERSION) {
+ if (recv_hdr->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ recv_hdr->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ *resp_len = opa_get_smp_header_size(
+ (const struct opa_smp *)recv->mad);
+ else
+ *resp_len = sizeof(struct ib_mad_hdr);
+ }
return true;
} else {
return false;
}
}
-static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
- struct ib_wc *wc)
+
+static enum smi_action
+handle_opa_smi(struct ib_mad_port_private *port_priv,
+ struct ib_mad_qp_info *qp_info,
+ struct ib_wc *wc,
+ int port_num,
+ struct ib_mad_private *recv,
+ struct ib_mad_private *response)
{
+ enum smi_forward_action retsmi;
+ struct opa_smp *smp = (struct opa_smp *)recv->mad;
+
+ if (opa_smi_handle_dr_smp_recv(smp,
+ rdma_cap_ib_switch(port_priv->device),
+ port_num,
+ port_priv->device->phys_port_cnt) ==
+ IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+
+ retsmi = opa_smi_check_forward_dr_smp(smp);
+ if (retsmi == IB_SMI_LOCAL)
+ return IB_SMI_HANDLE;
+
+ if (retsmi == IB_SMI_SEND) { /* don't forward */
+ if (opa_smi_handle_dr_smp_send(smp,
+ rdma_cap_ib_switch(port_priv->device),
+ port_num) == IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+
+ if (opa_smi_check_local_smp(smp, port_priv->device) ==
+ IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+
+ } else if (rdma_cap_ib_switch(port_priv->device)) {
+ /* forward case for switches */
+ memcpy(response, recv, mad_priv_size(response));
+ response->header.recv_wc.wc = &response->header.wc;
+ response->header.recv_wc.recv_buf.opa_mad =
+ (struct opa_mad *)response->mad;
+ response->header.recv_wc.recv_buf.grh = &response->grh;
+
+ agent_send_response((const struct ib_mad_hdr *)response->mad,
+ &response->grh, wc,
+ port_priv->device,
+ opa_smi_get_fwd_port(smp),
+ qp_info->qp->qp_num,
+ recv->header.wc.byte_len,
+ true);
+
+ return IB_SMI_DISCARD;
+ }
+
+ return IB_SMI_HANDLE;
+}
+
+static enum smi_action
+handle_smi(struct ib_mad_port_private *port_priv,
+ struct ib_mad_qp_info *qp_info,
+ struct ib_wc *wc,
+ int port_num,
+ struct ib_mad_private *recv,
+ struct ib_mad_private *response,
+ bool opa)
+{
+ struct ib_mad_hdr *mad_hdr = (struct ib_mad_hdr *)recv->mad;
+
+ if (opa && mad_hdr->base_version == OPA_MGMT_BASE_VERSION &&
+ mad_hdr->class_version == OPA_SMI_CLASS_VERSION)
+ return handle_opa_smi(port_priv, qp_info, wc, port_num, recv,
+ response);
+
+ return handle_ib_smi(port_priv, qp_info, wc, port_num, recv, response);
+}
+
+static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_mad_port_private *port_priv = cq->cq_context;
+ struct ib_mad_list_head *mad_list =
+ container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
struct ib_mad_qp_info *qp_info;
struct ib_mad_private_header *mad_priv_hdr;
struct ib_mad_private *recv, *response = NULL;
- struct ib_mad_list_head *mad_list;
struct ib_mad_agent_private *mad_agent;
int port_num;
int ret = IB_MAD_RESULT_SUCCESS;
+ size_t mad_size;
+ u16 resp_mad_pkey_index = 0;
+ bool opa;
+
+ if (list_empty_careful(&port_priv->port_list))
+ return;
+
+ if (wc->status != IB_WC_SUCCESS) {
+ /*
+ * Receive errors indicate that the QP has entered the error
+ * state - error handling/shutdown code will cleanup
+ */
+ return;
+ }
- mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
qp_info = mad_list->mad_queue->qp_info;
dequeue_mad(mad_list);
+ opa = rdma_cap_opa_mad(qp_info->port_priv->device,
+ qp_info->port_priv->port_num);
+
mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header,
mad_list);
recv = container_of(mad_priv_hdr, struct ib_mad_private, header);
ib_dma_unmap_single(port_priv->device,
recv->header.mapping,
- sizeof(struct ib_mad_private) -
- sizeof(struct ib_mad_private_header),
+ mad_priv_dma_size(recv),
DMA_FROM_DEVICE);
/* Setup MAD receive work completion from "normal" work completion */
recv->header.wc = *wc;
recv->header.recv_wc.wc = &recv->header.wc;
- recv->header.recv_wc.mad_len = sizeof(struct ib_mad);
- recv->header.recv_wc.recv_buf.mad = &recv->mad.mad;
+
+ if (opa && ((struct ib_mad_hdr *)(recv->mad))->base_version == OPA_MGMT_BASE_VERSION) {
+ recv->header.recv_wc.mad_len = wc->byte_len - sizeof(struct ib_grh);
+ recv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad);
+ } else {
+ recv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+ recv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad);
+ }
+
+ recv->header.recv_wc.recv_buf.mad = (struct ib_mad *)recv->mad;
recv->header.recv_wc.recv_buf.grh = &recv->grh;
if (atomic_read(&qp_info->snoop_count))
snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS);
/* Validate MAD */
- if (!validate_mad(&recv->mad.mad, qp_info->qp->qp_num))
+ if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info, opa))
goto out;
- response = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+ mad_size = recv->mad_size;
+ response = alloc_mad_private(mad_size, GFP_KERNEL);
if (!response) {
- printk(KERN_ERR PFX "ib_mad_recv_done_handler no memory "
- "for response buffer\n");
+ dev_err(&port_priv->device->dev,
+ "%s: no memory for response buffer\n", __func__);
goto out;
}
- if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH)
+ if (rdma_cap_ib_switch(port_priv->device))
port_num = wc->port_num;
else
port_num = port_priv->port_num;
- if (recv->mad.mad.mad_hdr.mgmt_class ==
+ if (((struct ib_mad_hdr *)recv->mad)->mgmt_class ==
IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
- enum smi_forward_action retsmi;
-
- if (smi_handle_dr_smp_recv(&recv->mad.smp,
- port_priv->device->node_type,
- port_num,
- port_priv->device->phys_port_cnt) ==
- IB_SMI_DISCARD)
+ if (handle_smi(port_priv, qp_info, wc, port_num, recv,
+ response, opa)
+ == IB_SMI_DISCARD)
goto out;
-
- retsmi = smi_check_forward_dr_smp(&recv->mad.smp);
- if (retsmi == IB_SMI_LOCAL)
- goto local;
-
- if (retsmi == IB_SMI_SEND) { /* don't forward */
- if (smi_handle_dr_smp_send(&recv->mad.smp,
- port_priv->device->node_type,
- port_num) == IB_SMI_DISCARD)
- goto out;
-
- if (smi_check_local_smp(&recv->mad.smp, port_priv->device) == IB_SMI_DISCARD)
- goto out;
- } else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) {
- /* forward case for switches */
- memcpy(response, recv, sizeof(*response));
- response->header.recv_wc.wc = &response->header.wc;
- response->header.recv_wc.recv_buf.mad = &response->mad.mad;
- response->header.recv_wc.recv_buf.grh = &response->grh;
-
- agent_send_response(&response->mad.mad,
- &response->grh, wc,
- port_priv->device,
- smi_get_fwd_port(&recv->mad.smp),
- qp_info->qp->qp_num);
-
- goto out;
- }
}
-local:
/* Give driver "right of first refusal" on incoming MAD */
if (port_priv->device->process_mad) {
ret = port_priv->device->process_mad(port_priv->device, 0,
port_priv->port_num,
wc, &recv->grh,
- &recv->mad.mad,
- &response->mad.mad);
+ (const struct ib_mad_hdr *)recv->mad,
+ recv->mad_size,
+ (struct ib_mad_hdr *)response->mad,
+ &mad_size, &resp_mad_pkey_index);
+
+ if (opa)
+ wc->pkey_index = resp_mad_pkey_index;
+
if (ret & IB_MAD_RESULT_SUCCESS) {
if (ret & IB_MAD_RESULT_CONSUMED)
goto out;
if (ret & IB_MAD_RESULT_REPLY) {
- agent_send_response(&response->mad.mad,
+ agent_send_response((const struct ib_mad_hdr *)response->mad,
&recv->grh, wc,
port_priv->device,
port_num,
- qp_info->qp->qp_num);
+ qp_info->qp->qp_num,
+ mad_size, opa);
goto out;
}
}
}
- mad_agent = find_mad_agent(port_priv, &recv->mad.mad);
+ mad_agent = find_mad_agent(port_priv, (const struct ib_mad_hdr *)recv->mad);
if (mad_agent) {
ib_mad_complete_recv(mad_agent, &recv->header.recv_wc);
/*
@@ -2567,17 +2297,17 @@ local:
*/
recv = NULL;
} else if ((ret & IB_MAD_RESULT_SUCCESS) &&
- generate_unmatched_resp(recv, response)) {
- agent_send_response(&response->mad.mad, &recv->grh, wc,
- port_priv->device, port_num, qp_info->qp->qp_num);
+ generate_unmatched_resp(recv, response, &mad_size, opa)) {
+ agent_send_response((const struct ib_mad_hdr *)response->mad, &recv->grh, wc,
+ port_priv->device, port_num,
+ qp_info->qp->qp_num, mad_size, opa);
}
out:
/* Post another receive request for this QP */
if (response) {
ib_mad_post_receive_mads(qp_info, response);
- if (recv)
- kmem_cache_free(ib_mad_cache, recv);
+ kfree(recv);
} else
ib_mad_post_receive_mads(qp_info, recv);
}
@@ -2658,7 +2388,7 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
mad_agent_priv = mad_send_wr->mad_agent_priv;
spin_lock_irqsave(&mad_agent_priv->lock, flags);
- if (mad_agent_priv->agent.rmpp_version) {
+ if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
ret = ib_process_rmpp_send_wc(mad_send_wr, mad_send_wc);
if (ret == IB_RMPP_RESULT_CONSUMED)
goto done;
@@ -2688,12 +2418,9 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
mad_send_wc->status = mad_send_wr->status;
if (ret == IB_RMPP_RESULT_INTERNAL)
ib_rmpp_send_handler(mad_send_wc);
- else {
- if (mad_send_wr->is_sa_cc_mad)
- sa_cc_mad_done(get_cc_obj(mad_send_wr));
+ else
mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
mad_send_wc);
- }
/* Release reference on agent taken when sending */
deref_mad_agent(mad_agent_priv);
@@ -2702,11 +2429,12 @@ done:
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
}
-static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv,
- struct ib_wc *wc)
+static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc)
{
+ struct ib_mad_port_private *port_priv = cq->cq_context;
+ struct ib_mad_list_head *mad_list =
+ container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
struct ib_mad_send_wr_private *mad_send_wr, *queued_send_wr;
- struct ib_mad_list_head *mad_list;
struct ib_mad_qp_info *qp_info;
struct ib_mad_queue *send_queue;
struct ib_send_wr *bad_send_wr;
@@ -2714,7 +2442,14 @@ static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv,
unsigned long flags;
int ret;
- mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+ if (list_empty_careful(&port_priv->port_list))
+ return;
+
+ if (wc->status != IB_WC_SUCCESS) {
+ if (!ib_mad_send_error(port_priv, wc))
+ return;
+ }
+
mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
mad_list);
send_queue = mad_list->mad_queue;
@@ -2751,10 +2486,11 @@ retry:
ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
if (queued_send_wr) {
- ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr,
+ ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr.wr,
&bad_send_wr);
if (ret) {
- printk(KERN_ERR PFX "ib_post_send failed: %d\n", ret);
+ dev_err(&port_priv->device->dev,
+ "ib_post_send failed: %d\n", ret);
mad_send_wr = queued_send_wr;
wc->status = IB_WC_LOC_QP_OP_ERR;
goto retry;
@@ -2778,24 +2514,15 @@ static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info)
spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
}
-static void mad_error_handler(struct ib_mad_port_private *port_priv,
- struct ib_wc *wc)
+static bool ib_mad_send_error(struct ib_mad_port_private *port_priv,
+ struct ib_wc *wc)
{
- struct ib_mad_list_head *mad_list;
- struct ib_mad_qp_info *qp_info;
+ struct ib_mad_list_head *mad_list =
+ container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
+ struct ib_mad_qp_info *qp_info = mad_list->mad_queue->qp_info;
struct ib_mad_send_wr_private *mad_send_wr;
int ret;
- /* Determine if failure was a send or receive */
- mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
- qp_info = mad_list->mad_queue->qp_info;
- if (mad_list->mad_queue == &qp_info->recv_queue)
- /*
- * Receive errors indicate that the QP has entered the error
- * state - error handling/shutdown code will cleanup
- */
- return;
-
/*
* Send errors will transition the QP to SQE - move
* QP to RTS and repost flushed work requests
@@ -2808,12 +2535,11 @@ static void mad_error_handler(struct ib_mad_port_private *port_priv,
struct ib_send_wr *bad_send_wr;
mad_send_wr->retry = 0;
- ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr,
+ ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr,
&bad_send_wr);
- if (ret)
- ib_mad_send_done_handler(port_priv, wc);
- } else
- ib_mad_send_done_handler(port_priv, wc);
+ if (!ret)
+ return false;
+ }
} else {
struct ib_qp_attr *attr;
@@ -2826,42 +2552,15 @@ static void mad_error_handler(struct ib_mad_port_private *port_priv,
IB_QP_STATE | IB_QP_CUR_STATE);
kfree(attr);
if (ret)
- printk(KERN_ERR PFX "mad_error_handler - "
- "ib_modify_qp to RTS : %d\n", ret);
+ dev_err(&port_priv->device->dev,
+ "%s - ib_modify_qp to RTS: %d\n",
+ __func__, ret);
else
mark_sends_for_retry(qp_info);
}
- ib_mad_send_done_handler(port_priv, wc);
}
-}
-/*
- * IB MAD completion callback
- */
-static void ib_mad_completion_handler(struct work_struct *work)
-{
- struct ib_mad_port_private *port_priv;
- struct ib_wc wc;
-
- port_priv = container_of(work, struct ib_mad_port_private, work);
- ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
-
- while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) {
- if (wc.status == IB_WC_SUCCESS) {
- switch (wc.opcode) {
- case IB_WC_SEND:
- ib_mad_send_done_handler(port_priv, &wc);
- break;
- case IB_WC_RECV:
- ib_mad_recv_done_handler(port_priv, &wc);
- break;
- default:
- BUG_ON(1);
- break;
- }
- } else
- mad_error_handler(port_priv, &wc);
- }
+ return true;
}
static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
@@ -2873,7 +2572,6 @@ static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
INIT_LIST_HEAD(&cancel_list);
- cancel_sa_cc_mads(mad_agent_priv);
spin_lock_irqsave(&mad_agent_priv->lock, flags);
list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
&mad_agent_priv->send_list, agent_list) {
@@ -2895,8 +2593,6 @@ static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
&cancel_list, agent_list) {
mad_send_wc.send_buf = &mad_send_wr->send_buf;
list_del(&mad_send_wr->agent_list);
- if (mad_send_wr->is_sa_cc_mad)
- sa_cc_mad_done(get_cc_obj(mad_send_wr));
mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
&mad_send_wc);
atomic_dec(&mad_agent_priv->refcount);
@@ -2917,7 +2613,8 @@ find_send_wr(struct ib_mad_agent_private *mad_agent_priv,
list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list,
agent_list) {
- if (is_data_mad(mad_agent_priv, mad_send_wr->send_buf.mad) &&
+ if (is_rmpp_data_mad(mad_agent_priv,
+ mad_send_wr->send_buf.mad) &&
&mad_send_wr->send_buf == send_buf)
return mad_send_wr;
}
@@ -2936,13 +2633,7 @@ int ib_modify_mad(struct ib_mad_agent *mad_agent,
agent);
spin_lock_irqsave(&mad_agent_priv->lock, flags);
mad_send_wr = find_send_wr(mad_agent_priv, send_buf);
- if (!mad_send_wr) {
- spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
- if (modify_sa_cc_mad(mad_agent_priv, send_buf, timeout_ms))
- return -EINVAL;
- return 0;
- }
- if (mad_send_wr->status != IB_WC_SUCCESS) {
+ if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) {
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
return -EINVAL;
}
@@ -2980,10 +2671,14 @@ static void local_completions(struct work_struct *work)
int free_mad;
struct ib_wc wc;
struct ib_mad_send_wc mad_send_wc;
+ bool opa;
mad_agent_priv =
container_of(work, struct ib_mad_agent_private, local_work);
+ opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device,
+ mad_agent_priv->qp_info->port_priv->port_num);
+
spin_lock_irqsave(&mad_agent_priv->lock, flags);
while (!list_empty(&mad_agent_priv->local_list)) {
local = list_entry(mad_agent_priv->local_list.next,
@@ -2993,9 +2688,11 @@ static void local_completions(struct work_struct *work)
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
free_mad = 0;
if (local->mad_priv) {
+ u8 base_version;
recv_mad_agent = local->recv_mad_agent;
if (!recv_mad_agent) {
- printk(KERN_ERR PFX "No receive MAD agent for local completion\n");
+ dev_err(&mad_agent_priv->agent.device->dev,
+ "No receive MAD agent for local completion\n");
free_mad = 1;
goto local_send_completion;
}
@@ -3005,25 +2702,35 @@ static void local_completions(struct work_struct *work)
* before request
*/
build_smp_wc(recv_mad_agent->agent.qp,
- (unsigned long) local->mad_send_wr,
+ local->mad_send_wr->send_wr.wr.wr_cqe,
be16_to_cpu(IB_LID_PERMISSIVE),
- 0, recv_mad_agent->agent.port_num, &wc);
+ local->mad_send_wr->send_wr.pkey_index,
+ recv_mad_agent->agent.port_num, &wc);
local->mad_priv->header.recv_wc.wc = &wc;
- local->mad_priv->header.recv_wc.mad_len =
- sizeof(struct ib_mad);
+
+ base_version = ((struct ib_mad_hdr *)(local->mad_priv->mad))->base_version;
+ if (opa && base_version == OPA_MGMT_BASE_VERSION) {
+ local->mad_priv->header.recv_wc.mad_len = local->return_wc_byte_len;
+ local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad);
+ } else {
+ local->mad_priv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+ local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad);
+ }
+
INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.rmpp_list);
list_add(&local->mad_priv->header.recv_wc.recv_buf.list,
&local->mad_priv->header.recv_wc.rmpp_list);
local->mad_priv->header.recv_wc.recv_buf.grh = NULL;
local->mad_priv->header.recv_wc.recv_buf.mad =
- &local->mad_priv->mad.mad;
+ (struct ib_mad *)local->mad_priv->mad;
if (atomic_read(&recv_mad_agent->qp_info->snoop_count))
snoop_recv(recv_mad_agent->qp_info,
&local->mad_priv->header.recv_wc,
IB_MAD_SNOOP_RECVS);
recv_mad_agent->agent.recv_handler(
&recv_mad_agent->agent,
+ &local->mad_send_wr->send_buf,
&local->mad_priv->header.recv_wc);
spin_lock_irqsave(&recv_mad_agent->lock, flags);
atomic_dec(&recv_mad_agent->refcount);
@@ -3045,7 +2752,7 @@ local_send_completion:
spin_lock_irqsave(&mad_agent_priv->lock, flags);
atomic_dec(&mad_agent_priv->refcount);
if (free_mad)
- kmem_cache_free(ib_mad_cache, local->mad_priv);
+ kfree(local->mad_priv);
kfree(local);
}
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
@@ -3063,7 +2770,7 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
- if (mad_send_wr->mad_agent_priv->agent.rmpp_version) {
+ if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) {
ret = ib_retry_rmpp(mad_send_wr);
switch (ret) {
case IB_RMPP_RESULT_UNHANDLED:
@@ -3126,8 +2833,6 @@ static void timeout_sends(struct work_struct *work)
else
mad_send_wc.status = mad_send_wr->status;
mad_send_wc.send_buf = &mad_send_wr->send_buf;
- if (mad_send_wr->is_sa_cc_mad)
- sa_cc_mad_done(get_cc_obj(mad_send_wr));
mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
&mad_send_wc);
@@ -3137,17 +2842,6 @@ static void timeout_sends(struct work_struct *work)
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
}
-static void ib_mad_thread_completion_handler(struct ib_cq *cq, void *arg)
-{
- struct ib_mad_port_private *port_priv = cq->cq_context;
- unsigned long flags;
-
- spin_lock_irqsave(&ib_mad_port_list_lock, flags);
- if (!list_empty(&port_priv->port_list))
- queue_work(port_priv->wq, &port_priv->work);
- spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
-}
-
/*
* Allocate receive MADs and post receive WRs for them
*/
@@ -3162,8 +2856,7 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
/* Initialize common scatter list fields */
- sg_list.length = sizeof *mad_priv - sizeof mad_priv->header;
- sg_list.lkey = (*qp_info->port_priv->mr).lkey;
+ sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey;
/* Initialize common receive WR fields */
recv_wr.next = NULL;
@@ -3176,29 +2869,29 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
mad_priv = mad;
mad = NULL;
} else {
- mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+ mad_priv = alloc_mad_private(port_mad_size(qp_info->port_priv),
+ GFP_ATOMIC);
if (!mad_priv) {
- printk(KERN_ERR PFX "No memory for receive buffer\n");
+ dev_err(&qp_info->port_priv->device->dev,
+ "No memory for receive buffer\n");
ret = -ENOMEM;
break;
}
}
+ sg_list.length = mad_priv_dma_size(mad_priv);
sg_list.addr = ib_dma_map_single(qp_info->port_priv->device,
&mad_priv->grh,
- sizeof *mad_priv -
- sizeof mad_priv->header,
+ mad_priv_dma_size(mad_priv),
DMA_FROM_DEVICE);
if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device,
sg_list.addr))) {
ret = -ENOMEM;
- kmem_cache_free(ib_mad_cache, mad_priv);
- printk(KERN_ERR PFX "ib_dma_map_single failed\n");
break;
}
-
mad_priv->header.mapping = sg_list.addr;
- recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list;
mad_priv->header.mad_list.mad_queue = recv_queue;
+ mad_priv->header.mad_list.cqe.done = ib_mad_recv_done;
+ recv_wr.wr_cqe = &mad_priv->header.mad_list.cqe;
/* Post receive WR */
spin_lock_irqsave(&recv_queue->lock, flags);
@@ -3213,11 +2906,11 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
spin_unlock_irqrestore(&recv_queue->lock, flags);
ib_dma_unmap_single(qp_info->port_priv->device,
mad_priv->header.mapping,
- sizeof *mad_priv -
- sizeof mad_priv->header,
+ mad_priv_dma_size(mad_priv),
DMA_FROM_DEVICE);
- kmem_cache_free(ib_mad_cache, mad_priv);
- printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret);
+ kfree(mad_priv);
+ dev_err(&qp_info->port_priv->device->dev,
+ "ib_post_recv failed: %d\n", ret);
break;
}
} while (post);
@@ -3252,10 +2945,9 @@ static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info)
ib_dma_unmap_single(qp_info->port_priv->device,
recv->header.mapping,
- sizeof(struct ib_mad_private) -
- sizeof(struct ib_mad_private_header),
+ mad_priv_dma_size(recv),
DMA_FROM_DEVICE);
- kmem_cache_free(ib_mad_cache, recv);
+ kfree(recv);
}
qp_info->recv_queue.count = 0;
@@ -3269,16 +2961,17 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
int ret, i;
struct ib_qp_attr *attr;
struct ib_qp *qp;
- u16 pkey_index = 0;
+ u16 pkey_index;
attr = kmalloc(sizeof *attr, GFP_KERNEL);
if (!attr) {
- printk(KERN_ERR PFX "Couldn't kmalloc ib_qp_attr\n");
+ dev_err(&port_priv->device->dev,
+ "Couldn't kmalloc ib_qp_attr\n");
return -ENOMEM;
}
ret = ib_find_pkey(port_priv->device, port_priv->port_num,
- 0xFFFF, &pkey_index);
+ IB_DEFAULT_PKEY_FULL, &pkey_index);
if (ret)
pkey_index = 0;
@@ -3297,16 +2990,18 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
ret = ib_modify_qp(qp, attr, IB_QP_STATE |
IB_QP_PKEY_INDEX | IB_QP_QKEY);
if (ret) {
- printk(KERN_ERR PFX "Couldn't change QP%d state to "
- "INIT: %d\n", i, ret);
+ dev_err(&port_priv->device->dev,
+ "Couldn't change QP%d state to INIT: %d\n",
+ i, ret);
goto out;
}
attr->qp_state = IB_QPS_RTR;
ret = ib_modify_qp(qp, attr, IB_QP_STATE);
if (ret) {
- printk(KERN_ERR PFX "Couldn't change QP%d state to "
- "RTR: %d\n", i, ret);
+ dev_err(&port_priv->device->dev,
+ "Couldn't change QP%d state to RTR: %d\n",
+ i, ret);
goto out;
}
@@ -3314,16 +3009,18 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
attr->sq_psn = IB_MAD_SEND_Q_PSN;
ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN);
if (ret) {
- printk(KERN_ERR PFX "Couldn't change QP%d state to "
- "RTS: %d\n", i, ret);
+ dev_err(&port_priv->device->dev,
+ "Couldn't change QP%d state to RTS: %d\n",
+ i, ret);
goto out;
}
}
ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
if (ret) {
- printk(KERN_ERR PFX "Failed to request completion "
- "notification: %d\n", ret);
+ dev_err(&port_priv->device->dev,
+ "Failed to request completion notification: %d\n",
+ ret);
goto out;
}
@@ -3333,7 +3030,8 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL);
if (ret) {
- printk(KERN_ERR PFX "Couldn't post receive WRs\n");
+ dev_err(&port_priv->device->dev,
+ "Couldn't post receive WRs\n");
goto out;
}
}
@@ -3347,7 +3045,8 @@ static void qp_event_handler(struct ib_event *event, void *qp_context)
struct ib_mad_qp_info *qp_info = qp_context;
/* It's worse than that! He's dead, Jim! */
- printk(KERN_ERR PFX "Fatal error (%d) on MAD QP (%d)\n",
+ dev_err(&qp_info->port_priv->device->dev,
+ "Fatal error (%d) on MAD QP (%d)\n",
event->event, qp_info->qp->qp_num);
}
@@ -3393,8 +3092,9 @@ static int create_mad_qp(struct ib_mad_qp_info *qp_info,
qp_init_attr.event_handler = qp_event_handler;
qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr);
if (IS_ERR(qp_info->qp)) {
- printk(KERN_ERR PFX "Couldn't create ib_mad QP%d\n",
- get_spl_qp_index(qp_type));
+ dev_err(&qp_info->port_priv->device->dev,
+ "Couldn't create ib_mad QP%d\n",
+ get_spl_qp_index(qp_type));
ret = PTR_ERR(qp_info->qp);
goto error;
}
@@ -3429,10 +3129,17 @@ static int ib_mad_port_open(struct ib_device *device,
char name[sizeof "ib_mad123"];
int has_smi;
+ if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE))
+ return -EFAULT;
+
+ if (WARN_ON(rdma_cap_opa_mad(device, port_num) &&
+ rdma_max_mad_size(device, port_num) < OPA_MGMT_MAD_SIZE))
+ return -EFAULT;
+
/* Create new device info */
port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
if (!port_priv) {
- printk(KERN_ERR PFX "No memory for ib_mad_port_private\n");
+ dev_err(&device->dev, "No memory for ib_mad_port_private\n");
return -ENOMEM;
}
@@ -3444,33 +3151,25 @@ static int ib_mad_port_open(struct ib_device *device,
init_mad_qp(port_priv, &port_priv->qp_info[1]);
cq_size = mad_sendq_size + mad_recvq_size;
- has_smi = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND;
+ has_smi = rdma_cap_ib_smi(device, port_num);
if (has_smi)
cq_size *= 2;
- port_priv->cq = ib_create_cq(port_priv->device,
- ib_mad_thread_completion_handler,
- NULL, port_priv, cq_size, 0);
+ port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0,
+ IB_POLL_WORKQUEUE);
if (IS_ERR(port_priv->cq)) {
- printk(KERN_ERR PFX "Couldn't create ib_mad CQ\n");
+ dev_err(&device->dev, "Couldn't create ib_mad CQ\n");
ret = PTR_ERR(port_priv->cq);
goto error3;
}
- port_priv->pd = ib_alloc_pd(device);
+ port_priv->pd = ib_alloc_pd(device, 0);
if (IS_ERR(port_priv->pd)) {
- printk(KERN_ERR PFX "Couldn't create ib_mad PD\n");
+ dev_err(&device->dev, "Couldn't create ib_mad PD\n");
ret = PTR_ERR(port_priv->pd);
goto error4;
}
- port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE);
- if (IS_ERR(port_priv->mr)) {
- printk(KERN_ERR PFX "Couldn't get ib_mad DMA MR\n");
- ret = PTR_ERR(port_priv->mr);
- goto error5;
- }
-
if (has_smi) {
ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI);
if (ret)
@@ -3481,16 +3180,11 @@ static int ib_mad_port_open(struct ib_device *device,
goto error7;
snprintf(name, sizeof name, "ib_mad%d", port_num);
- port_priv->wq = create_singlethread_workqueue(name);
+ port_priv->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
if (!port_priv->wq) {
ret = -ENOMEM;
goto error8;
}
- INIT_WORK(&port_priv->work, ib_mad_completion_handler);
-
- if (sa_cc_init(&port_priv->sa_cc))
- goto error9;
-
spin_lock_irqsave(&ib_mad_port_list_lock, flags);
list_add_tail(&port_priv->port_list, &ib_mad_port_list);
@@ -3498,30 +3192,26 @@ static int ib_mad_port_open(struct ib_device *device,
ret = ib_mad_port_start(port_priv);
if (ret) {
- printk(KERN_ERR PFX "Couldn't start port\n");
- goto error10;
+ dev_err(&device->dev, "Couldn't start port\n");
+ goto error9;
}
return 0;
-error10:
+error9:
spin_lock_irqsave(&ib_mad_port_list_lock, flags);
list_del_init(&port_priv->port_list);
spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
destroy_workqueue(port_priv->wq);
-error9:
- sa_cc_destroy(&port_priv->sa_cc);
error8:
destroy_mad_qp(&port_priv->qp_info[1]);
error7:
destroy_mad_qp(&port_priv->qp_info[0]);
error6:
- ib_dereg_mr(port_priv->mr);
-error5:
ib_dealloc_pd(port_priv->pd);
error4:
- ib_destroy_cq(port_priv->cq);
+ ib_free_cq(port_priv->cq);
cleanup_recv_queue(&port_priv->qp_info[1]);
cleanup_recv_queue(&port_priv->qp_info[0]);
error3:
@@ -3544,19 +3234,17 @@ static int ib_mad_port_close(struct ib_device *device, int port_num)
port_priv = __ib_get_mad_port(device, port_num);
if (port_priv == NULL) {
spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
- printk(KERN_ERR PFX "Port %d not found\n", port_num);
+ dev_err(&device->dev, "Port %d not found\n", port_num);
return -ENODEV;
}
list_del_init(&port_priv->port_list);
spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
destroy_workqueue(port_priv->wq);
- sa_cc_destroy(&port_priv->sa_cc);
destroy_mad_qp(&port_priv->qp_info[1]);
destroy_mad_qp(&port_priv->qp_info[0]);
- ib_dereg_mr(port_priv->mr);
ib_dealloc_pd(port_priv->pd);
- ib_destroy_cq(port_priv->cq);
+ ib_free_cq(port_priv->cq);
cleanup_recv_queue(&port_priv->qp_info[1]);
cleanup_recv_queue(&port_priv->qp_info[0]);
/* XXX: Handle deallocation of MAD registration tables */
@@ -3568,29 +3256,21 @@ static int ib_mad_port_close(struct ib_device *device, int port_num)
static void ib_mad_init_device(struct ib_device *device)
{
- int start, end, i;
+ int start, i;
- if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
- return;
+ start = rdma_start_port(device);
- if (device->node_type == RDMA_NODE_IB_SWITCH) {
- start = 0;
- end = 0;
- } else {
- start = 1;
- end = device->phys_port_cnt;
- }
+ for (i = start; i <= rdma_end_port(device); i++) {
+ if (!rdma_cap_ib_mad(device, i))
+ continue;
- for (i = start; i <= end; i++) {
if (ib_mad_port_open(device, i)) {
- printk(KERN_ERR PFX "Couldn't open %s port %d\n",
- device->name, i);
+ dev_err(&device->dev, "Couldn't open port %d\n", i);
goto error;
}
if (ib_agent_port_open(device, i)) {
- printk(KERN_ERR PFX "Couldn't open %s port %d "
- "for agents\n",
- device->name, i);
+ dev_err(&device->dev,
+ "Couldn't open port %d for agents\n", i);
goto error_agent;
}
}
@@ -3598,46 +3278,34 @@ static void ib_mad_init_device(struct ib_device *device)
error_agent:
if (ib_mad_port_close(device, i))
- printk(KERN_ERR PFX "Couldn't close %s port %d\n",
- device->name, i);
+ dev_err(&device->dev, "Couldn't close port %d\n", i);
error:
- i--;
+ while (--i >= start) {
+ if (!rdma_cap_ib_mad(device, i))
+ continue;
- while (i >= start) {
if (ib_agent_port_close(device, i))
- printk(KERN_ERR PFX "Couldn't close %s port %d "
- "for agents\n",
- device->name, i);
+ dev_err(&device->dev,
+ "Couldn't close port %d for agents\n", i);
if (ib_mad_port_close(device, i))
- printk(KERN_ERR PFX "Couldn't close %s port %d\n",
- device->name, i);
- i--;
+ dev_err(&device->dev, "Couldn't close port %d\n", i);
}
}
-static void ib_mad_remove_device(struct ib_device *device)
+static void ib_mad_remove_device(struct ib_device *device, void *client_data)
{
- int i, num_ports, cur_port;
+ int i;
- if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
- return;
+ for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {
+ if (!rdma_cap_ib_mad(device, i))
+ continue;
- if (device->node_type == RDMA_NODE_IB_SWITCH) {
- num_ports = 1;
- cur_port = 0;
- } else {
- num_ports = device->phys_port_cnt;
- cur_port = 1;
- }
- for (i = 0; i < num_ports; i++, cur_port++) {
- if (ib_agent_port_close(device, cur_port))
- printk(KERN_ERR PFX "Couldn't close %s port %d "
- "for agents\n",
- device->name, cur_port);
- if (ib_mad_port_close(device, cur_port))
- printk(KERN_ERR PFX "Couldn't close %s port %d\n",
- device->name, cur_port);
+ if (ib_agent_port_close(device, i))
+ dev_err(&device->dev,
+ "Couldn't close port %d for agents\n", i);
+ if (ib_mad_port_close(device, i))
+ dev_err(&device->dev, "Couldn't close port %d\n", i);
}
}
@@ -3647,48 +3315,25 @@ static struct ib_client mad_client = {
.remove = ib_mad_remove_device
};
-static int __init ib_mad_init_module(void)
+int ib_mad_init(void)
{
- int ret;
-
mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE);
mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE);
mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE);
mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE);
- ib_mad_cache = kmem_cache_create("ib_mad",
- sizeof(struct ib_mad_private),
- 0,
- SLAB_HWCACHE_ALIGN,
- NULL);
- if (!ib_mad_cache) {
- printk(KERN_ERR PFX "Couldn't create ib_mad cache\n");
- ret = -ENOMEM;
- goto error1;
- }
-
INIT_LIST_HEAD(&ib_mad_port_list);
if (ib_register_client(&mad_client)) {
- printk(KERN_ERR PFX "Couldn't register ib_mad client\n");
- ret = -EINVAL;
- goto error2;
+ pr_err("Couldn't register ib_mad client\n");
+ return -EINVAL;
}
return 0;
-
-error2:
- kmem_cache_destroy(ib_mad_cache);
-error1:
- return ret;
}
-static void __exit ib_mad_cleanup_module(void)
+void ib_mad_cleanup(void)
{
ib_unregister_client(&mad_client);
- kmem_cache_destroy(ib_mad_cache);
}
-
-module_init(ib_mad_init_module);
-module_exit(ib_mad_cleanup_module);
diff --git a/sys/ofed/drivers/infiniband/core/mad_priv.h b/sys/ofed/drivers/infiniband/core/mad_priv.h
index e2cd0ac0a5148..28669f6419e1f 100644
--- a/sys/ofed/drivers/infiniband/core/mad_priv.h
+++ b/sys/ofed/drivers/infiniband/core/mad_priv.h
@@ -41,9 +41,7 @@
#include <linux/workqueue.h>
#include <rdma/ib_mad.h>
#include <rdma/ib_smi.h>
-
-
-#define PFX "ib_mad: "
+#include <rdma/opa_smi.h>
#define IB_MAD_QPS_CORE 2 /* Always QP0 and QP1 as a minimum */
@@ -59,13 +57,14 @@
/* Registration table sizes */
#define MAX_MGMT_CLASS 80
-#define MAX_MGMT_VERSION 8
+#define MAX_MGMT_VERSION 0x83
#define MAX_MGMT_OUI 8
#define MAX_MGMT_VENDOR_RANGE2 (IB_MGMT_CLASS_VENDOR_RANGE2_END - \
IB_MGMT_CLASS_VENDOR_RANGE2_START + 1)
struct ib_mad_list_head {
struct list_head list;
+ struct ib_cqe cqe;
struct ib_mad_queue *mad_queue;
};
@@ -78,12 +77,9 @@ struct ib_mad_private_header {
struct ib_mad_private {
struct ib_mad_private_header header;
+ size_t mad_size;
struct ib_grh grh;
- union {
- struct ib_mad mad;
- struct ib_rmpp_mad rmpp_mad;
- struct ib_smp smp;
- } mad;
+ u8 mad[0];
} __attribute__ ((packed));
struct ib_rmpp_segment {
@@ -121,14 +117,6 @@ struct ib_mad_snoop_private {
struct completion comp;
};
-/* Structure for timeout-fifo entry */
-struct tf_entry {
- unsigned long exp_time; /* entry expiration time */
- struct list_head fifo_list; /* to keep entries in fifo order */
- struct list_head to_list; /* to keep entries in timeout order */
- int canceled; /* indicates whether entry is canceled */
-};
-
struct ib_mad_send_wr_private {
struct ib_mad_list_head mad_list;
struct list_head agent_list;
@@ -136,7 +124,7 @@ struct ib_mad_send_wr_private {
struct ib_mad_send_buf send_buf;
u64 header_mapping;
u64 payload_mapping;
- struct ib_send_wr send_wr;
+ struct ib_ud_wr send_wr;
struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
__be64 tid;
unsigned long timeout;
@@ -154,10 +142,6 @@ struct ib_mad_send_wr_private {
int seg_num;
int newwin;
int pad;
-
- /* SA congestion controlled MAD */
- int is_sa_cc_mad;
- struct tf_entry tf_list;
};
struct ib_mad_local_private {
@@ -165,6 +149,7 @@ struct ib_mad_local_private {
struct ib_mad_private *mad_priv;
struct ib_mad_agent_private *recv_mad_agent;
struct ib_mad_send_wr_private *mad_send_wr;
+ size_t return_wc_byte_len;
};
struct ib_mad_mgmt_method_table {
@@ -209,47 +194,25 @@ struct ib_mad_qp_info {
atomic_t snoop_count;
};
-struct to_fifo {
- struct list_head to_head;
- struct list_head fifo_head;
- spinlock_t lists_lock;
- struct timer_list timer;
- struct work_struct work;
- u32 fifo_size;
- u32 num_items;
- int stop_enqueue;
- struct workqueue_struct *workq;
-};
-
-/* SA congestion control data */
-struct sa_cc_data {
- spinlock_t lock;
- unsigned long outstanding;
- struct to_fifo *tf;
-};
-
struct ib_mad_port_private {
struct list_head port_list;
struct ib_device *device;
int port_num;
struct ib_cq *cq;
struct ib_pd *pd;
- struct ib_mr *mr;
spinlock_t reg_lock;
struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
struct list_head agent_list;
struct workqueue_struct *wq;
- struct work_struct work;
struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE];
- struct sa_cc_data sa_cc;
};
int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr);
struct ib_mad_send_wr_private *
-ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv,
- struct ib_mad_recv_wc *mad_recv_wc);
+ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv,
+ const struct ib_mad_recv_wc *mad_recv_wc);
void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
struct ib_mad_send_wc *mad_send_wc);
diff --git a/sys/ofed/drivers/infiniband/core/mad_rmpp.c b/sys/ofed/drivers/infiniband/core/mad_rmpp.c
index f37878c9c06eb..382941b46e43a 100644
--- a/sys/ofed/drivers/infiniband/core/mad_rmpp.c
+++ b/sys/ofed/drivers/infiniband/core/mad_rmpp.c
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2005 Intel Inc. All rights reserved.
* Copyright (c) 2005-2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -67,6 +68,7 @@ struct mad_rmpp_recv {
u8 mgmt_class;
u8 class_version;
u8 method;
+ u8 base_version;
};
static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
@@ -139,7 +141,8 @@ static void ack_recv(struct mad_rmpp_recv *rmpp_recv,
hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
msg = ib_create_send_mad(&rmpp_recv->agent->agent, recv_wc->wc->src_qp,
recv_wc->wc->pkey_index, 1, hdr_len,
- 0, GFP_KERNEL);
+ 0, GFP_KERNEL,
+ IB_MGMT_BASE_VERSION);
if (IS_ERR(msg))
return;
@@ -165,7 +168,8 @@ static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent,
hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
msg = ib_create_send_mad(agent, recv_wc->wc->src_qp,
recv_wc->wc->pkey_index, 1,
- hdr_len, 0, GFP_KERNEL);
+ hdr_len, 0, GFP_KERNEL,
+ IB_MGMT_BASE_VERSION);
if (IS_ERR(msg))
ib_destroy_ah(ah);
else {
@@ -316,6 +320,7 @@ create_rmpp_recv(struct ib_mad_agent_private *agent,
rmpp_recv->mgmt_class = mad_hdr->mgmt_class;
rmpp_recv->class_version = mad_hdr->class_version;
rmpp_recv->method = mad_hdr->method;
+ rmpp_recv->base_version = mad_hdr->base_version;
return rmpp_recv;
error: kfree(rmpp_recv);
@@ -431,14 +436,23 @@ static inline int get_mad_len(struct mad_rmpp_recv *rmpp_recv)
{
struct ib_rmpp_mad *rmpp_mad;
int hdr_size, data_size, pad;
+ bool opa = rdma_cap_opa_mad(rmpp_recv->agent->qp_info->port_priv->device,
+ rmpp_recv->agent->qp_info->port_priv->port_num);
rmpp_mad = (struct ib_rmpp_mad *)rmpp_recv->cur_seg_buf->mad;
hdr_size = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
- data_size = sizeof(struct ib_rmpp_mad) - hdr_size;
- pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
- if (pad > IB_MGMT_RMPP_DATA || pad < 0)
- pad = 0;
+ if (opa && rmpp_recv->base_version == OPA_MGMT_BASE_VERSION) {
+ data_size = sizeof(struct opa_rmpp_mad) - hdr_size;
+ pad = OPA_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+ if (pad > OPA_MGMT_RMPP_DATA || pad < 0)
+ pad = 0;
+ } else {
+ data_size = sizeof(struct ib_rmpp_mad) - hdr_size;
+ pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+ if (pad > IB_MGMT_RMPP_DATA || pad < 0)
+ pad = 0;
+ }
return hdr_size + rmpp_recv->seg_num * data_size - pad;
}
@@ -570,13 +584,14 @@ static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
if (mad_send_wr->seg_num == 1) {
rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_FIRST;
- paylen = mad_send_wr->send_buf.seg_count * IB_MGMT_RMPP_DATA -
- mad_send_wr->pad;
+ paylen = (mad_send_wr->send_buf.seg_count *
+ mad_send_wr->send_buf.seg_rmpp_size) -
+ mad_send_wr->pad;
}
if (mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) {
rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_LAST;
- paylen = IB_MGMT_RMPP_DATA - mad_send_wr->pad;
+ paylen = mad_send_wr->send_buf.seg_rmpp_size - mad_send_wr->pad;
}
rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen);
diff --git a/sys/ofed/drivers/infiniband/core/multicast.c b/sys/ofed/drivers/infiniband/core/multicast.c
index 4df22eea1dad2..1fa90d427d697 100644
--- a/sys/ofed/drivers/infiniband/core/multicast.c
+++ b/sys/ofed/drivers/infiniband/core/multicast.c
@@ -36,29 +36,16 @@
#include <linux/dma-mapping.h>
#include <linux/err.h>
#include <linux/interrupt.h>
-#include <linux/module.h>
#include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/random.h>
-#include <linux/moduleparam.h>
#include <linux/rbtree.h>
#include <rdma/ib_cache.h>
#include "sa.h"
-static int mcast_leave_retries = 3;
-
-/*static const struct kernel_param_ops retry_ops = {
- .set = param_set_int,
- .get = param_get_int,
-};
-
-module_param_cb(mcast_leave_retries, &retry_ops, &mcast_leave_retries, 0644);
-MODULE_PARM_DESC(mcast_leave_retries, "Number of retries for multicast leave "
- "requests before giving up (default: 3)");
-*/
static void mcast_add_one(struct ib_device *device);
-static void mcast_remove_one(struct ib_device *device);
+static void mcast_remove_one(struct ib_device *device, void *client_data);
static struct ib_client mcast_client = {
.name = "ib_multicast",
@@ -117,11 +104,10 @@ struct mcast_group {
struct list_head pending_list;
struct list_head active_list;
struct mcast_member *last_join;
- int members[3];
+ int members[NUM_JOIN_MEMBERSHIP_TYPES];
atomic_t refcount;
enum mcast_group_state state;
struct ib_sa_query *query;
- int query_id;
u16 pkey_index;
u8 leave_state;
int retries;
@@ -235,8 +221,9 @@ static void queue_join(struct mcast_member *member)
}
/*
- * A multicast group has three types of members: full member, non member, and
- * send only member. We need to keep track of the number of members of each
+ * A multicast group has four types of members: full member, non member,
+ * sendonly non member and sendonly full member.
+ * We need to keep track of the number of members of each
* type based on their join state. Adjust the number of members the belong to
* the specified join states.
*/
@@ -244,7 +231,7 @@ static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
{
int i;
- for (i = 0; i < 3; i++, join_state >>= 1)
+ for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++, join_state >>= 1)
if (join_state & 0x1)
group->members[i] += inc;
}
@@ -260,7 +247,7 @@ static u8 get_leave_state(struct mcast_group *group)
u8 leave_state = 0;
int i;
- for (i = 0; i < 3; i++)
+ for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++)
if (!group->members[i])
leave_state |= (0x1 << i);
@@ -308,8 +295,8 @@ static int cmp_rec(struct ib_sa_mcmember_rec *src,
if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
return -EINVAL;
if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
- IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector,
- src->mtu, dst->mtu))
+ IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector,
+ src->mtu, dst->mtu))
return -EINVAL;
if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
src->traffic_class != dst->traffic_class)
@@ -317,14 +304,14 @@ static int cmp_rec(struct ib_sa_mcmember_rec *src,
if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
return -EINVAL;
if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
- IB_SA_MCMEMBER_REC_RATE, dst->rate_selector,
- src->rate, dst->rate))
+ IB_SA_MCMEMBER_REC_RATE, dst->rate_selector,
+ src->rate, dst->rate))
return -EINVAL;
if (check_selector(comp_mask,
- IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
- IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
- dst->packet_life_time_selector,
- src->packet_life_time, dst->packet_life_time))
+ IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
+ IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
+ dst->packet_life_time_selector,
+ src->packet_life_time, dst->packet_life_time))
return -EINVAL;
if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl)
return -EINVAL;
@@ -354,11 +341,7 @@ static int send_join(struct mcast_group *group, struct mcast_member *member)
member->multicast.comp_mask,
3000, GFP_KERNEL, join_handler, group,
&group->query);
- if (ret >= 0) {
- group->query_id = ret;
- ret = 0;
- }
- return ret;
+ return (ret > 0) ? 0 : ret;
}
static int send_leave(struct mcast_group *group, u8 leave_state)
@@ -378,11 +361,7 @@ static int send_leave(struct mcast_group *group, u8 leave_state)
IB_SA_MCMEMBER_REC_JOIN_STATE,
3000, GFP_KERNEL, leave_handler,
group, &group->query);
- if (ret >= 0) {
- group->query_id = ret;
- ret = 0;
- }
- return ret;
+ return (ret > 0) ? 0 : ret;
}
static void join_group(struct mcast_group *group, struct mcast_member *member,
@@ -540,17 +519,22 @@ static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
if (status)
process_join_error(group, status);
else {
+ int mgids_changed, is_mgid0;
ib_find_pkey(group->port->dev->device, group->port->port_num,
be16_to_cpu(rec->pkey), &pkey_index);
spin_lock_irq(&group->port->lock);
- group->rec = *rec;
if (group->state == MCAST_BUSY &&
group->pkey_index == MCAST_INVALID_PKEY_INDEX)
group->pkey_index = pkey_index;
- if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) {
+ mgids_changed = memcmp(&rec->mgid, &group->rec.mgid,
+ sizeof(group->rec.mgid));
+ group->rec = *rec;
+ if (mgids_changed) {
rb_erase(&group->node, &group->port->table);
- mcast_insert(group->port, group, 1);
+ is_mgid0 = !memcmp(&mgid0, &group->rec.mgid,
+ sizeof(mgid0));
+ mcast_insert(group->port, group, is_mgid0);
}
spin_unlock_irq(&group->port->lock);
}
@@ -565,12 +549,8 @@ static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
if (status && group->retries > 0 &&
!send_leave(group, group->leave_state))
group->retries--;
- else {
- if (status && group->retries <= 0)
- printk(KERN_WARNING "reached max retry count. "
- "status=%d. Giving up\n", status);
+ else
mcast_work_handler(&group->work);
- }
}
static struct mcast_group *acquire_group(struct mcast_port *port,
@@ -593,7 +573,7 @@ static struct mcast_group *acquire_group(struct mcast_port *port,
if (!group)
return NULL;
- group->retries = mcast_leave_retries;
+ group->retries = 3;
group->port = port;
group->rec.mgid = *mgid;
group->pkey_index = MCAST_INVALID_PKEY_INDEX;
@@ -737,13 +717,27 @@ EXPORT_SYMBOL(ib_sa_get_mcmember_rec);
int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
struct ib_sa_mcmember_rec *rec,
+ struct net_device *ndev,
+ enum ib_gid_type gid_type,
struct ib_ah_attr *ah_attr)
{
int ret;
u16 gid_index;
u8 p;
- ret = ib_find_cached_gid(device, &rec->port_gid, &p, &gid_index);
+ if (rdma_protocol_roce(device, port_num)) {
+ ret = ib_find_cached_gid_by_port(device, &rec->port_gid,
+ gid_type, port_num,
+ ndev,
+ &gid_index);
+ } else if (rdma_protocol_ib(device, port_num)) {
+ ret = ib_find_cached_gid(device, &rec->port_gid,
+ IB_GID_TYPE_IB, NULL, &p,
+ &gid_index);
+ } else {
+ ret = -EINVAL;
+ }
+
if (ret)
return ret;
@@ -794,8 +788,7 @@ static void mcast_event_handler(struct ib_event_handler *handler,
int index;
dev = container_of(handler, struct mcast_device, event_handler);
- if (rdma_port_get_link_layer(dev->device, event->element.port_num) !=
- IB_LINK_LAYER_INFINIBAND)
+ if (!rdma_cap_ib_mcast(dev->device, event->element.port_num))
return;
index = event->element.port_num - dev->start_port;
@@ -803,6 +796,7 @@ static void mcast_event_handler(struct ib_event_handler *handler,
switch (event->event) {
case IB_EVENT_PORT_ERR:
case IB_EVENT_LID_CHANGE:
+ case IB_EVENT_SM_CHANGE:
case IB_EVENT_CLIENT_REREGISTER:
mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR);
break;
@@ -821,24 +815,16 @@ static void mcast_add_one(struct ib_device *device)
int i;
int count = 0;
- if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
- return;
-
dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port,
GFP_KERNEL);
if (!dev)
return;
- if (device->node_type == RDMA_NODE_IB_SWITCH)
- dev->start_port = dev->end_port = 0;
- else {
- dev->start_port = 1;
- dev->end_port = device->phys_port_cnt;
- }
+ dev->start_port = rdma_start_port(device);
+ dev->end_port = rdma_end_port(device);
for (i = 0; i <= dev->end_port - dev->start_port; i++) {
- if (rdma_port_get_link_layer(device, dev->start_port + i) !=
- IB_LINK_LAYER_INFINIBAND)
+ if (!rdma_cap_ib_mcast(device, dev->start_port + i))
continue;
port = &dev->port[i];
port->dev = dev;
@@ -862,13 +848,12 @@ static void mcast_add_one(struct ib_device *device)
ib_register_event_handler(&dev->event_handler);
}
-static void mcast_remove_one(struct ib_device *device)
+static void mcast_remove_one(struct ib_device *device, void *client_data)
{
- struct mcast_device *dev;
+ struct mcast_device *dev = client_data;
struct mcast_port *port;
int i;
- dev = ib_get_client_data(device, &mcast_client);
if (!dev)
return;
@@ -876,8 +861,7 @@ static void mcast_remove_one(struct ib_device *device)
flush_workqueue(mcast_wq);
for (i = 0; i <= dev->end_port - dev->start_port; i++) {
- if (rdma_port_get_link_layer(device, dev->start_port + i) ==
- IB_LINK_LAYER_INFINIBAND) {
+ if (rdma_cap_ib_mcast(device, dev->start_port + i)) {
port = &dev->port[i];
deref_port(port);
wait_for_completion(&port->comp);
@@ -891,7 +875,7 @@ int mcast_init(void)
{
int ret;
- mcast_wq = create_singlethread_workqueue("ib_mcast");
+ mcast_wq = alloc_ordered_workqueue("ib_mcast", WQ_MEM_RECLAIM);
if (!mcast_wq)
return -ENOMEM;
diff --git a/sys/ofed/drivers/infiniband/core/opa_smi.h b/sys/ofed/drivers/infiniband/core/opa_smi.h
new file mode 100644
index 0000000000000..3bfab3505a291
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/opa_smi.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __OPA_SMI_H_
+#define __OPA_SMI_H_
+
+#include <rdma/ib_smi.h>
+#include <rdma/opa_smi.h>
+
+#include "smi.h"
+
+enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch,
+ int port_num, int phys_port_cnt);
+int opa_smi_get_fwd_port(struct opa_smp *smp);
+extern enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp);
+extern enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp,
+ bool is_switch, int port_num);
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action opa_smi_check_local_smp(struct opa_smp *smp,
+ struct ib_device *device)
+{
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ /* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */
+ return (device->process_mad &&
+ !opa_get_smp_direction(smp) &&
+ (smp->hop_ptr == smp->hop_cnt + 1)) ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD;
+}
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action opa_smi_check_local_returning_smp(struct opa_smp *smp,
+ struct ib_device *device)
+{
+ /* C14-13:3 -- We're at the end of the DR segment of path */
+ /* C14-13:4 -- Hop Pointer == 0 -> give to SM */
+ return (device->process_mad &&
+ opa_get_smp_direction(smp) &&
+ !smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD;
+}
+
+#endif /* __OPA_SMI_H_ */
diff --git a/sys/ofed/drivers/infiniband/core/packer.c b/sys/ofed/drivers/infiniband/core/packer.c
index 9f42595a165cd..31c566be7732d 100644
--- a/sys/ofed/drivers/infiniband/core/packer.c
+++ b/sys/ofed/drivers/infiniband/core/packer.c
@@ -31,7 +31,6 @@
* SOFTWARE.
*/
-#include <linux/module.h>
#include <linux/string.h>
#include <rdma/ib_pack.h>
@@ -39,12 +38,12 @@
static u64 value_read(int offset, int size, void *structure)
{
switch (size) {
- case 1: return *(u8 *) (structure + offset);
- case 2: return be16_to_cpup((__be16 *) (structure + offset));
- case 4: return be32_to_cpup((__be32 *) (structure + offset));
- case 8: return be64_to_cpup((__be64 *) (structure + offset));
+ case 1: return *(u8 *) ((char *)structure + offset);
+ case 2: return be16_to_cpup((__be16 *) ((char *)structure + offset));
+ case 4: return be32_to_cpup((__be32 *) ((char *)structure + offset));
+ case 8: return be64_to_cpup((__be64 *) ((char *)structure + offset));
default:
- printk(KERN_WARNING "Field size %d bits not handled\n", size * 8);
+ pr_warn("Field size %d bits not handled\n", size * 8);
return 0;
}
}
@@ -104,18 +103,17 @@ void ib_pack(const struct ib_field *desc,
} else {
if (desc[i].offset_bits % 8 ||
desc[i].size_bits % 8) {
- printk(KERN_WARNING "Structure field %s of size %d "
- "bits is not byte-aligned\n",
- desc[i].field_name, desc[i].size_bits);
+ pr_warn("Structure field %s of size %d bits is not byte-aligned\n",
+ desc[i].field_name, desc[i].size_bits);
}
if (desc[i].struct_size_bytes)
- memcpy(buf + desc[i].offset_words * 4 +
+ memcpy((char *)buf + desc[i].offset_words * 4 +
desc[i].offset_bits / 8,
- structure + desc[i].struct_offset_bytes,
+ (char *)structure + desc[i].struct_offset_bytes,
desc[i].size_bits / 8);
else
- memset(buf + desc[i].offset_words * 4 +
+ memset((char *)buf + desc[i].offset_words * 4 +
desc[i].offset_bits / 8,
0,
desc[i].size_bits / 8);
@@ -127,12 +125,12 @@ EXPORT_SYMBOL(ib_pack);
static void value_write(int offset, int size, u64 val, void *structure)
{
switch (size * 8) {
- case 8: *( u8 *) (structure + offset) = val; break;
- case 16: *(__be16 *) (structure + offset) = cpu_to_be16(val); break;
- case 32: *(__be32 *) (structure + offset) = cpu_to_be32(val); break;
- case 64: *(__be64 *) (structure + offset) = cpu_to_be64(val); break;
+ case 8: *( u8 *) ((char *)structure + offset) = val; break;
+ case 16: *(__be16 *) ((char *)structure + offset) = cpu_to_be16(val); break;
+ case 32: *(__be32 *) ((char *)structure + offset) = cpu_to_be32(val); break;
+ case 64: *(__be64 *) ((char *)structure + offset) = cpu_to_be64(val); break;
default:
- printk(KERN_WARNING "Field size %d bits not handled\n", size * 8);
+ pr_warn("Field size %d bits not handled\n", size * 8);
}
}
@@ -188,13 +186,12 @@ void ib_unpack(const struct ib_field *desc,
} else {
if (desc[i].offset_bits % 8 ||
desc[i].size_bits % 8) {
- printk(KERN_WARNING "Structure field %s of size %d "
- "bits is not byte-aligned\n",
- desc[i].field_name, desc[i].size_bits);
+ pr_warn("Structure field %s of size %d bits is not byte-aligned\n",
+ desc[i].field_name, desc[i].size_bits);
}
- memcpy(structure + desc[i].struct_offset_bytes,
- buf + desc[i].offset_words * 4 +
+ memcpy((char *)structure + desc[i].struct_offset_bytes,
+ (char *)buf + desc[i].offset_words * 4 +
desc[i].offset_bits / 8,
desc[i].size_bits / 8);
}
diff --git a/sys/ofed/drivers/infiniband/core/peer_mem.c b/sys/ofed/drivers/infiniband/core/peer_mem.c
deleted file mode 100644
index cd716a461ecc1..0000000000000
--- a/sys/ofed/drivers/infiniband/core/peer_mem.c
+++ /dev/null
@@ -1,461 +0,0 @@
-/*
- * Copyright (c) 2013, Mellanox Technologies. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <rdma/ib_peer_mem.h>
-#include <rdma/ib_verbs.h>
-#include <rdma/ib_umem.h>
-
-static DEFINE_MUTEX(peer_memory_mutex);
-static LIST_HEAD(peer_memory_list);
-
-static int num_registered_peers;
-
-/* This code uses the sysfs which is not supporeted by the FreeBSD.
- * * Will be added in future to the sysctl */
-
-#if 0
-static struct kobject *peers_kobj;
-static struct ib_peer_memory_client *get_peer_by_kobj(void *kobj);
-static ssize_t version_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj);
-
- if (ib_peer_client) {
- sprintf(buf, "%s\n", ib_peer_client->peer_mem->version);
- return strlen(buf);
- }
- /* not found - nothing is return */
- return 0;
-}
-
-static ssize_t num_alloc_mrs_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj);
-
- if (ib_peer_client) {
- sprintf(buf, "%lu\n", ib_peer_client->stats.num_alloc_mrs);
- return strlen(buf);
- }
- /* not found - nothing is return */
- return 0;
-}
-
-static ssize_t num_reg_pages_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj);
-
- if (ib_peer_client) {
- sprintf(buf, "%lu\n", ib_peer_client->stats.num_reg_pages);
- return strlen(buf);
- }
- /* not found - nothing is return */
- return 0;
-}
-
-static ssize_t num_dereg_pages_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj);
-
- if (ib_peer_client) {
- sprintf(buf, "%lu\n", ib_peer_client->stats.num_dereg_pages);
- return strlen(buf);
- }
- /* not found - nothing is return */
- return 0;
-}
-
-static ssize_t num_free_callbacks_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct ib_peer_memory_client *ib_peer_client = get_peer_by_kobj(kobj);
-
- if (ib_peer_client) {
- sprintf(buf, "%lu\n", ib_peer_client->stats.num_free_callbacks);
- return strlen(buf);
- }
- /* not found - nothing is return */
- return 0;
-}
-
-static struct kobj_attribute version_attr = __ATTR_RO(version);
-static struct kobj_attribute num_alloc_mrs = __ATTR_RO(num_alloc_mrs);
-static struct kobj_attribute num_reg_pages = __ATTR_RO(num_reg_pages);
-static struct kobj_attribute num_dereg_pages = __ATTR_RO(num_dereg_pages);
-static struct kobj_attribute num_free_callbacks = __ATTR_RO(num_free_callbacks);
-
-static struct attribute *peer_mem_attrs[] = {
- &version_attr.attr,
- &num_alloc_mrs.attr,
- &num_reg_pages.attr,
- &num_dereg_pages.attr,
- &num_free_callbacks.attr,
- NULL,
-};
-#endif
-
-#if 0
-static void destroy_peer_sysfs(struct ib_peer_memory_client *ib_peer_client)
-{
- kobject_put(ib_peer_client->kobj);
- if (!num_registered_peers)
- kobject_put(peers_kobj);
-
- return;
-}
-
-/* This code uses the sysfs which is not supporeted by the FreeBSD.
- * Will be added in future to the sysctl */
-
-static int create_peer_sysfs(struct ib_peer_memory_client *ib_peer_client)
-{
- int ret;
-
- if (!num_registered_peers) {
- /* creating under /sys/kernel/mm */
- peers_kobj = kobject_create_and_add("memory_peers", mm_kobj);
- if (!peers_kobj)
- return -ENOMEM;
- }
-
- ib_peer_client->peer_mem_attr_group.attrs = peer_mem_attrs;
- /* Dir alreday was created explicitly to get its kernel object for further usage */
- ib_peer_client->peer_mem_attr_group.name = NULL;
- ib_peer_client->kobj = kobject_create_and_add(ib_peer_client->peer_mem->name,
- peers_kobj);
-
- if (!ib_peer_client->kobj) {
- ret = -EINVAL;
- goto free;
- }
-
- /* Create the files associated with this kobject */
- ret = sysfs_create_group(ib_peer_client->kobj,
- &ib_peer_client->peer_mem_attr_group);
- if (ret)
- goto peer_free;
-
- return 0;
-
-peer_free:
- kobject_put(ib_peer_client->kobj);
-
-free:
- if (!num_registered_peers)
- kobject_put(peers_kobj);
-
- return ret;
-}
-#endif
-
-static int ib_invalidate_peer_memory(void *reg_handle,
- void *core_context)
-{
- struct ib_peer_memory_client *ib_peer_client =
- (struct ib_peer_memory_client *)reg_handle;
- struct invalidation_ctx *invalidation_ctx;
- struct core_ticket *core_ticket;
- int need_unlock = 1;
-
- mutex_lock(&ib_peer_client->lock);
- ib_peer_client->stats.num_free_callbacks += 1;
- core_ticket = ib_peer_search_context(ib_peer_client,
- (unsigned long)core_context);
- if (!core_ticket)
- goto out;
-
- invalidation_ctx = (struct invalidation_ctx *)core_ticket->context;
- /* If context not ready yet mark to be invalidated */
- if (!invalidation_ctx->func) {
- invalidation_ctx->peer_invalidated = 1;
- goto out;
- }
-
- invalidation_ctx->func(invalidation_ctx->cookie,
- invalidation_ctx->umem, 0, 0);
- if (invalidation_ctx->inflight_invalidation) {
-
- /* init the completion to wait on before letting other thread to run */
- init_completion(&invalidation_ctx->comp);
- mutex_unlock(&ib_peer_client->lock);
- need_unlock = 0;
- wait_for_completion(&invalidation_ctx->comp);
- }
-
- kfree(invalidation_ctx);
-
-out:
- if (need_unlock)
- mutex_unlock(&ib_peer_client->lock);
-
- return 0;
-}
-
-/* access to that peer client is under its lock - no extra lock is needed */
-unsigned long ib_peer_insert_context(struct ib_peer_memory_client *ib_peer_client,
- void *context)
-{
- struct core_ticket *core_ticket = kzalloc(sizeof(*core_ticket), GFP_KERNEL);
-
- ib_peer_client->last_ticket++;
- core_ticket->context = context;
- core_ticket->key = ib_peer_client->last_ticket;
-
- list_add_tail(&core_ticket->ticket_list,
- &ib_peer_client->core_ticket_list);
-
- return core_ticket->key;
-}
-
-int ib_peer_remove_context(struct ib_peer_memory_client *ib_peer_client,
- unsigned long key)
-{
- struct core_ticket *core_ticket, *tmp;
-
- list_for_each_entry_safe(core_ticket, tmp, &ib_peer_client->core_ticket_list,
- ticket_list) {
- if (core_ticket->key == key) {
- list_del(&core_ticket->ticket_list);
- kfree(core_ticket);
- return 0;
- }
- }
-
- return 1;
-}
-
-struct core_ticket *ib_peer_search_context(struct ib_peer_memory_client *ib_peer_client,
- unsigned long key)
-{
- struct core_ticket *core_ticket, *tmp;
- list_for_each_entry_safe(core_ticket, tmp, &ib_peer_client->core_ticket_list,
- ticket_list) {
- if (core_ticket->key == key)
- return core_ticket;
- }
-
- return NULL;
-}
-
-
-static int ib_memory_peer_check_mandatory(struct peer_memory_client
- *peer_client)
-{
-#define PEER_MEM_MANDATORY_FUNC(x) {\
- offsetof(struct peer_memory_client, x), #x }
-
- static const struct {
- size_t offset;
- char *name;
- } mandatory_table[] = {
- PEER_MEM_MANDATORY_FUNC(acquire),
- PEER_MEM_MANDATORY_FUNC(get_pages),
- PEER_MEM_MANDATORY_FUNC(put_pages),
- PEER_MEM_MANDATORY_FUNC(get_page_size),
- PEER_MEM_MANDATORY_FUNC(dma_map),
- PEER_MEM_MANDATORY_FUNC(dma_unmap)
- };
- int i;
-
- for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
- if (!*(void **) ((void *) peer_client + mandatory_table[i].offset)) {
- printk(KERN_WARNING "Peer memory %s is missing mandatory function %s\n",
- peer_client->name, mandatory_table[i].name);
- return -EINVAL;
- }
- }
-
- return 0;
-}
-
-
-
-void *ib_register_peer_memory_client(struct peer_memory_client *peer_client,
- invalidate_peer_memory *invalidate_callback)
-{
- int ret = 0;
- struct ib_peer_memory_client *ib_peer_client = NULL;
-
- mutex_lock(&peer_memory_mutex);
- if (ib_memory_peer_check_mandatory(peer_client)) {
- ret = -EINVAL;
- goto out;
- }
-
- ib_peer_client = kzalloc(sizeof(*ib_peer_client), GFP_KERNEL);
- if (!ib_peer_client)
- goto out;
- ib_peer_client->peer_mem = peer_client;
-
- INIT_LIST_HEAD(&ib_peer_client->core_ticket_list);
- mutex_init(&ib_peer_client->lock);
-#ifdef __FreeBSD__
- ib_peer_client->holdcount = 0;
- ib_peer_client->needwakeup = 0;
- cv_init(&ib_peer_client->peer_cv, "ibprcl");
-#else
- ret = init_srcu_struct(&ib_peer_client->peer_srcu);
- if (ret)
- goto free;
-#endif
-#if 0
- if (create_peer_sysfs(ib_peer_client))
- goto free;
-#endif
- *invalidate_callback = ib_invalidate_peer_memory;
- list_add_tail(&ib_peer_client->core_peer_list, &peer_memory_list);
- num_registered_peers++;
- goto out;
-#if 0
-free:
- kfree(ib_peer_client);
- ib_peer_client = NULL;
-#endif
-out:
- mutex_unlock(&peer_memory_mutex);
- return ib_peer_client;
-}
-EXPORT_SYMBOL(ib_register_peer_memory_client);
-
-void ib_unregister_peer_memory_client(void *reg_handle)
-{
- struct ib_peer_memory_client *ib_peer_client =
- (struct ib_peer_memory_client *)reg_handle;
-
- mutex_lock(&peer_memory_mutex);
- /* remove from list to prevent future core clients usage as it goes down */
- list_del(&ib_peer_client->core_peer_list);
-#ifdef __FreeBSD__
- while (ib_peer_client->holdcount != 0) {
- ib_peer_client->needwakeup = 1;
- cv_wait(&ib_peer_client->peer_cv, &peer_memory_mutex.sx);
- }
- cv_destroy(&ib_peer_client->peer_cv);
-#else
- mutex_unlock(&peer_memory_mutex);
- /* peer memory can't go down while there are active clients */
- synchronize_srcu(&ib_peer_client->peer_srcu);
- cleanup_srcu_struct(&ib_peer_client->peer_srcu);
- mutex_lock(&peer_memory_mutex);
-#endif
- num_registered_peers--;
-/* This code uses the sysfs which is not supporeted by the FreeBSD.
- * Will be added in future to the sysctl */
-#if 0
- destroy_peer_sysfs(ib_peer_client);
-#endif
- mutex_unlock(&peer_memory_mutex);
-
- kfree(ib_peer_client);
-}
-EXPORT_SYMBOL(ib_unregister_peer_memory_client);
-
-/* This code uses the sysfs which is not supporeted by the FreeBSD.
- * Will be added in future to the sysctl */
-
-#if 0
-static struct ib_peer_memory_client *get_peer_by_kobj(void *kobj)
-{
- struct ib_peer_memory_client *ib_peer_client;
-
- mutex_lock(&peer_memory_mutex);
- list_for_each_entry(ib_peer_client, &peer_memory_list, core_peer_list) {
- if (ib_peer_client->kobj == kobj)
- goto found;
- }
-
- ib_peer_client = NULL;
-
-found:
-
- mutex_unlock(&peer_memory_mutex);
- return ib_peer_client;
-}
-#endif
-
-struct ib_peer_memory_client *ib_get_peer_client(struct ib_ucontext *context, unsigned long addr,
- size_t size, void **peer_client_context,
- int *srcu_key)
-{
- struct ib_peer_memory_client *ib_peer_client;
- int ret;
-
- mutex_lock(&peer_memory_mutex);
- list_for_each_entry(ib_peer_client, &peer_memory_list, core_peer_list) {
- ret = ib_peer_client->peer_mem->acquire(addr, size,
- context->peer_mem_private_data,
- context->peer_mem_name,
- peer_client_context);
- if (ret == 1)
- goto found;
- }
-
- ib_peer_client = NULL;
-
-found:
- if (ib_peer_client) {
-#ifdef __FreeBSD__
- ib_peer_client->holdcount++;
-#else
- *srcu_key = srcu_read_lock(&ib_peer_client->peer_srcu);
-#endif
- }
-
- mutex_unlock(&peer_memory_mutex);
- return ib_peer_client;
-
-}
-EXPORT_SYMBOL(ib_get_peer_client);
-
-void ib_put_peer_client(struct ib_peer_memory_client *ib_peer_client,
- void *peer_client_context,
- int srcu_key)
-{
-
- if (ib_peer_client->peer_mem->release)
- ib_peer_client->peer_mem->release(peer_client_context);
-
-#ifdef __FreeBSD__
- ib_peer_client->holdcount--;
- if (ib_peer_client->holdcount == 0 && ib_peer_client->needwakeup) {
- cv_signal(&ib_peer_client->peer_cv);
- }
-#else
- srcu_read_unlock(&ib_peer_client->peer_srcu, srcu_key);
-#endif
- return;
-}
-EXPORT_SYMBOL(ib_put_peer_client);
-
diff --git a/sys/ofed/drivers/infiniband/core/sa_query.c b/sys/ofed/drivers/infiniband/core/sa_query.c
index a0c04f5f5228d..d818bad1661bd 100644
--- a/sys/ofed/drivers/infiniband/core/sa_query.c
+++ b/sys/ofed/drivers/infiniband/core/sa_query.c
@@ -41,14 +41,18 @@
#include <linux/kref.h>
#include <linux/idr.h>
#include <linux/workqueue.h>
-
+#include <linux/etherdevice.h>
#include <rdma/ib_pack.h>
#include <rdma/ib_cache.h>
+#include <rdma/ib_user_sa.h>
+#include <rdma/ib_marshall.h>
+#include <rdma/ib_addr.h>
#include "sa.h"
+#include "core_priv.h"
-MODULE_AUTHOR("Roland Dreier");
-MODULE_DESCRIPTION("InfiniBand subnet administration query support");
-MODULE_LICENSE("Dual BSD/GPL");
+#define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100
+#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000
+#define IB_SA_LOCAL_SVC_TIMEOUT_MAX 200000
struct ib_sa_sm_ah {
struct ib_ah *ah;
@@ -57,10 +61,17 @@ struct ib_sa_sm_ah {
u8 src_path_mask;
};
+struct ib_sa_classport_cache {
+ bool valid;
+ struct ib_class_port_info data;
+};
+
struct ib_sa_port {
struct ib_mad_agent *agent;
struct ib_sa_sm_ah *sm_ah;
struct work_struct update_task;
+ struct ib_sa_classport_cache classport_info;
+ spinlock_t classport_lock; /* protects class port info set */
spinlock_t ah_lock;
u8 port_num;
};
@@ -79,8 +90,16 @@ struct ib_sa_query {
struct ib_mad_send_buf *mad_buf;
struct ib_sa_sm_ah *sm_ah;
int id;
+ u32 flags;
+ struct list_head list; /* Local svc request list */
+ u32 seq; /* Local svc request sequence number */
+ unsigned long timeout; /* Local svc timeout */
+ u8 path_use; /* How will the pathrecord be used */
};
+#define IB_SA_ENABLE_LOCAL_SERVICE 0x00000001
+#define IB_SA_CANCEL 0x00000002
+
struct ib_sa_service_query {
void (*callback)(int, struct ib_sa_service_rec *, void *);
void *context;
@@ -99,6 +118,12 @@ struct ib_sa_guidinfo_query {
struct ib_sa_query sa_query;
};
+struct ib_sa_classport_info_query {
+ void (*callback)(int, struct ib_class_port_info *, void *);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
struct ib_sa_mcmember_query {
void (*callback)(int, struct ib_sa_mcmember_rec *, void *);
void *context;
@@ -106,7 +131,7 @@ struct ib_sa_mcmember_query {
};
static void ib_sa_add_one(struct ib_device *device);
-static void ib_sa_remove_one(struct ib_device *device);
+static void ib_sa_remove_one(struct ib_device *device, void *client_data);
static struct ib_client sa_client = {
.name = "sa",
@@ -352,6 +377,82 @@ static const struct ib_field service_rec_table[] = {
.size_bits = 2*64 },
};
+#define CLASSPORTINFO_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_class_port_info, field), \
+ .struct_size_bytes = sizeof((struct ib_class_port_info *)0)->field, \
+ .field_name = "ib_class_port_info:" #field
+
+static const struct ib_field classport_info_rec_table[] = {
+ { CLASSPORTINFO_REC_FIELD(base_version),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { CLASSPORTINFO_REC_FIELD(class_version),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { CLASSPORTINFO_REC_FIELD(capability_mask),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { CLASSPORTINFO_REC_FIELD(redirect_gid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { CLASSPORTINFO_REC_FIELD(redirect_tcslfl),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { CLASSPORTINFO_REC_FIELD(redirect_lid),
+ .offset_words = 7,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { CLASSPORTINFO_REC_FIELD(redirect_pkey),
+ .offset_words = 7,
+ .offset_bits = 16,
+ .size_bits = 16 },
+
+ { CLASSPORTINFO_REC_FIELD(redirect_qp),
+ .offset_words = 8,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { CLASSPORTINFO_REC_FIELD(redirect_qkey),
+ .offset_words = 9,
+ .offset_bits = 0,
+ .size_bits = 32 },
+
+ { CLASSPORTINFO_REC_FIELD(trap_gid),
+ .offset_words = 10,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { CLASSPORTINFO_REC_FIELD(trap_tcslfl),
+ .offset_words = 14,
+ .offset_bits = 0,
+ .size_bits = 32 },
+
+ { CLASSPORTINFO_REC_FIELD(trap_lid),
+ .offset_words = 15,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { CLASSPORTINFO_REC_FIELD(trap_pkey),
+ .offset_words = 15,
+ .offset_bits = 16,
+ .size_bits = 16 },
+
+ { CLASSPORTINFO_REC_FIELD(trap_hlqp),
+ .offset_words = 16,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { CLASSPORTINFO_REC_FIELD(trap_qkey),
+ .offset_words = 17,
+ .offset_bits = 0,
+ .size_bits = 32 },
+};
+
#define GUIDINFO_REC_FIELD(field) \
.struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \
.struct_size_bytes = sizeof((struct ib_sa_guidinfo_rec *) 0)->field, \
@@ -380,6 +481,11 @@ static const struct ib_field guidinfo_rec_table[] = {
.size_bits = 512 },
};
+static inline void ib_sa_disable_local_svc(struct ib_sa_query *query)
+{
+ query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE;
+}
+
static void free_sm_ah(struct kref *kref)
{
struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
@@ -397,13 +503,12 @@ static void update_sm_ah(struct work_struct *work)
struct ib_ah_attr ah_attr;
if (ib_query_port(port->agent->device, port->port_num, &port_attr)) {
- printk(KERN_WARNING "Couldn't query port\n");
+ pr_warn("Couldn't query port\n");
return;
}
new_ah = kmalloc(sizeof *new_ah, GFP_KERNEL);
if (!new_ah) {
- printk(KERN_WARNING "Couldn't allocate new SM AH\n");
return;
}
@@ -413,16 +518,21 @@ static void update_sm_ah(struct work_struct *work)
new_ah->pkey_index = 0;
if (ib_find_pkey(port->agent->device, port->port_num,
IB_DEFAULT_PKEY_FULL, &new_ah->pkey_index))
- printk(KERN_ERR "Couldn't find index for default PKey\n");
+ pr_err("Couldn't find index for default PKey\n");
memset(&ah_attr, 0, sizeof ah_attr);
ah_attr.dlid = port_attr.sm_lid;
ah_attr.sl = port_attr.sm_sl;
ah_attr.port_num = port->port_num;
+ if (port_attr.grh_required) {
+ ah_attr.ah_flags = IB_AH_GRH;
+ ah_attr.grh.dgid.global.subnet_prefix = cpu_to_be64(port_attr.subnet_prefix);
+ ah_attr.grh.dgid.global.interface_id = cpu_to_be64(IB_SA_WELL_KNOWN_GUID);
+ }
new_ah->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
if (IS_ERR(new_ah->ah)) {
- printk(KERN_WARNING "Couldn't create new SM AH\n");
+ pr_warn("Couldn't create new SM AH\n");
kfree(new_ah);
return;
}
@@ -449,7 +559,7 @@ static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event
struct ib_sa_port *port =
&sa_dev->port[event->element.port_num - sa_dev->start_port];
- if (rdma_port_get_link_layer(handler->device, port->port_num) != IB_LINK_LAYER_INFINIBAND)
+ if (!rdma_cap_ib_sa(handler->device, port->port_num))
return;
spin_lock_irqsave(&port->ah_lock, flags);
@@ -458,6 +568,13 @@ static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event
port->sm_ah = NULL;
spin_unlock_irqrestore(&port->ah_lock, flags);
+ if (event->event == IB_EVENT_SM_CHANGE ||
+ event->event == IB_EVENT_CLIENT_REREGISTER ||
+ event->event == IB_EVENT_LID_CHANGE) {
+ spin_lock_irqsave(&port->classport_lock, flags);
+ port->classport_info.valid = false;
+ spin_unlock_irqrestore(&port->classport_lock, flags);
+ }
queue_work(ib_wq, &sa_dev->port[event->element.port_num -
sa_dev->start_port].update_task);
}
@@ -500,8 +617,6 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query)
agent = query->port->agent;
mad_buf = query->mad_buf;
spin_unlock_irqrestore(&idr_lock, flags);
-
- ib_cancel_mad(agent, mad_buf);
}
EXPORT_SYMBOL(ib_sa_cancel_query);
@@ -529,7 +644,8 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
{
int ret;
u16 gid_index;
- int force_grh;
+ int use_roce;
+ struct net_device *ndev = NULL;
memset(ah_attr, 0, sizeof *ah_attr);
ah_attr->dlid = be16_to_cpu(rec->dlid);
@@ -539,29 +655,87 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
ah_attr->port_num = port_num;
ah_attr->static_rate = rec->rate;
- force_grh = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_ETHERNET;
+ use_roce = rdma_cap_eth_ah(device, port_num);
+
+ if (use_roce) {
+ struct net_device *idev;
+ struct net_device *resolved_dev;
+ struct rdma_dev_addr dev_addr = {.bound_dev_if = rec->ifindex,
+ .net = rec->net ? rec->net :
+ &init_net};
+ union {
+ struct sockaddr _sockaddr;
+ struct sockaddr_in _sockaddr_in;
+ struct sockaddr_in6 _sockaddr_in6;
+ } sgid_addr, dgid_addr;
+
+ if (!device->get_netdev)
+ return -EOPNOTSUPP;
+
+ rdma_gid2ip(&sgid_addr._sockaddr, &rec->sgid);
+ rdma_gid2ip(&dgid_addr._sockaddr, &rec->dgid);
+
+ /* validate the route */
+ ret = rdma_resolve_ip_route(&sgid_addr._sockaddr,
+ &dgid_addr._sockaddr, &dev_addr);
+ if (ret)
+ return ret;
- if (rec->hop_limit > 1 || force_grh) {
+ if ((dev_addr.network == RDMA_NETWORK_IPV4 ||
+ dev_addr.network == RDMA_NETWORK_IPV6) &&
+ rec->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
+ return -EINVAL;
+
+ idev = device->get_netdev(device, port_num);
+ if (!idev)
+ return -ENODEV;
+
+ resolved_dev = dev_get_by_index(dev_addr.net,
+ dev_addr.bound_dev_if);
+ if (resolved_dev->if_flags & IFF_LOOPBACK) {
+ dev_put(resolved_dev);
+ resolved_dev = idev;
+ dev_hold(resolved_dev);
+ }
+ ndev = ib_get_ndev_from_path(rec);
+ rcu_read_lock();
+ if ((ndev && ndev != resolved_dev) ||
+ (resolved_dev != idev &&
+ !rdma_is_upper_dev_rcu(idev, resolved_dev)))
+ ret = -EHOSTUNREACH;
+ rcu_read_unlock();
+ dev_put(idev);
+ dev_put(resolved_dev);
+ if (ret) {
+ if (ndev)
+ dev_put(ndev);
+ return ret;
+ }
+ }
+
+ if (rec->hop_limit > 0 || use_roce) {
ah_attr->ah_flags = IB_AH_GRH;
ah_attr->grh.dgid = rec->dgid;
- ret = ib_find_cached_gid(device, &rec->sgid, &port_num,
- &gid_index);
- if (ret)
+ ret = ib_find_cached_gid_by_port(device, &rec->sgid,
+ rec->gid_type, port_num, ndev,
+ &gid_index);
+ if (ret) {
+ if (ndev)
+ dev_put(ndev);
return ret;
+ }
ah_attr->grh.sgid_index = gid_index;
ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label);
ah_attr->grh.hop_limit = rec->hop_limit;
ah_attr->grh.traffic_class = rec->traffic_class;
+ if (ndev)
+ dev_put(ndev);
}
- if (force_grh) {
- memcpy(ah_attr->dmac, rec->dmac, 6);
- ah_attr->vlan_id = rec->vlan_id;
- } else {
- memset(ah_attr->dmac, 0, 6);
- ah_attr->vlan_id = 0xffff;
- }
+
+ if (use_roce)
+ memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN);
return 0;
}
@@ -583,7 +757,8 @@ static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask)
query->mad_buf = ib_create_send_mad(query->port->agent, 1,
query->sm_ah->pkey_index,
0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA,
- gfp_mask);
+ gfp_mask,
+ IB_MGMT_BASE_VERSION);
if (IS_ERR(query->mad_buf)) {
kref_put(&query->sm_ah->ref, free_sm_ah);
return -ENOMEM;
@@ -618,24 +793,30 @@ static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent)
static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
{
+ bool preload = gfpflags_allow_blocking(gfp_mask);
unsigned long flags;
int ret, id;
-retry:
- if (!idr_pre_get(&query_idr, gfp_mask))
- return -ENOMEM;
+ if (preload)
+ idr_preload(gfp_mask);
spin_lock_irqsave(&idr_lock, flags);
- ret = idr_get_new(&query_idr, query, &id);
+
+ id = idr_alloc(&query_idr, query, 0, 0, GFP_NOWAIT);
+
spin_unlock_irqrestore(&idr_lock, flags);
- if (ret == -EAGAIN)
- goto retry;
- if (ret)
- return ret;
+ if (preload)
+ idr_preload_end();
+ if (id < 0)
+ return id;
query->mad_buf->timeout_ms = timeout_ms;
query->mad_buf->context[0] = query;
query->id = id;
+ if (query->flags & IB_SA_ENABLE_LOCAL_SERVICE) {
+ ib_sa_disable_local_svc(query);
+ }
+
ret = ib_post_send_mad(query->mad_buf, NULL);
if (ret) {
spin_lock_irqsave(&idr_lock, flags);
@@ -657,6 +838,12 @@ void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec)
}
EXPORT_SYMBOL(ib_sa_unpack_path);
+void ib_sa_pack_path(struct ib_sa_path_rec *rec, void *attribute)
+{
+ ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, attribute);
+}
+EXPORT_SYMBOL(ib_sa_pack_path);
+
static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
int status,
struct ib_sa_mad *mad)
@@ -669,10 +856,10 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
mad->data, &rec);
- rec.vlan_id = 0xffff;
- memset(rec.dmac, 0, ETH_ALEN);
- memset(rec.smac, 0, ETH_ALEN);
-
+ rec.net = NULL;
+ rec.ifindex = 0;
+ rec.gid_type = IB_GID_TYPE_IB;
+ eth_zero_addr(rec.dmac);
query->callback(status, &rec, query->context);
} else
query->callback(status, NULL, query->context);
@@ -683,7 +870,6 @@ static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
kfree(container_of(sa_query, struct ib_sa_path_query, sa_query));
}
-
/**
* ib_sa_path_rec_get - Start a Path get query
* @client:SA client
@@ -710,15 +896,15 @@ static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
* the query.
*/
int ib_sa_path_rec_get(struct ib_sa_client *client,
- struct ib_device *device, u8 port_num,
- struct ib_sa_path_rec *rec,
- ib_sa_comp_mask comp_mask,
- int timeout_ms, gfp_t gfp_mask,
- void (*callback)(int status,
- struct ib_sa_path_rec *resp,
- void *context),
- void *context,
- struct ib_sa_query **sa_query)
+ struct ib_device *device, u8 port_num,
+ struct ib_sa_path_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_path_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
{
struct ib_sa_path_query *query;
struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
@@ -733,7 +919,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
port = &sa_dev->port[port_num - sa_dev->start_port];
agent = port->agent;
- query = kmalloc(sizeof *query, gfp_mask);
+ query = kzalloc(sizeof(*query), gfp_mask);
if (!query)
return -ENOMEM;
@@ -760,6 +946,9 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
*sa_query = &query->sa_query;
+ query->sa_query.flags |= IB_SA_ENABLE_LOCAL_SERVICE;
+ query->sa_query.mad_buf->context[1] = rec;
+
ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
if (ret < 0)
goto err2;
@@ -855,7 +1044,7 @@ int ib_sa_service_rec_query(struct ib_sa_client *client,
method != IB_SA_METHOD_DELETE)
return -EINVAL;
- query = kmalloc(sizeof *query, gfp_mask);
+ query = kzalloc(sizeof(*query), gfp_mask);
if (!query)
return -ENOMEM;
@@ -947,7 +1136,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
port = &sa_dev->port[port_num - sa_dev->start_port];
agent = port->agent;
- query = kmalloc(sizeof *query, gfp_mask);
+ query = kzalloc(sizeof(*query), gfp_mask);
if (!query)
return -ENOMEM;
@@ -993,8 +1182,8 @@ err1:
/* Support GuidInfoRecord */
static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query,
- int status,
- struct ib_sa_mad *mad)
+ int status,
+ struct ib_sa_mad *mad)
{
struct ib_sa_guidinfo_query *query =
container_of(sa_query, struct ib_sa_guidinfo_query, sa_query);
@@ -1027,7 +1216,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
{
struct ib_sa_guidinfo_query *query;
struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
- struct ib_sa_port *port;
+ struct ib_sa_port *port;
struct ib_mad_agent *agent;
struct ib_sa_mad *mad;
int ret;
@@ -1044,19 +1233,19 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
port = &sa_dev->port[port_num - sa_dev->start_port];
agent = port->agent;
- query = kmalloc(sizeof *query, gfp_mask);
+ query = kzalloc(sizeof(*query), gfp_mask);
if (!query)
return -ENOMEM;
- query->sa_query.port = port;
+ query->sa_query.port = port;
ret = alloc_mad(&query->sa_query, gfp_mask);
if (ret)
goto err1;
ib_sa_client_get(client);
query->sa_query.client = client;
- query->callback = callback;
- query->context = context;
+ query->callback = callback;
+ query->context = context;
mad = query->sa_query.mad_buf->mad;
init_mad(mad, agent);
@@ -1090,6 +1279,121 @@ err1:
}
EXPORT_SYMBOL(ib_sa_guid_info_rec_query);
+/* Support get SA ClassPortInfo */
+static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query,
+ int status,
+ struct ib_sa_mad *mad)
+{
+ unsigned long flags;
+ struct ib_sa_classport_info_query *query =
+ container_of(sa_query, struct ib_sa_classport_info_query, sa_query);
+
+ if (mad) {
+ struct ib_class_port_info rec;
+
+ ib_unpack(classport_info_rec_table,
+ ARRAY_SIZE(classport_info_rec_table),
+ mad->data, &rec);
+
+ spin_lock_irqsave(&sa_query->port->classport_lock, flags);
+ if (!status && !sa_query->port->classport_info.valid) {
+ memcpy(&sa_query->port->classport_info.data, &rec,
+ sizeof(sa_query->port->classport_info.data));
+
+ sa_query->port->classport_info.valid = true;
+ }
+ spin_unlock_irqrestore(&sa_query->port->classport_lock, flags);
+
+ query->callback(status, &rec, query->context);
+ } else {
+ query->callback(status, NULL, query->context);
+ }
+}
+
+static void ib_sa_portclass_info_rec_release(struct ib_sa_query *sa_query)
+{
+ kfree(container_of(sa_query, struct ib_sa_classport_info_query,
+ sa_query));
+}
+
+int ib_sa_classport_info_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_class_port_info *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_sa_classport_info_query *query;
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port;
+ struct ib_mad_agent *agent;
+ struct ib_sa_mad *mad;
+ struct ib_class_port_info cached_class_port_info;
+ int ret;
+ unsigned long flags;
+
+ if (!sa_dev)
+ return -ENODEV;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ agent = port->agent;
+
+ /* Use cached ClassPortInfo attribute if valid instead of sending mad */
+ spin_lock_irqsave(&port->classport_lock, flags);
+ if (port->classport_info.valid && callback) {
+ memcpy(&cached_class_port_info, &port->classport_info.data,
+ sizeof(cached_class_port_info));
+ spin_unlock_irqrestore(&port->classport_lock, flags);
+ callback(0, &cached_class_port_info, context);
+ return 0;
+ }
+ spin_unlock_irqrestore(&port->classport_lock, flags);
+
+ query = kzalloc(sizeof(*query), gfp_mask);
+ if (!query)
+ return -ENOMEM;
+
+ query->sa_query.port = port;
+ ret = alloc_mad(&query->sa_query, gfp_mask);
+ if (ret)
+ goto err1;
+
+ ib_sa_client_get(client);
+ query->sa_query.client = client;
+ query->callback = callback;
+ query->context = context;
+
+ mad = query->sa_query.mad_buf->mad;
+ init_mad(mad, agent);
+
+ query->sa_query.callback = callback ? ib_sa_classport_info_rec_callback : NULL;
+
+ query->sa_query.release = ib_sa_portclass_info_rec_release;
+ /* support GET only */
+ mad->mad_hdr.method = IB_MGMT_METHOD_GET;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_CLASS_PORTINFO);
+ mad->sa_hdr.comp_mask = 0;
+ *sa_query = &query->sa_query;
+
+ ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+ if (ret < 0)
+ goto err2;
+
+ return ret;
+
+err2:
+ *sa_query = NULL;
+ ib_sa_client_put(query->sa_query.client);
+ free_mad(&query->sa_query);
+
+err1:
+ kfree(query);
+ return ret;
+}
+EXPORT_SYMBOL(ib_sa_classport_info_rec_query);
+
static void send_handler(struct ib_mad_agent *agent,
struct ib_mad_send_wc *mad_send_wc)
{
@@ -1122,14 +1426,15 @@ static void send_handler(struct ib_mad_agent *agent,
}
static void recv_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf,
struct ib_mad_recv_wc *mad_recv_wc)
{
struct ib_sa_query *query;
- struct ib_mad_send_buf *mad_buf;
- mad_buf = (void *) (unsigned long) mad_recv_wc->wc->wr_id;
- query = mad_buf->context[0];
+ if (!send_buf)
+ return;
+ query = send_buf->context[0];
if (query->callback) {
if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
query->callback(query,
@@ -1147,16 +1452,10 @@ static void ib_sa_add_one(struct ib_device *device)
{
struct ib_sa_device *sa_dev;
int s, e, i;
+ int count = 0;
- if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
- return;
-
- if (device->node_type == RDMA_NODE_IB_SWITCH)
- s = e = 0;
- else {
- s = 1;
- e = device->phys_port_cnt;
- }
+ s = rdma_start_port(device);
+ e = rdma_end_port(device);
sa_dev = kzalloc(sizeof *sa_dev +
(e - s + 1) * sizeof (struct ib_sa_port),
@@ -1169,22 +1468,30 @@ static void ib_sa_add_one(struct ib_device *device)
for (i = 0; i <= e - s; ++i) {
spin_lock_init(&sa_dev->port[i].ah_lock);
- if (rdma_port_get_link_layer(device, i + 1) != IB_LINK_LAYER_INFINIBAND)
+ if (!rdma_cap_ib_sa(device, i + 1))
continue;
sa_dev->port[i].sm_ah = NULL;
sa_dev->port[i].port_num = i + s;
+ spin_lock_init(&sa_dev->port[i].classport_lock);
+ sa_dev->port[i].classport_info.valid = false;
+
sa_dev->port[i].agent =
ib_register_mad_agent(device, i + s, IB_QPT_GSI,
NULL, 0, send_handler,
- recv_handler, sa_dev);
+ recv_handler, sa_dev, 0);
if (IS_ERR(sa_dev->port[i].agent))
goto err;
INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah);
+
+ count++;
}
+ if (!count)
+ goto free;
+
ib_set_client_data(device, &sa_client, sa_dev);
/*
@@ -1196,31 +1503,28 @@ static void ib_sa_add_one(struct ib_device *device)
INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event);
if (ib_register_event_handler(&sa_dev->event_handler))
- goto reg_err;
+ goto err;
- for (i = 0; i <= e - s; ++i)
- if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND)
+ for (i = 0; i <= e - s; ++i) {
+ if (rdma_cap_ib_sa(device, i + 1))
update_sm_ah(&sa_dev->port[i].update_task);
+ }
return;
-reg_err:
- ib_set_client_data(device, &sa_client, NULL);
- i = e - s;
err:
- for (; i >= 0; --i)
- if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND &&
- !IS_ERR(sa_dev->port[i].agent))
- ib_unregister_mad_agent(sa_dev->port[i].agent);
-
+ while (--i >= 0) {
+ if (rdma_cap_ib_sa(device, i + 1))
+ ib_unregister_mad_agent(sa_dev->port[i].agent);
+ }
+free:
kfree(sa_dev);
-
return;
}
-static void ib_sa_remove_one(struct ib_device *device)
+static void ib_sa_remove_one(struct ib_device *device, void *client_data)
{
- struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_device *sa_dev = client_data;
int i;
if (!sa_dev)
@@ -1231,7 +1535,7 @@ static void ib_sa_remove_one(struct ib_device *device)
flush_workqueue(ib_wq);
for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) {
- if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) {
+ if (rdma_cap_ib_sa(device, i + 1)) {
ib_unregister_mad_agent(sa_dev->port[i].agent);
if (sa_dev->port[i].sm_ah)
kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah);
@@ -1242,7 +1546,7 @@ static void ib_sa_remove_one(struct ib_device *device)
kfree(sa_dev);
}
-static int __init ib_sa_init(void)
+int ib_sa_init(void)
{
int ret;
@@ -1250,29 +1554,27 @@ static int __init ib_sa_init(void)
ret = ib_register_client(&sa_client);
if (ret) {
- printk(KERN_ERR "Couldn't register ib_sa client\n");
+ pr_err("Couldn't register ib_sa client\n");
goto err1;
}
ret = mcast_init();
if (ret) {
- printk(KERN_ERR "Couldn't initialize multicast handling\n");
+ pr_err("Couldn't initialize multicast handling\n");
goto err2;
}
return 0;
+
err2:
ib_unregister_client(&sa_client);
err1:
return ret;
}
-static void __exit ib_sa_cleanup(void)
+void ib_sa_cleanup(void)
{
mcast_cleanup();
ib_unregister_client(&sa_client);
idr_destroy(&query_idr);
}
-
-module_init_order(ib_sa_init, SI_ORDER_SECOND);
-module_exit(ib_sa_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/smi.c b/sys/ofed/drivers/infiniband/core/smi.c
deleted file mode 100644
index 5855e4405d9bf..0000000000000
--- a/sys/ofed/drivers/infiniband/core/smi.c
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved.
- * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved.
- * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved.
- * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved.
- * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved.
- * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include <rdma/ib_smi.h>
-#include "smi.h"
-
-/*
- * Fixup a directed route SMP for sending
- * Return 0 if the SMP should be discarded
- */
-enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
- u8 node_type, int port_num)
-{
- u8 hop_ptr, hop_cnt;
-
- hop_ptr = smp->hop_ptr;
- hop_cnt = smp->hop_cnt;
-
- /* See section 14.2.2.2, Vol 1 IB spec */
- /* C14-6 -- valid hop_cnt values are from 0 to 63 */
- if (hop_cnt >= IB_SMP_MAX_PATH_HOPS)
- return IB_SMI_DISCARD;
-
- if (!ib_get_smp_direction(smp)) {
- /* C14-9:1 */
- if (hop_cnt && hop_ptr == 0) {
- smp->hop_ptr++;
- return (smp->initial_path[smp->hop_ptr] ==
- port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
- }
-
- /* C14-9:2 */
- if (hop_ptr && hop_ptr < hop_cnt) {
- if (node_type != RDMA_NODE_IB_SWITCH)
- return IB_SMI_DISCARD;
-
- /* smp->return_path set when received */
- smp->hop_ptr++;
- return (smp->initial_path[smp->hop_ptr] ==
- port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
- }
-
- /* C14-9:3 -- We're at the end of the DR segment of path */
- if (hop_ptr == hop_cnt) {
- /* smp->return_path set when received */
- smp->hop_ptr++;
- return (node_type == RDMA_NODE_IB_SWITCH ||
- smp->dr_dlid == IB_LID_PERMISSIVE ?
- IB_SMI_HANDLE : IB_SMI_DISCARD);
- }
-
- /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
- /* C14-9:5 -- Fail unreasonable hop pointer */
- return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
-
- } else {
- /* C14-13:1 */
- if (hop_cnt && hop_ptr == hop_cnt + 1) {
- smp->hop_ptr--;
- return (smp->return_path[smp->hop_ptr] ==
- port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
- }
-
- /* C14-13:2 */
- if (2 <= hop_ptr && hop_ptr <= hop_cnt) {
- if (node_type != RDMA_NODE_IB_SWITCH)
- return IB_SMI_DISCARD;
-
- smp->hop_ptr--;
- return (smp->return_path[smp->hop_ptr] ==
- port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
- }
-
- /* C14-13:3 -- at the end of the DR segment of path */
- if (hop_ptr == 1) {
- smp->hop_ptr--;
- /* C14-13:3 -- SMPs destined for SM shouldn't be here */
- return (node_type == RDMA_NODE_IB_SWITCH ||
- smp->dr_slid == IB_LID_PERMISSIVE ?
- IB_SMI_HANDLE : IB_SMI_DISCARD);
- }
-
- /* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */
- if (hop_ptr == 0)
- return IB_SMI_HANDLE;
-
- /* C14-13:5 -- Check for unreasonable hop pointer */
- return IB_SMI_DISCARD;
- }
-}
-
-/*
- * Adjust information for a received SMP
- * Return 0 if the SMP should be dropped
- */
-enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type,
- int port_num, int phys_port_cnt)
-{
- u8 hop_ptr, hop_cnt;
-
- hop_ptr = smp->hop_ptr;
- hop_cnt = smp->hop_cnt;
-
- /* See section 14.2.2.2, Vol 1 IB spec */
- /* C14-6 -- valid hop_cnt values are from 0 to 63 */
- if (hop_cnt >= IB_SMP_MAX_PATH_HOPS)
- return IB_SMI_DISCARD;
-
- if (!ib_get_smp_direction(smp)) {
- /* C14-9:1 -- sender should have incremented hop_ptr */
- if (hop_cnt && hop_ptr == 0)
- return IB_SMI_DISCARD;
-
- /* C14-9:2 -- intermediate hop */
- if (hop_ptr && hop_ptr < hop_cnt) {
- if (node_type != RDMA_NODE_IB_SWITCH)
- return IB_SMI_DISCARD;
-
- smp->return_path[hop_ptr] = port_num;
- /* smp->hop_ptr updated when sending */
- return (smp->initial_path[hop_ptr+1] <= phys_port_cnt ?
- IB_SMI_HANDLE : IB_SMI_DISCARD);
- }
-
- /* C14-9:3 -- We're at the end of the DR segment of path */
- if (hop_ptr == hop_cnt) {
- if (hop_cnt)
- smp->return_path[hop_ptr] = port_num;
- /* smp->hop_ptr updated when sending */
-
- return (node_type == RDMA_NODE_IB_SWITCH ||
- smp->dr_dlid == IB_LID_PERMISSIVE ?
- IB_SMI_HANDLE : IB_SMI_DISCARD);
- }
-
- /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
- /* C14-9:5 -- fail unreasonable hop pointer */
- return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
-
- } else {
-
- /* C14-13:1 */
- if (hop_cnt && hop_ptr == hop_cnt + 1) {
- smp->hop_ptr--;
- return (smp->return_path[smp->hop_ptr] ==
- port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
- }
-
- /* C14-13:2 */
- if (2 <= hop_ptr && hop_ptr <= hop_cnt) {
- if (node_type != RDMA_NODE_IB_SWITCH)
- return IB_SMI_DISCARD;
-
- /* smp->hop_ptr updated when sending */
- return (smp->return_path[hop_ptr-1] <= phys_port_cnt ?
- IB_SMI_HANDLE : IB_SMI_DISCARD);
- }
-
- /* C14-13:3 -- We're at the end of the DR segment of path */
- if (hop_ptr == 1) {
- if (smp->dr_slid == IB_LID_PERMISSIVE) {
- /* giving SMP to SM - update hop_ptr */
- smp->hop_ptr--;
- return IB_SMI_HANDLE;
- }
- /* smp->hop_ptr updated when sending */
- return (node_type == RDMA_NODE_IB_SWITCH ?
- IB_SMI_HANDLE : IB_SMI_DISCARD);
- }
-
- /* C14-13:4 -- hop_ptr = 0 -> give to SM */
- /* C14-13:5 -- Check for unreasonable hop pointer */
- return (hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
- }
-}
-
-enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp)
-{
- u8 hop_ptr, hop_cnt;
-
- hop_ptr = smp->hop_ptr;
- hop_cnt = smp->hop_cnt;
-
- if (!ib_get_smp_direction(smp)) {
- /* C14-9:2 -- intermediate hop */
- if (hop_ptr && hop_ptr < hop_cnt)
- return IB_SMI_FORWARD;
-
- /* C14-9:3 -- at the end of the DR segment of path */
- if (hop_ptr == hop_cnt)
- return (smp->dr_dlid == IB_LID_PERMISSIVE ?
- IB_SMI_SEND : IB_SMI_LOCAL);
-
- /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
- if (hop_ptr == hop_cnt + 1)
- return IB_SMI_SEND;
- } else {
- /* C14-13:2 -- intermediate hop */
- if (2 <= hop_ptr && hop_ptr <= hop_cnt)
- return IB_SMI_FORWARD;
-
- /* C14-13:3 -- at the end of the DR segment of path */
- if (hop_ptr == 1)
- return (smp->dr_slid != IB_LID_PERMISSIVE ?
- IB_SMI_SEND : IB_SMI_LOCAL);
- }
- return IB_SMI_LOCAL;
-}
-
-/*
- * Return the forwarding port number from initial_path for outgoing SMP and
- * from return_path for returning SMP
- */
-int smi_get_fwd_port(struct ib_smp *smp)
-{
- return (!ib_get_smp_direction(smp) ? smp->initial_path[smp->hop_ptr+1] :
- smp->return_path[smp->hop_ptr-1]);
-}
diff --git a/sys/ofed/drivers/infiniband/core/smi.h b/sys/ofed/drivers/infiniband/core/smi.h
index aff96bac49b4c..33c91c8a16e95 100644
--- a/sys/ofed/drivers/infiniband/core/smi.h
+++ b/sys/ofed/drivers/infiniband/core/smi.h
@@ -51,12 +51,12 @@ enum smi_forward_action {
IB_SMI_FORWARD /* SMP should be forwarded (for switches only) */
};
-enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type,
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch,
int port_num, int phys_port_cnt);
int smi_get_fwd_port(struct ib_smp *smp);
extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp);
extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
- u8 node_type, int port_num);
+ bool is_switch, int port_num);
/*
* Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
diff --git a/sys/ofed/drivers/infiniband/core/sysfs.c b/sys/ofed/drivers/infiniband/core/sysfs.c
deleted file mode 100644
index 6bcbfb9d9a1d9..0000000000000
--- a/sys/ofed/drivers/infiniband/core/sysfs.c
+++ /dev/null
@@ -1,1026 +0,0 @@
-/*
- * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
- * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
- * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "core_priv.h"
-
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/printk.h>
-
-#include <rdma/ib_mad.h>
-#include <rdma/ib_pma.h>
-
-struct ib_port {
- struct kobject kobj;
- struct ib_device *ibdev;
- struct attribute_group gid_group;
- struct attribute_group pkey_group;
- u8 port_num;
-};
-
-struct port_attribute {
- struct attribute attr;
- ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf);
- ssize_t (*store)(struct ib_port *, struct port_attribute *,
- const char *buf, size_t count);
-};
-
-#define PORT_ATTR(_name, _mode, _show, _store) \
-struct port_attribute port_attr_##_name = __ATTR(_name, _mode, _show, _store)
-
-#define PORT_ATTR_RO(_name) \
-struct port_attribute port_attr_##_name = __ATTR_RO(_name)
-
-struct port_table_attribute {
- struct port_attribute attr;
- char name[8];
- int index;
-};
-
-static ssize_t port_attr_show(struct kobject *kobj,
- struct attribute *attr, char *buf)
-{
- struct port_attribute *port_attr =
- container_of(attr, struct port_attribute, attr);
- struct ib_port *p = container_of(kobj, struct ib_port, kobj);
-
- if (!port_attr->show)
- return -EIO;
-
- return port_attr->show(p, port_attr, buf);
-}
-
-static const struct sysfs_ops port_sysfs_ops = {
- .show = port_attr_show
-};
-
-static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
- char *buf)
-{
- struct ib_port_attr attr;
- ssize_t ret;
-
- static const char *state_name[] = {
- [IB_PORT_NOP] = "NOP",
- [IB_PORT_DOWN] = "DOWN",
- [IB_PORT_INIT] = "INIT",
- [IB_PORT_ARMED] = "ARMED",
- [IB_PORT_ACTIVE] = "ACTIVE",
- [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER"
- };
-
- ret = ib_query_port(p->ibdev, p->port_num, &attr);
- if (ret)
- return ret;
-
- return sprintf(buf, "%d: %s\n", attr.state,
- attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ?
- state_name[attr.state] : "UNKNOWN");
-}
-
-static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused,
- char *buf)
-{
- struct ib_port_attr attr;
- ssize_t ret;
-
- ret = ib_query_port(p->ibdev, p->port_num, &attr);
- if (ret)
- return ret;
-
- return sprintf(buf, "0x%x\n", attr.lid);
-}
-
-static ssize_t lid_mask_count_show(struct ib_port *p,
- struct port_attribute *unused,
- char *buf)
-{
- struct ib_port_attr attr;
- ssize_t ret;
-
- ret = ib_query_port(p->ibdev, p->port_num, &attr);
- if (ret)
- return ret;
-
- return sprintf(buf, "%d\n", attr.lmc);
-}
-
-static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused,
- char *buf)
-{
- struct ib_port_attr attr;
- ssize_t ret;
-
- ret = ib_query_port(p->ibdev, p->port_num, &attr);
- if (ret)
- return ret;
-
- return sprintf(buf, "0x%x\n", attr.sm_lid);
-}
-
-static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused,
- char *buf)
-{
- struct ib_port_attr attr;
- ssize_t ret;
-
- ret = ib_query_port(p->ibdev, p->port_num, &attr);
- if (ret)
- return ret;
-
- return sprintf(buf, "%d\n", attr.sm_sl);
-}
-
-static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused,
- char *buf)
-{
- struct ib_port_attr attr;
- ssize_t ret;
-
- ret = ib_query_port(p->ibdev, p->port_num, &attr);
- if (ret)
- return ret;
-
- return sprintf(buf, "0x%08x\n", attr.port_cap_flags);
-}
-
-static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused,
- char *buf)
-{
- struct ib_port_attr attr;
- char *speed = "";
- int rate; /* in deci-Gb/sec */
- ssize_t ret;
-
- ret = ib_query_port(p->ibdev, p->port_num, &attr);
- if (ret)
- return ret;
-
- ib_active_speed_enum_to_rate(attr.active_speed,
- &rate,
- &speed);
-
- rate *= ib_width_enum_to_int(attr.active_width);
- if (rate < 0)
- return -EINVAL;
-
- return sprintf(buf, "%d%s Gb/sec (%dX%s)\n",
- rate / 10, rate % 10 ? ".5" : "",
- ib_width_enum_to_int(attr.active_width), speed);
-}
-
-static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused,
- char *buf)
-{
- struct ib_port_attr attr;
-
- ssize_t ret;
-
- ret = ib_query_port(p->ibdev, p->port_num, &attr);
- if (ret)
- return ret;
-
- switch (attr.phys_state) {
- case 1: return sprintf(buf, "1: Sleep\n");
- case 2: return sprintf(buf, "2: Polling\n");
- case 3: return sprintf(buf, "3: Disabled\n");
- case 4: return sprintf(buf, "4: PortConfigurationTraining\n");
- case 5: return sprintf(buf, "5: LinkUp\n");
- case 6: return sprintf(buf, "6: LinkErrorRecovery\n");
- case 7: return sprintf(buf, "7: Phy Test\n");
- default: return sprintf(buf, "%d: <unknown>\n", attr.phys_state);
- }
-}
-
-static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused,
- char *buf)
-{
- switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) {
- case IB_LINK_LAYER_INFINIBAND:
- return sprintf(buf, "%s\n", "InfiniBand");
- case IB_LINK_LAYER_ETHERNET:
- return sprintf(buf, "%s\n", "Ethernet");
- case IB_LINK_LAYER_SCIF:
- return sprintf(buf, "%s\n", "SCIF");
- default:
- return sprintf(buf, "%s\n", "Unknown");
- }
-}
-
-static PORT_ATTR_RO(state);
-static PORT_ATTR_RO(lid);
-static PORT_ATTR_RO(lid_mask_count);
-static PORT_ATTR_RO(sm_lid);
-static PORT_ATTR_RO(sm_sl);
-static PORT_ATTR_RO(cap_mask);
-static PORT_ATTR_RO(rate);
-static PORT_ATTR_RO(phys_state);
-static PORT_ATTR_RO(link_layer);
-
-static struct attribute *port_default_attrs[] = {
- &port_attr_state.attr,
- &port_attr_lid.attr,
- &port_attr_lid_mask_count.attr,
- &port_attr_sm_lid.attr,
- &port_attr_sm_sl.attr,
- &port_attr_cap_mask.attr,
- &port_attr_rate.attr,
- &port_attr_phys_state.attr,
- &port_attr_link_layer.attr,
- NULL
-};
-
-static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
- char *buf)
-{
- struct port_table_attribute *tab_attr =
- container_of(attr, struct port_table_attribute, attr);
- union ib_gid gid;
- ssize_t ret;
-
- ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid);
- if (ret)
- return ret;
-
- return sprintf(buf, GID_PRINT_FMT"\n",GID_PRINT_ARGS(gid.raw));
-}
-
-static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
- char *buf)
-{
- struct port_table_attribute *tab_attr =
- container_of(attr, struct port_table_attribute, attr);
- u16 pkey;
- ssize_t ret;
-
- ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey);
- if (ret)
- return ret;
-
- return sprintf(buf, "0x%04x\n", pkey);
-}
-
-static ssize_t get_pma_counters(struct ib_port *p, struct port_attribute *attr,
- char *buf, int c_ext)
-{
- struct port_table_attribute *tab_attr =
- container_of(attr, struct port_table_attribute, attr);
- int offset = tab_attr->index & 0xffff;
- int width = (tab_attr->index >> 16) & 0xff;
- struct ib_mad *in_mad = NULL;
- struct ib_mad *out_mad = NULL;
- ssize_t ret;
-
- if (!p->ibdev->process_mad)
- return -ENXIO;
-
- in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
- out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
- if (!in_mad || !out_mad) {
- ret = -ENOMEM;
- goto out;
- }
-
- in_mad->mad_hdr.base_version = 1;
- in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT;
- in_mad->mad_hdr.class_version = 1;
- in_mad->mad_hdr.method = IB_MGMT_METHOD_GET;
- if (c_ext)
- in_mad->mad_hdr.attr_id = IB_PMA_PORT_COUNTERS_EXT;
- else
- in_mad->mad_hdr.attr_id = IB_PMA_PORT_COUNTERS;
-
- in_mad->data[41] = p->port_num; /* PortSelect field */
-
- if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY,
- p->port_num, NULL, NULL, in_mad, out_mad) &
- (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) !=
- (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) {
- ret = -EINVAL;
- goto out;
- }
-
- switch (width) {
- case 4:
- ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >>
- (4 - (offset % 8))) & 0xf);
- break;
- case 8:
- ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]);
- break;
- case 16:
- ret = sprintf(buf, "%u\n",
- be16_to_cpup((__be16 *)(out_mad->data + 40 + offset / 8)));
- break;
- case 32:
- ret = sprintf(buf, "%u\n",
- be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8)));
- break;
- case 64:
- ret = sprintf(buf, "%llu\n",
- (unsigned long long)be64_to_cpup((__be64 *)(out_mad->data + 40 + offset / 8)));
- break;
- default:
- ret = 0;
- }
-
-out:
- kfree(in_mad);
- kfree(out_mad);
-
- return ret;
-}
-
-#define PORT_PMA_ATTR(_name, _counter, _width, _offset) \
-struct port_table_attribute port_pma_attr_##_name = { \
- .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \
- .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \
-}
-
-static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
- char *buf)
-{
- return get_pma_counters(p, attr, buf, 0);
-}
-
-static PORT_PMA_ATTR(symbol_error , 0, 16, 32);
-static PORT_PMA_ATTR(link_error_recovery , 1, 8, 48);
-static PORT_PMA_ATTR(link_downed , 2, 8, 56);
-static PORT_PMA_ATTR(port_rcv_errors , 3, 16, 64);
-static PORT_PMA_ATTR(port_rcv_remote_physical_errors, 4, 16, 80);
-static PORT_PMA_ATTR(port_rcv_switch_relay_errors , 5, 16, 96);
-static PORT_PMA_ATTR(port_xmit_discards , 6, 16, 112);
-static PORT_PMA_ATTR(port_xmit_constraint_errors , 7, 8, 128);
-static PORT_PMA_ATTR(port_rcv_constraint_errors , 8, 8, 136);
-static PORT_PMA_ATTR(local_link_integrity_errors , 9, 4, 152);
-static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10, 4, 156);
-static PORT_PMA_ATTR(VL15_dropped , 11, 16, 176);
-static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192);
-static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224);
-static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256);
-static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288);
-
-static struct attribute *pma_attrs[] = {
- &port_pma_attr_symbol_error.attr.attr,
- &port_pma_attr_link_error_recovery.attr.attr,
- &port_pma_attr_link_downed.attr.attr,
- &port_pma_attr_port_rcv_errors.attr.attr,
- &port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
- &port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
- &port_pma_attr_port_xmit_discards.attr.attr,
- &port_pma_attr_port_xmit_constraint_errors.attr.attr,
- &port_pma_attr_port_rcv_constraint_errors.attr.attr,
- &port_pma_attr_local_link_integrity_errors.attr.attr,
- &port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
- &port_pma_attr_VL15_dropped.attr.attr,
- &port_pma_attr_port_xmit_data.attr.attr,
- &port_pma_attr_port_rcv_data.attr.attr,
- &port_pma_attr_port_xmit_packets.attr.attr,
- &port_pma_attr_port_rcv_packets.attr.attr,
- NULL
-};
-
-static struct attribute_group pma_group = {
- .name = "counters",
- .attrs = pma_attrs
-};
-
-#define PORT_PMA_ATTR_EXT(_name, _counter, _width, _offset) \
-struct port_table_attribute port_pma_attr_ext_##_name = { \
- .attr = __ATTR(_name, S_IRUGO, show_pma_counter_ext, NULL), \
- .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \
-}
-
-static ssize_t show_pma_counter_ext(struct ib_port *p,
- struct port_attribute *attr, char *buf)
-{
- return get_pma_counters(p, attr, buf, 1);
-}
-
-static PORT_PMA_ATTR_EXT(port_xmit_data_64 , 0, 64, 64);
-static PORT_PMA_ATTR_EXT(port_rcv_data_64 , 0, 64, 128);
-static PORT_PMA_ATTR_EXT(port_xmit_packets_64 , 0, 64, 192);
-static PORT_PMA_ATTR_EXT(port_rcv_packets_64 , 0, 64, 256);
-static PORT_PMA_ATTR_EXT(port_unicast_xmit_packets , 0, 64, 320);
-static PORT_PMA_ATTR_EXT(port_unicast_rcv_packets , 0, 64, 384);
-static PORT_PMA_ATTR_EXT(port_multicast_xmit_packets , 0, 64, 448);
-static PORT_PMA_ATTR_EXT(port_multicast_rcv_packets , 0, 64, 512);
-
-static struct attribute *pma_attrs_ext[] = {
- &port_pma_attr_ext_port_xmit_data_64.attr.attr,
- &port_pma_attr_ext_port_rcv_data_64.attr.attr,
- &port_pma_attr_ext_port_xmit_packets_64.attr.attr,
- &port_pma_attr_ext_port_rcv_packets_64.attr.attr,
- &port_pma_attr_ext_port_unicast_xmit_packets.attr.attr,
- &port_pma_attr_ext_port_unicast_rcv_packets.attr.attr,
- &port_pma_attr_ext_port_multicast_xmit_packets.attr.attr,
- &port_pma_attr_ext_port_multicast_rcv_packets.attr.attr,
- NULL
-};
-
-static struct attribute_group pma_ext_group = {
- .name = "counters_ext",
- .attrs = pma_attrs_ext
-};
-
-static void ib_port_release(struct kobject *kobj)
-{
- struct ib_port *p = container_of(kobj, struct ib_port, kobj);
- struct attribute *a;
- int i;
-
- for (i = 0; (a = p->gid_group.attrs[i]); ++i)
- kfree(a);
-
- kfree(p->gid_group.attrs);
-
- for (i = 0; (a = p->pkey_group.attrs[i]); ++i)
- kfree(a);
-
- kfree(p->pkey_group.attrs);
-
- kfree(p);
-}
-
-static struct kobj_type port_type = {
- .release = ib_port_release,
- .sysfs_ops = &port_sysfs_ops,
- .default_attrs = port_default_attrs
-};
-
-static void ib_device_release(struct device *device)
-{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
-
- kfree(dev);
-}
-
-#ifdef __linux__
-/* BSD supports this through devfs(5) and devd(8). */
-static int ib_device_uevent(struct device *device,
- struct kobj_uevent_env *env)
-{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
-
- if (add_uevent_var(env, "NAME=%s", dev->name))
- return -ENOMEM;
-
- /*
- * It would be nice to pass the node GUID with the event...
- */
-
- return 0;
-}
-#endif
-
-static struct attribute **
-alloc_group_attrs(ssize_t (*show)(struct ib_port *,
- struct port_attribute *, char *buf),
- int len)
-{
- struct attribute **tab_attr;
- struct port_table_attribute *element;
- int i;
-
- tab_attr = kcalloc(1 + len, sizeof(struct attribute *), GFP_KERNEL);
- if (!tab_attr)
- return NULL;
-
- for (i = 0; i < len; i++) {
- element = kzalloc(sizeof(struct port_table_attribute),
- GFP_KERNEL);
- if (!element)
- goto err;
-
- if (snprintf(element->name, sizeof(element->name),
- "%d", i) >= sizeof(element->name)) {
- kfree(element);
- goto err;
- }
-
- element->attr.attr.name = element->name;
- element->attr.attr.mode = S_IRUGO;
- element->attr.show = show;
- element->index = i;
- sysfs_attr_init(&element->attr.attr);
-
- tab_attr[i] = &element->attr.attr;
- }
-
- return tab_attr;
-
-err:
- while (--i >= 0)
- kfree(tab_attr[i]);
- kfree(tab_attr);
- return NULL;
-}
-
-static int add_port(struct ib_device *device, int port_num,
- int (*port_callback)(struct ib_device *,
- u8, struct kobject *))
-{
- struct ib_port *p;
- struct ib_port_attr attr;
- int i;
- int ret;
-
- ret = ib_query_port(device, port_num, &attr);
- if (ret)
- return ret;
-
- p = kzalloc(sizeof *p, GFP_KERNEL);
- if (!p)
- return -ENOMEM;
-
- p->ibdev = device;
- p->port_num = port_num;
-
- ret = kobject_init_and_add(&p->kobj, &port_type,
- device->ports_parent,
- "%d", port_num);
- if (ret)
- goto err_put;
-
- ret = sysfs_create_group(&p->kobj, &pma_group);
- if (ret)
- goto err_put;
-
- ret = sysfs_create_group(&p->kobj, &pma_ext_group);
- if (ret)
- goto err_remove_pma;
-
- p->gid_group.name = "gids";
- p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len);
- if (!p->gid_group.attrs)
- goto err_remove_pma_ext;
-
- ret = sysfs_create_group(&p->kobj, &p->gid_group);
- if (ret)
- goto err_free_gid;
-
- p->pkey_group.name = "pkeys";
- p->pkey_group.attrs = alloc_group_attrs(show_port_pkey,
- attr.pkey_tbl_len);
- if (!p->pkey_group.attrs)
- goto err_remove_gid;
-
- ret = sysfs_create_group(&p->kobj, &p->pkey_group);
- if (ret)
- goto err_free_pkey;
-
- if (port_callback) {
- ret = port_callback(device, port_num, &p->kobj);
- if (ret)
- goto err_remove_pkey;
- }
-
- list_add_tail(&p->kobj.entry, &device->port_list);
-#ifdef __linux__
- kobject_uevent(&p->kobj, KOBJ_ADD);
-#endif
- return 0;
-
-err_remove_pkey:
- sysfs_remove_group(&p->kobj, &p->pkey_group);
-
-err_free_pkey:
- for (i = 0; i < attr.pkey_tbl_len; ++i)
- kfree(p->pkey_group.attrs[i]);
-
- kfree(p->pkey_group.attrs);
-
-err_remove_gid:
- sysfs_remove_group(&p->kobj, &p->gid_group);
-
-err_free_gid:
- for (i = 0; i < attr.gid_tbl_len; ++i)
- kfree(p->gid_group.attrs[i]);
-
- kfree(p->gid_group.attrs);
-
-err_remove_pma_ext:
- sysfs_remove_group(&p->kobj, &pma_ext_group);
-
-err_remove_pma:
- sysfs_remove_group(&p->kobj, &pma_group);
-
-err_put:
- kobject_put(device->ports_parent);
- kfree(p);
- return ret;
-}
-
-static ssize_t show_node_type(struct device *device,
- struct device_attribute *attr, char *buf)
-{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
-
- switch (dev->node_type) {
- case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type);
- case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type);
- case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type);
- case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type);
- case RDMA_NODE_MIC: return sprintf(buf, "%d: MIC\n", dev->node_type);
- default: return sprintf(buf, "%d: <unknown>\n", dev->node_type);
- }
-}
-
-static ssize_t show_sys_image_guid(struct device *device,
- struct device_attribute *dev_attr, char *buf)
-{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
- struct ib_device_attr attr;
- ssize_t ret;
-
- ret = ib_query_device(dev, &attr);
- if (ret)
- return ret;
-
- return sprintf(buf, "%04x:%04x:%04x:%04x\n",
- be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]),
- be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]),
- be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]),
- be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3]));
-}
-
-static ssize_t show_node_guid(struct device *device,
- struct device_attribute *attr, char *buf)
-{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
-
- return sprintf(buf, "%04x:%04x:%04x:%04x\n",
- be16_to_cpu(((__be16 *) &dev->node_guid)[0]),
- be16_to_cpu(((__be16 *) &dev->node_guid)[1]),
- be16_to_cpu(((__be16 *) &dev->node_guid)[2]),
- be16_to_cpu(((__be16 *) &dev->node_guid)[3]));
-}
-
-static ssize_t show_node_desc(struct device *device,
- struct device_attribute *attr, char *buf)
-{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
-
- return sprintf(buf, "%.64s\n", dev->node_desc);
-}
-
-static ssize_t set_node_desc(struct device *device,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
- struct ib_device_modify desc = {};
- int ret;
-
- if (!dev->modify_device)
- return -EIO;
-
- memcpy(desc.node_desc, buf, min_t(int, count, 64));
- ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc);
- if (ret)
- return ret;
-
- return count;
-}
-
-static ssize_t show_cmd_perf(struct device *device,
- struct device_attribute *attr, char *buf)
-{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
-
- return sprintf(buf, "%d\n", dev->cmd_perf);
-}
-
-static ssize_t set_cmd_perf(struct device *device,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
- u32 val;
-
- if (sscanf(buf, "0x%x", &val) != 1)
- return -EINVAL;
-
- dev->cmd_perf = val;
-
- return count;
-}
-
-static ssize_t show_cmd_avg(struct device *device,
- struct device_attribute *attr, char *buf)
-{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
-
- return sprintf(buf, "%llu\n", (unsigned long long)dev->cmd_avg);
-}
-
-static ssize_t set_cmd_avg(struct device *device,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
-
- spin_lock(&dev->cmd_perf_lock);
- dev->cmd_avg = 0;
- dev->cmd_n = 0;
- spin_unlock(&dev->cmd_perf_lock);
-
- return count;
-}
-
-static ssize_t show_cmd_n(struct device *device,
- struct device_attribute *attr, char *buf)
-{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
-
- return sprintf(buf, "%d\n", dev->cmd_n);
-}
-
-static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL);
-static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL);
-static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL);
-static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc);
-static DEVICE_ATTR(cmd_perf, S_IRUGO | S_IWUSR, show_cmd_perf, set_cmd_perf);
-static DEVICE_ATTR(cmd_avg, S_IRUGO | S_IWUSR, show_cmd_avg, set_cmd_avg);
-static DEVICE_ATTR(cmd_n, S_IRUGO, show_cmd_n, NULL);
-
-static struct device_attribute *ib_class_attributes[] = {
- &dev_attr_node_type,
- &dev_attr_sys_image_guid,
- &dev_attr_node_guid,
- &dev_attr_node_desc,
- &dev_attr_cmd_perf,
- &dev_attr_cmd_avg,
- &dev_attr_cmd_n,
-};
-
-static struct class ib_class = {
- .name = "infiniband",
- .dev_release = ib_device_release,
-#ifdef __linux__
- .dev_uevent = ib_device_uevent,
-#endif
-};
-
-/* Show a given an attribute in the statistics group */
-static ssize_t show_protocol_stat(const struct device *device,
- struct device_attribute *attr, char *buf,
- unsigned offset)
-{
- struct ib_device *dev = container_of(__DECONST(struct device *, device), struct ib_device, dev);
- union rdma_protocol_stats stats;
- ssize_t ret;
-
- ret = dev->get_protocol_stats(dev, &stats);
- if (ret)
- return ret;
-
- return sprintf(buf, "%llu\n",
- (unsigned long long) ((u64 *) &stats)[offset]);
-}
-
-/* generate a read-only iwarp statistics attribute */
-#define IW_STATS_ENTRY(name) \
-static ssize_t show_##name(struct device *device, \
- struct device_attribute *attr, char *buf) \
-{ \
- return show_protocol_stat(device, attr, buf, \
- offsetof(struct iw_protocol_stats, name) / \
- sizeof (u64)); \
-} \
-static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
-
-IW_STATS_ENTRY(ipInReceives);
-IW_STATS_ENTRY(ipInHdrErrors);
-IW_STATS_ENTRY(ipInTooBigErrors);
-IW_STATS_ENTRY(ipInNoRoutes);
-IW_STATS_ENTRY(ipInAddrErrors);
-IW_STATS_ENTRY(ipInUnknownProtos);
-IW_STATS_ENTRY(ipInTruncatedPkts);
-IW_STATS_ENTRY(ipInDiscards);
-IW_STATS_ENTRY(ipInDelivers);
-IW_STATS_ENTRY(ipOutForwDatagrams);
-IW_STATS_ENTRY(ipOutRequests);
-IW_STATS_ENTRY(ipOutDiscards);
-IW_STATS_ENTRY(ipOutNoRoutes);
-IW_STATS_ENTRY(ipReasmTimeout);
-IW_STATS_ENTRY(ipReasmReqds);
-IW_STATS_ENTRY(ipReasmOKs);
-IW_STATS_ENTRY(ipReasmFails);
-IW_STATS_ENTRY(ipFragOKs);
-IW_STATS_ENTRY(ipFragFails);
-IW_STATS_ENTRY(ipFragCreates);
-IW_STATS_ENTRY(ipInMcastPkts);
-IW_STATS_ENTRY(ipOutMcastPkts);
-IW_STATS_ENTRY(ipInBcastPkts);
-IW_STATS_ENTRY(ipOutBcastPkts);
-IW_STATS_ENTRY(tcpRtoAlgorithm);
-IW_STATS_ENTRY(tcpRtoMin);
-IW_STATS_ENTRY(tcpRtoMax);
-IW_STATS_ENTRY(tcpMaxConn);
-IW_STATS_ENTRY(tcpActiveOpens);
-IW_STATS_ENTRY(tcpPassiveOpens);
-IW_STATS_ENTRY(tcpAttemptFails);
-IW_STATS_ENTRY(tcpEstabResets);
-IW_STATS_ENTRY(tcpCurrEstab);
-IW_STATS_ENTRY(tcpInSegs);
-IW_STATS_ENTRY(tcpOutSegs);
-IW_STATS_ENTRY(tcpRetransSegs);
-IW_STATS_ENTRY(tcpInErrs);
-IW_STATS_ENTRY(tcpOutRsts);
-
-static struct attribute *iw_proto_stats_attrs[] = {
- &dev_attr_ipInReceives.attr,
- &dev_attr_ipInHdrErrors.attr,
- &dev_attr_ipInTooBigErrors.attr,
- &dev_attr_ipInNoRoutes.attr,
- &dev_attr_ipInAddrErrors.attr,
- &dev_attr_ipInUnknownProtos.attr,
- &dev_attr_ipInTruncatedPkts.attr,
- &dev_attr_ipInDiscards.attr,
- &dev_attr_ipInDelivers.attr,
- &dev_attr_ipOutForwDatagrams.attr,
- &dev_attr_ipOutRequests.attr,
- &dev_attr_ipOutDiscards.attr,
- &dev_attr_ipOutNoRoutes.attr,
- &dev_attr_ipReasmTimeout.attr,
- &dev_attr_ipReasmReqds.attr,
- &dev_attr_ipReasmOKs.attr,
- &dev_attr_ipReasmFails.attr,
- &dev_attr_ipFragOKs.attr,
- &dev_attr_ipFragFails.attr,
- &dev_attr_ipFragCreates.attr,
- &dev_attr_ipInMcastPkts.attr,
- &dev_attr_ipOutMcastPkts.attr,
- &dev_attr_ipInBcastPkts.attr,
- &dev_attr_ipOutBcastPkts.attr,
- &dev_attr_tcpRtoAlgorithm.attr,
- &dev_attr_tcpRtoMin.attr,
- &dev_attr_tcpRtoMax.attr,
- &dev_attr_tcpMaxConn.attr,
- &dev_attr_tcpActiveOpens.attr,
- &dev_attr_tcpPassiveOpens.attr,
- &dev_attr_tcpAttemptFails.attr,
- &dev_attr_tcpEstabResets.attr,
- &dev_attr_tcpCurrEstab.attr,
- &dev_attr_tcpInSegs.attr,
- &dev_attr_tcpOutSegs.attr,
- &dev_attr_tcpRetransSegs.attr,
- &dev_attr_tcpInErrs.attr,
- &dev_attr_tcpOutRsts.attr,
- NULL
-};
-
-static struct attribute_group iw_stats_group = {
- .name = "proto_stats",
- .attrs = iw_proto_stats_attrs,
-};
-
-int ib_device_register_sysfs(struct ib_device *device,
- int (*port_callback)(struct ib_device *,
- u8, struct kobject *))
-{
- struct device *class_dev = &device->dev;
- int ret;
- int i;
-
- class_dev->class = &ib_class;
- class_dev->parent = device->dma_device;
- dev_set_name(class_dev, device->name);
- dev_set_drvdata(class_dev, device);
-
- INIT_LIST_HEAD(&device->port_list);
-
- ret = device_register(class_dev);
- if (ret)
- goto err;
-
- for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) {
- ret = device_create_file(class_dev, ib_class_attributes[i]);
- if (ret)
- goto err_unregister;
- }
-
- device->ports_parent = kobject_create_and_add("ports",&class_dev->kobj);
- if (!device->ports_parent) {
- ret = -ENOMEM;
- goto err_put;
- }
-
- if (device->node_type == RDMA_NODE_IB_SWITCH) {
- ret = add_port(device, 0, port_callback);
- if (ret)
- goto err_put;
- } else {
- for (i = 1; i <= device->phys_port_cnt; ++i) {
- ret = add_port(device, i, port_callback);
- if (ret)
- goto err_put;
- }
- }
-
- if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) {
- ret = sysfs_create_group(&class_dev->kobj, &iw_stats_group);
- if (ret)
- goto err_put;
- }
-
- return 0;
-
-err_put:
- {
- struct kobject *p, *t;
- struct ib_port *port;
-
- list_for_each_entry_safe(p, t, &device->port_list, entry) {
- list_del(&p->entry);
- port = container_of(p, struct ib_port, kobj);
- sysfs_remove_group(p, &pma_group);
- sysfs_remove_group(p, &port->pkey_group);
- sysfs_remove_group(p, &port->gid_group);
- kobject_put(p);
- }
- }
-
- kobject_put(&class_dev->kobj);
-
-err_unregister:
-
- for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) {
- device_remove_file(class_dev, ib_class_attributes[i]);
- }
-
- device_unregister(class_dev);
-
-err:
- return ret;
-}
-
-void ib_device_unregister_sysfs(struct ib_device *device)
-{
- int i;
- struct kobject *p, *t;
- struct ib_port *port;
- struct device *class_dev = &device->dev;
-
- /* Hold kobject until ib_dealloc_device() */
- kobject_get(&device->dev.kobj);
-
- for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) {
- device_remove_file(class_dev, ib_class_attributes[i]);
- }
-
- list_for_each_entry_safe(p, t, &device->port_list, entry) {
- list_del(&p->entry);
- port = container_of(p, struct ib_port, kobj);
- sysfs_remove_group(p, &pma_group);
- sysfs_remove_group(p, &port->pkey_group);
- sysfs_remove_group(p, &port->gid_group);
- kobject_put(p);
- }
-
- kobject_put(device->ports_parent);
- device_unregister(&device->dev);
-}
-
-int ib_sysfs_setup(void)
-{
- return class_register(&ib_class);
-}
-
-void ib_sysfs_cleanup(void)
-{
- class_unregister(&ib_class);
-}
diff --git a/sys/ofed/drivers/infiniband/core/ucm.c b/sys/ofed/drivers/infiniband/core/ucm.c
index 8f20e89da0d5e..b6490427a7bc3 100644
--- a/sys/ofed/drivers/infiniband/core/ucm.c
+++ b/sys/ofed/drivers/infiniband/core/ucm.c
@@ -43,10 +43,10 @@
#include <linux/idr.h>
#include <linux/mutex.h>
#include <linux/slab.h>
-#include <linux/string.h>
#include <asm/uaccess.h>
+#include <rdma/ib.h>
#include <rdma/ib_cm.h>
#include <rdma/ib_user_cm.h>
#include <rdma/ib_marshall.h>
@@ -108,7 +108,7 @@ enum {
#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
static void ib_ucm_add_one(struct ib_device *device);
-static void ib_ucm_remove_one(struct ib_device *device);
+static void ib_ucm_remove_one(struct ib_device *device, void *client_data);
static struct ib_client ucm_client = {
.name = "ucm",
@@ -175,7 +175,6 @@ static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx)
static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file)
{
struct ib_ucm_context *ctx;
- int result;
ctx = kzalloc(sizeof *ctx, GFP_KERNEL);
if (!ctx)
@@ -186,17 +185,10 @@ static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file)
ctx->file = file;
INIT_LIST_HEAD(&ctx->events);
- do {
- result = idr_pre_get(&ctx_id_table, GFP_KERNEL);
- if (!result)
- goto error;
-
- mutex_lock(&ctx_id_mutex);
- result = idr_get_new(&ctx_id_table, ctx, &ctx->id);
- mutex_unlock(&ctx_id_mutex);
- } while (result == -EAGAIN);
-
- if (result)
+ mutex_lock(&ctx_id_mutex);
+ ctx->id = idr_alloc(&ctx_id_table, ctx, 0, 0, GFP_KERNEL);
+ mutex_unlock(&ctx_id_mutex);
+ if (ctx->id < 0)
goto error;
list_add_tail(&ctx->file_list, &file->ctxs);
@@ -378,8 +370,7 @@ static int ib_ucm_event_handler(struct ib_cm_id *cm_id,
list_add_tail(&uevent->file_list, &ctx->file->events);
list_add_tail(&uevent->ctx_list, &ctx->events);
wake_up_interruptible(&ctx->file->poll_wait);
- if (ctx->file->filp)
- selwakeup(&ctx->file->filp->f_selinfo);
+ linux_poll_wakeup(ctx->file->filp);
mutex_unlock(&ctx->file->file_mutex);
return 0;
@@ -667,8 +658,7 @@ static ssize_t ib_ucm_listen(struct ib_ucm_file *file,
if (result)
goto out;
- result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask,
- NULL);
+ result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask);
out:
ib_ucm_ctx_put(ctx);
return result;
@@ -703,14 +693,9 @@ static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len)
if (!len)
return 0;
- data = kmalloc(len, GFP_KERNEL);
- if (!data)
- return -ENOMEM;
-
- if (copy_from_user(data, (void __user *)(unsigned long)src, len)) {
- kfree(data);
- return -EFAULT;
- }
+ data = memdup_user((void __user *)(unsigned long)src, len);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
*dest = data;
return 0;
@@ -1118,6 +1103,9 @@ static ssize_t ib_ucm_write(struct file *filp, const char __user *buf,
struct ib_ucm_cmd_hdr hdr;
ssize_t result;
+ if (WARN_ON_ONCE(!ib_safe_file_access(filp)))
+ return -EACCES;
+
if (len < sizeof(hdr))
return -EINVAL;
@@ -1207,6 +1195,7 @@ static int ib_ucm_close(struct inode *inode, struct file *filp)
return 0;
}
+static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES);
static void ib_ucm_release_dev(struct device *dev)
{
struct ib_ucm_device *ucm_dev;
@@ -1214,17 +1203,17 @@ static void ib_ucm_release_dev(struct device *dev)
ucm_dev = container_of(dev, struct ib_ucm_device, dev);
cdev_del(&ucm_dev->cdev);
if (ucm_dev->devnum < IB_UCM_MAX_DEVICES)
- clear_bit(ucm_dev->devnum, dev_map);
+ clear_bit(ucm_dev->devnum, dev_map);
else
- clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, dev_map);
+ clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, overflow_map);
kfree(ucm_dev);
}
static const struct file_operations ucm_fops = {
- .owner = THIS_MODULE,
- .open = ib_ucm_open,
+ .owner = THIS_MODULE,
+ .open = ib_ucm_open,
.release = ib_ucm_close,
- .write = ib_ucm_write,
+ .write = ib_ucm_write,
.poll = ib_ucm_poll,
.llseek = no_llseek,
};
@@ -1240,7 +1229,6 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
static dev_t overflow_maj;
-static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES);
static int find_overflow_devnum(void)
{
int ret;
@@ -1249,7 +1237,7 @@ static int find_overflow_devnum(void)
ret = alloc_chrdev_region(&overflow_maj, 0, IB_UCM_MAX_DEVICES,
"infiniband_cm");
if (ret) {
- printk(KERN_ERR "ucm: couldn't register dynamic device number\n");
+ pr_err("ucm: couldn't register dynamic device number\n");
return ret;
}
}
@@ -1267,8 +1255,7 @@ static void ib_ucm_add_one(struct ib_device *device)
dev_t base;
struct ib_ucm_device *ucm_dev;
- if (!device->alloc_ucontext ||
- rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ if (!device->alloc_ucontext || !rdma_cap_ib_cm(device, 1))
return;
ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL);
@@ -1281,7 +1268,7 @@ static void ib_ucm_add_one(struct ib_device *device)
if (devnum >= IB_UCM_MAX_DEVICES) {
devnum = find_overflow_devnum();
if (devnum < 0)
- goto err;
+ goto err;
ucm_dev->devnum = devnum + IB_UCM_MAX_DEVICES;
base = devnum + overflow_maj;
@@ -1325,9 +1312,9 @@ err:
return;
}
-static void ib_ucm_remove_one(struct ib_device *device)
+static void ib_ucm_remove_one(struct ib_device *device, void *client_data)
{
- struct ib_ucm_device *ucm_dev = ib_get_client_data(device, &ucm_client);
+ struct ib_ucm_device *ucm_dev = client_data;
if (!ucm_dev)
return;
@@ -1335,12 +1322,8 @@ static void ib_ucm_remove_one(struct ib_device *device)
device_unregister(&ucm_dev->dev);
}
-static ssize_t show_abi_version(struct class *class, struct class_attribute *attr, char *buf)
-{
- return sprintf(buf, "%d\n", IB_USER_CM_ABI_VERSION);
-}
-
-static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+static CLASS_ATTR_STRING(abi_version, S_IRUGO,
+ __stringify(IB_USER_CM_ABI_VERSION));
static int __init ib_ucm_init(void)
{
@@ -1349,25 +1332,25 @@ static int __init ib_ucm_init(void)
ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES,
"infiniband_cm");
if (ret) {
- printk(KERN_ERR "ucm: couldn't register device number\n");
+ pr_err("ucm: couldn't register device number\n");
goto error1;
}
- ret = class_create_file(&cm_class, &class_attr_abi_version);
+ ret = class_create_file(&cm_class, &class_attr_abi_version.attr);
if (ret) {
- printk(KERN_ERR "ucm: couldn't create abi_version attribute\n");
+ pr_err("ucm: couldn't create abi_version attribute\n");
goto error2;
}
ret = ib_register_client(&ucm_client);
if (ret) {
- printk(KERN_ERR "ucm: couldn't register client\n");
+ pr_err("ucm: couldn't register client\n");
goto error3;
}
return 0;
error3:
- class_remove_file(&cm_class, &class_attr_abi_version);
+ class_remove_file(&cm_class, &class_attr_abi_version.attr);
error2:
unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
error1:
@@ -1377,7 +1360,7 @@ error1:
static void __exit ib_ucm_cleanup(void)
{
ib_unregister_client(&ucm_client);
- class_remove_file(&cm_class, &class_attr_abi_version);
+ class_remove_file(&cm_class, &class_attr_abi_version.attr);
unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
if (overflow_maj)
unregister_chrdev_region(overflow_maj, IB_UCM_MAX_DEVICES);
diff --git a/sys/ofed/drivers/infiniband/core/ucma.c b/sys/ofed/drivers/infiniband/core/ucma.c
index 4000aa20cd889..8646365122386 100644
--- a/sys/ofed/drivers/infiniband/core/ucma.c
+++ b/sys/ofed/drivers/infiniband/core/ucma.c
@@ -48,6 +48,8 @@
#include <rdma/ib_marshall.h>
#include <rdma/rdma_cm.h>
#include <rdma/rdma_cm_ib.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib.h>
MODULE_AUTHOR("Sean Hefty");
MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
@@ -61,6 +63,7 @@ struct ucma_file {
struct list_head ctx_list;
struct list_head event_list;
wait_queue_head_t poll_wait;
+ struct workqueue_struct *close_wq;
};
struct ucma_context {
@@ -76,6 +79,13 @@ struct ucma_context {
struct list_head list;
struct list_head mc_list;
+ /* mark that device is in process of destroying the internal HW
+ * resources, protected by the global mut
+ */
+ int closing;
+ /* sync between removal event and id destroy, protected by file mut */
+ int destroying;
+ struct work_struct close_work;
};
struct ucma_multicast {
@@ -84,6 +94,7 @@ struct ucma_multicast {
int events_reported;
u64 uid;
+ u8 join_state;
struct list_head list;
struct sockaddr_storage addr;
};
@@ -94,6 +105,7 @@ struct ucma_event {
struct list_head list;
struct rdma_cm_id *cm_id;
struct rdma_ucm_event_resp resp;
+ struct work_struct close_work;
};
static DEFINE_MUTEX(mut);
@@ -119,8 +131,12 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
mutex_lock(&mut);
ctx = _ucma_find_context(id, file);
- if (!IS_ERR(ctx))
- atomic_inc(&ctx->ref);
+ if (!IS_ERR(ctx)) {
+ if (ctx->closing)
+ ctx = ERR_PTR(-EIO);
+ else
+ atomic_inc(&ctx->ref);
+ }
mutex_unlock(&mut);
return ctx;
}
@@ -131,31 +147,46 @@ static void ucma_put_ctx(struct ucma_context *ctx)
complete(&ctx->comp);
}
+static void ucma_close_event_id(struct work_struct *work)
+{
+ struct ucma_event *uevent_close = container_of(work, struct ucma_event, close_work);
+
+ rdma_destroy_id(uevent_close->cm_id);
+ kfree(uevent_close);
+}
+
+static void ucma_close_id(struct work_struct *work)
+{
+ struct ucma_context *ctx = container_of(work, struct ucma_context, close_work);
+
+ /* once all inflight tasks are finished, we close all underlying
+ * resources. The context is still alive till its explicit destryoing
+ * by its creator.
+ */
+ ucma_put_ctx(ctx);
+ wait_for_completion(&ctx->comp);
+ /* No new events will be generated after destroying the id. */
+ rdma_destroy_id(ctx->cm_id);
+}
+
static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
{
struct ucma_context *ctx;
- int ret;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return NULL;
+ INIT_WORK(&ctx->close_work, ucma_close_id);
atomic_set(&ctx->ref, 1);
init_completion(&ctx->comp);
INIT_LIST_HEAD(&ctx->mc_list);
ctx->file = file;
- do {
- ret = idr_pre_get(&ctx_idr, GFP_KERNEL);
- if (!ret)
- goto error;
-
- mutex_lock(&mut);
- ret = idr_get_new(&ctx_idr, ctx, &ctx->id);
- mutex_unlock(&mut);
- } while (ret == -EAGAIN);
-
- if (ret)
+ mutex_lock(&mut);
+ ctx->id = idr_alloc(&ctx_idr, ctx, 0, 0, GFP_KERNEL);
+ mutex_unlock(&mut);
+ if (ctx->id < 0)
goto error;
list_add_tail(&ctx->list, &file->ctx_list);
@@ -169,23 +200,15 @@ error:
static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx)
{
struct ucma_multicast *mc;
- int ret;
mc = kzalloc(sizeof(*mc), GFP_KERNEL);
if (!mc)
return NULL;
- do {
- ret = idr_pre_get(&multicast_idr, GFP_KERNEL);
- if (!ret)
- goto error;
-
- mutex_lock(&mut);
- ret = idr_get_new(&multicast_idr, mc, &mc->id);
- mutex_unlock(&mut);
- } while (ret == -EAGAIN);
-
- if (ret)
+ mutex_lock(&mut);
+ mc->id = idr_alloc(&multicast_idr, mc, 0, 0, GFP_KERNEL);
+ mutex_unlock(&mut);
+ if (mc->id < 0)
goto error;
mc->ctx = ctx;
@@ -233,8 +256,8 @@ static void ucma_set_event_context(struct ucma_context *ctx,
switch (event->event) {
case RDMA_CM_EVENT_MULTICAST_JOIN:
case RDMA_CM_EVENT_MULTICAST_ERROR:
- uevent->mc = (struct ucma_multicast *)
- event->param.ud.private_data;
+ uevent->mc = __DECONST(struct ucma_multicast *,
+ event->param.ud.private_data);
uevent->resp.uid = uevent->mc->uid;
uevent->resp.id = uevent->mc->id;
break;
@@ -245,6 +268,44 @@ static void ucma_set_event_context(struct ucma_context *ctx,
}
}
+/* Called with file->mut locked for the relevant context. */
+static void ucma_removal_event_handler(struct rdma_cm_id *cm_id)
+{
+ struct ucma_context *ctx = cm_id->context;
+ struct ucma_event *con_req_eve;
+ int event_found = 0;
+
+ if (ctx->destroying)
+ return;
+
+ /* only if context is pointing to cm_id that it owns it and can be
+ * queued to be closed, otherwise that cm_id is an inflight one that
+ * is part of that context event list pending to be detached and
+ * reattached to its new context as part of ucma_get_event,
+ * handled separately below.
+ */
+ if (ctx->cm_id == cm_id) {
+ mutex_lock(&mut);
+ ctx->closing = 1;
+ mutex_unlock(&mut);
+ queue_work(ctx->file->close_wq, &ctx->close_work);
+ return;
+ }
+
+ list_for_each_entry(con_req_eve, &ctx->file->event_list, list) {
+ if (con_req_eve->cm_id == cm_id &&
+ con_req_eve->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+ list_del(&con_req_eve->list);
+ INIT_WORK(&con_req_eve->close_work, ucma_close_event_id);
+ queue_work(ctx->file->close_wq, &con_req_eve->close_work);
+ event_found = 1;
+ break;
+ }
+ }
+ if (!event_found)
+ pr_err("ucma_removal_event_handler: warning: connect request event wasn't found\n");
+}
+
static int ucma_event_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event)
{
@@ -274,21 +335,27 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id,
goto out;
}
ctx->backlog--;
- } else if (!ctx->uid) {
+ } else if (!ctx->uid || ctx->cm_id != cm_id) {
/*
* We ignore events for new connections until userspace has set
* their context. This can only happen if an error occurs on a
* new connection before the user accepts it. This is okay,
- * since the accept will just fail later.
+ * since the accept will just fail later. However, we do need
+ * to release the underlying HW resources in case of a device
+ * removal event.
*/
+ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+ ucma_removal_event_handler(cm_id);
+
kfree(uevent);
goto out;
}
list_add_tail(&uevent->list, &ctx->file->event_list);
wake_up_interruptible(&ctx->file->poll_wait);
- if (ctx->file->filp)
- selwakeup(&ctx->file->filp->f_selinfo);
+ linux_poll_wakeup(ctx->file->filp);
+ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+ ucma_removal_event_handler(cm_id);
out:
mutex_unlock(&ctx->file->mut);
return ret;
@@ -334,7 +401,6 @@ static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf,
ctx->cm_id = uevent->cm_id;
ctx->cm_id->context = ctx;
uevent->resp.id = ctx->id;
- ctx->cm_id->ucontext = ctx;
}
if (copy_to_user((void __user *)(unsigned long)cmd.response,
@@ -372,7 +438,7 @@ static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_
}
static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf,
- int in_len, int out_len)
+ int in_len, int out_len)
{
struct rdma_ucm_create_id cmd;
struct rdma_ucm_create_id_resp resp;
@@ -397,12 +463,12 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf,
return -ENOMEM;
ctx->uid = cmd.uid;
- ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps, qp_type);
+ ctx->cm_id = rdma_create_id(TD_TO_VNET(curthread),
+ ucma_event_handler, ctx, cmd.ps, qp_type);
if (IS_ERR(ctx->cm_id)) {
ret = PTR_ERR(ctx->cm_id);
goto err1;
}
- ctx->cm_id->ucontext = ctx;
resp.id = ctx->id;
if (copy_to_user((void __user *)(unsigned long)cmd.response,
@@ -449,9 +515,15 @@ static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
}
/*
- * We cannot hold file->mut when calling rdma_destroy_id() or we can
- * deadlock. We also acquire file->mut in ucma_event_handler(), and
- * rdma_destroy_id() will wait until all callbacks have completed.
+ * ucma_free_ctx is called after the underlying rdma CM-ID is destroyed. At
+ * this point, no new events will be reported from the hardware. However, we
+ * still need to cleanup the UCMA context for this ID. Specifically, there
+ * might be events that have not yet been consumed by the user space software.
+ * These might include pending connect requests which we have not completed
+ * processing. We cannot call rdma_destroy_id while holding the lock of the
+ * context (file->mut), as it might cause a deadlock. We therefore extract all
+ * relevant events from the context pending events list while holding the
+ * mutex. After that we release them as needed.
*/
static int ucma_free_ctx(struct ucma_context *ctx)
{
@@ -459,8 +531,6 @@ static int ucma_free_ctx(struct ucma_context *ctx)
struct ucma_event *uevent, *tmp;
LIST_HEAD(list);
- /* No new events will be generated after destroying the id. */
- rdma_destroy_id(ctx->cm_id);
ucma_cleanup_multicast(ctx);
@@ -508,10 +578,24 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
if (IS_ERR(ctx))
return PTR_ERR(ctx);
- ucma_put_ctx(ctx);
- wait_for_completion(&ctx->comp);
- resp.events_reported = ucma_free_ctx(ctx);
+ mutex_lock(&ctx->file->mut);
+ ctx->destroying = 1;
+ mutex_unlock(&ctx->file->mut);
+ flush_workqueue(ctx->file->close_wq);
+ /* At this point it's guaranteed that there is no inflight
+ * closing task */
+ mutex_lock(&mut);
+ if (!ctx->closing) {
+ mutex_unlock(&mut);
+ ucma_put_ctx(ctx);
+ wait_for_completion(&ctx->comp);
+ rdma_destroy_id(ctx->cm_id);
+ } else {
+ mutex_unlock(&mut);
+ }
+
+ resp.events_reported = ucma_free_ctx(ctx);
if (copy_to_user((void __user *)(unsigned long)cmd.response,
&resp, sizeof(resp)))
ret = -EFAULT;
@@ -519,10 +603,10 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
return ret;
}
-static ssize_t ucma_bind_addr(struct ucma_file *file, const char __user *inbuf,
+static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf,
int in_len, int out_len)
{
- struct rdma_ucm_bind_addr cmd;
+ struct rdma_ucm_bind_ip cmd;
struct ucma_context *ctx;
int ret;
@@ -538,24 +622,75 @@ static ssize_t ucma_bind_addr(struct ucma_file *file, const char __user *inbuf,
return ret;
}
+static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_bind cmd;
+ struct sockaddr *addr;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ addr = (struct sockaddr *) &cmd.addr;
+ if (cmd.reserved || !cmd.addr_size || (cmd.addr_size != rdma_addr_size(addr)))
+ return -EINVAL;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ret = rdma_bind_addr(ctx->cm_id, addr);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_resolve_ip(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_resolve_ip cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr,
+ (struct sockaddr *) &cmd.dst_addr,
+ cmd.timeout_ms);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
static ssize_t ucma_resolve_addr(struct ucma_file *file,
const char __user *inbuf,
int in_len, int out_len)
{
struct rdma_ucm_resolve_addr cmd;
+ struct sockaddr *src, *dst;
struct ucma_context *ctx;
int ret;
if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
return -EFAULT;
+ src = (struct sockaddr *) &cmd.src_addr;
+ dst = (struct sockaddr *) &cmd.dst_addr;
+ if (cmd.reserved || (cmd.src_size && (cmd.src_size != rdma_addr_size(src))) ||
+ !cmd.dst_size || (cmd.dst_size != rdma_addr_size(dst)))
+ return -EINVAL;
+
ctx = ucma_get_ctx(file, cmd.id);
if (IS_ERR(ctx))
return PTR_ERR(ctx);
- ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr,
- (struct sockaddr *) &cmd.dst_addr,
- cmd.timeout_ms);
+ ret = rdma_resolve_addr(ctx->cm_id, src, dst, cmd.timeout_ms);
ucma_put_ctx(ctx);
return ret;
}
@@ -648,7 +783,7 @@ static ssize_t ucma_query_route(struct ucma_file *file,
const char __user *inbuf,
int in_len, int out_len)
{
- struct rdma_ucm_query_route cmd;
+ struct rdma_ucm_query cmd;
struct rdma_ucm_query_route_resp resp;
struct ucma_context *ctx;
struct sockaddr *addr;
@@ -678,26 +813,13 @@ static ssize_t ucma_query_route(struct ucma_file *file,
resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid;
resp.port_num = ctx->cm_id->port_num;
- switch (rdma_node_get_transport(ctx->cm_id->device->node_type)) {
- case RDMA_TRANSPORT_IB:
- switch (rdma_port_get_link_layer(ctx->cm_id->device,
- ctx->cm_id->port_num)) {
- case IB_LINK_LAYER_INFINIBAND:
- ucma_copy_ib_route(&resp, &ctx->cm_id->route);
- break;
- case IB_LINK_LAYER_ETHERNET:
- ucma_copy_iboe_route(&resp, &ctx->cm_id->route);
- break;
- default:
- break;
- }
- break;
- case RDMA_TRANSPORT_IWARP:
+
+ if (rdma_cap_ib_sa(ctx->cm_id->device, ctx->cm_id->port_num))
+ ucma_copy_ib_route(&resp, &ctx->cm_id->route);
+ else if (rdma_protocol_roce(ctx->cm_id->device, ctx->cm_id->port_num))
+ ucma_copy_iboe_route(&resp, &ctx->cm_id->route);
+ else if (rdma_protocol_iwarp(ctx->cm_id->device, ctx->cm_id->port_num))
ucma_copy_iw_route(&resp, &ctx->cm_id->route);
- break;
- default:
- break;
- }
out:
if (copy_to_user((void __user *)(unsigned long)cmd.response,
@@ -708,7 +830,162 @@ out:
return ret;
}
-static void ucma_copy_conn_param(struct rdma_conn_param *dst,
+static void ucma_query_device_addr(struct rdma_cm_id *cm_id,
+ struct rdma_ucm_query_addr_resp *resp)
+{
+ if (!cm_id->device)
+ return;
+
+ resp->node_guid = (__force __u64) cm_id->device->node_guid;
+ resp->port_num = cm_id->port_num;
+ resp->pkey = (__force __u16) cpu_to_be16(
+ ib_addr_get_pkey(&cm_id->route.addr.dev_addr));
+}
+
+static ssize_t ucma_query_addr(struct ucma_context *ctx,
+ void __user *response, int out_len)
+{
+ struct rdma_ucm_query_addr_resp resp;
+ struct sockaddr *addr;
+ int ret = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ memset(&resp, 0, sizeof resp);
+
+ addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr;
+ resp.src_size = rdma_addr_size(addr);
+ memcpy(&resp.src_addr, addr, resp.src_size);
+
+ addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr;
+ resp.dst_size = rdma_addr_size(addr);
+ memcpy(&resp.dst_addr, addr, resp.dst_size);
+
+ ucma_query_device_addr(ctx->cm_id, &resp);
+
+ if (copy_to_user(response, &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static ssize_t ucma_query_path(struct ucma_context *ctx,
+ void __user *response, int out_len)
+{
+ struct rdma_ucm_query_path_resp *resp;
+ int i, ret = 0;
+
+ if (out_len < sizeof(*resp))
+ return -ENOSPC;
+
+ resp = kzalloc(out_len, GFP_KERNEL);
+ if (!resp)
+ return -ENOMEM;
+
+ resp->num_paths = ctx->cm_id->route.num_paths;
+ for (i = 0, out_len -= sizeof(*resp);
+ i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data);
+ i++, out_len -= sizeof(struct ib_path_rec_data)) {
+
+ resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY |
+ IB_PATH_BIDIRECTIONAL;
+ ib_sa_pack_path(&ctx->cm_id->route.path_rec[i],
+ &resp->path_data[i].path_rec);
+ }
+
+ if (copy_to_user(response, resp,
+ sizeof(*resp) + (i * sizeof(struct ib_path_rec_data))))
+ ret = -EFAULT;
+
+ kfree(resp);
+ return ret;
+}
+
+static ssize_t ucma_query_gid(struct ucma_context *ctx,
+ void __user *response, int out_len)
+{
+ struct rdma_ucm_query_addr_resp resp;
+ struct sockaddr_ib *addr;
+ int ret = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ memset(&resp, 0, sizeof resp);
+
+ ucma_query_device_addr(ctx->cm_id, &resp);
+
+ addr = (struct sockaddr_ib *) &resp.src_addr;
+ resp.src_size = sizeof(*addr);
+ if (ctx->cm_id->route.addr.src_addr.ss_family == AF_IB) {
+ memcpy(addr, &ctx->cm_id->route.addr.src_addr, resp.src_size);
+ } else {
+ addr->sib_family = AF_IB;
+ addr->sib_pkey = (__force __be16) resp.pkey;
+ rdma_addr_get_sgid(&ctx->cm_id->route.addr.dev_addr,
+ (union ib_gid *) &addr->sib_addr);
+ addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *)
+ &ctx->cm_id->route.addr.src_addr);
+ }
+
+ addr = (struct sockaddr_ib *) &resp.dst_addr;
+ resp.dst_size = sizeof(*addr);
+ if (ctx->cm_id->route.addr.dst_addr.ss_family == AF_IB) {
+ memcpy(addr, &ctx->cm_id->route.addr.dst_addr, resp.dst_size);
+ } else {
+ addr->sib_family = AF_IB;
+ addr->sib_pkey = (__force __be16) resp.pkey;
+ rdma_addr_get_dgid(&ctx->cm_id->route.addr.dev_addr,
+ (union ib_gid *) &addr->sib_addr);
+ addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *)
+ &ctx->cm_id->route.addr.dst_addr);
+ }
+
+ if (copy_to_user(response, &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static ssize_t ucma_query(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_query cmd;
+ struct ucma_context *ctx;
+ void __user *response;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ response = (void __user *)(unsigned long) cmd.response;
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ switch (cmd.option) {
+ case RDMA_USER_CM_QUERY_ADDR:
+ ret = ucma_query_addr(ctx, response, out_len);
+ break;
+ case RDMA_USER_CM_QUERY_PATH:
+ ret = ucma_query_path(ctx, response, out_len);
+ break;
+ case RDMA_USER_CM_QUERY_GID:
+ ret = ucma_query_gid(ctx, response, out_len);
+ break;
+ default:
+ ret = -ENOSYS;
+ break;
+ }
+
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static void ucma_copy_conn_param(struct rdma_cm_id *id,
+ struct rdma_conn_param *dst,
struct rdma_ucm_conn_param *src)
{
dst->private_data = src->private_data;
@@ -720,6 +997,7 @@ static void ucma_copy_conn_param(struct rdma_conn_param *dst,
dst->rnr_retry_count = src->rnr_retry_count;
dst->srq = src->srq;
dst->qp_num = src->qp_num;
+ dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0;
}
static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf,
@@ -740,7 +1018,7 @@ static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf,
if (IS_ERR(ctx))
return PTR_ERR(ctx);
- ucma_copy_conn_param(&conn_param, &cmd.conn_param);
+ ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param);
ret = rdma_connect(ctx->cm_id, &conn_param);
ucma_put_ctx(ctx);
return ret;
@@ -783,7 +1061,7 @@ static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf,
return PTR_ERR(ctx);
if (cmd.conn_param.valid) {
- ucma_copy_conn_param(&conn_param, &cmd.conn_param);
+ ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param);
mutex_lock(&file->mut);
ret = rdma_accept(ctx->cm_id, &conn_param);
if (!ret)
@@ -924,6 +1202,8 @@ static int ucma_set_ib_path(struct ucma_context *ctx,
if (!optlen)
return -EINVAL;
+ memset(&sa_path, 0, sizeof(sa_path));
+
ib_sa_unpack_path(path_data->path_rec, &sa_path);
ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1);
if (ret)
@@ -937,22 +1217,12 @@ static int ucma_set_ib_path(struct ucma_context *ctx,
static int ucma_set_option_ib(struct ucma_context *ctx, int optname,
void *optval, size_t optlen)
{
- int ret = 0;
+ int ret;
switch (optname) {
case RDMA_OPTION_IB_PATH:
ret = ucma_set_ib_path(ctx, optval, optlen);
break;
-
- case RDMA_OPTION_IB_APM:
- if (optlen != sizeof(u8)) {
- ret = -EINVAL;
- break;
- }
- if (*(u8 *)optval)
- ret = rdma_enable_apm(ctx->cm_id, RDMA_ALT_PATH_BEST);
- break;
-
default:
ret = -ENOSYS;
}
@@ -994,24 +1264,18 @@ static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf,
if (IS_ERR(ctx))
return PTR_ERR(ctx);
- optval = kmalloc(cmd.optlen, GFP_KERNEL);
- if (!optval) {
- ret = -ENOMEM;
- goto err_ucma_put_ctx;
- }
-
- if (copy_from_user(optval, (void __user *)(unsigned long)cmd.optval,
- cmd.optlen)) {
- ret = -EFAULT;
- goto err_kfree;
+ optval = memdup_user((void __user *) (unsigned long) cmd.optval,
+ cmd.optlen);
+ if (IS_ERR(optval)) {
+ ret = PTR_ERR(optval);
+ goto out;
}
ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval,
cmd.optlen);
-
-err_kfree:
kfree(optval);
-err_ucma_put_ctx:
+
+out:
ucma_put_ctx(ctx);
return ret;
}
@@ -1035,23 +1299,31 @@ static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf,
return ret;
}
-static ssize_t ucma_join_multicast(struct ucma_file *file,
- const char __user *inbuf,
- int in_len, int out_len)
+static ssize_t ucma_process_join(struct ucma_file *file,
+ struct rdma_ucm_join_mcast *cmd, int out_len)
{
- struct rdma_ucm_join_mcast cmd;
struct rdma_ucm_create_id_resp resp;
struct ucma_context *ctx;
struct ucma_multicast *mc;
+ struct sockaddr *addr;
int ret;
+ u8 join_state;
if (out_len < sizeof(resp))
return -ENOSPC;
- if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
- return -EFAULT;
+ addr = (struct sockaddr *) &cmd->addr;
+ if (!cmd->addr_size || (cmd->addr_size != rdma_addr_size(addr)))
+ return -EINVAL;
- ctx = ucma_get_ctx(file, cmd.id);
+ if (cmd->join_flags == RDMA_MC_JOIN_FLAG_FULLMEMBER)
+ join_state = BIT(FULLMEMBER_JOIN);
+ else if (cmd->join_flags == RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER)
+ join_state = BIT(SENDONLY_FULLMEMBER_JOIN);
+ else
+ return -EINVAL;
+
+ ctx = ucma_get_ctx(file, cmd->id);
if (IS_ERR(ctx))
return PTR_ERR(ctx);
@@ -1061,15 +1333,16 @@ static ssize_t ucma_join_multicast(struct ucma_file *file,
ret = -ENOMEM;
goto err1;
}
-
- mc->uid = cmd.uid;
- memcpy(&mc->addr, &cmd.addr, sizeof cmd.addr);
- ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr, mc);
+ mc->join_state = join_state;
+ mc->uid = cmd->uid;
+ memcpy(&mc->addr, addr, cmd->addr_size);
+ ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr,
+ join_state, mc);
if (ret)
goto err2;
resp.id = mc->id;
- if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ if (copy_to_user((void __user *)(unsigned long) cmd->response,
&resp, sizeof(resp))) {
ret = -EFAULT;
goto err3;
@@ -1094,6 +1367,38 @@ err1:
return ret;
}
+static ssize_t ucma_join_ip_multicast(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_join_ip_mcast cmd;
+ struct rdma_ucm_join_mcast join_cmd;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ join_cmd.response = cmd.response;
+ join_cmd.uid = cmd.uid;
+ join_cmd.id = cmd.id;
+ join_cmd.addr_size = rdma_addr_size((struct sockaddr *) &cmd.addr);
+ join_cmd.join_flags = RDMA_MC_JOIN_FLAG_FULLMEMBER;
+ memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size);
+
+ return ucma_process_join(file, &join_cmd, out_len);
+}
+
+static ssize_t ucma_join_multicast(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_join_mcast cmd;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ return ucma_process_join(file, &cmd, out_len);
+}
+
static ssize_t ucma_leave_multicast(struct ucma_file *file,
const char __user *inbuf,
int in_len, int out_len)
@@ -1115,10 +1420,10 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file,
mc = ERR_PTR(-ENOENT);
else if (mc->ctx->file != file)
mc = ERR_PTR(-EINVAL);
- else {
+ else if (!atomic_inc_not_zero(&mc->ctx->ref))
+ mc = ERR_PTR(-ENXIO);
+ else
idr_remove(&multicast_idr, mc->id);
- atomic_inc(&mc->ctx->ref);
- }
mutex_unlock(&mut);
if (IS_ERR(mc)) {
@@ -1148,10 +1453,10 @@ static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2)
/* Acquire mutex's based on pointer comparison to prevent deadlock. */
if (file1 < file2) {
mutex_lock(&file1->mut);
- mutex_lock(&file2->mut);
+ mutex_lock_nested(&file2->mut, SINGLE_DEPTH_NESTING);
} else {
mutex_lock(&file2->mut);
- mutex_lock(&file1->mut);
+ mutex_lock_nested(&file1->mut, SINGLE_DEPTH_NESTING);
}
}
@@ -1236,25 +1541,29 @@ file_put:
static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
const char __user *inbuf,
int in_len, int out_len) = {
- [RDMA_USER_CM_CMD_CREATE_ID] = ucma_create_id,
- [RDMA_USER_CM_CMD_DESTROY_ID] = ucma_destroy_id,
- [RDMA_USER_CM_CMD_BIND_ADDR] = ucma_bind_addr,
- [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr,
- [RDMA_USER_CM_CMD_RESOLVE_ROUTE]= ucma_resolve_route,
- [RDMA_USER_CM_CMD_QUERY_ROUTE] = ucma_query_route,
- [RDMA_USER_CM_CMD_CONNECT] = ucma_connect,
- [RDMA_USER_CM_CMD_LISTEN] = ucma_listen,
- [RDMA_USER_CM_CMD_ACCEPT] = ucma_accept,
- [RDMA_USER_CM_CMD_REJECT] = ucma_reject,
- [RDMA_USER_CM_CMD_DISCONNECT] = ucma_disconnect,
- [RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr,
- [RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event,
- [RDMA_USER_CM_CMD_GET_OPTION] = NULL,
- [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option,
- [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify,
- [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast,
- [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast,
- [RDMA_USER_CM_CMD_MIGRATE_ID] = ucma_migrate_id
+ [RDMA_USER_CM_CMD_CREATE_ID] = ucma_create_id,
+ [RDMA_USER_CM_CMD_DESTROY_ID] = ucma_destroy_id,
+ [RDMA_USER_CM_CMD_BIND_IP] = ucma_bind_ip,
+ [RDMA_USER_CM_CMD_RESOLVE_IP] = ucma_resolve_ip,
+ [RDMA_USER_CM_CMD_RESOLVE_ROUTE] = ucma_resolve_route,
+ [RDMA_USER_CM_CMD_QUERY_ROUTE] = ucma_query_route,
+ [RDMA_USER_CM_CMD_CONNECT] = ucma_connect,
+ [RDMA_USER_CM_CMD_LISTEN] = ucma_listen,
+ [RDMA_USER_CM_CMD_ACCEPT] = ucma_accept,
+ [RDMA_USER_CM_CMD_REJECT] = ucma_reject,
+ [RDMA_USER_CM_CMD_DISCONNECT] = ucma_disconnect,
+ [RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr,
+ [RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event,
+ [RDMA_USER_CM_CMD_GET_OPTION] = NULL,
+ [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option,
+ [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify,
+ [RDMA_USER_CM_CMD_JOIN_IP_MCAST] = ucma_join_ip_multicast,
+ [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast,
+ [RDMA_USER_CM_CMD_MIGRATE_ID] = ucma_migrate_id,
+ [RDMA_USER_CM_CMD_QUERY] = ucma_query,
+ [RDMA_USER_CM_CMD_BIND] = ucma_bind,
+ [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr,
+ [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast
};
static ssize_t ucma_write(struct file *filp, const char __user *buf,
@@ -1264,6 +1573,9 @@ static ssize_t ucma_write(struct file *filp, const char __user *buf,
struct rdma_ucm_cmd_hdr hdr;
ssize_t ret;
+ if (WARN_ON_ONCE(!ib_safe_file_access(filp)))
+ return -EACCES;
+
if (len < sizeof(hdr))
return -EINVAL;
@@ -1315,6 +1627,13 @@ static int ucma_open(struct inode *inode, struct file *filp)
if (!file)
return -ENOMEM;
+ file->close_wq = alloc_ordered_workqueue("ucma_close_id",
+ WQ_MEM_RECLAIM);
+ if (!file->close_wq) {
+ kfree(file);
+ return -ENOMEM;
+ }
+
INIT_LIST_HEAD(&file->event_list);
INIT_LIST_HEAD(&file->ctx_list);
init_waitqueue_head(&file->poll_wait);
@@ -1333,16 +1652,34 @@ static int ucma_close(struct inode *inode, struct file *filp)
mutex_lock(&file->mut);
list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) {
+ ctx->destroying = 1;
mutex_unlock(&file->mut);
mutex_lock(&mut);
idr_remove(&ctx_idr, ctx->id);
mutex_unlock(&mut);
+ flush_workqueue(file->close_wq);
+ /* At that step once ctx was marked as destroying and workqueue
+ * was flushed we are safe from any inflights handlers that
+ * might put other closing task.
+ */
+ mutex_lock(&mut);
+ if (!ctx->closing) {
+ mutex_unlock(&mut);
+ /* rdma_destroy_id ensures that no event handlers are
+ * inflight for that id before releasing it.
+ */
+ rdma_destroy_id(ctx->cm_id);
+ } else {
+ mutex_unlock(&mut);
+ }
+
ucma_free_ctx(ctx);
mutex_lock(&file->mut);
}
mutex_unlock(&file->mut);
+ destroy_workqueue(file->close_wq);
kfree(file);
return 0;
}
@@ -1371,11 +1708,11 @@ static const struct file_operations ucma_fops = {
};
static struct miscdevice ucma_misc = {
- .minor = MISC_DYNAMIC_MINOR,
- .name = "rdma_cm",
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "rdma_cm",
.nodename = "infiniband/rdma_cm",
.mode = 0666,
- .fops = &ucma_fops,
+ .fops = &ucma_fops,
};
static ssize_t show_abi_version(struct device *dev,
@@ -1396,7 +1733,7 @@ static int __init ucma_init(void)
ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version);
if (ret) {
- printk(KERN_ERR "rdma_ucm: couldn't create abi_version attr\n");
+ pr_err("rdma_ucm: couldn't create abi_version attr\n");
goto err1;
}
@@ -1411,6 +1748,7 @@ static void __exit ucma_cleanup(void)
device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
misc_deregister(&ucma_misc);
idr_destroy(&ctx_idr);
+ idr_destroy(&multicast_idr);
}
module_init(ucma_init);
diff --git a/sys/ofed/drivers/infiniband/core/ud_header.c b/sys/ofed/drivers/infiniband/core/ud_header.c
index 051d3bd3d4b60..48183d8d95883 100644
--- a/sys/ofed/drivers/infiniband/core/ud_header.c
+++ b/sys/ofed/drivers/infiniband/core/ud_header.c
@@ -33,11 +33,12 @@
#include <linux/errno.h>
#include <linux/string.h>
-#include <linux/module.h>
#include <linux/if_ether.h>
#include <rdma/ib_pack.h>
+#include <machine/in_cksum.h>
+
#define STRUCT_FIELD(header, field) \
.struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \
.struct_size_bytes = sizeof ((struct ib_unpacked_ ## header *) 0)->field, \
@@ -116,6 +117,72 @@ static const struct ib_field vlan_table[] = {
.size_bits = 16 }
};
+static const struct ib_field ip4_table[] = {
+ { STRUCT_FIELD(ip4, ver),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { STRUCT_FIELD(ip4, hdr_len),
+ .offset_words = 0,
+ .offset_bits = 4,
+ .size_bits = 4 },
+ { STRUCT_FIELD(ip4, tos),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { STRUCT_FIELD(ip4, tot_len),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { STRUCT_FIELD(ip4, id),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(ip4, frag_off),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { STRUCT_FIELD(ip4, ttl),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { STRUCT_FIELD(ip4, protocol),
+ .offset_words = 2,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { STRUCT_FIELD(ip4, check),
+ .offset_words = 2,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { STRUCT_FIELD(ip4, saddr),
+ .offset_words = 3,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { STRUCT_FIELD(ip4, daddr),
+ .offset_words = 4,
+ .offset_bits = 0,
+ .size_bits = 32 }
+};
+
+static const struct ib_field udp_table[] = {
+ { STRUCT_FIELD(udp, sport),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(udp, dport),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { STRUCT_FIELD(udp, length),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(udp, csum),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 16 }
+};
+
static const struct ib_field grh_table[] = {
{ STRUCT_FIELD(grh, ip_version),
.offset_words = 0,
@@ -213,28 +280,61 @@ static const struct ib_field deth_table[] = {
.size_bits = 24 }
};
+__sum16 ib_ud_ip4_csum(struct ib_ud_header *header)
+{
+ struct ip iph;
+
+ iph.ip_hl = 5;
+ iph.ip_v = 4;
+ iph.ip_tos = header->ip4.tos;
+ iph.ip_len = header->ip4.tot_len;
+ iph.ip_id = header->ip4.id;
+ iph.ip_off = header->ip4.frag_off;
+ iph.ip_ttl = header->ip4.ttl;
+ iph.ip_p = header->ip4.protocol;
+ iph.ip_sum = 0;
+ iph.ip_src.s_addr = header->ip4.saddr;
+ iph.ip_dst.s_addr = header->ip4.daddr;
+
+ return in_cksum_hdr(&iph);
+}
+EXPORT_SYMBOL(ib_ud_ip4_csum);
+
/**
* ib_ud_header_init - Initialize UD header structure
* @payload_bytes:Length of packet payload
* @lrh_present: specify if LRH is present
* @eth_present: specify if Eth header is present
* @vlan_present: packet is tagged vlan
- * @grh_present:GRH flag (if non-zero, GRH will be included)
+ * @grh_present: GRH flag (if non-zero, GRH will be included)
+ * @ip_version: if non-zero, IP header, V4 or V6, will be included
+ * @udp_present :if non-zero, UDP header will be included
* @immediate_present: specify if immediate data is present
* @header:Structure to initialize
*/
-void ib_ud_header_init(int payload_bytes,
- int lrh_present,
- int eth_present,
- int vlan_present,
- int grh_present,
- int immediate_present,
- struct ib_ud_header *header)
+int ib_ud_header_init(int payload_bytes,
+ int lrh_present,
+ int eth_present,
+ int vlan_present,
+ int grh_present,
+ int ip_version,
+ int udp_present,
+ int immediate_present,
+ struct ib_ud_header *header)
{
+ size_t udp_bytes = udp_present ? IB_UDP_BYTES : 0;
+
+ grh_present = grh_present && !ip_version;
memset(header, 0, sizeof *header);
+ /*
+ * UDP header without IP header doesn't make sense
+ */
+ if (udp_present && ip_version != 4 && ip_version != 6)
+ return -EINVAL;
+
if (lrh_present) {
- u16 packet_length = 0;
+ u16 packet_length;
header->lrh.link_version = 0;
header->lrh.link_next_header =
@@ -250,18 +350,39 @@ void ib_ud_header_init(int payload_bytes,
}
if (vlan_present)
- header->eth.type = cpu_to_be16(ETH_P_8021Q);
+ header->eth.type = cpu_to_be16(ETH_P_8021Q);
+
+ if (ip_version == 6 || grh_present) {
+ header->grh.ip_version = 6;
+ header->grh.payload_length =
+ cpu_to_be16((udp_bytes +
+ IB_BTH_BYTES +
+ IB_DETH_BYTES +
+ payload_bytes +
+ 4 + /* ICRC */
+ 3) & ~3); /* round up */
+ header->grh.next_header = udp_present ? IPPROTO_UDP : 0x1b;
+ }
- if (grh_present) {
- header->grh.ip_version = 6;
- header->grh.payload_length =
- cpu_to_be16((IB_BTH_BYTES +
+ if (ip_version == 4) {
+ header->ip4.ver = 4; /* version 4 */
+ header->ip4.hdr_len = 5; /* 5 words */
+ header->ip4.tot_len =
+ cpu_to_be16(IB_IP4_BYTES +
+ udp_bytes +
+ IB_BTH_BYTES +
IB_DETH_BYTES +
payload_bytes +
- 4 + /* ICRC */
- 3) & ~3); /* round up */
- header->grh.next_header = 0x1b;
+ 4); /* ICRC */
+ header->ip4.protocol = IPPROTO_UDP;
}
+ if (udp_present && ip_version)
+ header->udp.length =
+ cpu_to_be16(IB_UDP_BYTES +
+ IB_BTH_BYTES +
+ IB_DETH_BYTES +
+ payload_bytes +
+ 4); /* ICRC */
if (immediate_present)
header->bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
@@ -273,8 +394,11 @@ void ib_ud_header_init(int payload_bytes,
header->lrh_present = lrh_present;
header->eth_present = eth_present;
header->vlan_present = vlan_present;
- header->grh_present = grh_present;
+ header->grh_present = grh_present || (ip_version == 6);
+ header->ipv4_present = ip_version == 4;
+ header->udp_present = udp_present;
header->immediate_present = immediate_present;
+ return 0;
}
EXPORT_SYMBOL(ib_ud_header_init);
@@ -293,35 +417,45 @@ int ib_ud_header_pack(struct ib_ud_header *header,
if (header->lrh_present) {
ib_pack(lrh_table, ARRAY_SIZE(lrh_table),
- &header->lrh, buf + len);
+ &header->lrh, (char *)buf + len);
len += IB_LRH_BYTES;
}
if (header->eth_present) {
ib_pack(eth_table, ARRAY_SIZE(eth_table),
- &header->eth, buf + len);
+ &header->eth, (char *)buf + len);
len += IB_ETH_BYTES;
}
if (header->vlan_present) {
ib_pack(vlan_table, ARRAY_SIZE(vlan_table),
- &header->vlan, buf + len);
+ &header->vlan, (char *)buf + len);
len += IB_VLAN_BYTES;
}
if (header->grh_present) {
ib_pack(grh_table, ARRAY_SIZE(grh_table),
- &header->grh, buf + len);
+ &header->grh, (char *)buf + len);
len += IB_GRH_BYTES;
}
+ if (header->ipv4_present) {
+ ib_pack(ip4_table, ARRAY_SIZE(ip4_table),
+ &header->ip4, (char *)buf + len);
+ len += IB_IP4_BYTES;
+ }
+ if (header->udp_present) {
+ ib_pack(udp_table, ARRAY_SIZE(udp_table),
+ &header->udp, (char *)buf + len);
+ len += IB_UDP_BYTES;
+ }
ib_pack(bth_table, ARRAY_SIZE(bth_table),
- &header->bth, buf + len);
+ &header->bth, (char *)buf + len);
len += IB_BTH_BYTES;
ib_pack(deth_table, ARRAY_SIZE(deth_table),
- &header->deth, buf + len);
+ &header->deth, (char *)buf + len);
len += IB_DETH_BYTES;
if (header->immediate_present) {
- memcpy(buf + len, &header->immediate_data, sizeof header->immediate_data);
+ memcpy((char *)buf + len, &header->immediate_data, sizeof header->immediate_data);
len += sizeof header->immediate_data;
}
@@ -342,11 +476,11 @@ int ib_ud_header_unpack(void *buf,
{
ib_unpack(lrh_table, ARRAY_SIZE(lrh_table),
buf, &header->lrh);
- buf += IB_LRH_BYTES;
+ buf = (char *)buf + IB_LRH_BYTES;
if (header->lrh.link_version != 0) {
- printk(KERN_WARNING "Invalid LRH.link_version %d\n",
- header->lrh.link_version);
+ pr_warn("Invalid LRH.link_version %d\n",
+ header->lrh.link_version);
return -EINVAL;
}
@@ -359,29 +493,29 @@ int ib_ud_header_unpack(void *buf,
header->grh_present = 1;
ib_unpack(grh_table, ARRAY_SIZE(grh_table),
buf, &header->grh);
- buf += IB_GRH_BYTES;
+ buf = (char *)buf + IB_GRH_BYTES;
if (header->grh.ip_version != 6) {
- printk(KERN_WARNING "Invalid GRH.ip_version %d\n",
- header->grh.ip_version);
+ pr_warn("Invalid GRH.ip_version %d\n",
+ header->grh.ip_version);
return -EINVAL;
}
if (header->grh.next_header != 0x1b) {
- printk(KERN_WARNING "Invalid GRH.next_header 0x%02x\n",
- header->grh.next_header);
+ pr_warn("Invalid GRH.next_header 0x%02x\n",
+ header->grh.next_header);
return -EINVAL;
}
break;
default:
- printk(KERN_WARNING "Invalid LRH.link_next_header %d\n",
- header->lrh.link_next_header);
+ pr_warn("Invalid LRH.link_next_header %d\n",
+ header->lrh.link_next_header);
return -EINVAL;
}
ib_unpack(bth_table, ARRAY_SIZE(bth_table),
buf, &header->bth);
- buf += IB_BTH_BYTES;
+ buf = (char *)buf + IB_BTH_BYTES;
switch (header->bth.opcode) {
case IB_OPCODE_UD_SEND_ONLY:
@@ -391,20 +525,19 @@ int ib_ud_header_unpack(void *buf,
header->immediate_present = 1;
break;
default:
- printk(KERN_WARNING "Invalid BTH.opcode 0x%02x\n",
- header->bth.opcode);
+ pr_warn("Invalid BTH.opcode 0x%02x\n", header->bth.opcode);
return -EINVAL;
}
if (header->bth.transport_header_version != 0) {
- printk(KERN_WARNING "Invalid BTH.transport_header_version %d\n",
- header->bth.transport_header_version);
+ pr_warn("Invalid BTH.transport_header_version %d\n",
+ header->bth.transport_header_version);
return -EINVAL;
}
ib_unpack(deth_table, ARRAY_SIZE(deth_table),
buf, &header->deth);
- buf += IB_DETH_BYTES;
+ buf = (char *)buf + IB_DETH_BYTES;
if (header->immediate_present)
memcpy(&header->immediate_data, buf, sizeof header->immediate_data);
diff --git a/sys/ofed/drivers/infiniband/core/umem.c b/sys/ofed/drivers/infiniband/core/umem.c
deleted file mode 100644
index 70fdea20e0257..0000000000000
--- a/sys/ofed/drivers/infiniband/core/umem.c
+++ /dev/null
@@ -1,445 +0,0 @@
-/*
- * Copyright (c) 2005 Topspin Communications. All rights reserved.
- * Copyright (c) 2005 Cisco Systems. All rights reserved.
- * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#define LINUXKPI_PARAM_PREFIX ibcore_
-
-#include <linux/mm.h>
-#include <linux/dma-mapping.h>
-#include <linux/sched.h>
-#include <linux/dma-attrs.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <sys/priv.h>
-#include <sys/resourcevar.h>
-#include <sys/vmmeter.h>
-#include <vm/vm_pageout.h>
-#include <vm/vm_map.h>
-#include "uverbs.h"
-
-#define IB_UMEM_MAX_PAGE_CHUNK (PAGE_SIZE / sizeof (struct page *))
-
-static int allow_weak_ordering;
-module_param_named(weak_ordering, allow_weak_ordering, int, 0444);
-MODULE_PARM_DESC(weak_ordering, "Allow weak ordering for data registered memory");
-
-static struct ib_umem *peer_umem_get(struct ib_peer_memory_client *ib_peer_mem,
- struct ib_umem *umem, unsigned long addr,
- int dmasync, int invalidation_supported)
-{
- int ret;
- const struct peer_memory_client *peer_mem = ib_peer_mem->peer_mem;
- struct invalidation_ctx *invalidation_ctx = NULL;
-
- umem->ib_peer_mem = ib_peer_mem;
- if (invalidation_supported) {
- invalidation_ctx = kzalloc(sizeof(*invalidation_ctx), GFP_KERNEL);
- if (!invalidation_ctx) {
- ret = -ENOMEM;
- goto out;
- }
- umem->invalidation_ctx = invalidation_ctx;
- invalidation_ctx->umem = umem;
- mutex_lock(&ib_peer_mem->lock);
- invalidation_ctx->context_ticket =
- ib_peer_insert_context(ib_peer_mem, invalidation_ctx);
- /* unlock before calling get pages to prevent a dead-lock from the callback */
- mutex_unlock(&ib_peer_mem->lock);
- }
-
- ret = peer_mem->get_pages(addr, umem->length, umem->writable, 1,
- &umem->sg_head,
- umem->peer_mem_client_context,
- invalidation_ctx ?
- (void *)invalidation_ctx->context_ticket : NULL);
-
- if (invalidation_ctx) {
- /* taking the lock back, checking that wasn't invalidated at that time */
- mutex_lock(&ib_peer_mem->lock);
- if (invalidation_ctx->peer_invalidated) {
- printk(KERN_ERR "peer_umem_get: pages were invalidated by peer\n");
- ret = -EINVAL;
- }
- }
-
- if (ret)
- goto out;
-
- umem->page_size = peer_mem->get_page_size
- (umem->peer_mem_client_context);
- if (umem->page_size <= 0)
- goto put_pages;
-
- umem->offset = addr & ((unsigned long)umem->page_size - 1);
- ret = peer_mem->dma_map(&umem->sg_head,
- umem->peer_mem_client_context,
- umem->context->device->dma_device,
- dmasync,
- &umem->nmap);
- if (ret)
- goto put_pages;
-
- ib_peer_mem->stats.num_reg_pages +=
- umem->nmap * (umem->page_size >> PAGE_SHIFT);
- ib_peer_mem->stats.num_alloc_mrs += 1;
- return umem;
-
-put_pages:
-
- peer_mem->put_pages(umem->peer_mem_client_context,
- &umem->sg_head);
-out:
- if (invalidation_ctx) {
- ib_peer_remove_context(ib_peer_mem, invalidation_ctx->context_ticket);
- mutex_unlock(&umem->ib_peer_mem->lock);
- kfree(invalidation_ctx);
- }
-
- ib_put_peer_client(ib_peer_mem, umem->peer_mem_client_context,
- umem->peer_mem_srcu_key);
- kfree(umem);
- return ERR_PTR(ret);
-}
-
-static void peer_umem_release(struct ib_umem *umem)
-{
- struct ib_peer_memory_client *ib_peer_mem = umem->ib_peer_mem;
- const struct peer_memory_client *peer_mem = ib_peer_mem->peer_mem;
- struct invalidation_ctx *invalidation_ctx = umem->invalidation_ctx;
-
- if (invalidation_ctx) {
-
- int peer_callback;
- int inflight_invalidation;
- /* If we are not under peer callback we must take the lock before removing
- * core ticket from the tree and releasing its umem.
- * It will let any inflight callbacks to be ended safely.
- * If we are under peer callback or under error flow of reg_mr so that context
- * wasn't activated yet lock was already taken.
- */
- if (invalidation_ctx->func && !invalidation_ctx->peer_callback)
- mutex_lock(&ib_peer_mem->lock);
- ib_peer_remove_context(ib_peer_mem, invalidation_ctx->context_ticket);
- /* make sure to check inflight flag after took the lock and remove from tree.
- * in addition, from that point using local variables for peer_callback and
- * inflight_invalidation as after the complete invalidation_ctx can't be accessed
- * any more as it may be freed by the callback.
- */
- peer_callback = invalidation_ctx->peer_callback;
- inflight_invalidation = invalidation_ctx->inflight_invalidation;
- if (inflight_invalidation)
- complete(&invalidation_ctx->comp);
- /* On peer callback lock is handled externally */
- if (!peer_callback)
- /* unlocking before put_pages */
- mutex_unlock(&ib_peer_mem->lock);
- /* in case under callback context or callback is pending let it free the invalidation context */
- if (!peer_callback && !inflight_invalidation)
- kfree(invalidation_ctx);
- }
-
- peer_mem->dma_unmap(&umem->sg_head,
- umem->peer_mem_client_context,
- umem->context->device->dma_device);
- peer_mem->put_pages(&umem->sg_head,
- umem->peer_mem_client_context);
-
- ib_peer_mem->stats.num_dereg_pages +=
- umem->nmap * (umem->page_size >> PAGE_SHIFT);
- ib_peer_mem->stats.num_dealloc_mrs += 1;
- ib_put_peer_client(ib_peer_mem, umem->peer_mem_client_context,
- umem->peer_mem_srcu_key);
- kfree(umem);
-
- return;
-
-}
-
-static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
-{
-
- vm_object_t object;
- struct scatterlist *sg;
- struct page *page;
- int i;
-
- object = NULL;
- if (umem->nmap > 0)
- ib_dma_unmap_sg(dev, umem->sg_head.sgl,
- umem->nmap,
- DMA_BIDIRECTIONAL);
- for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
- page = sg_page(sg);
- if (umem->writable && dirty) {
- if (object && object != page->object)
- VM_OBJECT_WUNLOCK(object);
- if (object != page->object) {
- object = page->object;
- VM_OBJECT_WLOCK(object);
- }
- vm_page_dirty(page);
- }
- }
- sg_free_table(&umem->sg_head);
- if (object)
- VM_OBJECT_WUNLOCK(object);
-
-}
-
-void ib_umem_activate_invalidation_notifier(struct ib_umem *umem,
- umem_invalidate_func_t func,
- void *cookie)
-{
- struct invalidation_ctx *invalidation_ctx = umem->invalidation_ctx;
-
- invalidation_ctx->func = func;
- invalidation_ctx->cookie = cookie;
-
- /* from that point any pending invalidations can be called */
- mutex_unlock(&umem->ib_peer_mem->lock);
- return;
-}
-EXPORT_SYMBOL(ib_umem_activate_invalidation_notifier);
-/**
- * ib_umem_get - Pin and DMA map userspace memory.
- * @context: userspace context to pin memory for
- * @addr: userspace virtual address to start at
- * @size: length of region to pin
- * @access: IB_ACCESS_xxx flags for memory being pinned
- * @dmasync: flush in-flight DMA when the memory region is written
- */
-struct ib_umem *ib_umem_get_ex(struct ib_ucontext *context, unsigned long addr,
- size_t size, int access, int dmasync,
- int invalidation_supported)
-{
-
- struct ib_umem *umem;
- struct proc *proc;
- pmap_t pmap;
- vm_offset_t end, last, start;
- vm_size_t npages;
- int error;
- int ret;
- int ents;
- int i;
- DEFINE_DMA_ATTRS(attrs);
- struct scatterlist *sg, *sg_list_start;
- int need_release = 0;
-
- error = priv_check(curthread, PRIV_VM_MLOCK);
- if (error)
- return ERR_PTR(-error);
-
- last = addr + size;
- start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */
- end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */
- if (last < addr || end < addr)
- return ERR_PTR(-EINVAL);
- npages = atop(end - start);
- if (npages > vm_page_max_wired)
- return ERR_PTR(-ENOMEM);
- umem = kzalloc(sizeof *umem, GFP_KERNEL);
- if (!umem)
- return ERR_PTR(-ENOMEM);
- proc = curthread->td_proc;
- PROC_LOCK(proc);
- if (ptoa(npages +
- pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) >
- lim_cur_proc(proc, RLIMIT_MEMLOCK)) {
- PROC_UNLOCK(proc);
- kfree(umem);
- return ERR_PTR(-ENOMEM);
- }
- PROC_UNLOCK(proc);
- if (npages + vm_cnt.v_wire_count > vm_page_max_wired) {
- kfree(umem);
- return ERR_PTR(-EAGAIN);
- }
- error = vm_map_wire(&proc->p_vmspace->vm_map, start, end,
- VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES |
- (umem->writable ? VM_MAP_WIRE_WRITE : 0));
- if (error != KERN_SUCCESS) {
- kfree(umem);
- return ERR_PTR(-ENOMEM);
- }
-
- umem->context = context;
- umem->length = size;
- umem->offset = addr & ~PAGE_MASK;
- umem->page_size = PAGE_SIZE;
- umem->start = addr;
- /*
- * We ask for writable memory if any access flags other than
- * "remote read" are set. "Local write" and "remote write"
- * obviously require write access. "Remote atomic" can do
- * things like fetch and add, which will modify memory, and
- * "MW bind" can change permissions by binding a window.
- */
- umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ);
-
- if (invalidation_supported || context->peer_mem_private_data) {
-
- struct ib_peer_memory_client *peer_mem_client;
-
- peer_mem_client = ib_get_peer_client(context, addr, size,
- &umem->peer_mem_client_context,
- &umem->peer_mem_srcu_key);
- if (peer_mem_client)
- return peer_umem_get(peer_mem_client, umem, addr,
- dmasync, invalidation_supported);
- }
-
- umem->hugetlb = 0;
-
- pmap = vm_map_pmap(&proc->p_vmspace->vm_map);
-
- if (npages == 0) {
- ret = -EINVAL;
- goto out;
- }
-
- ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
- if (ret)
- goto out;
-
- need_release = 1;
- sg_list_start = umem->sg_head.sgl;
-
- while (npages) {
-
- ents = min_t(int, npages, IB_UMEM_MAX_PAGE_CHUNK);
- umem->npages += ents;
-
- for_each_sg(sg_list_start, sg, ents, i) {
- vm_paddr_t pa;
-
- pa = pmap_extract(pmap, start);
- if (pa == 0) {
- ret = -ENOMEM;
- goto out;
- }
- sg_set_page(sg, PHYS_TO_VM_PAGE(pa),
- PAGE_SIZE, 0);
- npages--;
- start += PAGE_SIZE;
- }
-
- /* preparing for next loop */
- sg_list_start = sg;
- }
-
- umem->nmap = ib_dma_map_sg_attrs(context->device,
- umem->sg_head.sgl,
- umem->npages,
- DMA_BIDIRECTIONAL,
- &attrs);
- if (umem->nmap != umem->npages) {
- ret = -ENOMEM;
- goto out;
- }
-
-out:
- if (ret < 0) {
- if (need_release)
- __ib_umem_release(context->device, umem, 0);
- kfree(umem);
- }
-
- return ret < 0 ? ERR_PTR(ret) : umem;
-}
-EXPORT_SYMBOL(ib_umem_get_ex);
-
-struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
- size_t size, int access, int dmasync)
-{
- return ib_umem_get_ex(context, addr,
- size, access, dmasync, 0);
-}
-EXPORT_SYMBOL(ib_umem_get);
-
-/**
- * ib_umem_release - release memory pinned with ib_umem_get
- * @umem: umem struct to release
- */
-void ib_umem_release(struct ib_umem *umem)
-{
-
- vm_offset_t addr, end, last, start;
- vm_size_t size;
- int error;
-
- if (umem->ib_peer_mem) {
- peer_umem_release(umem);
- return;
- }
-
- __ib_umem_release(umem->context->device, umem, 1);
-
- if (umem->context->closing) {
- kfree(umem);
- return;
- }
-
- error = priv_check(curthread, PRIV_VM_MUNLOCK);
-
- if (error)
- return;
-
- addr = umem->start;
- size = umem->length;
- last = addr + size;
- start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */
- end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */
- vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map, start, end,
- VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
- kfree(umem);
-
-}
-EXPORT_SYMBOL(ib_umem_release);
-
-int ib_umem_page_count(struct ib_umem *umem)
-{
- int shift;
- int i;
- int n;
- struct scatterlist *sg;
-
- shift = ilog2(umem->page_size);
-
- n = 0;
- for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
- n += sg_dma_len(sg) >> shift;
-
- return n;
-}
-EXPORT_SYMBOL(ib_umem_page_count);
diff --git a/sys/ofed/drivers/infiniband/core/user_mad.c b/sys/ofed/drivers/infiniband/core/user_mad.c
index cc4a65913b982..4e579b131f57f 100644
--- a/sys/ofed/drivers/infiniband/core/user_mad.c
+++ b/sys/ofed/drivers/infiniband/core/user_mad.c
@@ -33,6 +33,8 @@
* SOFTWARE.
*/
+#define pr_fmt(fmt) "user_mad: " fmt
+
#include <linux/module.h>
#include <linux/device.h>
#include <linux/err.h>
@@ -79,10 +81,10 @@ enum {
*/
struct ib_umad_port {
- struct cdev *cdev;
+ struct cdev cdev;
struct device *dev;
- struct cdev *sm_cdev;
+ struct cdev sm_cdev;
struct device *sm_dev;
struct semaphore sm_sem;
@@ -93,12 +95,10 @@ struct ib_umad_port {
struct ib_umad_device *umad_dev;
int dev_num;
u8 port_num;
- struct list_head port_lst;
};
struct ib_umad_device {
- int start_port, end_port;
- struct kref ref;
+ struct kobject kobj;
struct ib_umad_port port[0];
};
@@ -131,85 +131,21 @@ static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
static DEFINE_SPINLOCK(port_lock);
static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
-static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS);
static void ib_umad_add_one(struct ib_device *device);
-static void ib_umad_remove_one(struct ib_device *device);
-
-static DEFINE_SPINLOCK(ports_list_lock);
-static struct list_head ports_list;
-
-
-static void remove_ports(struct kref *ref)
-{
- int i;
- struct ib_umad_port *p, *p1;
- struct ib_umad_device *dev =
- container_of(ref, struct ib_umad_device, ref);
-
- for (i = 0; i <= dev->end_port - dev->start_port; ++i) {
- struct ib_umad_port *port = &dev->port[i];
-
- list_for_each_entry_safe(p, p1, &ports_list, port_lst)
- if (p == port) {
- list_del(&p->port_lst);
- break;
- }
- }
-}
+static void ib_umad_remove_one(struct ib_device *device, void *client_data);
-static void put_umad_dev(struct kref *ref)
+static void ib_umad_release_dev(struct kobject *kobj)
{
- int ret, i;
struct ib_umad_device *dev =
- container_of(ref, struct ib_umad_device, ref);
+ container_of(kobj, struct ib_umad_device, kobj);
- spin_lock(&ports_list_lock);
- ret = (kref_put(ref, remove_ports));
- spin_unlock(&ports_list_lock);
- if (ret) {
- for (i = 0; i <= dev->end_port - dev->start_port; ++i) {
- if (dev->port[i].dev_num < IB_UMAD_MAX_PORTS)
- clear_bit(dev->port[i].dev_num, dev_map);
- else
- clear_bit(dev->port[i].dev_num - IB_UMAD_MAX_PORTS, overflow_map);
- cdev_del(dev->port[i].cdev);
- cdev_del(dev->port[i].sm_cdev);
- }
kfree(dev);
- }
-}
-
-static void release_port(struct ib_umad_port *port)
-{
- put_umad_dev(&port->umad_dev->ref);
-}
-
-
-static struct ib_umad_port *get_port(struct cdev *cdev)
-{
- struct ib_umad_port *port;
-
- spin_lock(&ports_list_lock);
- list_for_each_entry(port, &ports_list, port_lst) {
- if (port->cdev == cdev || port->sm_cdev == cdev) {
- kref_get(&port->umad_dev->ref);
- spin_unlock(&ports_list_lock);
-
- return port;
- }
- }
- spin_unlock(&ports_list_lock);
-
- return NULL;
}
-static void insert_port(struct ib_umad_port *port)
-{
- spin_lock(&ports_list_lock);
- list_add(&port->port_lst, &ports_list);
- spin_unlock(&ports_list_lock);
-}
+static struct kobj_type ib_umad_dev_ktype = {
+ .release = ib_umad_release_dev,
+};
static int hdr_size(struct ib_umad_file *file)
{
@@ -236,8 +172,8 @@ static int queue_packet(struct ib_umad_file *file,
packet->mad.hdr.id++)
if (agent == __get_agent(file, packet->mad.hdr.id)) {
list_add_tail(&packet->list, &file->recv_list);
- selwakeup(&file->filp->f_selinfo);
wake_up_interruptible(&file->recv_wait);
+ linux_poll_wakeup(file->filp);
ret = 0;
break;
}
@@ -275,6 +211,7 @@ static void send_handler(struct ib_mad_agent *agent,
}
static void recv_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_buf *send_buf,
struct ib_mad_recv_wc *mad_recv_wc)
{
struct ib_umad_file *file = agent->context;
@@ -327,20 +264,23 @@ static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf,
{
struct ib_mad_recv_buf *recv_buf;
int left, seg_payload, offset, max_seg_payload;
+ size_t seg_size;
- /* We need enough room to copy the first (or only) MAD segment. */
recv_buf = &packet->recv_wc->recv_buf;
- if ((packet->length <= sizeof (*recv_buf->mad) &&
+ seg_size = packet->recv_wc->mad_seg_size;
+
+ /* We need enough room to copy the first (or only) MAD segment. */
+ if ((packet->length <= seg_size &&
count < hdr_size(file) + packet->length) ||
- (packet->length > sizeof (*recv_buf->mad) &&
- count < hdr_size(file) + sizeof (*recv_buf->mad)))
+ (packet->length > seg_size &&
+ count < hdr_size(file) + seg_size))
return -EINVAL;
if (copy_to_user(buf, &packet->mad, hdr_size(file)))
return -EFAULT;
buf += hdr_size(file);
- seg_payload = min_t(int, packet->length, sizeof (*recv_buf->mad));
+ seg_payload = min_t(int, packet->length, seg_size);
if (copy_to_user(buf, recv_buf->mad, seg_payload))
return -EFAULT;
@@ -357,14 +297,14 @@ static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf,
return -ENOSPC;
}
offset = ib_get_mad_data_offset(recv_buf->mad->mad_hdr.mgmt_class);
- max_seg_payload = sizeof (struct ib_mad) - offset;
+ max_seg_payload = seg_size - offset;
for (left = packet->length - seg_payload, buf += seg_payload;
left; left -= seg_payload, buf += seg_payload) {
recv_buf = container_of(recv_buf->list.next,
struct ib_mad_recv_buf, list);
seg_payload = min(left, max_seg_payload);
- if (copy_to_user(buf, ((void *) recv_buf->mad) + offset,
+ if (copy_to_user(buf, (char *)recv_buf->mad + offset,
seg_payload))
return -EFAULT;
}
@@ -445,7 +385,7 @@ static int copy_rmpp_mad(struct ib_mad_send_buf *msg, const char __user *buf)
/* Copy class specific header */
if ((msg->hdr_len > IB_MGMT_RMPP_HDR) &&
- copy_from_user(msg->mad + IB_MGMT_RMPP_HDR, buf + IB_MGMT_RMPP_HDR,
+ copy_from_user((char *)msg->mad + IB_MGMT_RMPP_HDR, buf + IB_MGMT_RMPP_HDR,
msg->hdr_len - IB_MGMT_RMPP_HDR))
return -EFAULT;
@@ -490,11 +430,11 @@ static int is_duplicate(struct ib_umad_file *file,
* the same TID, reject the second as a duplicate. This is more
* restrictive than required by the spec.
*/
- if (!ib_response_mad((struct ib_mad *) hdr)) {
- if (!ib_response_mad((struct ib_mad *) sent_hdr))
+ if (!ib_response_mad(hdr)) {
+ if (!ib_response_mad(sent_hdr))
return 1;
continue;
- } else if (!ib_response_mad((struct ib_mad *) sent_hdr))
+ } else if (!ib_response_mad(sent_hdr))
continue;
if (same_destination(&packet->mad.hdr, &sent_packet->mad.hdr))
@@ -515,6 +455,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
struct ib_rmpp_mad *rmpp_mad;
__be64 *tid;
int ret, data_len, hdr_len, copy_offset, rmpp_active;
+ u8 base_version;
if (count < hdr_size(file) + IB_MGMT_RMPP_HDR)
return -EINVAL;
@@ -557,8 +498,8 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
ah_attr.ah_flags = IB_AH_GRH;
memcpy(ah_attr.grh.dgid.raw, packet->mad.hdr.gid, 16);
ah_attr.grh.sgid_index = packet->mad.hdr.gid_index;
- ah_attr.grh.flow_label = be32_to_cpu(packet->mad.hdr.flow_label);
- ah_attr.grh.hop_limit = packet->mad.hdr.hop_limit;
+ ah_attr.grh.flow_label = be32_to_cpu(packet->mad.hdr.flow_label);
+ ah_attr.grh.hop_limit = packet->mad.hdr.hop_limit;
ah_attr.grh.traffic_class = packet->mad.hdr.traffic_class;
}
@@ -570,35 +511,39 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data;
hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
- if (!ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)) {
- copy_offset = IB_MGMT_MAD_HDR;
- rmpp_active = 0;
- } else {
+
+ if (ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)
+ && ib_mad_kernel_rmpp_agent(agent)) {
copy_offset = IB_MGMT_RMPP_HDR;
rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
- IB_MGMT_RMPP_FLAG_ACTIVE;
+ IB_MGMT_RMPP_FLAG_ACTIVE;
+ } else {
+ copy_offset = IB_MGMT_MAD_HDR;
+ rmpp_active = 0;
}
+ base_version = ((struct ib_mad_hdr *)&packet->mad.data)->base_version;
data_len = count - hdr_size(file) - hdr_len;
packet->msg = ib_create_send_mad(agent,
be32_to_cpu(packet->mad.hdr.qpn),
packet->mad.hdr.pkey_index, rmpp_active,
- hdr_len, data_len, GFP_KERNEL);
+ hdr_len, data_len, GFP_KERNEL,
+ base_version);
if (IS_ERR(packet->msg)) {
ret = PTR_ERR(packet->msg);
goto err_ah;
}
- packet->msg->ah = ah;
+ packet->msg->ah = ah;
packet->msg->timeout_ms = packet->mad.hdr.timeout_ms;
- packet->msg->retries = packet->mad.hdr.retries;
+ packet->msg->retries = packet->mad.hdr.retries;
packet->msg->context[0] = packet;
/* Copy MAD header. Any RMPP header is already in place. */
memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR);
if (!rmpp_active) {
- if (copy_from_user(packet->msg->mad + copy_offset,
+ if (copy_from_user((char *)packet->msg->mad + copy_offset,
buf + copy_offset,
hdr_len + data_len - copy_offset)) {
ret = -EFAULT;
@@ -622,14 +567,22 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
rmpp_mad->mad_hdr.tid = *tid;
}
- spin_lock_irq(&file->send_lock);
- ret = is_duplicate(file, packet);
- if (!ret)
+ if (!ib_mad_kernel_rmpp_agent(agent)
+ && ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)
+ && (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) {
+ spin_lock_irq(&file->send_lock);
list_add_tail(&packet->list, &file->send_list);
- spin_unlock_irq(&file->send_lock);
- if (ret) {
- ret = -EINVAL;
- goto err_msg;
+ spin_unlock_irq(&file->send_lock);
+ } else {
+ spin_lock_irq(&file->send_lock);
+ ret = is_duplicate(file, packet);
+ if (!ret)
+ list_add_tail(&packet->list, &file->send_list);
+ spin_unlock_irq(&file->send_lock);
+ if (ret) {
+ ret = -EINVAL;
+ goto err_msg;
+ }
}
ret = ib_post_send_mad(packet->msg, NULL);
@@ -680,6 +633,8 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
mutex_lock(&file->mutex);
if (!file->port->ib_dev) {
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent: invalid device\n");
ret = -EPIPE;
goto out;
}
@@ -690,6 +645,9 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
}
if (ureq.qpn != 0 && ureq.qpn != 1) {
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent: invalid QPN %d specified\n",
+ ureq.qpn);
ret = -EINVAL;
goto out;
}
@@ -698,11 +656,15 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
if (!__get_agent(file, agent_id))
goto found;
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent: Max Agents (%u) reached\n",
+ IB_UMAD_MAX_AGENTS);
ret = -ENOMEM;
goto out;
found:
if (ureq.mgmt_class) {
+ memset(&req, 0, sizeof(req));
req.mgmt_class = ureq.mgmt_class;
req.mgmt_class_version = ureq.mgmt_class_version;
memcpy(req.oui, ureq.oui, sizeof req.oui);
@@ -723,7 +685,7 @@ found:
ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
ureq.mgmt_class ? &req : NULL,
ureq.rmpp_version,
- send_handler, recv_handler, file);
+ send_handler, recv_handler, file, 0);
if (IS_ERR(agent)) {
ret = PTR_ERR(agent);
agent = NULL;
@@ -731,7 +693,7 @@ found:
}
if (put_user(agent_id,
- (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) {
+ (u32 __user *) ((char *)arg + offsetof(struct ib_user_mad_reg_req, id)))) {
ret = -EFAULT;
goto out;
}
@@ -739,10 +701,11 @@ found:
if (!file->already_used) {
file->already_used = 1;
if (!file->use_pkey_index) {
- printk(KERN_WARNING "user_mad: process %s did not enable "
- "P_Key index support.\n", curthread->td_proc->p_comm);
- printk(KERN_WARNING "user_mad: Documentation/infiniband/user_mad.txt "
- "has info on the new ABI.\n");
+ dev_warn(file->port->dev,
+ "process %s did not enable P_Key index support.\n",
+ current->comm);
+ dev_warn(file->port->dev,
+ " Documentation/infiniband/user_mad.txt has info on the new ABI.\n");
}
}
@@ -760,6 +723,120 @@ out:
return ret;
}
+static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg)
+{
+ struct ib_user_mad_reg_req2 ureq;
+ struct ib_mad_reg_req req;
+ struct ib_mad_agent *agent = NULL;
+ int agent_id;
+ int ret;
+
+ mutex_lock(&file->port->file_mutex);
+ mutex_lock(&file->mutex);
+
+ if (!file->port->ib_dev) {
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent2: invalid device\n");
+ ret = -EPIPE;
+ goto out;
+ }
+
+ if (copy_from_user(&ureq, arg, sizeof(ureq))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (ureq.qpn != 0 && ureq.qpn != 1) {
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent2: invalid QPN %d specified\n",
+ ureq.qpn);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (ureq.flags & ~IB_USER_MAD_REG_FLAGS_CAP) {
+ const u32 flags = IB_USER_MAD_REG_FLAGS_CAP;
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent2 failed: invalid registration flags specified 0x%x; supported 0x%x\n",
+ ureq.flags, IB_USER_MAD_REG_FLAGS_CAP);
+ ret = -EINVAL;
+
+ if (put_user(flags,
+ (u32 __user *) ((char *)arg + offsetof(struct
+ ib_user_mad_reg_req2, flags))))
+ ret = -EFAULT;
+
+ goto out;
+ }
+
+ for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
+ if (!__get_agent(file, agent_id))
+ goto found;
+
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent2: Max Agents (%u) reached\n",
+ IB_UMAD_MAX_AGENTS);
+ ret = -ENOMEM;
+ goto out;
+
+found:
+ if (ureq.mgmt_class) {
+ memset(&req, 0, sizeof(req));
+ req.mgmt_class = ureq.mgmt_class;
+ req.mgmt_class_version = ureq.mgmt_class_version;
+ if (ureq.oui & 0xff000000) {
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent2 failed: oui invalid 0x%08x\n",
+ ureq.oui);
+ ret = -EINVAL;
+ goto out;
+ }
+ req.oui[2] = ureq.oui & 0x0000ff;
+ req.oui[1] = (ureq.oui & 0x00ff00) >> 8;
+ req.oui[0] = (ureq.oui & 0xff0000) >> 16;
+ memcpy(req.method_mask, ureq.method_mask,
+ sizeof(req.method_mask));
+ }
+
+ agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num,
+ ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
+ ureq.mgmt_class ? &req : NULL,
+ ureq.rmpp_version,
+ send_handler, recv_handler, file,
+ ureq.flags);
+ if (IS_ERR(agent)) {
+ ret = PTR_ERR(agent);
+ agent = NULL;
+ goto out;
+ }
+
+ if (put_user(agent_id,
+ (u32 __user *)((char *)arg +
+ offsetof(struct ib_user_mad_reg_req2, id)))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (!file->already_used) {
+ file->already_used = 1;
+ file->use_pkey_index = 1;
+ }
+
+ file->agent[agent_id] = agent;
+ ret = 0;
+
+out:
+ mutex_unlock(&file->mutex);
+
+ if (ret && agent)
+ ib_unregister_mad_agent(agent);
+
+ mutex_unlock(&file->port->file_mutex);
+
+ return ret;
+}
+
+
static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg)
{
struct ib_mad_agent *agent = NULL;
@@ -815,6 +892,8 @@ static long ib_umad_ioctl(struct file *filp, unsigned int cmd,
return ib_umad_unreg_agent(filp->private_data, (__u32 __user *) arg);
case IB_USER_MAD_ENABLE_PKEY:
return ib_umad_enable_pkey(filp->private_data);
+ case IB_USER_MAD_REGISTER_AGENT2:
+ return ib_umad_reg_agent2(filp->private_data, (void __user *) arg);
default:
return -ENOIOCTLCMD;
}
@@ -831,6 +910,8 @@ static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd,
return ib_umad_unreg_agent(filp->private_data, compat_ptr(arg));
case IB_USER_MAD_ENABLE_PKEY:
return ib_umad_enable_pkey(filp->private_data);
+ case IB_USER_MAD_REGISTER_AGENT2:
+ return ib_umad_reg_agent2(filp->private_data, compat_ptr(arg));
default:
return -ENOIOCTLCMD;
}
@@ -850,26 +931,19 @@ static int ib_umad_open(struct inode *inode, struct file *filp)
{
struct ib_umad_port *port;
struct ib_umad_file *file;
- int ret;
+ int ret = -ENXIO;
- port = get_port(inode->i_cdev->si_drv1);
- if (!port)
- return -ENXIO;
+ port = container_of(inode->i_cdev->si_drv1, struct ib_umad_port, cdev);
mutex_lock(&port->file_mutex);
- if (!port->ib_dev) {
- release_port(port);
- ret = -ENXIO;
+ if (!port->ib_dev)
goto out;
- }
+ ret = -ENOMEM;
file = kzalloc(sizeof *file, GFP_KERNEL);
- if (!file) {
- release_port(port);
- ret = -ENOMEM;
+ if (!file)
goto out;
- }
mutex_init(&file->mutex);
spin_lock_init(&file->send_lock);
@@ -884,6 +958,13 @@ static int ib_umad_open(struct inode *inode, struct file *filp)
list_add_tail(&file->port_list, &port->file_list);
ret = nonseekable_open(inode, filp);
+ if (ret) {
+ list_del(&file->port_list);
+ kfree(file);
+ goto out;
+ }
+
+ kobject_get(&port->umad_dev->kobj);
out:
mutex_unlock(&port->file_mutex);
@@ -893,7 +974,7 @@ out:
static int ib_umad_close(struct inode *inode, struct file *filp)
{
struct ib_umad_file *file = filp->private_data;
- struct ib_umad_port *port = file->port;
+ struct ib_umad_device *dev = file->port->umad_dev;
struct ib_umad_packet *packet, *tmp;
int already_dead;
int i;
@@ -922,21 +1003,21 @@ static int ib_umad_close(struct inode *inode, struct file *filp)
mutex_unlock(&file->port->file_mutex);
kfree(file);
- release_port(port);
+ kobject_put(&dev->kobj);
return 0;
}
static const struct file_operations umad_fops = {
- .owner = THIS_MODULE,
- .read = ib_umad_read,
- .write = ib_umad_write,
- .poll = ib_umad_poll,
+ .owner = THIS_MODULE,
+ .read = ib_umad_read,
+ .write = ib_umad_write,
+ .poll = ib_umad_poll,
.unlocked_ioctl = ib_umad_ioctl,
#ifdef CONFIG_COMPAT
- .compat_ioctl = ib_umad_compat_ioctl,
+ .compat_ioctl = ib_umad_compat_ioctl,
#endif
- .open = ib_umad_open,
+ .open = ib_umad_open,
.release = ib_umad_close,
.llseek = no_llseek,
};
@@ -949,9 +1030,7 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp)
};
int ret;
- port = get_port(inode->i_cdev->si_drv1);
- if (!port)
- return -ENXIO;
+ port = container_of(inode->i_cdev->si_drv1, struct ib_umad_port, sm_cdev);
if (filp->f_flags & O_NONBLOCK) {
if (down_trylock(&port->sm_sem)) {
@@ -966,17 +1045,27 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp)
}
ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
- if (ret) {
- up(&port->sm_sem);
- goto fail;
- }
+ if (ret)
+ goto err_up_sem;
filp->private_data = port;
- return nonseekable_open(inode, filp);
+ ret = nonseekable_open(inode, filp);
+ if (ret)
+ goto err_clr_sm_cap;
+
+ kobject_get(&port->umad_dev->kobj);
+
+ return 0;
+
+err_clr_sm_cap:
+ swap(props.set_port_cap_mask, props.clr_port_cap_mask);
+ ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+
+err_up_sem:
+ up(&port->sm_sem);
fail:
- release_port(port);
return ret;
}
@@ -995,14 +1084,14 @@ static int ib_umad_sm_close(struct inode *inode, struct file *filp)
up(&port->sm_sem);
- release_port(port);
+ kobject_put(&port->umad_dev->kobj);
return ret;
}
static const struct file_operations umad_sm_fops = {
- .owner = THIS_MODULE,
- .open = ib_umad_sm_open,
+ .owner = THIS_MODULE,
+ .open = ib_umad_sm_open,
.release = ib_umad_sm_close,
.llseek = no_llseek,
};
@@ -1037,14 +1126,12 @@ static ssize_t show_port(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR(port, S_IRUGO, show_port, NULL);
-static ssize_t show_abi_version(struct class *class, struct class_attribute *attr, char *buf)
-{
- return sprintf(buf, "%d\n", IB_USER_MAD_ABI_VERSION);
-}
-static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+static CLASS_ATTR_STRING(abi_version, S_IRUGO,
+ __stringify(IB_USER_MAD_ABI_VERSION));
static dev_t overflow_maj;
-static int find_overflow_devnum(void)
+static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS);
+static int find_overflow_devnum(struct ib_device *device)
{
int ret;
@@ -1052,7 +1139,8 @@ static int find_overflow_devnum(void)
ret = alloc_chrdev_region(&overflow_maj, 0, IB_UMAD_MAX_PORTS * 2,
"infiniband_mad");
if (ret) {
- printk(KERN_ERR "user_mad: couldn't register dynamic device number\n");
+ dev_err(&device->dev,
+ "couldn't register dynamic device number\n");
return ret;
}
}
@@ -1065,6 +1153,7 @@ static int find_overflow_devnum(void)
}
static int ib_umad_init_port(struct ib_device *device, int port_num,
+ struct ib_umad_device *umad_dev,
struct ib_umad_port *port)
{
int devnum;
@@ -1074,9 +1163,9 @@ static int ib_umad_init_port(struct ib_device *device, int port_num,
devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
if (devnum >= IB_UMAD_MAX_PORTS) {
spin_unlock(&port_lock);
- devnum = find_overflow_devnum();
+ devnum = find_overflow_devnum(device);
if (devnum < 0)
- return -1;
+ return -1;
spin_lock(&port_lock);
port->dev_num = devnum + IB_UMAD_MAX_PORTS;
@@ -1095,18 +1184,15 @@ static int ib_umad_init_port(struct ib_device *device, int port_num,
mutex_init(&port->file_mutex);
INIT_LIST_HEAD(&port->file_list);
- port->cdev = cdev_alloc();
- if (!port->cdev)
- goto err_cdev_c;
-
- port->cdev->ops = &umad_fops;
- port->cdev->owner = THIS_MODULE;
- kobject_set_name(&port->cdev->kobj, "umad%d", port->dev_num);
- if (cdev_add(port->cdev, base, 1))
+ cdev_init(&port->cdev, &umad_fops);
+ port->cdev.owner = THIS_MODULE;
+ port->cdev.kobj.parent = &umad_dev->kobj;
+ kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num);
+ if (cdev_add(&port->cdev, base, 1))
goto err_cdev;
port->dev = device_create(umad_class, device->dma_device,
- port->cdev->dev, port,
+ port->cdev.dev, port,
"umad%d", port->dev_num);
if (IS_ERR(port->dev))
goto err_cdev;
@@ -1117,18 +1203,15 @@ static int ib_umad_init_port(struct ib_device *device, int port_num,
goto err_dev;
base += IB_UMAD_MAX_PORTS;
- port->sm_cdev = cdev_alloc();
- if (!port->sm_cdev)
- goto err_dev;
-
- port->sm_cdev->ops = &umad_sm_fops;
- port->sm_cdev->owner = THIS_MODULE;
- kobject_set_name(&port->sm_cdev->kobj, "issm%d", port->dev_num);
- if (cdev_add(port->sm_cdev, base, 1))
+ cdev_init(&port->sm_cdev, &umad_sm_fops);
+ port->sm_cdev.owner = THIS_MODULE;
+ port->sm_cdev.kobj.parent = &umad_dev->kobj;
+ kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num);
+ if (cdev_add(&port->sm_cdev, base, 1))
goto err_sm_cdev;
port->sm_dev = device_create(umad_class, device->dma_device,
- port->sm_cdev->dev, port,
+ port->sm_cdev.dev, port,
"issm%d", port->dev_num);
if (IS_ERR(port->sm_dev))
goto err_sm_cdev;
@@ -1141,17 +1224,16 @@ static int ib_umad_init_port(struct ib_device *device, int port_num,
return 0;
err_sm_dev:
- device_destroy(umad_class, port->sm_cdev->dev);
+ device_destroy(umad_class, port->sm_cdev.dev);
err_sm_cdev:
- cdev_del(port->sm_cdev);
+ cdev_del(&port->sm_cdev);
err_dev:
- device_destroy(umad_class, port->cdev->dev);
+ device_destroy(umad_class, port->cdev.dev);
err_cdev:
- cdev_del(port->cdev);
-err_cdev_c:
+ cdev_del(&port->cdev);
if (port->dev_num < IB_UMAD_MAX_PORTS)
clear_bit(devnum, dev_map);
else
@@ -1168,8 +1250,11 @@ static void ib_umad_kill_port(struct ib_umad_port *port)
dev_set_drvdata(port->dev, NULL);
dev_set_drvdata(port->sm_dev, NULL);
- device_destroy(umad_class, port->cdev->dev);
- device_destroy(umad_class, port->sm_cdev->dev);
+ device_destroy(umad_class, port->cdev.dev);
+ device_destroy(umad_class, port->sm_cdev.dev);
+
+ cdev_del(&port->cdev);
+ cdev_del(&port->sm_cdev);
mutex_lock(&port->file_mutex);
@@ -1186,22 +1271,21 @@ static void ib_umad_kill_port(struct ib_umad_port *port)
}
mutex_unlock(&port->file_mutex);
+
+ if (port->dev_num < IB_UMAD_MAX_PORTS)
+ clear_bit(port->dev_num, dev_map);
+ else
+ clear_bit(port->dev_num - IB_UMAD_MAX_PORTS, overflow_map);
}
static void ib_umad_add_one(struct ib_device *device)
{
struct ib_umad_device *umad_dev;
int s, e, i;
+ int count = 0;
- if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
- return;
-
- if (device->node_type == RDMA_NODE_IB_SWITCH)
- s = e = 0;
- else {
- s = 1;
- e = device->phys_port_cnt;
- }
+ s = rdma_start_port(device);
+ e = rdma_end_port(device);
umad_dev = kzalloc(sizeof *umad_dev +
(e - s + 1) * sizeof (struct ib_umad_port),
@@ -1209,44 +1293,53 @@ static void ib_umad_add_one(struct ib_device *device)
if (!umad_dev)
return;
- kref_init(&umad_dev->ref);
-
- umad_dev->start_port = s;
- umad_dev->end_port = e;
-
- for (i = 0; i <= e - s; ++i)
- insert_port(&umad_dev->port[i]);
+ kobject_init(&umad_dev->kobj, &ib_umad_dev_ktype);
for (i = s; i <= e; ++i) {
+ if (!rdma_cap_ib_mad(device, i))
+ continue;
+
umad_dev->port[i - s].umad_dev = umad_dev;
- if (ib_umad_init_port(device, i, &umad_dev->port[i - s]))
- goto err;
+ if (ib_umad_init_port(device, i, umad_dev,
+ &umad_dev->port[i - s]))
+ goto err;
+
+ count++;
}
+ if (!count)
+ goto free;
+
ib_set_client_data(device, &umad_client, umad_dev);
return;
err:
- while (--i >= s)
- ib_umad_kill_port(&umad_dev->port[i - s]);
+ while (--i >= s) {
+ if (!rdma_cap_ib_mad(device, i))
+ continue;
- put_umad_dev(&umad_dev->ref);
+ ib_umad_kill_port(&umad_dev->port[i - s]);
+ }
+free:
+ kobject_put(&umad_dev->kobj);
}
-static void ib_umad_remove_one(struct ib_device *device)
+static void ib_umad_remove_one(struct ib_device *device, void *client_data)
{
- struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client);
+ struct ib_umad_device *umad_dev = client_data;
int i;
if (!umad_dev)
return;
- for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i)
+ for (i = 0; i <= rdma_end_port(device) - rdma_start_port(device); ++i) {
+ if (rdma_cap_ib_mad(device, i + rdma_start_port(device)))
ib_umad_kill_port(&umad_dev->port[i]);
+ }
- put_umad_dev(&umad_dev->ref);
+ kobject_put(&umad_dev->kobj);
}
static char *umad_devnode(struct device *dev, umode_t *mode)
@@ -1258,33 +1351,31 @@ static int __init ib_umad_init(void)
{
int ret;
- INIT_LIST_HEAD(&ports_list);
-
ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2,
"infiniband_mad");
if (ret) {
- printk(KERN_ERR "user_mad: couldn't register device number\n");
+ pr_err("couldn't register device number\n");
goto out;
}
umad_class = class_create(THIS_MODULE, "infiniband_mad");
if (IS_ERR(umad_class)) {
ret = PTR_ERR(umad_class);
- printk(KERN_ERR "user_mad: couldn't create class infiniband_mad\n");
+ pr_err("couldn't create class infiniband_mad\n");
goto out_chrdev;
}
umad_class->devnode = umad_devnode;
- ret = class_create_file(umad_class, &class_attr_abi_version);
+ ret = class_create_file(umad_class, &class_attr_abi_version.attr);
if (ret) {
- printk(KERN_ERR "user_mad: couldn't create abi_version attribute\n");
+ pr_err("couldn't create abi_version attribute\n");
goto out_class;
}
ret = ib_register_client(&umad_client);
if (ret) {
- printk(KERN_ERR "user_mad: couldn't register ib_umad client\n");
+ pr_err("couldn't register ib_umad client\n");
goto out_class;
}
@@ -1309,5 +1400,5 @@ static void __exit ib_umad_cleanup(void)
unregister_chrdev_region(overflow_maj, IB_UMAD_MAX_PORTS * 2);
}
-module_init(ib_umad_init);
+module_init_order(ib_umad_init, SI_ORDER_THIRD);
module_exit(ib_umad_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/uverbs.h b/sys/ofed/drivers/infiniband/core/uverbs.h
index 8ca6498f8f672..f905a17b67419 100644
--- a/sys/ofed/drivers/infiniband/core/uverbs.h
+++ b/sys/ofed/drivers/infiniband/core/uverbs.h
@@ -42,13 +42,29 @@
#include <linux/mutex.h>
#include <linux/completion.h>
#include <linux/cdev.h>
+#include <linux/srcu.h>
+#include <linux/rcupdate.h>
#include <linux/rbtree.h>
#include <rdma/ib_verbs.h>
-#include <rdma/ib_verbs_exp.h>
#include <rdma/ib_umem.h>
#include <rdma/ib_user_verbs.h>
-#include <rdma/ib_user_verbs_exp.h>
+
+#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \
+ do { \
+ (udata)->inbuf = (const void __user *) (ibuf); \
+ (udata)->outbuf = (void __user *) (obuf); \
+ (udata)->inlen = (ilen); \
+ (udata)->outlen = (olen); \
+ } while (0)
+
+#define INIT_UDATA_BUF_OR_NULL(udata, ibuf, obuf, ilen, olen) \
+ do { \
+ (udata)->inbuf = (ilen) ? (const void __user *) (ibuf) : NULL; \
+ (udata)->outbuf = (olen) ? (void __user *) (obuf) : NULL; \
+ (udata)->inlen = (ilen); \
+ (udata)->outlen = (olen); \
+ } while (0)
/*
* Our lifetime rules for these structs are the following:
@@ -72,15 +88,20 @@
*/
struct ib_uverbs_device {
- struct kref ref;
+ atomic_t refcount;
int num_comp_vectors;
struct completion comp;
struct device *dev;
- struct ib_device *ib_dev;
+ struct ib_device __rcu *ib_dev;
int devnum;
struct cdev cdev;
struct rb_root xrcd_tree;
struct mutex xrcd_tree_mutex;
+ struct kobject kobj;
+ struct srcu_struct disassociate_srcu;
+ struct mutex lists_mutex; /* protect lists */
+ struct list_head uverbs_file_list;
+ struct list_head uverbs_events_file_list;
};
struct ib_uverbs_event_file {
@@ -93,15 +114,19 @@ struct ib_uverbs_event_file {
wait_queue_head_t poll_wait;
struct fasync_struct *async_queue;
struct list_head event_list;
+ struct list_head list;
};
struct ib_uverbs_file {
struct kref ref;
struct mutex mutex;
+ struct mutex cleanup_mutex; /* protect cleanup */
struct ib_uverbs_device *device;
struct ib_ucontext *ucontext;
struct ib_event_handler event_handler;
struct ib_uverbs_event_file *async_file;
+ struct list_head list;
+ int is_closed;
};
struct ib_uverbs_event {
@@ -142,6 +167,10 @@ struct ib_uqp_object {
struct ib_uxrcd_object *uxrcd;
};
+struct ib_uwq_object {
+ struct ib_uevent_object uevent;
+};
+
struct ib_ucq_object {
struct ib_uobject uobject;
struct ib_uverbs_file *uverbs_file;
@@ -151,10 +180,6 @@ struct ib_ucq_object {
u32 async_events_reported;
};
-struct ib_udct_object {
- struct ib_uobject uobject;
-};
-
extern spinlock_t ib_uverbs_idr_lock;
extern struct idr ib_uverbs_pd_idr;
extern struct idr ib_uverbs_mr_idr;
@@ -165,12 +190,15 @@ extern struct idr ib_uverbs_qp_idr;
extern struct idr ib_uverbs_srq_idr;
extern struct idr ib_uverbs_xrcd_idr;
extern struct idr ib_uverbs_rule_idr;
-extern struct idr ib_uverbs_dct_idr;
+extern struct idr ib_uverbs_wq_idr;
+extern struct idr ib_uverbs_rwq_ind_tbl_idr;
void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+ struct ib_device *ib_dev,
int is_async);
+void ib_uverbs_free_async_event_file(struct ib_uverbs_file *uverbs_file);
struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
@@ -182,11 +210,14 @@ void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context);
void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr);
void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr);
void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
void ib_uverbs_event_handler(struct ib_event_handler *handler,
struct ib_event *event);
void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd);
+int uverbs_dealloc_mw(struct ib_mw *mw);
+
struct ib_uverbs_flow_spec {
union {
union {
@@ -198,14 +229,15 @@ struct ib_uverbs_flow_spec {
};
};
struct ib_uverbs_flow_spec_eth eth;
- struct ib_uverbs_flow_spec_ib ib;
struct ib_uverbs_flow_spec_ipv4 ipv4;
struct ib_uverbs_flow_spec_tcp_udp tcp_udp;
+ struct ib_uverbs_flow_spec_ipv6 ipv6;
};
};
#define IB_UVERBS_DECLARE_CMD(name) \
ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \
+ struct ib_device *ib_dev, \
const char __user *buf, int in_len, \
int out_len)
@@ -215,6 +247,7 @@ IB_UVERBS_DECLARE_CMD(query_port);
IB_UVERBS_DECLARE_CMD(alloc_pd);
IB_UVERBS_DECLARE_CMD(dealloc_pd);
IB_UVERBS_DECLARE_CMD(reg_mr);
+IB_UVERBS_DECLARE_CMD(rereg_mr);
IB_UVERBS_DECLARE_CMD(dereg_mr);
IB_UVERBS_DECLARE_CMD(alloc_mw);
IB_UVERBS_DECLARE_CMD(dealloc_mw);
@@ -245,25 +278,20 @@ IB_UVERBS_DECLARE_CMD(open_xrcd);
IB_UVERBS_DECLARE_CMD(close_xrcd);
#define IB_UVERBS_DECLARE_EX_CMD(name) \
- int ib_uverbs_ex_##name(struct ib_uverbs_file *file,\
- struct ib_udata *ucore, \
+ int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \
+ struct ib_device *ib_dev, \
+ struct ib_udata *ucore, \
struct ib_udata *uhw)
-#define IB_UVERBS_DECLARE_EXP_CMD(name) \
- ssize_t ib_uverbs_exp_##name(struct ib_uverbs_file *file, \
- struct ib_udata *ucore, \
- struct ib_udata *uhw)
-
IB_UVERBS_DECLARE_EX_CMD(create_flow);
IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
-
-IB_UVERBS_DECLARE_EXP_CMD(create_qp);
-IB_UVERBS_DECLARE_EXP_CMD(modify_cq);
-IB_UVERBS_DECLARE_EXP_CMD(modify_qp);
-IB_UVERBS_DECLARE_EXP_CMD(create_cq);
-IB_UVERBS_DECLARE_EXP_CMD(query_device);
-IB_UVERBS_DECLARE_EXP_CMD(create_dct);
-IB_UVERBS_DECLARE_EXP_CMD(destroy_dct);
-IB_UVERBS_DECLARE_EXP_CMD(query_dct);
+IB_UVERBS_DECLARE_EX_CMD(query_device);
+IB_UVERBS_DECLARE_EX_CMD(create_cq);
+IB_UVERBS_DECLARE_EX_CMD(create_qp);
+IB_UVERBS_DECLARE_EX_CMD(create_wq);
+IB_UVERBS_DECLARE_EX_CMD(modify_wq);
+IB_UVERBS_DECLARE_EX_CMD(destroy_wq);
+IB_UVERBS_DECLARE_EX_CMD(create_rwq_ind_table);
+IB_UVERBS_DECLARE_EX_CMD(destroy_rwq_ind_table);
#endif /* UVERBS_H */
diff --git a/sys/ofed/drivers/infiniband/core/uverbs_cmd.c b/sys/ofed/drivers/infiniband/core/uverbs_cmd.c
index 7f87a6c9ac11d..3289259176187 100644
--- a/sys/ofed/drivers/infiniband/core/uverbs_cmd.c
+++ b/sys/ofed/drivers/infiniband/core/uverbs_cmd.c
@@ -38,25 +38,17 @@
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/slab.h>
-#include <linux/moduleparam.h>
+#include <linux/sched.h>
#include <linux/rbtree.h>
-#include <linux/lockdep.h>
-#include <rdma/ib_addr.h>
#include <asm/uaccess.h>
-#include <asm/fcntl.h>
-#include <sys/priv.h>
#include "uverbs.h"
+#include "core_priv.h"
-static int disable_raw_qp_enforcement;
-module_param_named(disable_raw_qp_enforcement, disable_raw_qp_enforcement, int,
- 0444);
-MODULE_PARM_DESC(disable_raw_qp_enforcement, "Disable RAW QP enforcement for "
- "being opened by root (default: 0)");
+#include <sys/priv.h>
struct uverbs_lock_class {
- struct lock_class_key key;
char name[16];
};
@@ -68,44 +60,19 @@ static struct uverbs_lock_class qp_lock_class = { .name = "QP-uobj" };
static struct uverbs_lock_class ah_lock_class = { .name = "AH-uobj" };
static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" };
static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" };
-static struct uverbs_lock_class dct_lock_class = { .name = "DCT-uobj" };
-
-static int uverbs_copy_from_udata(void *dest, struct ib_udata *udata, size_t len)
-{
- return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0;
-}
-
-static int uverbs_copy_to_udata(struct ib_udata *udata, void *src, size_t len)
-{
- return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0;
-}
-
-static struct ib_udata_ops uverbs_copy = {
- .copy_from = uverbs_copy_from_udata,
- .copy_to = uverbs_copy_to_udata
-};
-
-#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \
- do { \
- (udata)->ops = &uverbs_copy; \
- (udata)->inbuf = (void __user *) (ibuf); \
- (udata)->outbuf = (void __user *) (obuf); \
- (udata)->inlen = (ilen); \
- (udata)->outlen = (olen); \
- } while (0)
-
-enum uverbs_cmd_type {
- IB_USER_VERBS_CMD_BASIC,
- IB_USER_VERBS_CMD_EXTENDED
-};
+static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" };
+static struct uverbs_lock_class wq_lock_class = { .name = "WQ-uobj" };
+static struct uverbs_lock_class rwq_ind_table_lock_class = { .name = "IND_TBL-uobj" };
/*
* The ib_uobject locking scheme is as follows:
*
* - ib_uverbs_idr_lock protects the uverbs idrs themselves, so it
- * needs to be held during all idr operations. When an object is
+ * needs to be held during all idr write operations. When an object is
* looked up, a reference must be taken on the object's kref before
- * dropping this lock.
+ * dropping this lock. For read operations, the rcu_read_lock()
+ * and rcu_write_lock() but similarly the kref reference is grabbed
+ * before the rcu_read_unlock().
*
* - Each object also has an rwsem. This rwsem must be held for
* reading while an operation that uses the object is performed.
@@ -131,13 +98,12 @@ static void init_uobj(struct ib_uobject *uobj, u64 user_handle,
uobj->context = context;
kref_init(&uobj->ref);
init_rwsem(&uobj->mutex);
- lockdep_set_class_and_name(&uobj->mutex, &c->key, c->name);
uobj->live = 0;
}
static void release_uobj(struct kref *kref)
{
- kfree(container_of(kref, struct ib_uobject, ref));
+ kfree_rcu(container_of(kref, struct ib_uobject, ref), rcu);
}
static void put_uobj(struct ib_uobject *uobj)
@@ -161,18 +127,17 @@ static int idr_add_uobj(struct idr *idr, struct ib_uobject *uobj)
{
int ret;
-retry:
- if (!idr_pre_get(idr, GFP_KERNEL))
- return -ENOMEM;
-
+ idr_preload(GFP_KERNEL);
spin_lock(&ib_uverbs_idr_lock);
- ret = idr_get_new(idr, uobj, &uobj->id);
- spin_unlock(&ib_uverbs_idr_lock);
- if (ret == -EAGAIN)
- goto retry;
+ ret = idr_alloc(idr, uobj, 0, 0, GFP_NOWAIT);
+ if (ret >= 0)
+ uobj->id = ret;
- return ret;
+ spin_unlock(&ib_uverbs_idr_lock);
+ idr_preload_end();
+
+ return ret < 0 ? ret : 0;
}
void idr_remove_uobj(struct idr *idr, struct ib_uobject *uobj)
@@ -187,7 +152,7 @@ static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id,
{
struct ib_uobject *uobj;
- spin_lock(&ib_uverbs_idr_lock);
+ rcu_read_lock();
uobj = idr_find(idr, id);
if (uobj) {
if (uobj->context == context)
@@ -195,7 +160,7 @@ static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id,
else
uobj = NULL;
}
- spin_unlock(&ib_uverbs_idr_lock);
+ rcu_read_unlock();
return uobj;
}
@@ -283,6 +248,27 @@ static struct ib_qp *idr_read_qp(int qp_handle, struct ib_ucontext *context)
return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0);
}
+static struct ib_wq *idr_read_wq(int wq_handle, struct ib_ucontext *context)
+{
+ return idr_read_obj(&ib_uverbs_wq_idr, wq_handle, context, 0);
+}
+
+static void put_wq_read(struct ib_wq *wq)
+{
+ put_uobj_read(wq->uobject);
+}
+
+static struct ib_rwq_ind_table *idr_read_rwq_indirection_table(int ind_table_handle,
+ struct ib_ucontext *context)
+{
+ return idr_read_obj(&ib_uverbs_rwq_ind_tbl_idr, ind_table_handle, context, 0);
+}
+
+static void put_rwq_indirection_table_read(struct ib_rwq_ind_table *ind_table)
+{
+ put_uobj_read(ind_table->uobject);
+}
+
static struct ib_qp *idr_write_qp(int qp_handle, struct ib_ucontext *context)
{
struct ib_uobject *uobj;
@@ -301,16 +287,6 @@ static void put_qp_write(struct ib_qp *qp)
put_uobj_write(qp->uobject);
}
-static struct ib_dct *idr_read_dct(int dct_handle, struct ib_ucontext *context)
-{
- return idr_read_obj(&ib_uverbs_dct_idr, dct_handle, context, 0);
-}
-
-static void put_dct_read(struct ib_dct *dct)
-{
- put_uobj_read(dct->uobject);
-}
-
static struct ib_srq *idr_read_srq(int srq_handle, struct ib_ucontext *context)
{
return idr_read_obj(&ib_uverbs_srq_idr, srq_handle, context, 0);
@@ -334,13 +310,13 @@ static void put_xrcd_read(struct ib_uobject *uobj)
}
ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
struct ib_uverbs_get_context cmd;
struct ib_uverbs_get_context_resp resp;
struct ib_udata udata;
- struct ib_device *ibdev = file->device->ib_dev;
struct ib_ucontext *ucontext;
struct file *filp;
int ret;
@@ -362,13 +338,13 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
(unsigned long) cmd.response + sizeof resp,
in_len - sizeof cmd, out_len - sizeof resp);
- ucontext = ibdev->alloc_ucontext(ibdev, &udata);
+ ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
if (IS_ERR(ucontext)) {
ret = PTR_ERR(ucontext);
goto err;
}
- ucontext->device = ibdev;
+ ucontext->device = ib_dev;
INIT_LIST_HEAD(&ucontext->pd_list);
INIT_LIST_HEAD(&ucontext->mr_list);
INIT_LIST_HEAD(&ucontext->mw_list);
@@ -376,41 +352,45 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
INIT_LIST_HEAD(&ucontext->qp_list);
INIT_LIST_HEAD(&ucontext->srq_list);
INIT_LIST_HEAD(&ucontext->ah_list);
+ INIT_LIST_HEAD(&ucontext->wq_list);
+ INIT_LIST_HEAD(&ucontext->rwq_ind_tbl_list);
INIT_LIST_HEAD(&ucontext->xrcd_list);
INIT_LIST_HEAD(&ucontext->rule_list);
- INIT_LIST_HEAD(&ucontext->dct_list);
+ rcu_read_lock();
+ ucontext->tgid = get_pid(task_pid_group_leader(current));
+ rcu_read_unlock();
ucontext->closing = 0;
- ucontext->peer_mem_private_data = NULL;
- ucontext->peer_mem_name = NULL;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+ ucontext->umem_tree = RB_ROOT;
+ init_rwsem(&ucontext->umem_rwsem);
+ ucontext->odp_mrs_count = 0;
+ INIT_LIST_HEAD(&ucontext->no_private_counters);
+
+ if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
+ ucontext->invalidate_range = NULL;
+
+#endif
resp.num_comp_vectors = file->device->num_comp_vectors;
- ret = get_unused_fd();
+ ret = get_unused_fd_flags(O_CLOEXEC);
if (ret < 0)
goto err_free;
resp.async_fd = ret;
- filp = ib_uverbs_alloc_event_file(file, 1);
+ filp = ib_uverbs_alloc_event_file(file, ib_dev, 1);
if (IS_ERR(filp)) {
ret = PTR_ERR(filp);
goto err_fd;
}
- file->async_file = filp->private_data;
-
- INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev,
- ib_uverbs_event_handler);
- ret = ib_register_event_handler(&file->event_handler);
- if (ret)
- goto err_file;
-
if (copy_to_user((void __user *) (unsigned long) cmd.response,
&resp, sizeof resp)) {
ret = -EFAULT;
goto err_file;
}
- kref_get(&file->async_file->ref);
- kref_get(&file->ref);
+
file->ucontext = ucontext;
fd_install(resp.async_fd, filp);
@@ -420,76 +400,75 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
return in_len;
err_file:
+ ib_uverbs_free_async_event_file(file);
fput(filp);
err_fd:
put_unused_fd(resp.async_fd);
err_free:
- ibdev->dealloc_ucontext(ucontext);
+ put_pid(ucontext->tgid);
+ ib_dev->dealloc_ucontext(ucontext);
err:
mutex_unlock(&file->mutex);
return ret;
}
-static void ib_uverbs_query_device_assign(
- struct ib_uverbs_query_device_resp *resp,
- struct ib_device_attr *attr,
- struct ib_uverbs_file *file)
-{
- memset(resp, 0, sizeof(*resp));
-
- resp->fw_ver = attr->fw_ver;
- resp->node_guid = file->device->ib_dev->node_guid;
- resp->sys_image_guid = attr->sys_image_guid;
- resp->max_mr_size = attr->max_mr_size;
- resp->page_size_cap = attr->page_size_cap;
- resp->vendor_id = attr->vendor_id;
- resp->vendor_part_id = attr->vendor_part_id;
- resp->hw_ver = attr->hw_ver;
- resp->max_qp = attr->max_qp;
- resp->max_qp_wr = attr->max_qp_wr;
- resp->device_cap_flags = attr->device_cap_flags;
- resp->max_sge = attr->max_sge;
- resp->max_sge_rd = attr->max_sge_rd;
- resp->max_cq = attr->max_cq;
- resp->max_cqe = attr->max_cqe;
- resp->max_mr = attr->max_mr;
- resp->max_pd = attr->max_pd;
- resp->max_qp_rd_atom = attr->max_qp_rd_atom;
- resp->max_ee_rd_atom = attr->max_ee_rd_atom;
- resp->max_res_rd_atom = attr->max_res_rd_atom;
- resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom;
- resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom;
- resp->atomic_cap = attr->atomic_cap;
- resp->max_ee = attr->max_ee;
- resp->max_rdd = attr->max_rdd;
- resp->max_mw = attr->max_mw;
- resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp;
- resp->max_raw_ethy_qp = attr->max_raw_ethy_qp;
- resp->max_mcast_grp = attr->max_mcast_grp;
- resp->max_mcast_qp_attach = attr->max_mcast_qp_attach;
- resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach;
- resp->max_ah = attr->max_ah;
- resp->max_fmr = attr->max_fmr;
- resp->max_map_per_fmr = attr->max_map_per_fmr;
- resp->max_srq = attr->max_srq;
- resp->max_srq_wr = attr->max_srq_wr;
- resp->max_srq_sge = attr->max_srq_sge;
- resp->max_pkeys = attr->max_pkeys;
- resp->local_ca_ack_delay = attr->local_ca_ack_delay;
- resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt;
+static void copy_query_dev_fields(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ struct ib_uverbs_query_device_resp *resp,
+ struct ib_device_attr *attr)
+{
+ resp->fw_ver = attr->fw_ver;
+ resp->node_guid = ib_dev->node_guid;
+ resp->sys_image_guid = attr->sys_image_guid;
+ resp->max_mr_size = attr->max_mr_size;
+ resp->page_size_cap = attr->page_size_cap;
+ resp->vendor_id = attr->vendor_id;
+ resp->vendor_part_id = attr->vendor_part_id;
+ resp->hw_ver = attr->hw_ver;
+ resp->max_qp = attr->max_qp;
+ resp->max_qp_wr = attr->max_qp_wr;
+ resp->device_cap_flags = (u32)(attr->device_cap_flags);
+ resp->max_sge = attr->max_sge;
+ resp->max_sge_rd = attr->max_sge_rd;
+ resp->max_cq = attr->max_cq;
+ resp->max_cqe = attr->max_cqe;
+ resp->max_mr = attr->max_mr;
+ resp->max_pd = attr->max_pd;
+ resp->max_qp_rd_atom = attr->max_qp_rd_atom;
+ resp->max_ee_rd_atom = attr->max_ee_rd_atom;
+ resp->max_res_rd_atom = attr->max_res_rd_atom;
+ resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom;
+ resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom;
+ resp->atomic_cap = attr->atomic_cap;
+ resp->max_ee = attr->max_ee;
+ resp->max_rdd = attr->max_rdd;
+ resp->max_mw = attr->max_mw;
+ resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp;
+ resp->max_raw_ethy_qp = attr->max_raw_ethy_qp;
+ resp->max_mcast_grp = attr->max_mcast_grp;
+ resp->max_mcast_qp_attach = attr->max_mcast_qp_attach;
+ resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach;
+ resp->max_ah = attr->max_ah;
+ resp->max_fmr = attr->max_fmr;
+ resp->max_map_per_fmr = attr->max_map_per_fmr;
+ resp->max_srq = attr->max_srq;
+ resp->max_srq_wr = attr->max_srq_wr;
+ resp->max_srq_sge = attr->max_srq_sge;
+ resp->max_pkeys = attr->max_pkeys;
+ resp->local_ca_ack_delay = attr->local_ca_ack_delay;
+ resp->phys_port_cnt = ib_dev->phys_port_cnt;
}
ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
struct ib_uverbs_query_device cmd;
struct ib_uverbs_query_device_resp resp;
- struct ib_device_attr attr;
- int ret;
if (out_len < sizeof resp)
return -ENOSPC;
@@ -497,20 +476,18 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
- ret = ib_query_device(file->device->ib_dev, &attr);
- if (ret)
- return ret;
-
- ib_uverbs_query_device_assign(&resp, &attr, file);
+ memset(&resp, 0, sizeof resp);
+ copy_query_dev_fields(file, ib_dev, &resp, &ib_dev->attrs);
- if (copy_to_user((void __user *)(unsigned long) cmd.response,
- &resp, sizeof(resp)))
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
return -EFAULT;
return in_len;
}
ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
@@ -525,7 +502,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
- ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr);
+ ret = ib_query_port(ib_dev, cmd.port_num, &attr);
if (ret)
return ret;
@@ -550,7 +527,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
resp.active_width = attr.active_width;
resp.active_speed = attr.active_speed;
resp.phys_state = attr.phys_state;
- resp.link_layer = rdma_port_get_link_layer(file->device->ib_dev,
+ resp.link_layer = rdma_port_get_link_layer(ib_dev,
cmd.port_num);
if (copy_to_user((void __user *) (unsigned long) cmd.response,
@@ -561,6 +538,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
@@ -588,15 +566,15 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
down_write(&uobj->mutex);
- pd = file->device->ib_dev->alloc_pd(file->device->ib_dev,
- file->ucontext, &udata);
+ pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata);
if (IS_ERR(pd)) {
ret = PTR_ERR(pd);
goto err;
}
- pd->device = file->device->ib_dev;
+ pd->device = ib_dev;
pd->uobject = uobj;
+ pd->__internal_mr = NULL;
atomic_set(&pd->usecnt, 0);
uobj->object = pd;
@@ -635,11 +613,13 @@ err:
}
ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
struct ib_uverbs_dealloc_pd cmd;
struct ib_uobject *uobj;
+ struct ib_pd *pd;
int ret;
if (copy_from_user(&cmd, buf, sizeof cmd))
@@ -648,15 +628,20 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext);
if (!uobj)
return -EINVAL;
+ pd = uobj->object;
- ret = ib_dealloc_pd(uobj->object);
- if (!ret)
- uobj->live = 0;
-
- put_uobj_write(uobj);
+ if (atomic_read(&pd->usecnt)) {
+ ret = -EBUSY;
+ goto err_put;
+ }
+ ret = pd->device->dealloc_pd(uobj->object);
+ WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
if (ret)
- return ret;
+ goto err_put;
+
+ uobj->live = 0;
+ put_uobj_write(uobj);
idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
@@ -667,6 +652,10 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
put_uobj(uobj);
return in_len;
+
+err_put:
+ put_uobj_write(uobj);
+ return ret;
}
struct xrcd_table_entry {
@@ -755,12 +744,13 @@ static void xrcd_table_delete(struct ib_uverbs_device *dev,
}
ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_open_xrcd cmd;
struct ib_uverbs_open_xrcd_resp resp;
- struct ib_udata udata;
+ struct ib_udata udata;
struct ib_uxrcd_object *obj;
struct ib_xrcd *xrcd = NULL;
struct fd f = {NULL};
@@ -776,7 +766,7 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
INIT_UDATA(&udata, buf + sizeof cmd,
(unsigned long) cmd.response + sizeof resp,
- in_len - sizeof cmd, out_len - sizeof resp);
+ in_len - sizeof cmd, out_len - sizeof resp);
mutex_lock(&file->device->xrcd_tree_mutex);
@@ -797,7 +787,7 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
}
if (xrcd && cmd.oflags & O_EXCL) {
- ret = -EINVAL;
+ ret = -EINVAL;
goto err_tree_mutex_unlock;
}
}
@@ -813,15 +803,14 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
down_write(&obj->uobject.mutex);
if (!xrcd) {
- xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev,
- file->ucontext, &udata);
+ xrcd = ib_dev->alloc_xrcd(ib_dev, file->ucontext, &udata);
if (IS_ERR(xrcd)) {
ret = PTR_ERR(xrcd);
goto err;
}
xrcd->inode = inode;
- xrcd->device = file->device->ib_dev;
+ xrcd->device = ib_dev;
atomic_set(&xrcd->usecnt, 0);
mutex_init(&xrcd->tgt_qp_mutex);
INIT_LIST_HEAD(&xrcd->tgt_qp_list);
@@ -892,11 +881,12 @@ err_tree_mutex_unlock:
}
ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_close_xrcd cmd;
- struct ib_uobject *uobj;
+ struct ib_uobject *uobj;
struct ib_xrcd *xrcd = NULL;
struct inode *inode = NULL;
struct ib_uxrcd_object *obj;
@@ -924,8 +914,8 @@ ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file,
if (!inode || atomic_dec_and_test(&xrcd->usecnt)) {
ret = ib_dealloc_xrcd(uobj->object);
- if (!ret)
- uobj->live = 0;
+ if (!ret)
+ uobj->live = 0;
}
live = uobj->live;
@@ -969,16 +959,17 @@ void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev,
}
ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_reg_mr cmd;
struct ib_uverbs_reg_mr_resp resp;
- struct ib_udata udata;
+ struct ib_udata udata;
struct ib_uobject *uobj;
struct ib_pd *pd;
struct ib_mr *mr;
- int ret;
+ int ret;
if (out_len < sizeof resp)
return -ENOSPC;
@@ -1006,30 +997,35 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
pd = idr_read_pd(cmd.pd_handle, file->ucontext);
if (!pd) {
- ret = -EINVAL;
+ ret = -EINVAL;
goto err_free;
}
- /* We first get a new "obj id" to be passed later to reg mr for
- further use as mr_id.
- */
- ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj);
- if (ret)
- goto err_put;
+
+ if (cmd.access_flags & IB_ACCESS_ON_DEMAND) {
+ if (!(pd->device->attrs.device_cap_flags &
+ IB_DEVICE_ON_DEMAND_PAGING)) {
+ pr_debug("ODP support not available\n");
+ ret = -EINVAL;
+ goto err_put;
+ }
+ }
mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
- cmd.access_flags, &udata, uobj->id);
+ cmd.access_flags, &udata);
if (IS_ERR(mr)) {
ret = PTR_ERR(mr);
- goto err_remove_uobj;
+ goto err_put;
}
mr->device = pd->device;
mr->pd = pd;
mr->uobject = uobj;
atomic_inc(&pd->usecnt);
- atomic_set(&mr->usecnt, 0);
uobj->object = mr;
+ ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj);
+ if (ret)
+ goto err_unreg;
memset(&resp, 0, sizeof resp);
resp.lkey = mr->lkey;
@@ -1055,11 +1051,11 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
return in_len;
err_copy:
- ib_dereg_mr(mr);
-
-err_remove_uobj:
idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+err_unreg:
+ ib_dereg_mr(mr);
+
err_put:
put_pd_read(pd);
@@ -1068,14 +1064,104 @@ err_free:
return ret;
}
+ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_rereg_mr cmd;
+ struct ib_uverbs_rereg_mr_resp resp;
+ struct ib_udata udata;
+ struct ib_pd *pd = NULL;
+ struct ib_mr *mr;
+ struct ib_pd *old_pd;
+ int ret;
+ struct ib_uobject *uobj;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof(cmd)))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof(cmd),
+ (unsigned long) cmd.response + sizeof(resp),
+ in_len - sizeof(cmd), out_len - sizeof(resp));
+
+ if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags)
+ return -EINVAL;
+
+ if ((cmd.flags & IB_MR_REREG_TRANS) &&
+ (!cmd.start || !cmd.hca_va || 0 >= cmd.length ||
+ (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)))
+ return -EINVAL;
+
+ uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle,
+ file->ucontext);
+
+ if (!uobj)
+ return -EINVAL;
+
+ mr = uobj->object;
+
+ if (cmd.flags & IB_MR_REREG_ACCESS) {
+ ret = ib_check_mr_access(cmd.access_flags);
+ if (ret)
+ goto put_uobjs;
+ }
+
+ if (cmd.flags & IB_MR_REREG_PD) {
+ pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+ if (!pd) {
+ ret = -EINVAL;
+ goto put_uobjs;
+ }
+ }
+
+ old_pd = mr->pd;
+ ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start,
+ cmd.length, cmd.hca_va,
+ cmd.access_flags, pd, &udata);
+ if (!ret) {
+ if (cmd.flags & IB_MR_REREG_PD) {
+ atomic_inc(&pd->usecnt);
+ mr->pd = pd;
+ atomic_dec(&old_pd->usecnt);
+ }
+ } else {
+ goto put_uobj_pd;
+ }
+
+ memset(&resp, 0, sizeof(resp));
+ resp.lkey = mr->lkey;
+ resp.rkey = mr->rkey;
+
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+ else
+ ret = in_len;
+
+put_uobj_pd:
+ if (cmd.flags & IB_MR_REREG_PD)
+ put_pd_read(pd);
+
+put_uobjs:
+
+ put_uobj_write(mr->uobject);
+
+ return ret;
+}
+
ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_dereg_mr cmd;
struct ib_mr *mr;
struct ib_uobject *uobj;
- int ret = -EINVAL;
+ int ret = -EINVAL;
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
@@ -1107,14 +1193,16 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_alloc_mw cmd;
struct ib_uverbs_alloc_mw_resp resp;
struct ib_uobject *uobj;
struct ib_pd *pd;
struct ib_mw *mw;
+ struct ib_udata udata;
int ret;
if (out_len < sizeof(resp))
@@ -1136,7 +1224,12 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
goto err_free;
}
- mw = pd->device->alloc_mw(pd, cmd.mw_type);
+ INIT_UDATA(&udata, buf + sizeof(cmd),
+ (unsigned long)cmd.response + sizeof(resp),
+ in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+ out_len - sizeof(resp));
+
+ mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata);
if (IS_ERR(mw)) {
ret = PTR_ERR(mw);
goto err_put;
@@ -1178,7 +1271,7 @@ err_copy:
idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
err_unalloc:
- ib_dealloc_mw(mw);
+ uverbs_dealloc_mw(mw);
err_put:
put_pd_read(pd);
@@ -1189,13 +1282,14 @@ err_free:
}
ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
struct ib_uverbs_dealloc_mw cmd;
struct ib_mw *mw;
- struct ib_uobject *uobj;
- int ret = -EINVAL;
+ struct ib_uobject *uobj;
+ int ret = -EINVAL;
if (copy_from_user(&cmd, buf, sizeof(cmd)))
return -EFAULT;
@@ -1206,7 +1300,7 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
mw = uobj->object;
- ret = ib_dealloc_mw(mw);
+ ret = uverbs_dealloc_mw(mw);
if (!ret)
uobj->live = 0;
@@ -1227,8 +1321,9 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_create_comp_channel cmd;
struct ib_uverbs_create_comp_channel_resp resp;
@@ -1241,12 +1336,12 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
- ret = get_unused_fd();
+ ret = get_unused_fd_flags(O_CLOEXEC);
if (ret < 0)
return ret;
resp.fd = ret;
- filp = ib_uverbs_alloc_event_file(file, 0);
+ filp = ib_uverbs_alloc_event_file(file, ib_dev, 0);
if (IS_ERR(filp)) {
put_unused_fd(resp.fd);
return PTR_ERR(filp);
@@ -1263,40 +1358,34 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
return in_len;
}
-static ssize_t create_cq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len, void *vcmd, int ex,
- void __user *response)
+static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw,
+ struct ib_uverbs_ex_create_cq *cmd,
+ size_t cmd_sz,
+ int (*cb)(struct ib_uverbs_file *file,
+ struct ib_ucq_object *obj,
+ struct ib_uverbs_ex_create_cq_resp *resp,
+ struct ib_udata *udata,
+ void *context),
+ void *context)
{
- struct ib_uverbs_create_cq *cmd;
- struct ib_uverbs_create_cq_ex *cmd_e;
- struct ib_uverbs_create_cq_resp resp;
- struct ib_udata udata;
struct ib_ucq_object *obj;
struct ib_uverbs_event_file *ev_file = NULL;
struct ib_cq *cq;
- struct ib_cq_init_attr attr;
- int cmd_sz;
int ret;
-
- if (out_len < sizeof resp)
- return -ENOSPC;
-
- cmd = vcmd;
- cmd_e = vcmd;
- cmd_sz = ex ? sizeof(*cmd_e) : sizeof(*cmd);
- INIT_UDATA(&udata, buf + cmd_sz, response + sizeof(resp),
- in_len - sizeof(cmd), out_len - sizeof(resp));
+ struct ib_uverbs_ex_create_cq_resp resp;
+ struct ib_cq_init_attr attr = {};
if (cmd->comp_vector >= file->device->num_comp_vectors)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
obj = kmalloc(sizeof *obj, GFP_KERNEL);
if (!obj)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
- init_uobj(&obj->uobject, cmd->user_handle, file->ucontext,
- &cq_lock_class);
+ init_uobj(&obj->uobject, cmd->user_handle, file->ucontext, &cq_lock_class);
down_write(&obj->uobject.mutex);
if (cmd->comp_channel >= 0) {
@@ -1313,19 +1402,20 @@ static ssize_t create_cq(struct ib_uverbs_file *file,
INIT_LIST_HEAD(&obj->comp_list);
INIT_LIST_HEAD(&obj->async_list);
- memset(&attr, 0, sizeof(attr));
attr.cqe = cmd->cqe;
attr.comp_vector = cmd->comp_vector;
- if (ex && (cmd_e->comp_mask & IB_UVERBS_CREATE_CQ_EX_CAP_FLAGS))
- attr.flags = cmd_e->create_flags;
- cq = file->device->ib_dev->create_cq(file->device->ib_dev, &attr,
- file->ucontext, &udata);
+
+ if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
+ attr.flags = cmd->flags;
+
+ cq = ib_dev->create_cq(ib_dev, &attr,
+ file->ucontext, uhw);
if (IS_ERR(cq)) {
ret = PTR_ERR(cq);
goto err_file;
}
- cq->device = file->device->ib_dev;
+ cq->device = ib_dev;
cq->uobject = &obj->uobject;
cq->comp_handler = ib_uverbs_comp_handler;
cq->event_handler = ib_uverbs_cq_event_handler;
@@ -1338,13 +1428,15 @@ static ssize_t create_cq(struct ib_uverbs_file *file,
goto err_free;
memset(&resp, 0, sizeof resp);
- resp.cq_handle = obj->uobject.id;
- resp.cqe = cq->cqe;
+ resp.base.cq_handle = obj->uobject.id;
+ resp.base.cqe = cq->cqe;
- if (copy_to_user(response, &resp, sizeof(resp))) {
- ret = -EFAULT;
- goto err_copy;
- }
+ resp.response_length = offsetof(typeof(resp), response_length) +
+ sizeof(resp.response_length);
+
+ ret = cb(file, obj, &resp, ucore, context);
+ if (ret)
+ goto err_cb;
mutex_lock(&file->mutex);
list_add_tail(&obj->uobject.list, &file->ucontext->cq_list);
@@ -1354,9 +1446,9 @@ static ssize_t create_cq(struct ib_uverbs_file *file,
up_write(&obj->uobject.mutex);
- return in_len;
+ return obj;
-err_copy:
+err_cb:
idr_remove_uobj(&ib_uverbs_cq_idr, &obj->uobject);
err_free:
@@ -1368,24 +1460,112 @@ err_file:
err:
put_uobj_write(&obj->uobject);
- return ret;
+
+ return ERR_PTR(ret);
+}
+
+static int ib_uverbs_create_cq_cb(struct ib_uverbs_file *file,
+ struct ib_ucq_object *obj,
+ struct ib_uverbs_ex_create_cq_resp *resp,
+ struct ib_udata *ucore, void *context)
+{
+ if (ib_copy_to_udata(ucore, &resp->base, sizeof(resp->base)))
+ return -EFAULT;
+
+ return 0;
}
ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
{
- struct ib_uverbs_create_cq cmd;
+ struct ib_uverbs_create_cq cmd;
+ struct ib_uverbs_ex_create_cq cmd_ex;
+ struct ib_uverbs_create_cq_resp resp;
+ struct ib_udata ucore;
+ struct ib_udata uhw;
+ struct ib_ucq_object *obj;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
if (copy_from_user(&cmd, buf, sizeof(cmd)))
return -EFAULT;
- return create_cq(file, buf, in_len, out_len, &cmd,
- IB_USER_VERBS_CMD_BASIC,
- (void __user *) (unsigned long) cmd.response);
+ INIT_UDATA(&ucore, buf, (unsigned long)cmd.response, sizeof(cmd), sizeof(resp));
+
+ INIT_UDATA(&uhw, buf + sizeof(cmd),
+ (unsigned long)cmd.response + sizeof(resp),
+ in_len - sizeof(cmd), out_len - sizeof(resp));
+
+ memset(&cmd_ex, 0, sizeof(cmd_ex));
+ cmd_ex.user_handle = cmd.user_handle;
+ cmd_ex.cqe = cmd.cqe;
+ cmd_ex.comp_vector = cmd.comp_vector;
+ cmd_ex.comp_channel = cmd.comp_channel;
+
+ obj = create_cq(file, ib_dev, &ucore, &uhw, &cmd_ex,
+ offsetof(typeof(cmd_ex), comp_channel) +
+ sizeof(cmd.comp_channel), ib_uverbs_create_cq_cb,
+ NULL);
+
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+
+ return in_len;
+}
+
+static int ib_uverbs_ex_create_cq_cb(struct ib_uverbs_file *file,
+ struct ib_ucq_object *obj,
+ struct ib_uverbs_ex_create_cq_resp *resp,
+ struct ib_udata *ucore, void *context)
+{
+ if (ib_copy_to_udata(ucore, resp, resp->response_length))
+ return -EFAULT;
+
+ return 0;
+}
+
+int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_create_cq_resp resp;
+ struct ib_uverbs_ex_create_cq cmd;
+ struct ib_ucq_object *obj;
+ int err;
+
+ if (ucore->inlen < sizeof(cmd))
+ return -EINVAL;
+
+ err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+ if (err)
+ return err;
+
+ if (cmd.comp_mask)
+ return -EINVAL;
+
+ if (cmd.reserved)
+ return -EINVAL;
+
+ if (ucore->outlen < (offsetof(typeof(resp), response_length) +
+ sizeof(resp.response_length)))
+ return -ENOSPC;
+
+ obj = create_cq(file, ib_dev, ucore, uhw, &cmd,
+ min(ucore->inlen, sizeof(cmd)),
+ ib_uverbs_ex_create_cq_cb, NULL);
+
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+
+ return 0;
}
ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1449,8 +1629,9 @@ static int copy_wc_to_user(void __user *dest, struct ib_wc *wc)
}
ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_poll_cq cmd;
struct ib_uverbs_poll_cq_resp resp;
@@ -1458,7 +1639,7 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
u8 __user *data_ptr;
struct ib_cq *cq;
struct ib_wc wc;
- int ret;
+ int ret;
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
@@ -1500,6 +1681,7 @@ out_put:
}
ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1522,6 +1704,7 @@ ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1573,63 +1756,71 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
return in_len;
}
-ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+static int create_qp(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw,
+ struct ib_uverbs_ex_create_qp *cmd,
+ size_t cmd_sz,
+ int (*cb)(struct ib_uverbs_file *file,
+ struct ib_uverbs_ex_create_qp_resp *resp,
+ struct ib_udata *udata),
+ void *context)
{
- void __user *response;
- struct ib_udata udata;
- struct ib_uqp_object *obj;
- struct ib_device *device;
- struct ib_pd *pd = NULL;
- struct ib_xrcd *xrcd = NULL;
- struct ib_uobject *uninitialized_var(xrcd_uobj);
- struct ib_cq *scq = NULL, *rcq = NULL;
- struct ib_srq *srq = NULL;
- struct ib_qp *qp;
- struct ib_qp_init_attr attr;
- int ret;
- union {
- struct ib_uverbs_create_qp basic;
- } cmd_obj;
- struct ib_uverbs_create_qp *cmd;
- size_t cmd_size = 0;
- union {
- struct ib_uverbs_create_qp_resp basic;
- } resp_obj;
- struct ib_uverbs_create_qp_resp *resp;
- size_t resp_size = 0;
-
- cmd_size = sizeof(cmd_obj.basic);
- cmd = &cmd_obj.basic;
-
- resp_size = sizeof(resp_obj.basic);
- resp = &resp_obj.basic;
-
- if (out_len < resp_size)
- return -ENOSPC;
-
- if (copy_from_user(&cmd_obj, buf, cmd_size))
- return -EFAULT;
-
- response = (void __user *) (unsigned long) cmd->response;
+ struct ib_uqp_object *obj;
+ struct ib_device *device;
+ struct ib_pd *pd = NULL;
+ struct ib_xrcd *xrcd = NULL;
+ struct ib_uobject *uninitialized_var(xrcd_uobj);
+ struct ib_cq *scq = NULL, *rcq = NULL;
+ struct ib_srq *srq = NULL;
+ struct ib_qp *qp;
+ char *buf;
+ struct ib_qp_init_attr attr = {};
+ struct ib_uverbs_ex_create_qp_resp resp;
+ int ret;
+ struct ib_rwq_ind_table *ind_tbl = NULL;
+ bool has_sq = true;
- if (!disable_raw_qp_enforcement &&
- cmd->qp_type == IB_QPT_RAW_PACKET && priv_check(curthread, PRIV_NET_RAW))
+ if (cmd->qp_type == IB_QPT_RAW_PACKET && priv_check(curthread, PRIV_NET_RAW) != 0)
return -EPERM;
- INIT_UDATA(&udata, buf + cmd_size, response + resp_size,
- in_len - cmd_size, out_len - resp_size);
-
obj = kzalloc(sizeof *obj, GFP_KERNEL);
if (!obj)
return -ENOMEM;
- init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &qp_lock_class);
+ init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext,
+ &qp_lock_class);
down_write(&obj->uevent.uobject.mutex);
+ if (cmd_sz >= offsetof(typeof(*cmd), rwq_ind_tbl_handle) +
+ sizeof(cmd->rwq_ind_tbl_handle) &&
+ (cmd->comp_mask & IB_UVERBS_CREATE_QP_MASK_IND_TABLE)) {
+ ind_tbl = idr_read_rwq_indirection_table(cmd->rwq_ind_tbl_handle,
+ file->ucontext);
+ if (!ind_tbl) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ attr.rwq_ind_tbl = ind_tbl;
+ }
+
+ if ((cmd_sz >= offsetof(typeof(*cmd), reserved1) +
+ sizeof(cmd->reserved1)) && cmd->reserved1) {
+ ret = -EOPNOTSUPP;
+ goto err_put;
+ }
+
+ if (ind_tbl && (cmd->max_recv_wr || cmd->max_recv_sge || cmd->is_srq)) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ if (ind_tbl && !cmd->max_send_wr)
+ has_sq = false;
if (cmd->qp_type == IB_QPT_XRC_TGT) {
- xrcd = idr_read_xrcd(cmd->pd_handle, file->ucontext, &xrcd_uobj);
+ xrcd = idr_read_xrcd(cmd->pd_handle, file->ucontext,
+ &xrcd_uobj);
if (!xrcd) {
ret = -EINVAL;
goto err_put;
@@ -1641,41 +1832,47 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
cmd->max_recv_sge = 0;
} else {
if (cmd->is_srq) {
- srq = idr_read_srq(cmd->srq_handle, file->ucontext);
+ srq = idr_read_srq(cmd->srq_handle,
+ file->ucontext);
if (!srq || srq->srq_type != IB_SRQT_BASIC) {
- ret = -EINVAL;
- goto err_put;
- }
- }
-
- if (cmd->recv_cq_handle != cmd->send_cq_handle) {
- rcq = idr_read_cq(cmd->recv_cq_handle, file->ucontext, 0);
- if (!rcq) {
- ret = -EINVAL;
+ ret = -EINVAL;
goto err_put;
+ }
}
+
+ if (!ind_tbl) {
+ if (cmd->recv_cq_handle != cmd->send_cq_handle) {
+ rcq = idr_read_cq(cmd->recv_cq_handle,
+ file->ucontext, 0);
+ if (!rcq) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+ }
}
}
- scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq);
- rcq = rcq ?: scq;
+ if (has_sq)
+ scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq);
+ if (!ind_tbl)
+ rcq = rcq ?: scq;
pd = idr_read_pd(cmd->pd_handle, file->ucontext);
- if (!pd || !scq) {
+ if (!pd || (!scq && has_sq)) {
ret = -EINVAL;
goto err_put;
- }
+ }
device = pd->device;
- }
+ }
- memset(&attr, 0, sizeof attr);
attr.event_handler = ib_uverbs_qp_event_handler;
attr.qp_context = file;
attr.send_cq = scq;
attr.recv_cq = rcq;
attr.srq = srq;
attr.xrcd = xrcd;
- attr.sq_sig_type = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+ attr.sq_sig_type = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR :
+ IB_SIGNAL_REQ_WR;
attr.qp_type = cmd->qp_type;
attr.create_flags = 0;
@@ -1689,10 +1886,31 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
INIT_LIST_HEAD(&obj->uevent.event_list);
INIT_LIST_HEAD(&obj->mcast_list);
+ if (cmd_sz >= offsetof(typeof(*cmd), create_flags) +
+ sizeof(cmd->create_flags))
+ attr.create_flags = cmd->create_flags;
+
+ if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
+ IB_QP_CREATE_CROSS_CHANNEL |
+ IB_QP_CREATE_MANAGED_SEND |
+ IB_QP_CREATE_MANAGED_RECV |
+ IB_QP_CREATE_SCATTER_FCS)) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ buf = (char *)cmd + sizeof(*cmd);
+ if (cmd_sz > sizeof(*cmd))
+ if (!(buf[0] == 0 && !memcmp(buf, buf + 1,
+ cmd_sz - sizeof(*cmd) - 1))) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
if (cmd->qp_type == IB_QPT_XRC_TGT)
qp = ib_create_qp(pd, &attr);
else
- qp = device->create_qp(pd, &attr, &udata);
+ qp = device->create_qp(pd, &attr, uhw);
if (IS_ERR(qp)) {
ret = PTR_ERR(qp);
@@ -1706,16 +1924,20 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
qp->send_cq = attr.send_cq;
qp->recv_cq = attr.recv_cq;
qp->srq = attr.srq;
+ qp->rwq_ind_tbl = ind_tbl;
qp->event_handler = attr.event_handler;
qp->qp_context = attr.qp_context;
qp->qp_type = attr.qp_type;
atomic_set(&qp->usecnt, 0);
atomic_inc(&pd->usecnt);
- atomic_inc(&attr.send_cq->usecnt);
+ if (attr.send_cq)
+ atomic_inc(&attr.send_cq->usecnt);
if (attr.recv_cq)
atomic_inc(&attr.recv_cq->usecnt);
if (attr.srq)
atomic_inc(&attr.srq->usecnt);
+ if (ind_tbl)
+ atomic_inc(&ind_tbl->usecnt);
}
qp->uobject = &obj->uevent.uobject;
@@ -1724,25 +1946,28 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
if (ret)
goto err_destroy;
- memset(&resp_obj, 0, sizeof(resp_obj));
- resp->qpn = qp->qp_num;
- resp->qp_handle = obj->uevent.uobject.id;
- resp->max_recv_sge = attr.cap.max_recv_sge;
- resp->max_send_sge = attr.cap.max_send_sge;
- resp->max_recv_wr = attr.cap.max_recv_wr;
- resp->max_send_wr = attr.cap.max_send_wr;
- resp->max_inline_data = attr.cap.max_inline_data;
-
- if (copy_to_user(response, &resp_obj, resp_size)) {
- ret = -EFAULT;
- goto err_copy;
- }
+ memset(&resp, 0, sizeof resp);
+ resp.base.qpn = qp->qp_num;
+ resp.base.qp_handle = obj->uevent.uobject.id;
+ resp.base.max_recv_sge = attr.cap.max_recv_sge;
+ resp.base.max_send_sge = attr.cap.max_send_sge;
+ resp.base.max_recv_wr = attr.cap.max_recv_wr;
+ resp.base.max_send_wr = attr.cap.max_send_wr;
+ resp.base.max_inline_data = attr.cap.max_inline_data;
+
+ resp.response_length = offsetof(typeof(resp), response_length) +
+ sizeof(resp.response_length);
+
+ ret = cb(file, &resp, ucore);
+ if (ret)
+ goto err_cb;
if (xrcd) {
- obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+ obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object,
+ uobject);
atomic_inc(&obj->uxrcd->refcnt);
put_xrcd_read(xrcd_uobj);
- }
+ }
if (pd)
put_pd_read(pd);
@@ -1752,6 +1977,8 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
put_cq_read(rcq);
if (srq)
put_srq_read(srq);
+ if (ind_tbl)
+ put_rwq_indirection_table_read(ind_tbl);
mutex_lock(&file->mutex);
list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list);
@@ -1761,9 +1988,8 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
up_write(&obj->uevent.uobject.mutex);
- return in_len;
-
-err_copy:
+ return 0;
+err_cb:
idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
err_destroy:
@@ -1780,12 +2006,123 @@ err_put:
put_cq_read(rcq);
if (srq)
put_srq_read(srq);
+ if (ind_tbl)
+ put_rwq_indirection_table_read(ind_tbl);
put_uobj_write(&obj->uevent.uobject);
return ret;
}
+static int ib_uverbs_create_qp_cb(struct ib_uverbs_file *file,
+ struct ib_uverbs_ex_create_qp_resp *resp,
+ struct ib_udata *ucore)
+{
+ if (ib_copy_to_udata(ucore, &resp->base, sizeof(resp->base)))
+ return -EFAULT;
+
+ return 0;
+}
+
+ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_qp cmd;
+ struct ib_uverbs_ex_create_qp cmd_ex;
+ struct ib_udata ucore;
+ struct ib_udata uhw;
+ ssize_t resp_size = sizeof(struct ib_uverbs_create_qp_resp);
+ int err;
+
+ if (out_len < resp_size)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof(cmd)))
+ return -EFAULT;
+
+ INIT_UDATA(&ucore, buf, (unsigned long)cmd.response, sizeof(cmd),
+ resp_size);
+ INIT_UDATA(&uhw, buf + sizeof(cmd),
+ (unsigned long)cmd.response + resp_size,
+ in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+ out_len - resp_size);
+
+ memset(&cmd_ex, 0, sizeof(cmd_ex));
+ cmd_ex.user_handle = cmd.user_handle;
+ cmd_ex.pd_handle = cmd.pd_handle;
+ cmd_ex.send_cq_handle = cmd.send_cq_handle;
+ cmd_ex.recv_cq_handle = cmd.recv_cq_handle;
+ cmd_ex.srq_handle = cmd.srq_handle;
+ cmd_ex.max_send_wr = cmd.max_send_wr;
+ cmd_ex.max_recv_wr = cmd.max_recv_wr;
+ cmd_ex.max_send_sge = cmd.max_send_sge;
+ cmd_ex.max_recv_sge = cmd.max_recv_sge;
+ cmd_ex.max_inline_data = cmd.max_inline_data;
+ cmd_ex.sq_sig_all = cmd.sq_sig_all;
+ cmd_ex.qp_type = cmd.qp_type;
+ cmd_ex.is_srq = cmd.is_srq;
+
+ err = create_qp(file, &ucore, &uhw, &cmd_ex,
+ offsetof(typeof(cmd_ex), is_srq) +
+ sizeof(cmd.is_srq), ib_uverbs_create_qp_cb,
+ NULL);
+
+ if (err)
+ return err;
+
+ return in_len;
+}
+
+static int ib_uverbs_ex_create_qp_cb(struct ib_uverbs_file *file,
+ struct ib_uverbs_ex_create_qp_resp *resp,
+ struct ib_udata *ucore)
+{
+ if (ib_copy_to_udata(ucore, resp, resp->response_length))
+ return -EFAULT;
+
+ return 0;
+}
+
+int ib_uverbs_ex_create_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_create_qp_resp resp;
+ struct ib_uverbs_ex_create_qp cmd = {0};
+ int err;
+
+ if (ucore->inlen < (offsetof(typeof(cmd), comp_mask) +
+ sizeof(cmd.comp_mask)))
+ return -EINVAL;
+
+ err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+ if (err)
+ return err;
+
+ if (cmd.comp_mask & ~IB_UVERBS_CREATE_QP_SUP_COMP_MASK)
+ return -EINVAL;
+
+ if (cmd.reserved)
+ return -EINVAL;
+
+ if (ucore->outlen < (offsetof(typeof(resp), response_length) +
+ sizeof(resp.response_length)))
+ return -ENOSPC;
+
+ err = create_qp(file, ucore, uhw, &cmd,
+ min(ucore->inlen, sizeof(cmd)),
+ ib_uverbs_ex_create_qp_cb, NULL);
+
+ if (err)
+ return err;
+
+ return 0;
+}
+
ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len, int out_len)
{
struct ib_uverbs_open_qp cmd;
@@ -1834,7 +2171,7 @@ ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
if (IS_ERR(qp)) {
ret = PTR_ERR(qp);
goto err_put;
- }
+ }
qp->uobject = &obj->uevent.uobject;
@@ -1862,6 +2199,7 @@ ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
mutex_unlock(&file->mutex);
obj->uevent.uobject.live = 1;
+
up_write(&obj->uevent.uobject.mutex);
return in_len;
@@ -1879,8 +2217,9 @@ err_put:
}
ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_query_qp cmd;
struct ib_uverbs_query_qp_resp resp;
@@ -1992,45 +2331,31 @@ static int modify_qp_mask(enum ib_qp_type qp_type, int mask)
}
}
-static ssize_t __uverbs_modify_qp(struct ib_uverbs_file *file,
+ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
- int out_len,
- enum uverbs_cmd_type cmd_type)
+ int out_len)
{
- struct ib_uverbs_modify_qp_ex cmd;
- struct ib_udata udata;
- struct ib_qp *qp;
- struct ib_qp_attr *attr;
- struct ib_qp_attr_ex *attrx;
- int ret;
- void *p;
- union ib_gid sgid;
- union ib_gid *dgid;
- u8 port_num;
-
- if (cmd_type == IB_USER_VERBS_CMD_BASIC) {
- p = &cmd;
- p += sizeof(cmd.comp_mask);
- if (copy_from_user(p, buf,
- sizeof(struct ib_uverbs_modify_qp)))
+ struct ib_uverbs_modify_qp cmd;
+ struct ib_udata udata;
+ struct ib_qp *qp;
+ struct ib_qp_attr *attr;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
- } else {
- if (copy_from_user(&cmd, buf, sizeof(cmd)))
- return -EFAULT;
- }
INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
out_len);
- attrx = kzalloc(sizeof(*attrx), GFP_KERNEL);
- if (!attrx)
+ attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ if (!attr)
return -ENOMEM;
- attr = (struct ib_qp_attr *)attrx;
qp = idr_read_qp(cmd.qp_handle, file->ucontext);
if (!qp) {
- kfree(attrx);
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
attr->qp_state = cmd.qp_state;
@@ -2078,77 +2403,39 @@ static ssize_t __uverbs_modify_qp(struct ib_uverbs_file *file,
attr->alt_ah_attr.static_rate = cmd.alt_dest.static_rate;
attr->alt_ah_attr.ah_flags = cmd.alt_dest.is_global ? IB_AH_GRH : 0;
attr->alt_ah_attr.port_num = cmd.alt_dest.port_num;
- port_num = (cmd.attr_mask & IB_QP_PORT) ? cmd.port_num : qp->port_num;
- if ((cmd.attr_mask & IB_QP_AV) && port_num &&
- (rdma_port_get_link_layer(qp->device, port_num) ==
- IB_LINK_LAYER_ETHERNET)) {
- ret = ib_query_gid(qp->device, port_num,
- attr->ah_attr.grh.sgid_index, &sgid);
- if (ret)
- goto out;
- dgid = &attr->ah_attr.grh.dgid;
- if (rdma_link_local_addr((struct in6_addr *)dgid->raw)) {
- rdma_get_ll_mac((struct in6_addr *)dgid->raw,
- attr->ah_attr.dmac);
- rdma_get_ll_mac((struct in6_addr *)sgid.raw,
- attr->smac);
- attr->vlan_id = rdma_get_vlan_id(&sgid);
- } else {
- ret = rdma_addr_find_dmac_by_grh(&sgid, dgid,
- attr->ah_attr.dmac,
- &attr->vlan_id, -1U);
- if (ret)
- goto out;
- ret = rdma_addr_find_smac_by_sgid(&sgid, attr->smac,
- NULL, -1U);
- if (ret)
- goto out;
- }
- cmd.attr_mask |= IB_QP_SMAC;
- if (attr->vlan_id < 0xFFFF)
- cmd.attr_mask |= IB_QP_VID;
- }
- if (cmd_type == IB_USER_VERBS_CMD_EXTENDED) {
- if (cmd.comp_mask & IB_UVERBS_QP_ATTR_DCT_KEY)
- attrx->dct_key = cmd.dct_key;
- }
if (qp->real_qp == qp) {
+ ret = ib_resolve_eth_dmac(qp, attr, &cmd.attr_mask);
+ if (ret)
+ goto release_qp;
ret = qp->device->modify_qp(qp, attr,
modify_qp_mask(qp->qp_type, cmd.attr_mask), &udata);
- if (!ret && (cmd.attr_mask & IB_QP_PORT))
- qp->port_num = attr->port_num;
} else {
ret = ib_modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask));
}
if (ret)
- goto out;
+ goto release_qp;
ret = in_len;
-out:
+release_qp:
put_qp_read(qp);
- kfree(attrx);
- return ret;
-}
+out:
+ kfree(attr);
-ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
-{
- return __uverbs_modify_qp(file, buf, in_len, out_len,
- IB_USER_VERBS_CMD_BASIC);
+ return ret;
}
ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
struct ib_uverbs_destroy_qp cmd;
struct ib_uverbs_destroy_qp_resp resp;
- struct ib_uobject *uobj;
+ struct ib_uobject *uobj;
struct ib_qp *qp;
struct ib_uqp_object *obj;
int ret = -EINVAL;
@@ -2200,18 +2487,26 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
return in_len;
}
+static void *alloc_wr(size_t wr_size, __u32 num_sge)
+{
+ return kmalloc(ALIGN(wr_size, sizeof (struct ib_sge)) +
+ num_sge * sizeof (struct ib_sge), GFP_KERNEL);
+};
+
ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_post_send cmd;
struct ib_uverbs_post_send_resp resp;
struct ib_uverbs_send_wr *user_wr;
struct ib_send_wr *wr = NULL, *last, *next, *bad_wr;
- struct ib_qp *qp;
+ struct ib_qp *qp;
int i, sg_ind;
int is_ud;
ssize_t ret = -EINVAL;
+ size_t next_size;
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
@@ -2247,17 +2542,90 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
goto out_put;
}
- next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
- user_wr->num_sge * sizeof (struct ib_sge),
- GFP_KERNEL);
- if (!next) {
- ret = -ENOMEM;
- goto out_put;
- }
+ if (is_ud) {
+ struct ib_ud_wr *ud;
+
+ if (user_wr->opcode != IB_WR_SEND &&
+ user_wr->opcode != IB_WR_SEND_WITH_IMM) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ next_size = sizeof(*ud);
+ ud = alloc_wr(next_size, user_wr->num_sge);
+ if (!ud) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ ud->ah = idr_read_ah(user_wr->wr.ud.ah, file->ucontext);
+ if (!ud->ah) {
+ kfree(ud);
+ ret = -EINVAL;
+ goto out_put;
+ }
+ ud->remote_qpn = user_wr->wr.ud.remote_qpn;
+ ud->remote_qkey = user_wr->wr.ud.remote_qkey;
+
+ next = &ud->wr;
+ } else if (user_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
+ user_wr->opcode == IB_WR_RDMA_WRITE ||
+ user_wr->opcode == IB_WR_RDMA_READ) {
+ struct ib_rdma_wr *rdma;
+
+ next_size = sizeof(*rdma);
+ rdma = alloc_wr(next_size, user_wr->num_sge);
+ if (!rdma) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ rdma->remote_addr = user_wr->wr.rdma.remote_addr;
+ rdma->rkey = user_wr->wr.rdma.rkey;
+
+ next = &rdma->wr;
+ } else if (user_wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ user_wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+ struct ib_atomic_wr *atomic;
+
+ next_size = sizeof(*atomic);
+ atomic = alloc_wr(next_size, user_wr->num_sge);
+ if (!atomic) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ atomic->remote_addr = user_wr->wr.atomic.remote_addr;
+ atomic->compare_add = user_wr->wr.atomic.compare_add;
+ atomic->swap = user_wr->wr.atomic.swap;
+ atomic->rkey = user_wr->wr.atomic.rkey;
+
+ next = &atomic->wr;
+ } else if (user_wr->opcode == IB_WR_SEND ||
+ user_wr->opcode == IB_WR_SEND_WITH_IMM ||
+ user_wr->opcode == IB_WR_SEND_WITH_INV) {
+ next_size = sizeof(*next);
+ next = alloc_wr(next_size, user_wr->num_sge);
+ if (!next) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+ } else {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ if (user_wr->opcode == IB_WR_SEND_WITH_IMM ||
+ user_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
+ next->ex.imm_data =
+ (__be32 __force) user_wr->ex.imm_data;
+ } else if (user_wr->opcode == IB_WR_SEND_WITH_INV) {
+ next->ex.invalidate_rkey = user_wr->ex.invalidate_rkey;
+ }
if (!last)
wr = next;
- else
+ else
last->next = next;
last = next;
@@ -2267,54 +2635,11 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
next->opcode = user_wr->opcode;
next->send_flags = user_wr->send_flags;
- if (is_ud) {
- next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah,
- file->ucontext);
- if (!next->wr.ud.ah) {
- ret = -EINVAL;
- goto out_put;
- }
- next->wr.ud.remote_qpn = user_wr->wr.ud.remote_qpn;
- next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey;
- } else {
- switch (next->opcode) {
- case IB_WR_RDMA_WRITE_WITH_IMM:
- next->ex.imm_data =
- (__be32 __force) user_wr->ex.imm_data;
- case IB_WR_RDMA_WRITE:
- case IB_WR_RDMA_READ:
- next->wr.rdma.remote_addr =
- user_wr->wr.rdma.remote_addr;
- next->wr.rdma.rkey =
- user_wr->wr.rdma.rkey;
- break;
- case IB_WR_SEND_WITH_IMM:
- next->ex.imm_data =
- (__be32 __force) user_wr->ex.imm_data;
- break;
- case IB_WR_SEND_WITH_INV:
- next->ex.invalidate_rkey =
- user_wr->ex.invalidate_rkey;
- break;
- case IB_WR_ATOMIC_CMP_AND_SWP:
- case IB_WR_ATOMIC_FETCH_AND_ADD:
- next->wr.atomic.remote_addr =
- user_wr->wr.atomic.remote_addr;
- next->wr.atomic.compare_add =
- user_wr->wr.atomic.compare_add;
- next->wr.atomic.swap = user_wr->wr.atomic.swap;
- next->wr.atomic.rkey = user_wr->wr.atomic.rkey;
- break;
- default:
- break;
- }
- }
-
if (next->num_sge) {
- next->sg_list = (void *) next +
- ALIGN(sizeof *next, sizeof (struct ib_sge));
+ next->sg_list = (void *)((char *)next +
+ ALIGN(next_size, sizeof(struct ib_sge)));
if (copy_from_user(next->sg_list,
- buf + sizeof cmd +
+ (const char *)buf + sizeof cmd +
cmd.wr_count * cmd.wqe_size +
sg_ind * sizeof (struct ib_sge),
next->num_sge * sizeof (struct ib_sge))) {
@@ -2332,7 +2657,7 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
for (next = wr; next; next = next->next) {
++resp.bad_wr;
if (next == bad_wr)
- break;
+ break;
}
if (copy_to_user((void __user *) (unsigned long) cmd.response,
@@ -2343,8 +2668,8 @@ out_put:
put_qp_read(qp);
while (wr) {
- if (is_ud && wr->wr.ud.ah)
- put_ah_read(wr->wr.ud.ah);
+ if (is_ud && ud_wr(wr)->ah)
+ put_ah_read(ud_wr(wr)->ah);
next = wr->next;
kfree(wr);
wr = next;
@@ -2366,7 +2691,7 @@ static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf,
struct ib_recv_wr *wr = NULL, *last, *next;
int sg_ind;
int i;
- int ret;
+ int ret;
if (in_len < wqe_size * wr_count +
sge_count * sizeof (struct ib_uverbs_sge))
@@ -2389,9 +2714,9 @@ static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf,
}
if (user_wr->num_sge + sg_ind > sge_count) {
- ret = -EINVAL;
- goto err;
- }
+ ret = -EINVAL;
+ goto err;
+ }
next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
user_wr->num_sge * sizeof (struct ib_sge),
@@ -2399,7 +2724,7 @@ static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf,
if (!next) {
ret = -ENOMEM;
goto err;
- }
+ }
if (!last)
wr = next;
@@ -2412,10 +2737,10 @@ static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf,
next->num_sge = user_wr->num_sge;
if (next->num_sge) {
- next->sg_list = (void *) next +
- ALIGN(sizeof *next, sizeof (struct ib_sge));
+ next->sg_list = (void *)((char *)next +
+ ALIGN(sizeof *next, sizeof (struct ib_sge)));
if (copy_from_user(next->sg_list,
- buf + wr_count * wqe_size +
+ (const char *)buf + wr_count * wqe_size +
sg_ind * sizeof (struct ib_sge),
next->num_sge * sizeof (struct ib_sge))) {
ret = -EFAULT;
@@ -2442,6 +2767,7 @@ err:
}
ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2491,6 +2817,7 @@ out:
}
ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2540,8 +2867,9 @@ out:
}
ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_create_ah cmd;
struct ib_uverbs_create_ah_resp resp;
@@ -2564,7 +2892,7 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_class);
down_write(&uobj->mutex);
- pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+ pd = idr_read_pd(cmd.pd_handle, file->ucontext);
if (!pd) {
ret = -EINVAL;
goto err;
@@ -2580,6 +2908,7 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
attr.grh.sgid_index = cmd.attr.grh.sgid_index;
attr.grh.hop_limit = cmd.attr.grh.hop_limit;
attr.grh.traffic_class = cmd.attr.grh.traffic_class;
+ memset(&attr.dmac, 0, sizeof(attr.dmac));
memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16);
ah = ib_create_ah(pd, &attr);
@@ -2630,12 +2959,13 @@ err:
}
ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len, int out_len)
{
struct ib_uverbs_destroy_ah cmd;
struct ib_ah *ah;
struct ib_uobject *uobj;
- int ret;
+ int ret;
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
@@ -2666,6 +2996,7 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2713,14 +3044,15 @@ out_put:
}
ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_detach_mcast cmd;
struct ib_uqp_object *obj;
struct ib_qp *qp;
struct ib_uverbs_mcast_entry *mcast;
- int ret = -EINVAL;
+ int ret = -EINVAL;
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
@@ -2749,474 +3081,529 @@ out_put:
return ret ? ret : in_len;
}
-static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
- struct ib_uverbs_create_xsrq *cmd,
- struct ib_udata *udata)
+static size_t kern_spec_filter_sz(struct ib_uverbs_flow_spec_hdr *spec)
{
- struct ib_uverbs_create_srq_resp resp;
- struct ib_usrq_object *obj;
- struct ib_pd *pd;
- struct ib_srq *srq;
- struct ib_uobject *uninitialized_var(xrcd_uobj);
- struct ib_srq_init_attr attr;
- int ret;
+ /* Returns user space filter size, includes padding */
+ return (spec->size - sizeof(struct ib_uverbs_flow_spec_hdr)) / 2;
+}
- obj = kmalloc(sizeof(*obj), GFP_KERNEL);
- if (!obj)
- return -ENOMEM;
+static ssize_t spec_filter_size(void *kern_spec_filter, u16 kern_filter_size,
+ u16 ib_real_filter_sz)
+{
+ /*
+ * User space filter structures must be 64 bit aligned, otherwise this
+ * may pass, but we won't handle additional new attributes.
+ */
- init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &srq_lock_class);
- down_write(&obj->uevent.uobject.mutex);
+ if (kern_filter_size > ib_real_filter_sz) {
+ if (memchr_inv((char *)kern_spec_filter +
+ ib_real_filter_sz, 0,
+ kern_filter_size - ib_real_filter_sz))
+ return -EINVAL;
+ return ib_real_filter_sz;
+ }
+ return kern_filter_size;
+}
- if (cmd->srq_type == IB_SRQT_XRC) {
- attr.ext.xrc.xrcd = idr_read_xrcd(cmd->xrcd_handle, file->ucontext, &xrcd_uobj);
- if (!attr.ext.xrc.xrcd) {
- ret = -EINVAL;
- goto err;
- }
+static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
+ union ib_flow_spec *ib_spec)
+{
+ ssize_t actual_filter_sz;
+ ssize_t kern_filter_sz;
+ ssize_t ib_filter_sz;
+ void *kern_spec_mask;
+ void *kern_spec_val;
- obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
- atomic_inc(&obj->uxrcd->refcnt);
+ if (kern_spec->reserved)
+ return -EINVAL;
- attr.ext.xrc.cq = idr_read_cq(cmd->cq_handle, file->ucontext, 0);
- if (!attr.ext.xrc.cq) {
- ret = -EINVAL;
- goto err_put_xrcd;
- }
+ ib_spec->type = kern_spec->type;
+
+ kern_filter_sz = kern_spec_filter_sz(&kern_spec->hdr);
+ /* User flow spec size must be aligned to 4 bytes */
+ if (kern_filter_sz != ALIGN(kern_filter_sz, 4))
+ return -EINVAL;
+
+ kern_spec_val = (char *)kern_spec +
+ sizeof(struct ib_uverbs_flow_spec_hdr);
+ kern_spec_mask = (char *)kern_spec_val + kern_filter_sz;
+
+ switch (ib_spec->type) {
+ case IB_FLOW_SPEC_ETH:
+ ib_filter_sz = offsetof(struct ib_flow_eth_filter, real_sz);
+ actual_filter_sz = spec_filter_size(kern_spec_mask,
+ kern_filter_sz,
+ ib_filter_sz);
+ if (actual_filter_sz <= 0)
+ return -EINVAL;
+ ib_spec->size = sizeof(struct ib_flow_spec_eth);
+ memcpy(&ib_spec->eth.val, kern_spec_val, actual_filter_sz);
+ memcpy(&ib_spec->eth.mask, kern_spec_mask, actual_filter_sz);
+ break;
+ case IB_FLOW_SPEC_IPV4:
+ ib_filter_sz = offsetof(struct ib_flow_ipv4_filter, real_sz);
+ actual_filter_sz = spec_filter_size(kern_spec_mask,
+ kern_filter_sz,
+ ib_filter_sz);
+ if (actual_filter_sz <= 0)
+ return -EINVAL;
+ ib_spec->size = sizeof(struct ib_flow_spec_ipv4);
+ memcpy(&ib_spec->ipv4.val, kern_spec_val, actual_filter_sz);
+ memcpy(&ib_spec->ipv4.mask, kern_spec_mask, actual_filter_sz);
+ break;
+ case IB_FLOW_SPEC_IPV6:
+ ib_filter_sz = offsetof(struct ib_flow_ipv6_filter, real_sz);
+ actual_filter_sz = spec_filter_size(kern_spec_mask,
+ kern_filter_sz,
+ ib_filter_sz);
+ if (actual_filter_sz <= 0)
+ return -EINVAL;
+ ib_spec->size = sizeof(struct ib_flow_spec_ipv6);
+ memcpy(&ib_spec->ipv6.val, kern_spec_val, actual_filter_sz);
+ memcpy(&ib_spec->ipv6.mask, kern_spec_mask, actual_filter_sz);
+
+ if ((ntohl(ib_spec->ipv6.mask.flow_label)) >= BIT(20) ||
+ (ntohl(ib_spec->ipv6.val.flow_label)) >= BIT(20))
+ return -EINVAL;
+ break;
+ case IB_FLOW_SPEC_TCP:
+ case IB_FLOW_SPEC_UDP:
+ ib_filter_sz = offsetof(struct ib_flow_tcp_udp_filter, real_sz);
+ actual_filter_sz = spec_filter_size(kern_spec_mask,
+ kern_filter_sz,
+ ib_filter_sz);
+ if (actual_filter_sz <= 0)
+ return -EINVAL;
+ ib_spec->size = sizeof(struct ib_flow_spec_tcp_udp);
+ memcpy(&ib_spec->tcp_udp.val, kern_spec_val, actual_filter_sz);
+ memcpy(&ib_spec->tcp_udp.mask, kern_spec_mask, actual_filter_sz);
+ break;
+ default:
+ return -EINVAL;
}
+ return 0;
+}
- pd = idr_read_pd(cmd->pd_handle, file->ucontext);
- if (!pd) {
- ret = -EINVAL;
- goto err_put_cq;
- }
+int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_create_wq cmd = {};
+ struct ib_uverbs_ex_create_wq_resp resp = {};
+ struct ib_uwq_object *obj;
+ int err = 0;
+ struct ib_cq *cq;
+ struct ib_pd *pd;
+ struct ib_wq *wq;
+ struct ib_wq_init_attr wq_init_attr = {};
+ size_t required_cmd_sz;
+ size_t required_resp_len;
- attr.event_handler = ib_uverbs_srq_event_handler;
- attr.srq_context = file;
- attr.srq_type = cmd->srq_type;
- attr.attr.max_wr = cmd->max_wr;
- attr.attr.max_sge = cmd->max_sge;
- attr.attr.srq_limit = cmd->srq_limit;
+ required_cmd_sz = offsetof(typeof(cmd), max_sge) + sizeof(cmd.max_sge);
+ required_resp_len = offsetof(typeof(resp), wqn) + sizeof(resp.wqn);
- obj->uevent.events_reported = 0;
- INIT_LIST_HEAD(&obj->uevent.event_list);
+ if (ucore->inlen < required_cmd_sz)
+ return -EINVAL;
- srq = pd->device->create_srq(pd, &attr, udata);
- if (IS_ERR(srq)) {
- ret = PTR_ERR(srq);
- goto err_put;
- }
+ if (ucore->outlen < required_resp_len)
+ return -ENOSPC;
- srq->device = pd->device;
- srq->pd = pd;
- srq->srq_type = cmd->srq_type;
- srq->uobject = &obj->uevent.uobject;
- srq->event_handler = attr.event_handler;
- srq->srq_context = attr.srq_context;
+ if (ucore->inlen > sizeof(cmd) &&
+ !ib_is_udata_cleared(ucore, sizeof(cmd),
+ ucore->inlen - sizeof(cmd)))
+ return -EOPNOTSUPP;
- if (cmd->srq_type == IB_SRQT_XRC) {
- srq->ext.xrc.cq = attr.ext.xrc.cq;
- srq->ext.xrc.xrcd = attr.ext.xrc.xrcd;
- atomic_inc(&attr.ext.xrc.cq->usecnt);
- atomic_inc(&attr.ext.xrc.xrcd->usecnt);
- }
+ err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+ if (err)
+ return err;
- atomic_inc(&pd->usecnt);
- atomic_set(&srq->usecnt, 0);
+ if (cmd.comp_mask)
+ return -EOPNOTSUPP;
- obj->uevent.uobject.object = srq;
- ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject);
- if (ret)
- goto err_destroy;
+ obj = kmalloc(sizeof(*obj), GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
- memset(&resp, 0, sizeof resp);
- resp.srq_handle = obj->uevent.uobject.id;
- resp.max_wr = attr.attr.max_wr;
- resp.max_sge = attr.attr.max_sge;
- if (cmd->srq_type == IB_SRQT_XRC)
- resp.srqn = srq->ext.xrc.srq_num;
+ init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext,
+ &wq_lock_class);
+ down_write(&obj->uevent.uobject.mutex);
+ pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+ if (!pd) {
+ err = -EINVAL;
+ goto err_uobj;
+ }
- if (copy_to_user((void __user *) (unsigned long) cmd->response,
- &resp, sizeof resp)) {
- ret = -EFAULT;
- goto err_copy;
+ cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+ if (!cq) {
+ err = -EINVAL;
+ goto err_put_pd;
}
- if (cmd->srq_type == IB_SRQT_XRC) {
- put_uobj_read(xrcd_uobj);
- put_cq_read(attr.ext.xrc.cq);
+ wq_init_attr.cq = cq;
+ wq_init_attr.max_sge = cmd.max_sge;
+ wq_init_attr.max_wr = cmd.max_wr;
+ wq_init_attr.wq_context = file;
+ wq_init_attr.wq_type = cmd.wq_type;
+ wq_init_attr.event_handler = ib_uverbs_wq_event_handler;
+ obj->uevent.events_reported = 0;
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+ wq = pd->device->create_wq(pd, &wq_init_attr, uhw);
+ if (IS_ERR(wq)) {
+ err = PTR_ERR(wq);
+ goto err_put_cq;
}
+
+ wq->uobject = &obj->uevent.uobject;
+ obj->uevent.uobject.object = wq;
+ wq->wq_type = wq_init_attr.wq_type;
+ wq->cq = cq;
+ wq->pd = pd;
+ wq->device = pd->device;
+ wq->wq_context = wq_init_attr.wq_context;
+ atomic_set(&wq->usecnt, 0);
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&cq->usecnt);
+ wq->uobject = &obj->uevent.uobject;
+ obj->uevent.uobject.object = wq;
+ err = idr_add_uobj(&ib_uverbs_wq_idr, &obj->uevent.uobject);
+ if (err)
+ goto destroy_wq;
+
+ memset(&resp, 0, sizeof(resp));
+ resp.wq_handle = obj->uevent.uobject.id;
+ resp.max_sge = wq_init_attr.max_sge;
+ resp.max_wr = wq_init_attr.max_wr;
+ resp.wqn = wq->wq_num;
+ resp.response_length = required_resp_len;
+ err = ib_copy_to_udata(ucore,
+ &resp, resp.response_length);
+ if (err)
+ goto err_copy;
+
put_pd_read(pd);
+ put_cq_read(cq);
mutex_lock(&file->mutex);
- list_add_tail(&obj->uevent.uobject.list, &file->ucontext->srq_list);
+ list_add_tail(&obj->uevent.uobject.list, &file->ucontext->wq_list);
mutex_unlock(&file->mutex);
obj->uevent.uobject.live = 1;
-
up_write(&obj->uevent.uobject.mutex);
-
return 0;
err_copy:
- idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject);
-
-err_destroy:
- ib_destroy_srq(srq);
-
-err_put:
- put_pd_read(pd);
-
+ idr_remove_uobj(&ib_uverbs_wq_idr, &obj->uevent.uobject);
+destroy_wq:
+ ib_destroy_wq(wq);
err_put_cq:
- if (cmd->srq_type == IB_SRQT_XRC)
- put_cq_read(attr.ext.xrc.cq);
-
-err_put_xrcd:
- if (cmd->srq_type == IB_SRQT_XRC) {
- atomic_dec(&obj->uxrcd->refcnt);
- put_uobj_read(xrcd_uobj);
- }
-
-err:
+ put_cq_read(cq);
+err_put_pd:
+ put_pd_read(pd);
+err_uobj:
put_uobj_write(&obj->uevent.uobject);
- return ret;
+
+ return err;
}
-ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+int ib_uverbs_ex_destroy_wq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
{
- struct ib_uverbs_create_srq cmd;
- struct ib_uverbs_create_xsrq xcmd;
- struct ib_uverbs_create_srq_resp resp;
- struct ib_udata udata;
- int ret;
+ struct ib_uverbs_ex_destroy_wq cmd = {};
+ struct ib_uverbs_ex_destroy_wq_resp resp = {};
+ struct ib_wq *wq;
+ struct ib_uobject *uobj;
+ struct ib_uwq_object *obj;
+ size_t required_cmd_sz;
+ size_t required_resp_len;
+ int ret;
- if (out_len < sizeof resp)
- return -ENOSPC;
+ required_cmd_sz = offsetof(typeof(cmd), wq_handle) + sizeof(cmd.wq_handle);
+ required_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ if (ucore->inlen < required_cmd_sz)
+ return -EINVAL;
- xcmd.response = cmd.response;
- xcmd.user_handle = cmd.user_handle;
- xcmd.srq_type = IB_SRQT_BASIC;
- xcmd.pd_handle = cmd.pd_handle;
- xcmd.max_wr = cmd.max_wr;
- xcmd.max_sge = cmd.max_sge;
- xcmd.srq_limit = cmd.srq_limit;
+ if (ucore->outlen < required_resp_len)
+ return -ENOSPC;
- INIT_UDATA(&udata, buf + sizeof cmd,
- (unsigned long) cmd.response + sizeof resp,
- in_len - sizeof cmd, out_len - sizeof resp);
+ if (ucore->inlen > sizeof(cmd) &&
+ !ib_is_udata_cleared(ucore, sizeof(cmd),
+ ucore->inlen - sizeof(cmd)))
+ return -EOPNOTSUPP;
- ret = __uverbs_create_xsrq(file, &xcmd, &udata);
+ ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
if (ret)
return ret;
- return in_len;
-}
-
-ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len, int out_len)
-{
- struct ib_uverbs_create_xsrq cmd;
- struct ib_uverbs_create_srq_resp resp;
- struct ib_udata udata;
- int ret;
-
- if (out_len < sizeof resp)
- return -ENOSPC;
+ if (cmd.comp_mask)
+ return -EOPNOTSUPP;
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ resp.response_length = required_resp_len;
+ uobj = idr_write_uobj(&ib_uverbs_wq_idr, cmd.wq_handle,
+ file->ucontext);
+ if (!uobj)
+ return -EINVAL;
- INIT_UDATA(&udata, buf + sizeof cmd,
- (unsigned long) cmd.response + sizeof resp,
- in_len - sizeof cmd, out_len - sizeof resp);
+ wq = uobj->object;
+ obj = container_of(uobj, struct ib_uwq_object, uevent.uobject);
+ ret = ib_destroy_wq(wq);
+ if (!ret)
+ uobj->live = 0;
- ret = __uverbs_create_xsrq(file, &cmd, &udata);
+ put_uobj_write(uobj);
if (ret)
return ret;
- return in_len;
-}
+ idr_remove_uobj(&ib_uverbs_wq_idr, uobj);
-ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
-{
- struct ib_uverbs_modify_srq cmd;
- struct ib_udata udata;
- struct ib_srq *srq;
- struct ib_srq_attr attr;
- int ret;
-
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
-
- INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
- out_len);
-
- srq = idr_read_srq(cmd.srq_handle, file->ucontext);
- if (!srq)
- return -EINVAL;
-
- attr.max_wr = cmd.max_wr;
- attr.srq_limit = cmd.srq_limit;
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
- ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata);
+ ib_uverbs_release_uevent(file, &obj->uevent);
+ resp.events_reported = obj->uevent.events_reported;
+ put_uobj(uobj);
- put_srq_read(srq);
+ ret = ib_copy_to_udata(ucore, &resp, resp.response_length);
+ if (ret)
+ return ret;
- return ret ? ret : in_len;
+ return 0;
}
-ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
- const char __user *buf,
- int in_len, int out_len)
+int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
{
- struct ib_uverbs_query_srq cmd;
- struct ib_uverbs_query_srq_resp resp;
- struct ib_srq_attr attr;
- struct ib_srq *srq;
- int ret;
-
- if (out_len < sizeof resp)
- return -ENOSPC;
-
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
+ struct ib_uverbs_ex_modify_wq cmd = {};
+ struct ib_wq *wq;
+ struct ib_wq_attr wq_attr = {};
+ size_t required_cmd_sz;
+ int ret;
- srq = idr_read_srq(cmd.srq_handle, file->ucontext);
- if (!srq)
+ required_cmd_sz = offsetof(typeof(cmd), curr_wq_state) + sizeof(cmd.curr_wq_state);
+ if (ucore->inlen < required_cmd_sz)
return -EINVAL;
- ret = ib_query_srq(srq, &attr);
-
- put_srq_read(srq);
+ if (ucore->inlen > sizeof(cmd) &&
+ !ib_is_udata_cleared(ucore, sizeof(cmd),
+ ucore->inlen - sizeof(cmd)))
+ return -EOPNOTSUPP;
+ ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
if (ret)
- return ret;
+ return ret;
- memset(&resp, 0, sizeof resp);
+ if (!cmd.attr_mask)
+ return -EINVAL;
- resp.max_wr = attr.max_wr;
- resp.max_sge = attr.max_sge;
- resp.srq_limit = attr.srq_limit;
+ if (cmd.attr_mask > (IB_WQ_STATE | IB_WQ_CUR_STATE))
+ return -EINVAL;
- if (copy_to_user((void __user *) (unsigned long) cmd.response,
- &resp, sizeof resp))
- return -EFAULT;
+ wq = idr_read_wq(cmd.wq_handle, file->ucontext);
+ if (!wq)
+ return -EINVAL;
- return in_len;
+ wq_attr.curr_wq_state = cmd.curr_wq_state;
+ wq_attr.wq_state = cmd.wq_state;
+ ret = wq->device->modify_wq(wq, &wq_attr, cmd.attr_mask, uhw);
+ put_wq_read(wq);
+ return ret;
}
-ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
{
- struct ib_uverbs_destroy_srq cmd;
- struct ib_uverbs_destroy_srq_resp resp;
- struct ib_uobject *uobj;
- struct ib_srq *srq;
- struct ib_uevent_object *obj;
- int ret = -EINVAL;
- struct ib_usrq_object *us;
- enum ib_srq_type srq_type;
-
- if (copy_from_user(&cmd, buf, sizeof cmd))
- return -EFAULT;
-
- uobj = idr_write_uobj(&ib_uverbs_srq_idr, cmd.srq_handle, file->ucontext);
- if (!uobj)
+ struct ib_uverbs_ex_create_rwq_ind_table cmd = {};
+ struct ib_uverbs_ex_create_rwq_ind_table_resp resp = {};
+ struct ib_uobject *uobj;
+ int err = 0;
+ struct ib_rwq_ind_table_init_attr init_attr = {};
+ struct ib_rwq_ind_table *rwq_ind_tbl;
+ struct ib_wq **wqs = NULL;
+ u32 *wqs_handles = NULL;
+ struct ib_wq *wq = NULL;
+ int i, j, num_read_wqs;
+ u32 num_wq_handles;
+ u32 expected_in_size;
+ size_t required_cmd_sz_header;
+ size_t required_resp_len;
+
+ required_cmd_sz_header = offsetof(typeof(cmd), log_ind_tbl_size) + sizeof(cmd.log_ind_tbl_size);
+ required_resp_len = offsetof(typeof(resp), ind_tbl_num) + sizeof(resp.ind_tbl_num);
+
+ if (ucore->inlen < required_cmd_sz_header)
return -EINVAL;
- srq = uobj->object;
- obj = container_of(uobj, struct ib_uevent_object, uobject);
- srq_type = srq->srq_type;
-
- ret = ib_destroy_srq(srq);
- if (!ret)
- uobj->live = 0;
-
- put_uobj_write(uobj);
-
- if (ret)
- return ret;
-
- if (srq_type == IB_SRQT_XRC) {
- us = container_of(obj, struct ib_usrq_object, uevent);
- atomic_dec(&us->uxrcd->refcnt);
- }
-
- idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
-
- mutex_lock(&file->mutex);
- list_del(&uobj->list);
- mutex_unlock(&file->mutex);
- ib_uverbs_release_uevent(file, obj);
+ if (ucore->outlen < required_resp_len)
+ return -ENOSPC;
- memset(&resp, 0, sizeof resp);
- resp.events_reported = obj->events_reported;
+ err = ib_copy_from_udata(&cmd, ucore, required_cmd_sz_header);
+ if (err)
+ return err;
- put_uobj(uobj);
+ ucore->inbuf = (const char *)ucore->inbuf + required_cmd_sz_header;
+ ucore->inlen -= required_cmd_sz_header;
- if (copy_to_user((void __user *) (unsigned long) cmd.response,
- &resp, sizeof resp))
- ret = -EFAULT;
+ if (cmd.comp_mask)
+ return -EOPNOTSUPP;
- return ret ? ret : in_len;
-}
+ if (cmd.log_ind_tbl_size > IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE)
+ return -EINVAL;
-ssize_t ib_uverbs_exp_create_dct(struct ib_uverbs_file *file,
- struct ib_udata *ucore,
- struct ib_udata *uhw)
-{
- int in_len = ucore->inlen + uhw->inlen;
- int out_len = ucore->outlen + uhw->outlen;
- struct ib_uverbs_create_dct cmd;
- struct ib_uverbs_create_dct_resp resp;
- struct ib_udata udata;
- struct ib_udct_object *obj;
- struct ib_dct *dct;
- int ret;
- struct ib_dct_init_attr attr;
- struct ib_pd *pd = NULL;
- struct ib_cq *cq = NULL;
- struct ib_srq *srq = NULL;
+ num_wq_handles = 1 << cmd.log_ind_tbl_size;
+ expected_in_size = num_wq_handles * sizeof(__u32);
+ if (num_wq_handles == 1)
+ /* input size for wq handles is u64 aligned */
+ expected_in_size += sizeof(__u32);
- if (out_len < sizeof(resp))
- return -ENOSPC;
+ if (ucore->inlen < expected_in_size)
+ return -EINVAL;
- ret = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd));
- if (ret)
- return ret;
+ if (ucore->inlen > expected_in_size &&
+ !ib_is_udata_cleared(ucore, expected_in_size,
+ ucore->inlen - expected_in_size))
+ return -EOPNOTSUPP;
- obj = kmalloc(sizeof(*obj), GFP_KERNEL);
- if (!obj)
+ wqs_handles = kcalloc(num_wq_handles, sizeof(*wqs_handles),
+ GFP_KERNEL);
+ if (!wqs_handles)
return -ENOMEM;
- init_uobj(&obj->uobject, cmd.user_handle, file->ucontext,
- &dct_lock_class);
- down_write(&obj->uobject.mutex);
+ err = ib_copy_from_udata(wqs_handles, ucore,
+ num_wq_handles * sizeof(__u32));
+ if (err)
+ goto err_free;
- pd = idr_read_pd(cmd.pd_handle, file->ucontext);
- if (!pd) {
- ret = -EINVAL;
- goto err_pd;
+ wqs = kcalloc(num_wq_handles, sizeof(*wqs), GFP_KERNEL);
+ if (!wqs) {
+ err = -ENOMEM;
+ goto err_free;
}
- cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
- if (!cq) {
- ret = -EINVAL;
- goto err_put;
- }
+ for (num_read_wqs = 0; num_read_wqs < num_wq_handles;
+ num_read_wqs++) {
+ wq = idr_read_wq(wqs_handles[num_read_wqs], file->ucontext);
+ if (!wq) {
+ err = -EINVAL;
+ goto put_wqs;
+ }
- srq = idr_read_srq(cmd.srq_handle, file->ucontext);
- if (!srq) {
- ret = -EINVAL;
- goto err_put;
+ wqs[num_read_wqs] = wq;
}
- attr.cq = cq;
- attr.access_flags = cmd.access_flags;
- attr.min_rnr_timer = cmd.min_rnr_timer;
- attr.srq = srq;
- attr.tclass = cmd.tclass;
- attr.flow_label = cmd.flow_label;
- attr.dc_key = cmd.dc_key;
- attr.mtu = cmd.mtu;
- attr.port = cmd.port;
- attr.pkey_index = cmd.pkey_index;
- attr.gid_index = cmd.gid_index;
- attr.hop_limit = cmd.hop_limit;
- attr.create_flags = cmd.create_flags;
-
- dct = ib_create_dct(pd, &attr, &udata);
- if (IS_ERR(dct)) {
- ret = PTR_ERR(dct);
- goto err_put;
+ uobj = kmalloc(sizeof(*uobj), GFP_KERNEL);
+ if (!uobj) {
+ err = -ENOMEM;
+ goto put_wqs;
}
- dct->device = file->device->ib_dev;
- dct->uobject = &obj->uobject;
-
- obj->uobject.object = dct;
- ret = idr_add_uobj(&ib_uverbs_dct_idr, &obj->uobject);
- if (ret)
- goto err_dct;
-
- memset(&resp, 0, sizeof(resp));
- resp.dct_handle = obj->uobject.id;
- resp.dctn = dct->dct_num;
+ init_uobj(uobj, 0, file->ucontext, &rwq_ind_table_lock_class);
+ down_write(&uobj->mutex);
+ init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size;
+ init_attr.ind_tbl = wqs;
+ rwq_ind_tbl = ib_dev->create_rwq_ind_table(ib_dev, &init_attr, uhw);
- ret = ucore->ops->copy_to(ucore, &resp, sizeof(resp));
- if (ret)
- goto err_copy;
+ if (IS_ERR(rwq_ind_tbl)) {
+ err = PTR_ERR(rwq_ind_tbl);
+ goto err_uobj;
+ }
- mutex_lock(&file->mutex);
- list_add_tail(&obj->uobject.list, &file->ucontext->dct_list);
- mutex_unlock(&file->mutex);
+ rwq_ind_tbl->ind_tbl = wqs;
+ rwq_ind_tbl->log_ind_tbl_size = init_attr.log_ind_tbl_size;
+ rwq_ind_tbl->uobject = uobj;
+ uobj->object = rwq_ind_tbl;
+ rwq_ind_tbl->device = ib_dev;
+ atomic_set(&rwq_ind_tbl->usecnt, 0);
- obj->uobject.live = 1;
+ for (i = 0; i < num_wq_handles; i++)
+ atomic_inc(&wqs[i]->usecnt);
- put_srq_read(srq);
- put_cq_read(cq);
- put_pd_read(pd);
+ err = idr_add_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj);
+ if (err)
+ goto destroy_ind_tbl;
- up_write(&obj->uobject.mutex);
+ resp.ind_tbl_handle = uobj->id;
+ resp.ind_tbl_num = rwq_ind_tbl->ind_tbl_num;
+ resp.response_length = required_resp_len;
- return in_len;
+ err = ib_copy_to_udata(ucore,
+ &resp, resp.response_length);
+ if (err)
+ goto err_copy;
-err_copy:
- idr_remove_uobj(&ib_uverbs_dct_idr, &obj->uobject);
+ kfree(wqs_handles);
-err_dct:
- ib_destroy_dct(dct);
+ for (j = 0; j < num_read_wqs; j++)
+ put_wq_read(wqs[j]);
-err_put:
- if (srq)
- put_srq_read(srq);
+ mutex_lock(&file->mutex);
+ list_add_tail(&uobj->list, &file->ucontext->rwq_ind_tbl_list);
+ mutex_unlock(&file->mutex);
- if (cq)
- put_cq_read(cq);
+ uobj->live = 1;
- put_pd_read(pd);
+ up_write(&uobj->mutex);
+ return 0;
-err_pd:
- put_uobj_write(&obj->uobject);
- return ret;
+err_copy:
+ idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj);
+destroy_ind_tbl:
+ ib_destroy_rwq_ind_table(rwq_ind_tbl);
+err_uobj:
+ put_uobj_write(uobj);
+put_wqs:
+ for (j = 0; j < num_read_wqs; j++)
+ put_wq_read(wqs[j]);
+err_free:
+ kfree(wqs_handles);
+ kfree(wqs);
+ return err;
}
-ssize_t ib_uverbs_exp_destroy_dct(struct ib_uverbs_file *file,
- struct ib_udata *ucore,
- struct ib_udata *uhw)
+int ib_uverbs_ex_destroy_rwq_ind_table(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
{
- int in_len = ucore->inlen + uhw->inlen;
- int out_len = ucore->outlen + uhw->outlen;
- struct ib_uverbs_destroy_dct cmd;
- struct ib_uverbs_destroy_dct_resp resp;
- struct ib_uobject *uobj;
- struct ib_dct *dct;
- struct ib_udct_object *obj;
- int ret;
+ struct ib_uverbs_ex_destroy_rwq_ind_table cmd = {};
+ struct ib_rwq_ind_table *rwq_ind_tbl;
+ struct ib_uobject *uobj;
+ int ret;
+ struct ib_wq **ind_tbl;
+ size_t required_cmd_sz;
- if (out_len < sizeof(resp))
- return -ENOSPC;
+ required_cmd_sz = offsetof(typeof(cmd), ind_tbl_handle) + sizeof(cmd.ind_tbl_handle);
+
+ if (ucore->inlen < required_cmd_sz)
+ return -EINVAL;
- ret = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd));
+ if (ucore->inlen > sizeof(cmd) &&
+ !ib_is_udata_cleared(ucore, sizeof(cmd),
+ ucore->inlen - sizeof(cmd)))
+ return -EOPNOTSUPP;
+
+ ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
if (ret)
return ret;
- uobj = idr_write_uobj(&ib_uverbs_dct_idr, cmd.user_handle, file->ucontext);
+ if (cmd.comp_mask)
+ return -EOPNOTSUPP;
+
+ uobj = idr_write_uobj(&ib_uverbs_rwq_ind_tbl_idr, cmd.ind_tbl_handle,
+ file->ucontext);
if (!uobj)
return -EINVAL;
+ rwq_ind_tbl = uobj->object;
+ ind_tbl = rwq_ind_tbl->ind_tbl;
- dct = uobj->object;
- obj = container_of(dct->uobject, struct ib_udct_object, uobject);
-
- ret = ib_destroy_dct(dct);
+ ret = ib_destroy_rwq_ind_table(rwq_ind_tbl);
if (!ret)
uobj->live = 0;
@@ -3225,138 +3612,25 @@ ssize_t ib_uverbs_exp_destroy_dct(struct ib_uverbs_file *file,
if (ret)
return ret;
- idr_remove_uobj(&ib_uverbs_dct_idr, uobj);
+ idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj);
mutex_lock(&file->mutex);
list_del(&uobj->list);
mutex_unlock(&file->mutex);
- memset(&resp, 0, sizeof(resp));
-
put_uobj(uobj);
-
- ret = ucore->ops->copy_to(ucore, &resp, sizeof(resp));
- if (ret)
- return ret;
-
- return in_len;
-}
-
-ssize_t ib_uverbs_exp_query_dct(struct ib_uverbs_file *file,
- struct ib_udata *ucore,
- struct ib_udata *uhw)
-{
- int in_len = ucore->inlen + uhw->inlen;
- int out_len = ucore->outlen + uhw->outlen;
- struct ib_uverbs_query_dct cmd;
- struct ib_uverbs_query_dct_resp resp;
- struct ib_dct *dct;
- struct ib_dct_attr *attr;
- int err;
-
- if (out_len < sizeof(resp))
- return -ENOSPC;
-
- err = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd));
- if (err)
- return err;
-
- attr = kmalloc(sizeof(*attr), GFP_KERNEL);
- if (!attr) {
- err = -ENOMEM;
- goto out;
- }
-
- dct = idr_read_dct(cmd.dct_handle, file->ucontext);
- if (!dct) {
- err = -EINVAL;
- goto out;
- }
-
- err = ib_query_dct(dct, attr);
-
- put_dct_read(dct);
-
- if (err)
- goto out;
-
- memset(&resp, 0, sizeof(resp));
-
- resp.dc_key = attr->dc_key;
- resp.access_flags = attr->access_flags;
- resp.flow_label = attr->flow_label;
- resp.key_violations = attr->key_violations;
- resp.port = attr->port;
- resp.min_rnr_timer = attr->min_rnr_timer;
- resp.tclass = attr->tclass;
- resp.mtu = attr->mtu;
- resp.pkey_index = attr->pkey_index;
- resp.gid_index = attr->gid_index;
- resp.hop_limit = attr->hop_limit;
- resp.state = attr->state;
-
- err = ucore->ops->copy_to(ucore, &resp, sizeof(resp));
-
-out:
- kfree(attr);
-
- return err ? err : in_len;
-}
-
-/*
- * Experimental functions
- */
-
-static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" };
-
-static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
- union ib_flow_spec *ib_spec)
-{
- ib_spec->type = kern_spec->type;
-
- switch (ib_spec->type) {
- case IB_FLOW_SPEC_ETH:
- ib_spec->eth.size = sizeof(struct ib_flow_spec_eth);
- memcpy(&ib_spec->eth.val, &kern_spec->eth.val,
- sizeof(struct ib_flow_eth_filter));
- memcpy(&ib_spec->eth.mask, &kern_spec->eth.mask,
- sizeof(struct ib_flow_eth_filter));
- break;
- case IB_FLOW_SPEC_IB:
- ib_spec->ib.size = sizeof(struct ib_flow_spec_ib);
- memcpy(&ib_spec->ib.val, &kern_spec->ib.val,
- sizeof(struct ib_flow_ib_filter));
- memcpy(&ib_spec->ib.mask, &kern_spec->ib.mask,
- sizeof(struct ib_flow_ib_filter));
- break;
- case IB_FLOW_SPEC_IPV4:
- ib_spec->ipv4.size = sizeof(struct ib_flow_spec_ipv4);
- memcpy(&ib_spec->ipv4.val, &kern_spec->ipv4.val,
- sizeof(struct ib_flow_ipv4_filter));
- memcpy(&ib_spec->ipv4.mask, &kern_spec->ipv4.mask,
- sizeof(struct ib_flow_ipv4_filter));
- break;
- case IB_FLOW_SPEC_TCP:
- case IB_FLOW_SPEC_UDP:
- ib_spec->tcp_udp.size = sizeof(struct ib_flow_spec_tcp_udp);
- memcpy(&ib_spec->tcp_udp.val, &kern_spec->tcp_udp.val,
- sizeof(struct ib_flow_tcp_udp_filter));
- memcpy(&ib_spec->tcp_udp.mask, &kern_spec->tcp_udp.mask,
- sizeof(struct ib_flow_tcp_udp_filter));
- break;
- default:
- return -EINVAL;
- }
- return 0;
+ kfree(ind_tbl);
+ return ret;
}
int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_udata *ucore,
struct ib_udata *uhw)
{
struct ib_uverbs_create_flow cmd;
struct ib_uverbs_create_flow_resp resp;
- struct ib_uobject *uobj;
+ struct ib_uobject *uobj;
struct ib_flow *flow_id;
struct ib_uverbs_flow_attr *kern_flow_attr;
struct ib_flow_attr *flow_attr;
@@ -3366,6 +3640,9 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
void *ib_spec;
int i;
+ if (ucore->inlen < sizeof(cmd))
+ return -EINVAL;
+
if (ucore->outlen < sizeof(resp))
return -ENOSPC;
@@ -3373,15 +3650,23 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
if (err)
return err;
- ucore->inbuf += sizeof(cmd);
+ ucore->inbuf = (const char *)ucore->inbuf + sizeof(cmd);
ucore->inlen -= sizeof(cmd);
if (cmd.comp_mask)
return -EINVAL;
- if (priv_check(curthread, PRIV_NET_RAW) && !disable_raw_qp_enforcement)
+ if (priv_check(curthread, PRIV_NET_RAW) != 0)
return -EPERM;
+ if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED)
+ return -EINVAL;
+
+ if ((cmd.flow_attr.flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) &&
+ ((cmd.flow_attr.type == IB_FLOW_ATTR_ALL_DEFAULT) ||
+ (cmd.flow_attr.type == IB_FLOW_ATTR_MC_DEFAULT)))
+ return -EINVAL;
+
if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS)
return -EINVAL;
@@ -3390,11 +3675,15 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
(cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec)))
return -EINVAL;
+ if (cmd.flow_attr.reserved[0] ||
+ cmd.flow_attr.reserved[1])
+ return -EINVAL;
+
if (cmd.flow_attr.num_of_specs) {
- kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) +
- cmd.flow_attr.size, GFP_KERNEL);
+ kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size,
+ GFP_KERNEL);
if (!kern_flow_attr)
- return -ENOMEM;
+ return -ENOMEM;
memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr));
err = ib_copy_from_udata(kern_flow_attr + 1, ucore,
@@ -3419,8 +3708,8 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
goto err_uobj;
}
- flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size,
- GFP_KERNEL);
+ flow_attr = kzalloc(sizeof(*flow_attr) + cmd.flow_attr.num_of_specs *
+ sizeof(union ib_flow_spec), GFP_KERNEL);
if (!flow_attr) {
err = -ENOMEM;
goto err_put;
@@ -3436,25 +3725,24 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
kern_spec = kern_flow_attr + 1;
ib_spec = flow_attr + 1;
for (i = 0; i < flow_attr->num_of_specs &&
- cmd.flow_attr.size >
- offsetof(struct ib_uverbs_flow_spec, reserved) &&
+ cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec, reserved) &&
cmd.flow_attr.size >=
((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) {
err = kern_spec_to_ib_spec(kern_spec, ib_spec);
if (err)
goto err_free;
flow_attr->size +=
- ((union ib_flow_spec *)ib_spec)->size;
- cmd.flow_attr.size -=
- ((struct ib_uverbs_flow_spec *)kern_spec)->size;
- kern_spec += ((struct ib_uverbs_flow_spec *)kern_spec)->size;
- ib_spec += ((union ib_flow_spec *)ib_spec)->size;
+ ((union ib_flow_spec *) ib_spec)->size;
+ cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size;
+ kern_spec = (char *)kern_spec + ((struct ib_uverbs_flow_spec *) kern_spec)->size;
+ ib_spec = (char *)ib_spec + ((union ib_flow_spec *)ib_spec)->size;
}
if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) {
pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n",
i, cmd.flow_attr.size);
+ err = -EINVAL;
goto err_free;
- }
+ }
flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
if (IS_ERR(flow_id)) {
err = PTR_ERR(flow_id);
@@ -3505,6 +3793,7 @@ err_free_attr:
}
int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_udata *ucore,
struct ib_udata *uhw)
{
@@ -3513,10 +3802,16 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
struct ib_uobject *uobj;
int ret;
+ if (ucore->inlen < sizeof(cmd))
+ return -EINVAL;
+
ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
if (ret)
return ret;
+ if (cmd.comp_mask)
+ return -EINVAL;
+
uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle,
file->ucontext);
if (!uobj)
@@ -3540,375 +3835,417 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
return ret;
}
-ssize_t ib_uverbs_exp_modify_qp(struct ib_uverbs_file *file,
- struct ib_udata *ucore, struct ib_udata *uhw)
+static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ struct ib_uverbs_create_xsrq *cmd,
+ struct ib_udata *udata)
{
- const char __user *buf = ucore->inbuf;
- int in_len = ucore->inlen + uhw->inlen;
- int out_len = ucore->outlen + uhw->outlen;
+ struct ib_uverbs_create_srq_resp resp;
+ struct ib_usrq_object *obj;
+ struct ib_pd *pd;
+ struct ib_srq *srq;
+ struct ib_uobject *uninitialized_var(xrcd_uobj);
+ struct ib_srq_init_attr attr;
+ int ret;
- return __uverbs_modify_qp(file, buf, in_len, out_len,
- IB_USER_VERBS_CMD_EXTENDED);
-}
+ obj = kmalloc(sizeof *obj, GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+
+ init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &srq_lock_class);
+ down_write(&obj->uevent.uobject.mutex);
+
+ if (cmd->srq_type == IB_SRQT_XRC) {
+ attr.ext.xrc.xrcd = idr_read_xrcd(cmd->xrcd_handle, file->ucontext, &xrcd_uobj);
+ if (!attr.ext.xrc.xrcd) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+ atomic_inc(&obj->uxrcd->refcnt);
+
+ attr.ext.xrc.cq = idr_read_cq(cmd->cq_handle, file->ucontext, 0);
+ if (!attr.ext.xrc.cq) {
+ ret = -EINVAL;
+ goto err_put_xrcd;
+ }
+ }
+
+ pd = idr_read_pd(cmd->pd_handle, file->ucontext);
+ if (!pd) {
+ ret = -EINVAL;
+ goto err_put_cq;
+ }
+
+ attr.event_handler = ib_uverbs_srq_event_handler;
+ attr.srq_context = file;
+ attr.srq_type = cmd->srq_type;
+ attr.attr.max_wr = cmd->max_wr;
+ attr.attr.max_sge = cmd->max_sge;
+ attr.attr.srq_limit = cmd->srq_limit;
+
+ obj->uevent.events_reported = 0;
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+
+ srq = pd->device->create_srq(pd, &attr, udata);
+ if (IS_ERR(srq)) {
+ ret = PTR_ERR(srq);
+ goto err_put;
+ }
+ srq->device = pd->device;
+ srq->pd = pd;
+ srq->srq_type = cmd->srq_type;
+ srq->uobject = &obj->uevent.uobject;
+ srq->event_handler = attr.event_handler;
+ srq->srq_context = attr.srq_context;
-ssize_t ib_uverbs_exp_create_cq(struct ib_uverbs_file *file,
- struct ib_udata *ucore, struct ib_udata *uhw)
+ if (cmd->srq_type == IB_SRQT_XRC) {
+ srq->ext.xrc.cq = attr.ext.xrc.cq;
+ srq->ext.xrc.xrcd = attr.ext.xrc.xrcd;
+ atomic_inc(&attr.ext.xrc.cq->usecnt);
+ atomic_inc(&attr.ext.xrc.xrcd->usecnt);
+ }
+
+ atomic_inc(&pd->usecnt);
+ atomic_set(&srq->usecnt, 0);
+
+ obj->uevent.uobject.object = srq;
+ ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject);
+ if (ret)
+ goto err_destroy;
+
+ memset(&resp, 0, sizeof resp);
+ resp.srq_handle = obj->uevent.uobject.id;
+ resp.max_wr = attr.attr.max_wr;
+ resp.max_sge = attr.attr.max_sge;
+ if (cmd->srq_type == IB_SRQT_XRC)
+ resp.srqn = srq->ext.xrc.srq_num;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd->response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ if (cmd->srq_type == IB_SRQT_XRC) {
+ put_uobj_read(xrcd_uobj);
+ put_cq_read(attr.ext.xrc.cq);
+ }
+ put_pd_read(pd);
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&obj->uevent.uobject.list, &file->ucontext->srq_list);
+ mutex_unlock(&file->mutex);
+
+ obj->uevent.uobject.live = 1;
+
+ up_write(&obj->uevent.uobject.mutex);
+
+ return 0;
+
+err_copy:
+ idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject);
+
+err_destroy:
+ ib_destroy_srq(srq);
+
+err_put:
+ put_pd_read(pd);
+
+err_put_cq:
+ if (cmd->srq_type == IB_SRQT_XRC)
+ put_cq_read(attr.ext.xrc.cq);
+
+err_put_xrcd:
+ if (cmd->srq_type == IB_SRQT_XRC) {
+ atomic_dec(&obj->uxrcd->refcnt);
+ put_uobj_read(xrcd_uobj);
+ }
+
+err:
+ put_uobj_write(&obj->uevent.uobject);
+ return ret;
+}
+
+ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
{
- const char __user *buf = ucore->inbuf;
- int in_len = ucore->inlen + uhw->inlen;
- int out_len = ucore->outlen + uhw->outlen;
- struct ib_uverbs_create_cq_ex cmd;
+ struct ib_uverbs_create_srq cmd;
+ struct ib_uverbs_create_xsrq xcmd;
+ struct ib_uverbs_create_srq_resp resp;
+ struct ib_udata udata;
+ int ret;
- if (copy_from_user(&cmd, buf, sizeof(cmd)))
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
- return create_cq(file, buf, in_len, out_len, &cmd,
- IB_USER_VERBS_CMD_EXTENDED, ucore->outbuf);
+ xcmd.response = cmd.response;
+ xcmd.user_handle = cmd.user_handle;
+ xcmd.srq_type = IB_SRQT_BASIC;
+ xcmd.pd_handle = cmd.pd_handle;
+ xcmd.max_wr = cmd.max_wr;
+ xcmd.max_sge = cmd.max_sge;
+ xcmd.srq_limit = cmd.srq_limit;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr),
+ out_len - sizeof resp);
+
+ ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata);
+ if (ret)
+ return ret;
+
+ return in_len;
}
-ssize_t ib_uverbs_exp_modify_cq(struct ib_uverbs_file *file,
- struct ib_udata *ucore, struct ib_udata *uhw)
+ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len, int out_len)
{
- const char __user *buf = ucore->inbuf;
- int in_len = ucore->inlen + uhw->inlen;
- struct ib_uverbs_modify_cq_ex cmd;
- struct ib_cq *cq;
- struct ib_cq_attr attr;
+ struct ib_uverbs_create_xsrq cmd;
+ struct ib_uverbs_create_srq_resp resp;
+ struct ib_udata udata;
int ret;
- if (copy_from_user(&cmd, buf, sizeof(cmd)))
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
- cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
- if (!cq)
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr),
+ out_len - sizeof resp);
+
+ ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata);
+ if (ret)
+ return ret;
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_modify_srq cmd;
+ struct ib_udata udata;
+ struct ib_srq *srq;
+ struct ib_srq_attr attr;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
+ out_len);
+
+ srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+ if (!srq)
return -EINVAL;
- attr.moderation.cq_count = cmd.cq_count;
- attr.moderation.cq_period = cmd.cq_period;
- attr.cq_cap_flags = cmd.cq_cap_flags;
+ attr.max_wr = cmd.max_wr;
+ attr.srq_limit = cmd.srq_limit;
- ret = ib_modify_cq(cq, &attr, cmd.attr_mask);
+ ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata);
- put_cq_read(cq);
+ put_srq_read(srq);
return ret ? ret : in_len;
}
-
-ssize_t ib_uverbs_exp_query_device(struct ib_uverbs_file *file,
- struct ib_udata *ucore, struct ib_udata *uhw)
+ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ const char __user *buf,
+ int in_len, int out_len)
{
- struct ib_uverbs_exp_query_device_resp resp;
- struct ib_exp_device_attr exp_attr;
- int ret;
+ struct ib_uverbs_query_srq cmd;
+ struct ib_uverbs_query_srq_resp resp;
+ struct ib_srq_attr attr;
+ struct ib_srq *srq;
+ int ret;
- if (ucore->outlen + uhw->outlen < sizeof(resp))
+ if (out_len < sizeof resp)
return -ENOSPC;
- memset(&resp, 0, sizeof(resp));
- memset(&exp_attr, 0, sizeof(exp_attr));
- ret = ib_exp_query_device(file->device->ib_dev, &exp_attr);
- if (ret)
- return ret;
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
- ib_uverbs_query_device_assign(&resp.base, &exp_attr.base, file);
+ srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+ if (!srq)
+ return -EINVAL;
- resp.comp_mask = 0;
- resp.device_cap_flags2 = 0;
+ ret = ib_query_srq(srq, &attr);
- /*
- * Handle regular attr fields
- */
- if (exp_attr.base.comp_mask & IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK) {
- resp.timestamp_mask = exp_attr.base.timestamp_mask;
- resp.comp_mask |= IB_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK;
- }
+ put_srq_read(srq);
- if (exp_attr.base.comp_mask & IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK) {
- resp.hca_core_clock = exp_attr.base.hca_core_clock;
- resp.comp_mask |= IB_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK;
- }
+ if (ret)
+ return ret;
- /*
- * Handle experimental attr fields
- */
- if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_CAP_FLAGS2) {
- resp.device_cap_flags2 = exp_attr.device_cap_flags2;
- resp.comp_mask |= IB_EXP_DEVICE_ATTR_CAP_FLAGS2;
- }
+ memset(&resp, 0, sizeof resp);
- if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_DC_REQ_RD) {
- resp.dc_rd_req = exp_attr.dc_rd_req;
- resp.comp_mask |= IB_EXP_DEVICE_ATTR_DC_REQ_RD;
- }
+ resp.max_wr = attr.max_wr;
+ resp.max_sge = attr.max_sge;
+ resp.srq_limit = attr.srq_limit;
- if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_DC_RES_RD) {
- resp.dc_rd_res = exp_attr.dc_rd_res;
- resp.comp_mask |= IB_EXP_DEVICE_ATTR_DC_RES_RD;
- }
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ return -EFAULT;
- if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ) {
- resp.inline_recv_sz = exp_attr.inline_recv_sz;
- resp.comp_mask |= IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ;
- }
+ return in_len;
+}
- if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_RSS_TBL_SZ) {
- resp.max_rss_tbl_sz = exp_attr.max_rss_tbl_sz;
- resp.comp_mask |= IB_EXP_DEVICE_ATTR_RSS_TBL_SZ;
- }
+ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_destroy_srq cmd;
+ struct ib_uverbs_destroy_srq_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_srq *srq;
+ struct ib_uevent_object *obj;
+ int ret = -EINVAL;
+ struct ib_usrq_object *us;
+ enum ib_srq_type srq_type;
- if (copy_to_user(ucore->outbuf, &resp, sizeof(resp)))
+ if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
- return ucore->inlen + uhw->inlen;
-}
+ uobj = idr_write_uobj(&ib_uverbs_srq_idr, cmd.srq_handle, file->ucontext);
+ if (!uobj)
+ return -EINVAL;
+ srq = uobj->object;
+ obj = container_of(uobj, struct ib_uevent_object, uobject);
+ srq_type = srq->srq_type;
-ssize_t ib_uverbs_exp_create_qp(struct ib_uverbs_file *file,
- struct ib_udata *ucore, struct ib_udata *uhw)
-{
- struct ib_uqp_object *obj;
- struct ib_device *device;
- struct ib_pd *pd = NULL;
- struct ib_xrcd *xrcd = NULL;
- struct ib_uobject *uninitialized_var(xrcd_uobj);
- struct ib_cq *scq = NULL, *rcq = NULL;
- struct ib_srq *srq = NULL;
- struct ib_qp *qp;
- struct ib_exp_qp_init_attr attr;
- int ret;
- struct ib_uverbs_exp_create_qp cmd_exp;
- struct ib_uverbs_exp_create_qp_resp resp_exp;
- struct ib_qp *parentqp = NULL;
+ ret = ib_destroy_srq(srq);
+ if (!ret)
+ uobj->live = 0;
- memset(&cmd_exp, 0, sizeof(cmd_exp));
+ put_uobj_write(uobj);
- ret = ucore->ops->copy_from(&cmd_exp, ucore, sizeof(cmd_exp));
if (ret)
return ret;
- if (!disable_raw_qp_enforcement &&
- cmd_exp.qp_type == IB_QPT_RAW_PACKET && priv_check(curthread,
- PRIV_NET_RAW))
- return -EPERM;
+ if (srq_type == IB_SRQT_XRC) {
+ us = container_of(obj, struct ib_usrq_object, uevent);
+ atomic_dec(&us->uxrcd->refcnt);
+ }
- obj = kzalloc(sizeof(*obj), GFP_KERNEL);
- if (!obj)
- return -ENOMEM;
+ idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
- init_uobj(&obj->uevent.uobject, cmd_exp.user_handle, file->ucontext,
- &qp_lock_class);
- down_write(&obj->uevent.uobject.mutex);
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
- if (cmd_exp.qp_type == IB_QPT_XRC_TGT) {
- xrcd = idr_read_xrcd(cmd_exp.pd_handle, file->ucontext, &xrcd_uobj);
- if (!xrcd) {
- ret = -EINVAL;
- goto err_put;
- }
- device = xrcd->device;
- } else {
- if (cmd_exp.qp_type == IB_QPT_XRC_INI) {
- cmd_exp.max_recv_wr = 0;
- cmd_exp.max_recv_sge = 0;
- } else {
- if (cmd_exp.is_srq) {
- srq = idr_read_srq(cmd_exp.srq_handle, file->ucontext);
- if (!srq || srq->srq_type != IB_SRQT_BASIC) {
- ret = -EINVAL;
- goto err_put;
- }
- }
+ ib_uverbs_release_uevent(file, obj);
- if (cmd_exp.recv_cq_handle != cmd_exp.send_cq_handle) {
- rcq = idr_read_cq(cmd_exp.recv_cq_handle, file->ucontext, 0);
- if (!rcq) {
- ret = -EINVAL;
- goto err_put;
- }
- }
- }
+ memset(&resp, 0, sizeof resp);
+ resp.events_reported = obj->events_reported;
- scq = idr_read_cq(cmd_exp.send_cq_handle, file->ucontext, !!rcq);
- rcq = rcq ?: scq;
- pd = idr_read_pd(cmd_exp.pd_handle, file->ucontext);
- if (!pd || !scq) {
- ret = -EINVAL;
- goto err_put;
- }
+ put_uobj(uobj);
- device = pd->device;
- }
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ ret = -EFAULT;
- memset(&attr, 0, sizeof(attr));
- attr.event_handler = ib_uverbs_qp_event_handler;
- attr.qp_context = file;
- attr.send_cq = scq;
- attr.recv_cq = rcq;
- attr.srq = srq;
- attr.xrcd = xrcd;
- attr.sq_sig_type = cmd_exp.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
- attr.qp_type = cmd_exp.qp_type;
- attr.create_flags = 0;
+ return ret ? ret : in_len;
+}
- attr.cap.max_send_wr = cmd_exp.max_send_wr;
- attr.cap.max_recv_wr = cmd_exp.max_recv_wr;
- attr.cap.max_send_sge = cmd_exp.max_send_sge;
- attr.cap.max_recv_sge = cmd_exp.max_recv_sge;
- attr.cap.max_inline_data = cmd_exp.max_inline_data;
-
- if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_CAP_FLAGS)
- attr.create_flags |= cmd_exp.qp_cap_flags &
- (IB_QP_CREATE_CROSS_CHANNEL |
- IB_QP_CREATE_MANAGED_SEND |
- IB_QP_CREATE_MANAGED_RECV);
-
- if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_QPG) {
- struct ib_uverbs_qpg *qpg;
- if (cmd_exp.qp_type != IB_QPT_RAW_PACKET &&
- cmd_exp.qp_type != IB_QPT_UD) {
- ret = -EINVAL;
- goto err_put;
- }
- qpg = &cmd_exp.qpg;
- switch (qpg->qpg_type) {
- case IB_QPG_PARENT:
- attr.parent_attrib.rss_child_count =
- qpg->parent_attrib.rss_child_count;
- attr.parent_attrib.tss_child_count =
- qpg->parent_attrib.tss_child_count;
- break;
- case IB_QPG_CHILD_RX:
- case IB_QPG_CHILD_TX:
- parentqp = idr_read_qp(qpg->parent_handle,
- file->ucontext);
- if (!parentqp) {
- ret = -EINVAL;
- goto err_put;
- }
- attr.qpg_parent = parentqp;
- break;
- default:
- ret = -EINVAL;
- goto err_put;
- }
- attr.qpg_type = qpg->qpg_type;
- }
+int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_query_device_resp resp = { {0} };
+ struct ib_uverbs_ex_query_device cmd;
+ struct ib_device_attr attr = {0};
+ int err;
- if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_INL_RECV)
- attr.max_inl_recv = cmd_exp.max_inl_recv;
+ if (ucore->inlen < sizeof(cmd))
+ return -EINVAL;
- obj->uevent.events_reported = 0;
- INIT_LIST_HEAD(&obj->uevent.event_list);
- INIT_LIST_HEAD(&obj->mcast_list);
+ err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+ if (err)
+ return err;
- if (cmd_exp.qp_type == IB_QPT_XRC_TGT)
- qp = ib_create_qp(pd, (struct ib_qp_init_attr *)&attr);
- else
- qp = device->exp_create_qp(pd, &attr, uhw);
+ if (cmd.comp_mask)
+ return -EINVAL;
- if (IS_ERR(qp)) {
- ret = PTR_ERR(qp);
- goto err_put;
- }
+ if (cmd.reserved)
+ return -EINVAL;
- if (cmd_exp.qp_type != IB_QPT_XRC_TGT) {
- qp->real_qp = qp;
- qp->device = device;
- qp->pd = pd;
- qp->send_cq = attr.send_cq;
- qp->recv_cq = attr.recv_cq;
- qp->srq = attr.srq;
- qp->event_handler = attr.event_handler;
- qp->qp_context = attr.qp_context;
- qp->qp_type = attr.qp_type;
- atomic_set(&qp->usecnt, 0);
- atomic_inc(&pd->usecnt);
- atomic_inc(&attr.send_cq->usecnt);
- if (attr.recv_cq)
- atomic_inc(&attr.recv_cq->usecnt);
- if (attr.srq)
- atomic_inc(&attr.srq->usecnt);
- }
- qp->uobject = &obj->uevent.uobject;
+ resp.response_length = offsetof(typeof(resp), odp_caps);
- obj->uevent.uobject.object = qp;
- ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
- if (ret)
- goto err_destroy;
+ if (ucore->outlen < resp.response_length)
+ return -ENOSPC;
- memset(&resp_exp, 0, sizeof(resp_exp));
- resp_exp.qpn = qp->qp_num;
- resp_exp.qp_handle = obj->uevent.uobject.id;
- resp_exp.max_recv_sge = attr.cap.max_recv_sge;
- resp_exp.max_send_sge = attr.cap.max_send_sge;
- resp_exp.max_recv_wr = attr.cap.max_recv_wr;
- resp_exp.max_send_wr = attr.cap.max_send_wr;
- resp_exp.max_inline_data = attr.cap.max_inline_data;
+ err = ib_dev->query_device(ib_dev, &attr, uhw);
+ if (err)
+ return err;
- if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_INL_RECV) {
- resp_exp.comp_mask |= IB_UVERBS_EXP_CREATE_QP_RESP_INL_RECV;
- resp_exp.max_inl_recv = attr.max_inl_recv;
- }
+ copy_query_dev_fields(file, ib_dev, &resp.base, &attr);
- ret = ucore->ops->copy_to(ucore, &resp_exp, sizeof(resp_exp));
- if (ret)
- goto err_copy;
+ if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps))
+ goto end;
- if (xrcd) {
- obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
- atomic_inc(&obj->uxrcd->refcnt);
- put_xrcd_read(xrcd_uobj);
- }
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+ resp.odp_caps.general_caps = attr.odp_caps.general_caps;
+ resp.odp_caps.per_transport_caps.rc_odp_caps =
+ attr.odp_caps.per_transport_caps.rc_odp_caps;
+ resp.odp_caps.per_transport_caps.uc_odp_caps =
+ attr.odp_caps.per_transport_caps.uc_odp_caps;
+ resp.odp_caps.per_transport_caps.ud_odp_caps =
+ attr.odp_caps.per_transport_caps.ud_odp_caps;
+#endif
+ resp.response_length += sizeof(resp.odp_caps);
- if (pd)
- put_pd_read(pd);
- if (scq)
- put_cq_read(scq);
- if (rcq && rcq != scq)
- put_cq_read(rcq);
- if (srq)
- put_srq_read(srq);
- if (parentqp)
- put_qp_read(parentqp);
+ if (ucore->outlen < resp.response_length + sizeof(resp.timestamp_mask))
+ goto end;
- mutex_lock(&file->mutex);
- list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list);
- mutex_unlock(&file->mutex);
+ resp.timestamp_mask = attr.timestamp_mask;
+ resp.response_length += sizeof(resp.timestamp_mask);
- obj->uevent.uobject.live = 1;
+ if (ucore->outlen < resp.response_length + sizeof(resp.hca_core_clock))
+ goto end;
- up_write(&obj->uevent.uobject.mutex);
+ resp.hca_core_clock = attr.hca_core_clock;
+ resp.response_length += sizeof(resp.hca_core_clock);
- return ucore->inlen + uhw->inlen;
+ if (ucore->outlen < resp.response_length + sizeof(resp.device_cap_flags_ex))
+ goto end;
-err_copy:
- idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+ resp.device_cap_flags_ex = attr.device_cap_flags;
+ resp.response_length += sizeof(resp.device_cap_flags_ex);
-err_destroy:
- ib_destroy_qp(qp);
+ if (ucore->outlen < resp.response_length + sizeof(resp.rss_caps))
+ goto end;
-err_put:
- if (xrcd)
- put_xrcd_read(xrcd_uobj);
- if (pd)
- put_pd_read(pd);
- if (scq)
- put_cq_read(scq);
- if (rcq && rcq != scq)
- put_cq_read(rcq);
- if (srq)
- put_srq_read(srq);
- if (parentqp)
- put_qp_read(parentqp);
+ resp.rss_caps.supported_qpts = attr.rss_caps.supported_qpts;
+ resp.rss_caps.max_rwq_indirection_tables =
+ attr.rss_caps.max_rwq_indirection_tables;
+ resp.rss_caps.max_rwq_indirection_table_size =
+ attr.rss_caps.max_rwq_indirection_table_size;
- put_uobj_write(&obj->uevent.uobject);
- return ret;
-}
+ resp.response_length += sizeof(resp.rss_caps);
-int ib_exp_query_device(struct ib_device *device,
- struct ib_exp_device_attr *device_attr)
-{
- return device->exp_query_device(device, device_attr);
+ if (ucore->outlen < resp.response_length + sizeof(resp.max_wq_type_rq))
+ goto end;
+
+ resp.max_wq_type_rq = attr.max_wq_type_rq;
+ resp.response_length += sizeof(resp.max_wq_type_rq);
+end:
+ err = ib_copy_to_udata(ucore, &resp, resp.response_length);
+ return err;
}
-EXPORT_SYMBOL(ib_exp_query_device);
diff --git a/sys/ofed/drivers/infiniband/core/uverbs_main.c b/sys/ofed/drivers/infiniband/core/uverbs_main.c
index 95e04f5d70556..d1c38a25d8c64 100644
--- a/sys/ofed/drivers/infiniband/core/uverbs_main.c
+++ b/sys/ofed/drivers/infiniband/core/uverbs_main.c
@@ -43,12 +43,11 @@
#include <linux/file.h>
#include <linux/cdev.h>
#include <linux/slab.h>
-#include <linux/ktime.h>
-#include <linux/rbtree.h>
-#include <linux/math64.h>
#include <asm/uaccess.h>
+#include <rdma/ib.h>
+
#include "uverbs.h"
MODULE_AUTHOR("Roland Dreier");
@@ -63,31 +62,6 @@ enum {
#define IB_UVERBS_BASE_DEV MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR)
-static int uverbs_copy_from_udata_ex(void *dest, struct ib_udata *udata, size_t len)
-{
- return copy_from_user(dest, udata->inbuf, min(udata->inlen, len)) ? -EFAULT : 0;
-}
-
-static int uverbs_copy_to_udata_ex(struct ib_udata *udata, void *src, size_t len)
-{
- return copy_to_user(udata->outbuf, src, min(udata->outlen, len)) ? -EFAULT : 0;
-}
-
-static struct ib_udata_ops uverbs_copy_ex = {
- .copy_from = uverbs_copy_from_udata_ex,
- .copy_to = uverbs_copy_to_udata_ex
-};
-
-#define INIT_UDATA_EX(udata, ibuf, obuf, ilen, olen) \
- do { \
- (udata)->ops = &uverbs_copy_ex; \
- (udata)->inbuf = (void __user *)(unsigned long)(ibuf); \
- (udata)->outbuf = (void __user *)(unsigned long)(obuf); \
- (udata)->inlen = (ilen); \
- (udata)->outlen = (olen); \
- } while (0)
-
-
static struct class *uverbs_class;
DEFINE_SPINLOCK(ib_uverbs_idr_lock);
@@ -100,44 +74,47 @@ DEFINE_IDR(ib_uverbs_qp_idr);
DEFINE_IDR(ib_uverbs_srq_idr);
DEFINE_IDR(ib_uverbs_xrcd_idr);
DEFINE_IDR(ib_uverbs_rule_idr);
-DEFINE_IDR(ib_uverbs_dct_idr);
+DEFINE_IDR(ib_uverbs_wq_idr);
+DEFINE_IDR(ib_uverbs_rwq_ind_tbl_idr);
static DEFINE_SPINLOCK(map_lock);
static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len) = {
- [IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context,
- [IB_USER_VERBS_CMD_QUERY_DEVICE] = ib_uverbs_query_device,
- [IB_USER_VERBS_CMD_QUERY_PORT] = ib_uverbs_query_port,
- [IB_USER_VERBS_CMD_ALLOC_PD] = ib_uverbs_alloc_pd,
- [IB_USER_VERBS_CMD_DEALLOC_PD] = ib_uverbs_dealloc_pd,
- [IB_USER_VERBS_CMD_REG_MR] = ib_uverbs_reg_mr,
- [IB_USER_VERBS_CMD_DEREG_MR] = ib_uverbs_dereg_mr,
+ [IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context,
+ [IB_USER_VERBS_CMD_QUERY_DEVICE] = ib_uverbs_query_device,
+ [IB_USER_VERBS_CMD_QUERY_PORT] = ib_uverbs_query_port,
+ [IB_USER_VERBS_CMD_ALLOC_PD] = ib_uverbs_alloc_pd,
+ [IB_USER_VERBS_CMD_DEALLOC_PD] = ib_uverbs_dealloc_pd,
+ [IB_USER_VERBS_CMD_REG_MR] = ib_uverbs_reg_mr,
+ [IB_USER_VERBS_CMD_REREG_MR] = ib_uverbs_rereg_mr,
+ [IB_USER_VERBS_CMD_DEREG_MR] = ib_uverbs_dereg_mr,
[IB_USER_VERBS_CMD_ALLOC_MW] = ib_uverbs_alloc_mw,
[IB_USER_VERBS_CMD_DEALLOC_MW] = ib_uverbs_dealloc_mw,
[IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel,
- [IB_USER_VERBS_CMD_CREATE_CQ] = ib_uverbs_create_cq,
- [IB_USER_VERBS_CMD_RESIZE_CQ] = ib_uverbs_resize_cq,
- [IB_USER_VERBS_CMD_POLL_CQ] = ib_uverbs_poll_cq,
- [IB_USER_VERBS_CMD_REQ_NOTIFY_CQ] = ib_uverbs_req_notify_cq,
- [IB_USER_VERBS_CMD_DESTROY_CQ] = ib_uverbs_destroy_cq,
- [IB_USER_VERBS_CMD_CREATE_QP] = ib_uverbs_create_qp,
- [IB_USER_VERBS_CMD_QUERY_QP] = ib_uverbs_query_qp,
- [IB_USER_VERBS_CMD_MODIFY_QP] = ib_uverbs_modify_qp,
- [IB_USER_VERBS_CMD_DESTROY_QP] = ib_uverbs_destroy_qp,
- [IB_USER_VERBS_CMD_POST_SEND] = ib_uverbs_post_send,
- [IB_USER_VERBS_CMD_POST_RECV] = ib_uverbs_post_recv,
- [IB_USER_VERBS_CMD_POST_SRQ_RECV] = ib_uverbs_post_srq_recv,
- [IB_USER_VERBS_CMD_CREATE_AH] = ib_uverbs_create_ah,
- [IB_USER_VERBS_CMD_DESTROY_AH] = ib_uverbs_destroy_ah,
- [IB_USER_VERBS_CMD_ATTACH_MCAST] = ib_uverbs_attach_mcast,
- [IB_USER_VERBS_CMD_DETACH_MCAST] = ib_uverbs_detach_mcast,
- [IB_USER_VERBS_CMD_CREATE_SRQ] = ib_uverbs_create_srq,
- [IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq,
- [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq,
- [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq,
+ [IB_USER_VERBS_CMD_CREATE_CQ] = ib_uverbs_create_cq,
+ [IB_USER_VERBS_CMD_RESIZE_CQ] = ib_uverbs_resize_cq,
+ [IB_USER_VERBS_CMD_POLL_CQ] = ib_uverbs_poll_cq,
+ [IB_USER_VERBS_CMD_REQ_NOTIFY_CQ] = ib_uverbs_req_notify_cq,
+ [IB_USER_VERBS_CMD_DESTROY_CQ] = ib_uverbs_destroy_cq,
+ [IB_USER_VERBS_CMD_CREATE_QP] = ib_uverbs_create_qp,
+ [IB_USER_VERBS_CMD_QUERY_QP] = ib_uverbs_query_qp,
+ [IB_USER_VERBS_CMD_MODIFY_QP] = ib_uverbs_modify_qp,
+ [IB_USER_VERBS_CMD_DESTROY_QP] = ib_uverbs_destroy_qp,
+ [IB_USER_VERBS_CMD_POST_SEND] = ib_uverbs_post_send,
+ [IB_USER_VERBS_CMD_POST_RECV] = ib_uverbs_post_recv,
+ [IB_USER_VERBS_CMD_POST_SRQ_RECV] = ib_uverbs_post_srq_recv,
+ [IB_USER_VERBS_CMD_CREATE_AH] = ib_uverbs_create_ah,
+ [IB_USER_VERBS_CMD_DESTROY_AH] = ib_uverbs_destroy_ah,
+ [IB_USER_VERBS_CMD_ATTACH_MCAST] = ib_uverbs_attach_mcast,
+ [IB_USER_VERBS_CMD_DETACH_MCAST] = ib_uverbs_detach_mcast,
+ [IB_USER_VERBS_CMD_CREATE_SRQ] = ib_uverbs_create_srq,
+ [IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq,
+ [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq,
+ [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq,
[IB_USER_VERBS_CMD_OPEN_XRCD] = ib_uverbs_open_xrcd,
[IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrcd,
[IB_USER_VERBS_CMD_CREATE_XSRQ] = ib_uverbs_create_xsrq,
@@ -145,36 +122,48 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
};
static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_udata *ucore,
struct ib_udata *uhw) = {
[IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow,
[IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow,
-};
-
-static ssize_t (*uverbs_exp_cmd_table[])(struct ib_uverbs_file *file,
- struct ib_udata *ucore,
- struct ib_udata *uhw) = {
- [IB_USER_VERBS_EXP_CMD_CREATE_QP] = ib_uverbs_exp_create_qp,
- [IB_USER_VERBS_EXP_CMD_MODIFY_CQ] = ib_uverbs_exp_modify_cq,
- [IB_USER_VERBS_EXP_CMD_MODIFY_QP] = ib_uverbs_exp_modify_qp,
- [IB_USER_VERBS_EXP_CMD_CREATE_CQ] = ib_uverbs_exp_create_cq,
- [IB_USER_VERBS_EXP_CMD_QUERY_DEVICE] = ib_uverbs_exp_query_device,
- [IB_USER_VERBS_EXP_CMD_CREATE_DCT] = ib_uverbs_exp_create_dct,
- [IB_USER_VERBS_EXP_CMD_DESTROY_DCT] = ib_uverbs_exp_destroy_dct,
- [IB_USER_VERBS_EXP_CMD_QUERY_DCT] = ib_uverbs_exp_query_dct,
+ [IB_USER_VERBS_EX_CMD_QUERY_DEVICE] = ib_uverbs_ex_query_device,
+ [IB_USER_VERBS_EX_CMD_CREATE_CQ] = ib_uverbs_ex_create_cq,
+ [IB_USER_VERBS_EX_CMD_CREATE_QP] = ib_uverbs_ex_create_qp,
+ [IB_USER_VERBS_EX_CMD_CREATE_WQ] = ib_uverbs_ex_create_wq,
+ [IB_USER_VERBS_EX_CMD_MODIFY_WQ] = ib_uverbs_ex_modify_wq,
+ [IB_USER_VERBS_EX_CMD_DESTROY_WQ] = ib_uverbs_ex_destroy_wq,
+ [IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL] = ib_uverbs_ex_create_rwq_ind_table,
+ [IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL] = ib_uverbs_ex_destroy_rwq_ind_table,
};
static void ib_uverbs_add_one(struct ib_device *device);
-static void ib_uverbs_remove_one(struct ib_device *device);
+static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
-static void ib_uverbs_release_dev(struct kref *ref)
+int uverbs_dealloc_mw(struct ib_mw *mw)
+{
+ struct ib_pd *pd = mw->pd;
+ int ret;
+
+ ret = mw->device->dealloc_mw(mw);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+ return ret;
+}
+
+static void ib_uverbs_release_dev(struct kobject *kobj)
{
struct ib_uverbs_device *dev =
- container_of(ref, struct ib_uverbs_device, ref);
+ container_of(kobj, struct ib_uverbs_device, kobj);
- complete(&dev->comp);
+ cleanup_srcu_struct(&dev->disassociate_srcu);
+ kfree(dev);
}
+static struct kobj_type ib_uverbs_dev_ktype = {
+ .release = ib_uverbs_release_dev,
+};
+
static void ib_uverbs_release_event_file(struct kref *ref)
{
struct ib_uverbs_event_file *file =
@@ -237,10 +226,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
struct ib_ucontext *context)
{
struct ib_uobject *uobj, *tmp;
- int err;
-
- if (!context)
- return 0;
context->closing = 1;
@@ -257,13 +242,10 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
struct ib_mw *mw = uobj->object;
idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
- err = ib_dealloc_mw(mw);
- if (err) {
- pr_info("user_verbs: couldn't deallocate MW during cleanup.\n");
- pr_info("user_verbs: the system may have become unstable.\n");
- }
+ uverbs_dealloc_mw(mw);
kfree(uobj);
}
+
list_for_each_entry_safe(uobj, tmp, &context->rule_list, list) {
struct ib_flow *flow_id = uobj->object;
@@ -278,28 +260,32 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
container_of(uobj, struct ib_uqp_object, uevent.uobject);
idr_remove_uobj(&ib_uverbs_qp_idr, uobj);
-
- ib_uverbs_detach_umcast(qp, uqp);
- err = ib_destroy_qp(qp);
- if (err)
- pr_info("destroying uverbs qp failed: err %d\n", err);
-
+ if (qp == qp->real_qp)
+ ib_uverbs_detach_umcast(qp, uqp);
+ ib_destroy_qp(qp);
ib_uverbs_release_uevent(file, &uqp->uevent);
kfree(uqp);
}
- list_for_each_entry_safe(uobj, tmp, &context->dct_list, list) {
- struct ib_dct *dct = uobj->object;
- struct ib_udct_object *udct =
- container_of(uobj, struct ib_udct_object, uobject);
+ list_for_each_entry_safe(uobj, tmp, &context->rwq_ind_tbl_list, list) {
+ struct ib_rwq_ind_table *rwq_ind_tbl = uobj->object;
+ struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl;
- idr_remove_uobj(&ib_uverbs_dct_idr, uobj);
+ idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj);
+ ib_destroy_rwq_ind_table(rwq_ind_tbl);
+ kfree(ind_tbl);
+ kfree(uobj);
+ }
- err = ib_destroy_dct(dct);
- if (err)
- pr_info("destroying uverbs dct failed: err %d\n", err);
+ list_for_each_entry_safe(uobj, tmp, &context->wq_list, list) {
+ struct ib_wq *wq = uobj->object;
+ struct ib_uwq_object *uwq =
+ container_of(uobj, struct ib_uwq_object, uevent.uobject);
- kfree(udct);
+ idr_remove_uobj(&ib_uverbs_wq_idr, uobj);
+ ib_destroy_wq(wq);
+ ib_uverbs_release_uevent(file, &uwq->uevent);
+ kfree(uwq);
}
list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) {
@@ -308,9 +294,7 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
container_of(uobj, struct ib_uevent_object, uobject);
idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
- err = ib_destroy_srq(srq);
- if (err)
- pr_info("destroying uverbs srq failed: err %d\n", err);
+ ib_destroy_srq(srq);
ib_uverbs_release_uevent(file, uevent);
kfree(uevent);
}
@@ -322,10 +306,7 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
container_of(uobj, struct ib_ucq_object, uobject);
idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
- err = ib_destroy_cq(cq);
- if (err)
- pr_info("destroying uverbs cq failed: err %d\n", err);
-
+ ib_destroy_cq(cq);
ib_uverbs_release_ucq(file, ev_file, ucq);
kfree(ucq);
}
@@ -334,11 +315,7 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
struct ib_mr *mr = uobj->object;
idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
- err = ib_dereg_mr(mr);
- if (err) {
- pr_info("user_verbs: couldn't deregister an MR during cleanup.\n");
- pr_info("user_verbs: the system may have become unstable.\n");
- }
+ ib_dereg_mr(mr);
kfree(uobj);
}
@@ -362,16 +339,32 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
kfree(uobj);
}
+ put_pid(context->tgid);
+
return context->device->dealloc_ucontext(context);
}
+static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev)
+{
+ complete(&dev->comp);
+}
+
static void ib_uverbs_release_file(struct kref *ref)
{
struct ib_uverbs_file *file =
container_of(ref, struct ib_uverbs_file, ref);
+ struct ib_device *ib_dev;
+ int srcu_key;
- module_put(file->device->ib_dev->owner);
- kref_put(&file->device->ref, ib_uverbs_release_dev);
+ srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+ ib_dev = srcu_dereference(file->device->ib_dev,
+ &file->device->disassociate_srcu);
+ if (ib_dev && !ib_dev->disassociate_ucontext)
+ module_put(ib_dev->owner);
+ srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+
+ if (atomic_dec_and_test(&file->device->refcount))
+ ib_uverbs_comp_dev(file->device);
kfree(file);
}
@@ -393,9 +386,19 @@ static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf,
return -EAGAIN;
if (wait_event_interruptible(file->poll_wait,
- !list_empty(&file->event_list)))
+ (!list_empty(&file->event_list) ||
+ /* The barriers built into wait_event_interruptible()
+ * and wake_up() guarentee this will see the null set
+ * without using RCU
+ */
+ !file->uverbs_file->device->ib_dev)))
return -ERESTARTSYS;
+ /* If device was disassociated and no event exists set an error */
+ if (list_empty(&file->event_list) &&
+ !file->uverbs_file->device->ib_dev)
+ return -EIO;
+
spin_lock_irq(&file->lock);
}
@@ -437,7 +440,6 @@ static unsigned int ib_uverbs_event_poll(struct file *filp,
unsigned int pollflags = 0;
struct ib_uverbs_event_file *file = filp->private_data;
- file->filp = filp;
poll_wait(filp, &file->poll_wait, wait);
spin_lock_irq(&file->lock);
@@ -459,8 +461,11 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
{
struct ib_uverbs_event_file *file = filp->private_data;
struct ib_uverbs_event *entry, *tmp;
+ int closed_already = 0;
+ mutex_lock(&file->uverbs_file->device->lists_mutex);
spin_lock_irq(&file->lock);
+ closed_already = file->is_closed;
file->is_closed = 1;
list_for_each_entry_safe(entry, tmp, &file->event_list, list) {
if (entry->counter)
@@ -468,11 +473,15 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
kfree(entry);
}
spin_unlock_irq(&file->lock);
-
- if (file->is_async) {
- ib_unregister_event_handler(&file->uverbs_file->event_handler);
- kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
+ if (!closed_already) {
+ list_del(&file->list);
+ if (file->is_async)
+ ib_unregister_event_handler(&file->uverbs_file->
+ event_handler);
}
+ mutex_unlock(&file->uverbs_file->device->lists_mutex);
+
+ kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
kref_put(&file->ref, ib_uverbs_release_event_file);
return 0;
@@ -480,7 +489,7 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
static const struct file_operations uverbs_event_fops = {
.owner = THIS_MODULE,
- .read = ib_uverbs_event_read,
+ .read = ib_uverbs_event_read,
.poll = ib_uverbs_event_poll,
.release = ib_uverbs_event_close,
.fasync = ib_uverbs_event_fasync,
@@ -519,8 +528,7 @@ void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context)
spin_unlock_irqrestore(&file->lock, flags);
wake_up_interruptible(&file->poll_wait);
- if (file->filp)
- selwakeup(&file->filp->f_selinfo);
+ linux_poll_wakeup(file->filp);
kill_fasync(&file->async_queue, SIGIO, POLL_IN);
}
@@ -546,6 +554,7 @@ static void ib_uverbs_async_handler(struct ib_uverbs_file *file,
entry->desc.async.element = element;
entry->desc.async.event_type = event;
+ entry->desc.async.reserved = 0;
entry->counter = counter;
list_add_tail(&entry->list, &file->async_file->event_list);
@@ -554,8 +563,7 @@ static void ib_uverbs_async_handler(struct ib_uverbs_file *file,
spin_unlock_irqrestore(&file->async_file->lock, flags);
wake_up_interruptible(&file->async_file->poll_wait);
- if (file->async_file->filp)
- selwakeup(&file->async_file->filp->f_selinfo);
+ linux_poll_wakeup(file->async_file->filp);
kill_fasync(&file->async_file->async_queue, SIGIO, POLL_IN);
}
@@ -573,6 +581,10 @@ void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr)
{
struct ib_uevent_object *uobj;
+ /* for XRC target qp's, check that qp is live */
+ if (!event->element.qp->uobject || !event->element.qp->uobject->live)
+ return;
+
uobj = container_of(event->element.qp->uobject,
struct ib_uevent_object, uobject);
@@ -581,6 +593,16 @@ void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr)
&uobj->events_reported);
}
+void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr)
+{
+ struct ib_uevent_object *uobj = container_of(event->element.wq->uobject,
+ struct ib_uevent_object, uobject);
+
+ ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+ event->event, &uobj->event_list,
+ &uobj->events_reported);
+}
+
void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr)
{
struct ib_uevent_object *uobj;
@@ -603,13 +625,21 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler,
NULL, NULL);
}
+void ib_uverbs_free_async_event_file(struct ib_uverbs_file *file)
+{
+ kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
+ file->async_file = NULL;
+}
+
struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+ struct ib_device *ib_dev,
int is_async)
{
struct ib_uverbs_event_file *ev_file;
struct file *filp;
+ int ret;
- ev_file = kzalloc(sizeof *ev_file, GFP_KERNEL);
+ ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL);
if (!ev_file)
return ERR_PTR(-ENOMEM);
@@ -618,7 +648,9 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
INIT_LIST_HEAD(&ev_file->event_list);
init_waitqueue_head(&ev_file->poll_wait);
ev_file->uverbs_file = uverbs_file;
- ev_file->is_async = is_async;
+ kref_get(&ev_file->uverbs_file->ref);
+ ev_file->async_queue = NULL;
+ ev_file->is_closed = 0;
/*
* fops_get() can't fail here, because we're coming from a
@@ -626,14 +658,43 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
* module reference.
*/
filp = alloc_file(FMODE_READ, fops_get(&uverbs_event_fops));
-
- if (IS_ERR(filp)) {
- kfree(ev_file);
- } else {
+ if (IS_ERR(filp))
+ goto err_put_refs;
filp->private_data = ev_file;
+ ev_file->filp = filp;
+
+ mutex_lock(&uverbs_file->device->lists_mutex);
+ list_add_tail(&ev_file->list,
+ &uverbs_file->device->uverbs_events_file_list);
+ mutex_unlock(&uverbs_file->device->lists_mutex);
+
+ if (is_async) {
+ WARN_ON(uverbs_file->async_file);
+ uverbs_file->async_file = ev_file;
+ kref_get(&uverbs_file->async_file->ref);
+ INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler,
+ ib_dev,
+ ib_uverbs_event_handler);
+ ret = ib_register_event_handler(&uverbs_file->event_handler);
+ if (ret)
+ goto err_put_file;
+
+ /* At that point async file stuff was fully set */
+ ev_file->is_async = 1;
}
return filp;
+
+err_put_file:
+ fput(filp);
+ kref_put(&uverbs_file->async_file->ref, ib_uverbs_release_event_file);
+ uverbs_file->async_file = NULL;
+ return ERR_PTR(ret);
+
+err_put_refs:
+ kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file);
+ kref_put(&ev_file->ref, ib_uverbs_release_event_file);
+ return filp;
}
/*
@@ -665,221 +726,34 @@ out:
return ev_file;
}
-static const char *verbs_cmd_str(__u32 cmd)
+static int verify_command_mask(struct ib_device *ib_dev, __u32 command)
{
- switch (cmd) {
- case IB_USER_VERBS_CMD_GET_CONTEXT:
- return "GET_CONTEXT";
- case IB_USER_VERBS_CMD_QUERY_DEVICE:
- return "QUERY_DEVICE";
- case IB_USER_VERBS_CMD_QUERY_PORT:
- return "QUERY_PORT";
- case IB_USER_VERBS_CMD_ALLOC_PD:
- return "ALLOC_PD";
- case IB_USER_VERBS_CMD_DEALLOC_PD:
- return "DEALLOC_PD";
- case IB_USER_VERBS_CMD_REG_MR:
- return "REG_MR";
- case IB_USER_VERBS_CMD_DEREG_MR:
- return "DEREG_MR";
- case IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL:
- return "CREATE_COMP_CHANNEL";
- case IB_USER_VERBS_CMD_CREATE_CQ:
- return "CREATE_CQ";
- case IB_USER_VERBS_CMD_RESIZE_CQ:
- return "RESIZE_CQ";
- case IB_USER_VERBS_CMD_POLL_CQ:
- return "POLL_CQ";
- case IB_USER_VERBS_CMD_REQ_NOTIFY_CQ:
- return "REQ_NOTIFY_CQ";
- case IB_USER_VERBS_CMD_DESTROY_CQ:
- return "DESTROY_CQ";
- case IB_USER_VERBS_CMD_CREATE_QP:
- return "CREATE_QP";
- case IB_USER_VERBS_CMD_QUERY_QP:
- return "QUERY_QP";
- case IB_USER_VERBS_CMD_MODIFY_QP:
- return "MODIFY_QP";
- case IB_USER_VERBS_CMD_DESTROY_QP:
- return "DESTROY_QP";
- case IB_USER_VERBS_CMD_POST_SEND:
- return "POST_SEND";
- case IB_USER_VERBS_CMD_POST_RECV:
- return "POST_RECV";
- case IB_USER_VERBS_CMD_POST_SRQ_RECV:
- return "POST_SRQ_RECV";
- case IB_USER_VERBS_CMD_CREATE_AH:
- return "CREATE_AH";
- case IB_USER_VERBS_CMD_DESTROY_AH:
- return "DESTROY_AH";
- case IB_USER_VERBS_CMD_ATTACH_MCAST:
- return "ATTACH_MCAST";
- case IB_USER_VERBS_CMD_DETACH_MCAST:
- return "DETACH_MCAST";
- case IB_USER_VERBS_CMD_CREATE_SRQ:
- return "CREATE_SRQ";
- case IB_USER_VERBS_CMD_MODIFY_SRQ:
- return "MODIFY_SRQ";
- case IB_USER_VERBS_CMD_QUERY_SRQ:
- return "QUERY_SRQ";
- case IB_USER_VERBS_CMD_DESTROY_SRQ:
- return "DESTROY_SRQ";
- case IB_USER_VERBS_CMD_OPEN_XRCD:
- return "OPEN_XRCD";
- case IB_USER_VERBS_CMD_CLOSE_XRCD:
- return "CLOSE_XRCD";
- case IB_USER_VERBS_CMD_CREATE_XSRQ:
- return "CREATE_XSRQ";
- case IB_USER_VERBS_CMD_OPEN_QP:
- return "OPEN_QP";
- }
-
- return "Unknown command";
-}
-
-enum {
- COMMAND_INFO_MASK = 0x1000,
-};
-
-static ssize_t ib_uverbs_exp_handle_cmd(struct ib_uverbs_file *file,
- const char __user *buf,
- struct ib_device *dev,
- struct ib_uverbs_cmd_hdr *hdr,
- size_t count,
- int legacy_ex_cmd)
-{
- struct ib_udata ucore;
- struct ib_udata uhw;
- struct ib_uverbs_ex_cmd_hdr ex_hdr;
- __u32 command = hdr->command - IB_USER_VERBS_EXP_CMD_FIRST;
-
- if (hdr->command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
- IB_USER_VERBS_CMD_COMMAND_MASK))
- return -EINVAL;
-
- if (command >= ARRAY_SIZE(uverbs_exp_cmd_table) ||
- !uverbs_exp_cmd_table[command])
- return -EINVAL;
-
- if (!file->ucontext)
- return -EINVAL;
-
- if (!(dev->uverbs_exp_cmd_mask & (1ull << command)))
- return -ENOSYS;
-
- if (legacy_ex_cmd) {
- struct ib_uverbs_ex_cmd_hdr_legacy hxl;
- struct ib_uverbs_ex_cmd_resp1_legacy resp1;
- __u64 response;
- ssize_t ret;
-
- if (count < sizeof(hxl))
- return -EINVAL;
-
- if (copy_from_user(&hxl, buf, sizeof(hxl)))
- return -EFAULT;
-
- if (((hxl.in_words + hxl.provider_in_words) * 4) != count)
- return -EINVAL;
-
- count -= sizeof(hxl);
- buf += sizeof(hxl);
- if (hxl.out_words || hxl.provider_out_words) {
- if (count < sizeof(resp1))
- return -EINVAL;
- if (copy_from_user(&resp1, buf, sizeof(resp1)))
- return -EFAULT;
- response = resp1.response;
- if (!response)
- return -EINVAL;
-
- /*
- * Change user buffer to comply with new extension format.
- */
- if (sizeof(resp1.comp_mask) != sizeof(resp1.response))
- return -EFAULT;
- buf += sizeof(resp1.comp_mask);
- if (copy_to_user(__DECONST(void __user *, buf), &resp1.comp_mask,
- sizeof(resp1.response)))
- return -EFAULT;
-
- } else {
- response = 0;
- }
-
- INIT_UDATA_EX(&ucore,
- (hxl.in_words) ? buf : 0,
- response,
- hxl.in_words * 4,
- hxl.out_words * 4);
-
- INIT_UDATA_EX(&uhw,
- (hxl.provider_in_words) ? buf + ucore.inlen : 0,
- (hxl.provider_out_words) ? response + ucore.outlen : 0,
- hxl.provider_in_words * 4,
- hxl.provider_out_words * 4);
-
- ret = uverbs_exp_cmd_table[command](file, &ucore, &uhw);
- /*
- * UnChange user buffer
- */
- if (response && copy_to_user(__DECONST(void __user *, buf), &resp1.response, sizeof(resp1.response)))
- return -EFAULT;
-
- return ret;
- } else {
- if (count < (sizeof(hdr) + sizeof(ex_hdr)))
- return -EINVAL;
-
- if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
- return -EFAULT;
-
- buf += sizeof(hdr) + sizeof(ex_hdr);
-
- if ((hdr->in_words + ex_hdr.provider_in_words) * 8 != count)
- return -EINVAL;
+ u64 mask;
- if (ex_hdr.response) {
- if (!hdr->out_words && !ex_hdr.provider_out_words)
- return -EINVAL;
- } else {
- if (hdr->out_words || ex_hdr.provider_out_words)
- return -EINVAL;
- }
-
- INIT_UDATA_EX(&ucore,
- (hdr->in_words) ? buf : 0,
- (unsigned long)ex_hdr.response,
- hdr->in_words * 8,
- hdr->out_words * 8);
+ if (command <= IB_USER_VERBS_CMD_OPEN_QP)
+ mask = ib_dev->uverbs_cmd_mask;
+ else
+ mask = ib_dev->uverbs_ex_cmd_mask;
- INIT_UDATA_EX(&uhw,
- (ex_hdr.provider_in_words) ? buf + ucore.inlen : 0,
- (ex_hdr.provider_out_words) ? ex_hdr.response + ucore.outlen : 0,
- ex_hdr.provider_in_words * 8,
- ex_hdr.provider_out_words * 8);
+ if (mask & ((u64)1 << command))
+ return 0;
- return uverbs_exp_cmd_table[command](file, &ucore, &uhw);
- }
+ return -1;
}
static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
size_t count, loff_t *pos)
{
struct ib_uverbs_file *file = filp->private_data;
- struct ib_device *dev = file->device->ib_dev;
+ struct ib_device *ib_dev;
struct ib_uverbs_cmd_hdr hdr;
- struct timespec ts1;
- struct timespec ts2;
- ktime_t t1, t2, delta;
- s64 ds;
- ssize_t ret;
- u64 dividend;
- u32 divisor;
- __u32 flags;
__u32 command;
- int legacy_ex_cmd = 0;
- size_t written_count = count;
+ __u32 flags;
+ int srcu_key;
+ ssize_t ret;
+
+ if (WARN_ON_ONCE(!ib_safe_file_access(filp)))
+ return -EACCES;
if (count < sizeof hdr)
return -EINVAL;
@@ -887,171 +761,157 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
if (copy_from_user(&hdr, buf, sizeof hdr))
return -EFAULT;
- /*
- * For BWD compatibility change old style extension verbs commands
- * to their equivalent experimental command.
- */
- if ((hdr.command >= IB_USER_VERBS_LEGACY_CMD_FIRST) &&
- (hdr.command <= IB_USER_VERBS_LEGACY_EX_CMD_LAST)) {
- hdr.command += IB_USER_VERBS_EXP_CMD_FIRST -
- IB_USER_VERBS_LEGACY_CMD_FIRST;
- legacy_ex_cmd = 1;
+ srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+ ib_dev = srcu_dereference(file->device->ib_dev,
+ &file->device->disassociate_srcu);
+ if (!ib_dev) {
+ ret = -EIO;
+ goto out;
+ }
+
+ if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
+ IB_USER_VERBS_CMD_COMMAND_MASK)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
+ if (verify_command_mask(ib_dev, command)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (!file->ucontext &&
+ command != IB_USER_VERBS_CMD_GET_CONTEXT) {
+ ret = -EINVAL;
+ goto out;
}
flags = (hdr.command &
IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT;
- command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
- ktime_get_ts(&ts1);
- if (!flags && (command >= IB_USER_VERBS_EXP_CMD_FIRST)) {
- ret = ib_uverbs_exp_handle_cmd(file, buf, dev, &hdr, count, legacy_ex_cmd);
- } else if (!flags) {
+ if (!flags) {
if (command >= ARRAY_SIZE(uverbs_cmd_table) ||
- !uverbs_cmd_table[command])
- return -EINVAL;
-
- if (!file->ucontext &&
- command != IB_USER_VERBS_CMD_GET_CONTEXT)
- return -EINVAL;
+ !uverbs_cmd_table[command]) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (!(dev->uverbs_cmd_mask & (1ull << command)))
- return -ENOSYS;
+ if (hdr.in_words * 4 != count) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (hdr.in_words * 4 != count)
- return -EINVAL;
+ ret = uverbs_cmd_table[command](file, ib_dev,
+ buf + sizeof(hdr),
+ hdr.in_words * 4,
+ hdr.out_words * 4);
- ret = uverbs_cmd_table[command](file,
- buf + sizeof(hdr),
- hdr.in_words * 4,
- hdr.out_words * 4);
} else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) {
+ struct ib_uverbs_ex_cmd_hdr ex_hdr;
struct ib_udata ucore;
struct ib_udata uhw;
- struct ib_uverbs_ex_cmd_hdr ex_hdr;
-
- if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
- IB_USER_VERBS_CMD_COMMAND_MASK))
- return -EINVAL;
+ size_t written_count = count;
if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) ||
- !uverbs_ex_cmd_table[command])
- return -EINVAL;
-
- if (!file->ucontext)
- return -EINVAL;
+ !uverbs_ex_cmd_table[command]) {
+ ret = -ENOSYS;
+ goto out;
+ }
- if (!(dev->uverbs_ex_cmd_mask & (1ull << command)))
- return -ENOSYS;
+ if (!file->ucontext) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (count < (sizeof(hdr) + sizeof(ex_hdr)))
- return -EINVAL;
+ if (count < (sizeof(hdr) + sizeof(ex_hdr))) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
- return -EFAULT;
+ if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) {
+ ret = -EFAULT;
+ goto out;
+ }
count -= sizeof(hdr) + sizeof(ex_hdr);
buf += sizeof(hdr) + sizeof(ex_hdr);
- if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count)
- return -EINVAL;
+ if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (ex_hdr.cmd_hdr_reserved) {
+ ret = -EINVAL;
+ goto out;
+ }
if (ex_hdr.response) {
- if (!hdr.out_words && !ex_hdr.provider_out_words)
- return -EINVAL;
+ if (!hdr.out_words && !ex_hdr.provider_out_words) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!access_ok(VERIFY_WRITE,
+ (void __user *) (unsigned long) ex_hdr.response,
+ (hdr.out_words + ex_hdr.provider_out_words) * 8)) {
+ ret = -EFAULT;
+ goto out;
+ }
} else {
- if (hdr.out_words || ex_hdr.provider_out_words)
- return -EINVAL;
+ if (hdr.out_words || ex_hdr.provider_out_words) {
+ ret = -EINVAL;
+ goto out;
+ }
}
- INIT_UDATA_EX(&ucore,
- (hdr.in_words) ? buf : 0,
- (unsigned long)ex_hdr.response,
- hdr.in_words * 8,
- hdr.out_words * 8);
-
- INIT_UDATA_EX(&uhw,
- (ex_hdr.provider_in_words) ? buf + ucore.inlen : 0,
- (ex_hdr.provider_out_words) ? ex_hdr.response + ucore.outlen : 0,
- ex_hdr.provider_in_words * 8,
- ex_hdr.provider_out_words * 8);
-
- ret = uverbs_ex_cmd_table[command](file, &ucore, &uhw);
-
- if (ret)
- return ret;
-
- return written_count;
-
+ INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response,
+ hdr.in_words * 8, hdr.out_words * 8);
+
+ INIT_UDATA_BUF_OR_NULL(&uhw,
+ buf + ucore.inlen,
+ (unsigned long) ex_hdr.response + ucore.outlen,
+ ex_hdr.provider_in_words * 8,
+ ex_hdr.provider_out_words * 8);
+
+ ret = uverbs_ex_cmd_table[command](file,
+ ib_dev,
+ &ucore,
+ &uhw);
+ if (!ret)
+ ret = written_count;
} else {
- return -EFAULT;
+ ret = -ENOSYS;
}
- if ((dev->cmd_perf & (COMMAND_INFO_MASK - 1)) == hdr.command) {
- ktime_get_ts(&ts2);
- t1 = timespec_to_ktime(ts1);
- t2 = timespec_to_ktime(ts2);
- delta = ktime_sub(t2, t1);
- ds = ktime_to_ns(delta);
- spin_lock(&dev->cmd_perf_lock);
- dividend = dev->cmd_avg * dev->cmd_n + ds;
- ++dev->cmd_n;
- divisor = dev->cmd_n;
- do_div(dividend, divisor);
- dev->cmd_avg = dividend;
- spin_unlock(&dev->cmd_perf_lock);
- if (dev->cmd_perf & COMMAND_INFO_MASK) {
- pr_info("%s: %s execution time = %lld nsec\n",
- file->device->ib_dev->name,
- verbs_cmd_str(hdr.command),
- (long long)ds);
- }
- }
+out:
+ srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
return ret;
}
static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct ib_uverbs_file *file = filp->private_data;
+ struct ib_device *ib_dev;
+ int ret = 0;
+ int srcu_key;
- if (!file->ucontext)
- return -ENODEV;
- else
- return file->device->ib_dev->mmap(file->ucontext, vma);
-}
-/* XXX Not supported in FreeBSD */
-#if 0
-static unsigned long ib_uverbs_get_unmapped_area(struct file *filp,
- unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
-{
- struct ib_uverbs_file *file = filp->private_data;
-
- if (!file->ucontext)
- return -ENODEV;
- else {
- if (!file->device->ib_dev->get_unmapped_area)
- return current->mm->get_unmapped_area(filp, addr, len,
- pgoff, flags);
-
- return file->device->ib_dev->get_unmapped_area(filp, addr, len,
- pgoff, flags);
+ srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+ ib_dev = srcu_dereference(file->device->ib_dev,
+ &file->device->disassociate_srcu);
+ if (!ib_dev) {
+ ret = -EIO;
+ goto out;
}
-}
-#endif
-
-static long ib_uverbs_ioctl(struct file *filp,
- unsigned int cmd, unsigned long arg)
-{
- struct ib_uverbs_file *file = filp->private_data;
-
- if (!file->device->ib_dev->ioctl)
- return -ENOTSUPP;
if (!file->ucontext)
- return -ENODEV;
+ ret = -ENODEV;
else
- /* provider should provide it's own locking mechanism */
- return file->device->ib_dev->ioctl(file->ucontext, cmd, arg);
+ ret = ib_dev->mmap(file->ucontext, vma);
+out:
+ srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+ return ret;
}
/*
@@ -1068,23 +928,43 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
{
struct ib_uverbs_device *dev;
struct ib_uverbs_file *file;
+ struct ib_device *ib_dev;
int ret;
+ int module_dependent;
+ int srcu_key;
dev = container_of(inode->i_cdev->si_drv1, struct ib_uverbs_device, cdev);
- if (dev)
- kref_get(&dev->ref);
- else
+ if (!atomic_inc_not_zero(&dev->refcount))
return -ENXIO;
- if (!try_module_get(dev->ib_dev->owner)) {
- ret = -ENODEV;
+ srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+ mutex_lock(&dev->lists_mutex);
+ ib_dev = srcu_dereference(dev->ib_dev,
+ &dev->disassociate_srcu);
+ if (!ib_dev) {
+ ret = -EIO;
goto err;
}
- file = kmalloc(sizeof *file, GFP_KERNEL);
+ /* In case IB device supports disassociate ucontext, there is no hard
+ * dependency between uverbs device and its low level device.
+ */
+ module_dependent = !(ib_dev->disassociate_ucontext);
+
+ if (module_dependent) {
+ if (!try_module_get(ib_dev->owner)) {
+ ret = -ENODEV;
+ goto err;
+ }
+ }
+
+ file = kzalloc(sizeof(*file), GFP_KERNEL);
if (!file) {
ret = -ENOMEM;
- goto err_module;
+ if (module_dependent)
+ goto err_module;
+
+ goto err;
}
file->device = dev;
@@ -1092,54 +972,71 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
file->async_file = NULL;
kref_init(&file->ref);
mutex_init(&file->mutex);
+ mutex_init(&file->cleanup_mutex);
filp->private_data = file;
+ kobject_get(&dev->kobj);
+ list_add_tail(&file->list, &dev->uverbs_file_list);
+ mutex_unlock(&dev->lists_mutex);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
return nonseekable_open(inode, filp);
err_module:
- module_put(dev->ib_dev->owner);
+ module_put(ib_dev->owner);
err:
- kref_put(&dev->ref, ib_uverbs_release_dev);
+ mutex_unlock(&dev->lists_mutex);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+ if (atomic_dec_and_test(&dev->refcount))
+ ib_uverbs_comp_dev(dev);
+
return ret;
}
static int ib_uverbs_close(struct inode *inode, struct file *filp)
{
struct ib_uverbs_file *file = filp->private_data;
+ struct ib_uverbs_device *dev = file->device;
- ib_uverbs_cleanup_ucontext(file, file->ucontext);
+ mutex_lock(&file->cleanup_mutex);
+ if (file->ucontext) {
+ ib_uverbs_cleanup_ucontext(file, file->ucontext);
+ file->ucontext = NULL;
+ }
+ mutex_unlock(&file->cleanup_mutex);
+
+ mutex_lock(&file->device->lists_mutex);
+ if (!file->is_closed) {
+ list_del(&file->list);
+ file->is_closed = 1;
+ }
+ mutex_unlock(&file->device->lists_mutex);
if (file->async_file)
kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
kref_put(&file->ref, ib_uverbs_release_file);
+ kobject_put(&dev->kobj);
return 0;
}
static const struct file_operations uverbs_fops = {
- .owner = THIS_MODULE,
- .write = ib_uverbs_write,
- .open = ib_uverbs_open,
+ .owner = THIS_MODULE,
+ .write = ib_uverbs_write,
+ .open = ib_uverbs_open,
.release = ib_uverbs_close,
.llseek = no_llseek,
- .unlocked_ioctl = ib_uverbs_ioctl,
};
static const struct file_operations uverbs_mmap_fops = {
- .owner = THIS_MODULE,
- .write = ib_uverbs_write,
+ .owner = THIS_MODULE,
+ .write = ib_uverbs_write,
.mmap = ib_uverbs_mmap,
- .open = ib_uverbs_open,
+ .open = ib_uverbs_open,
.release = ib_uverbs_close,
.llseek = no_llseek,
-/* XXX Not supported in FreeBSD */
-#if 0
- .get_unmapped_area = ib_uverbs_get_unmapped_area,
-#endif
- .unlocked_ioctl = ib_uverbs_ioctl,
};
static struct ib_client uverbs_client = {
@@ -1151,45 +1048,46 @@ static struct ib_client uverbs_client = {
static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
char *buf)
{
+ int ret = -ENODEV;
+ int srcu_key;
struct ib_uverbs_device *dev = dev_get_drvdata(device);
+ struct ib_device *ib_dev;
if (!dev)
return -ENODEV;
- return sprintf(buf, "%s\n", dev->ib_dev->name);
-}
-static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
-
-static ssize_t show_dev_ref_cnt(struct device *device,
- struct device_attribute *attr, char *buf)
-{
- struct ib_uverbs_device *dev = dev_get_drvdata(device);
+ srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+ ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
+ if (ib_dev)
+ ret = sprintf(buf, "%s\n", ib_dev->name);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
- if (!dev)
- return -ENODEV;
-
- return sprintf(buf, "%d\n", atomic_read(&dev->ref.refcount));
+ return ret;
}
-static DEVICE_ATTR(ref_cnt, S_IRUGO, show_dev_ref_cnt, NULL);
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
static ssize_t show_dev_abi_version(struct device *device,
struct device_attribute *attr, char *buf)
{
struct ib_uverbs_device *dev = dev_get_drvdata(device);
+ int ret = -ENODEV;
+ int srcu_key;
+ struct ib_device *ib_dev;
if (!dev)
return -ENODEV;
+ srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+ ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
+ if (ib_dev)
+ ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
- return sprintf(buf, "%d\n", dev->ib_dev->uverbs_abi_ver);
+ return ret;
}
static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
-static ssize_t show_abi_version(struct class *class, struct class_attribute *attr, char *buf)
-{
- return sprintf(buf, "%d\n", IB_USER_VERBS_ABI_VERSION);
-}
-
-static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+static CLASS_ATTR_STRING(abi_version, S_IRUGO,
+ __stringify(IB_USER_VERBS_ABI_VERSION));
static dev_t overflow_maj;
static DECLARE_BITMAP(overflow_map, IB_UVERBS_MAX_DEVICES);
@@ -1207,7 +1105,7 @@ static int find_overflow_devnum(void)
ret = alloc_chrdev_region(&overflow_maj, 0, IB_UVERBS_MAX_DEVICES,
"infiniband_verbs");
if (ret) {
- printk(KERN_ERR "user_verbs: couldn't register dynamic device number\n");
+ pr_err("user_verbs: couldn't register dynamic device number\n");
return ret;
}
}
@@ -1218,52 +1116,13 @@ static int find_overflow_devnum(void)
return ret;
}
-#include <linux/pci.h>
-
-static ssize_t
-show_dev_device(struct device *device, struct device_attribute *attr, char *buf)
-{
- struct ib_uverbs_device *dev = dev_get_drvdata(device);
-
- if (!dev || !dev->ib_dev->dma_device)
- return -ENODEV;
-
- return sprintf(buf, "0x%04x\n",
- ((struct pci_dev *)dev->ib_dev->dma_device)->device);
-}
-static DEVICE_ATTR(device, S_IRUGO, show_dev_device, NULL);
-
-static ssize_t
-show_dev_vendor(struct device *device, struct device_attribute *attr, char *buf)
-{
- struct ib_uverbs_device *dev = dev_get_drvdata(device);
-
- if (!dev || !dev->ib_dev->dma_device)
- return -ENODEV;
-
- return sprintf(buf, "0x%04x\n",
- ((struct pci_dev *)dev->ib_dev->dma_device)->vendor);
-}
-
-static DEVICE_ATTR(vendor, S_IRUGO, show_dev_vendor, NULL);
-
-struct attribute *device_attrs[] =
-{
- &dev_attr_device.attr,
- &dev_attr_vendor.attr,
- NULL
-};
-
-static struct attribute_group device_group = {
- .name = "device",
- .attrs = device_attrs
-};
static void ib_uverbs_add_one(struct ib_device *device)
{
int devnum;
dev_t base;
struct ib_uverbs_device *uverbs_dev;
+ int ret;
if (!device->alloc_ucontext)
return;
@@ -1272,10 +1131,20 @@ static void ib_uverbs_add_one(struct ib_device *device)
if (!uverbs_dev)
return;
- kref_init(&uverbs_dev->ref);
+ ret = init_srcu_struct(&uverbs_dev->disassociate_srcu);
+ if (ret) {
+ kfree(uverbs_dev);
+ return;
+ }
+
+ atomic_set(&uverbs_dev->refcount, 1);
init_completion(&uverbs_dev->comp);
uverbs_dev->xrcd_tree = RB_ROOT;
mutex_init(&uverbs_dev->xrcd_tree_mutex);
+ kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype);
+ mutex_init(&uverbs_dev->lists_mutex);
+ INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list);
+ INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list);
spin_lock(&map_lock);
devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
@@ -1283,7 +1152,7 @@ static void ib_uverbs_add_one(struct ib_device *device)
spin_unlock(&map_lock);
devnum = find_overflow_devnum();
if (devnum < 0)
- goto err;
+ goto err;
spin_lock(&map_lock);
uverbs_dev->devnum = devnum + IB_UVERBS_MAX_DEVICES;
@@ -1296,12 +1165,13 @@ static void ib_uverbs_add_one(struct ib_device *device)
}
spin_unlock(&map_lock);
- uverbs_dev->ib_dev = device;
+ rcu_assign_pointer(uverbs_dev->ib_dev, device);
uverbs_dev->num_comp_vectors = device->num_comp_vectors;
cdev_init(&uverbs_dev->cdev, NULL);
uverbs_dev->cdev.owner = THIS_MODULE;
uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
+ uverbs_dev->cdev.kobj.parent = &uverbs_dev->kobj;
kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);
if (cdev_add(&uverbs_dev->cdev, base, 1))
goto err_cdev;
@@ -1314,12 +1184,8 @@ static void ib_uverbs_add_one(struct ib_device *device)
if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev))
goto err_class;
- if (device_create_file(uverbs_dev->dev, &dev_attr_ref_cnt))
- goto err_class;
if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version))
goto err_class;
- if (sysfs_create_group(&uverbs_dev->dev->kobj, &device_group))
- goto err_class;
ib_set_client_data(device, &uverbs_client, uverbs_dev);
@@ -1336,32 +1202,123 @@ err_cdev:
clear_bit(devnum, overflow_map);
err:
- kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+ if (atomic_dec_and_test(&uverbs_dev->refcount))
+ ib_uverbs_comp_dev(uverbs_dev);
wait_for_completion(&uverbs_dev->comp);
- kfree(uverbs_dev);
+ kobject_put(&uverbs_dev->kobj);
return;
}
-static void ib_uverbs_remove_one(struct ib_device *device)
+static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev,
+ struct ib_device *ib_dev)
+{
+ struct ib_uverbs_file *file;
+ struct ib_uverbs_event_file *event_file;
+ struct ib_event event;
+
+ /* Pending running commands to terminate */
+ synchronize_srcu(&uverbs_dev->disassociate_srcu);
+ event.event = IB_EVENT_DEVICE_FATAL;
+ event.element.port_num = 0;
+ event.device = ib_dev;
+
+ mutex_lock(&uverbs_dev->lists_mutex);
+ while (!list_empty(&uverbs_dev->uverbs_file_list)) {
+ struct ib_ucontext *ucontext;
+ file = list_first_entry(&uverbs_dev->uverbs_file_list,
+ struct ib_uverbs_file, list);
+ file->is_closed = 1;
+ list_del(&file->list);
+ kref_get(&file->ref);
+ mutex_unlock(&uverbs_dev->lists_mutex);
+
+ ib_uverbs_event_handler(&file->event_handler, &event);
+
+ mutex_lock(&file->cleanup_mutex);
+ ucontext = file->ucontext;
+ file->ucontext = NULL;
+ mutex_unlock(&file->cleanup_mutex);
+
+ /* At this point ib_uverbs_close cannot be running
+ * ib_uverbs_cleanup_ucontext
+ */
+ if (ucontext) {
+ /* We must release the mutex before going ahead and
+ * calling disassociate_ucontext. disassociate_ucontext
+ * might end up indirectly calling uverbs_close,
+ * for example due to freeing the resources
+ * (e.g mmput).
+ */
+ ib_dev->disassociate_ucontext(ucontext);
+ ib_uverbs_cleanup_ucontext(file, ucontext);
+ }
+
+ mutex_lock(&uverbs_dev->lists_mutex);
+ kref_put(&file->ref, ib_uverbs_release_file);
+ }
+
+ while (!list_empty(&uverbs_dev->uverbs_events_file_list)) {
+ event_file = list_first_entry(&uverbs_dev->
+ uverbs_events_file_list,
+ struct ib_uverbs_event_file,
+ list);
+ spin_lock_irq(&event_file->lock);
+ event_file->is_closed = 1;
+ spin_unlock_irq(&event_file->lock);
+
+ list_del(&event_file->list);
+ if (event_file->is_async) {
+ ib_unregister_event_handler(&event_file->uverbs_file->
+ event_handler);
+ event_file->uverbs_file->event_handler.device = NULL;
+ }
+
+ wake_up_interruptible(&event_file->poll_wait);
+ linux_poll_wakeup(event_file->filp);
+ kill_fasync(&event_file->async_queue, SIGIO, POLL_IN);
+ }
+ mutex_unlock(&uverbs_dev->lists_mutex);
+}
+
+static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
{
- struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client);
+ struct ib_uverbs_device *uverbs_dev = client_data;
+ int wait_clients = 1;
if (!uverbs_dev)
return;
- sysfs_remove_group(&uverbs_dev->dev->kobj, &device_group);
dev_set_drvdata(uverbs_dev->dev, NULL);
device_destroy(uverbs_class, uverbs_dev->cdev.dev);
cdev_del(&uverbs_dev->cdev);
if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES)
- clear_bit(uverbs_dev->devnum, dev_map);
+ clear_bit(uverbs_dev->devnum, dev_map);
else
clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map);
- kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
- wait_for_completion(&uverbs_dev->comp);
- kfree(uverbs_dev);
+ if (device->disassociate_ucontext) {
+ /* We disassociate HW resources and immediately return.
+ * Userspace will see a EIO errno for all future access.
+ * Upon returning, ib_device may be freed internally and is not
+ * valid any more.
+ * uverbs_device is still available until all clients close
+ * their files, then the uverbs device ref count will be zero
+ * and its resources will be freed.
+ * Note: At this point no more files can be opened since the
+ * cdev was deleted, however active clients can still issue
+ * commands and close their open files.
+ */
+ rcu_assign_pointer(uverbs_dev->ib_dev, NULL);
+ ib_uverbs_free_hw_resources(uverbs_dev, device);
+ wait_clients = 0;
+ }
+
+ if (atomic_dec_and_test(&uverbs_dev->refcount))
+ ib_uverbs_comp_dev(uverbs_dev);
+ if (wait_clients)
+ wait_for_completion(&uverbs_dev->comp);
+ kobject_put(&uverbs_dev->kobj);
}
static char *uverbs_devnode(struct device *dev, umode_t *mode)
@@ -1378,28 +1335,28 @@ static int __init ib_uverbs_init(void)
ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES,
"infiniband_verbs");
if (ret) {
- printk(KERN_ERR "user_verbs: couldn't register device number\n");
+ pr_err("user_verbs: couldn't register device number\n");
goto out;
}
uverbs_class = class_create(THIS_MODULE, "infiniband_verbs");
if (IS_ERR(uverbs_class)) {
ret = PTR_ERR(uverbs_class);
- printk(KERN_ERR "user_verbs: couldn't create class infiniband_verbs\n");
+ pr_err("user_verbs: couldn't create class infiniband_verbs\n");
goto out_chrdev;
}
uverbs_class->devnode = uverbs_devnode;
- ret = class_create_file(uverbs_class, &class_attr_abi_version);
+ ret = class_create_file(uverbs_class, &class_attr_abi_version.attr);
if (ret) {
- printk(KERN_ERR "user_verbs: couldn't create abi_version attribute\n");
+ pr_err("user_verbs: couldn't create abi_version attribute\n");
goto out_class;
}
ret = ib_register_client(&uverbs_client);
if (ret) {
- printk(KERN_ERR "user_verbs: couldn't register client\n");
+ pr_err("user_verbs: couldn't register client\n");
goto out_class;
}
@@ -1431,5 +1388,5 @@ static void __exit ib_uverbs_cleanup(void)
idr_destroy(&ib_uverbs_srq_idr);
}
-module_init(ib_uverbs_init);
+module_init_order(ib_uverbs_init, SI_ORDER_THIRD);
module_exit(ib_uverbs_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/uverbs_marshall.c b/sys/ofed/drivers/infiniband/core/uverbs_marshall.c
index a541882f63fc9..db8654779aaea 100644
--- a/sys/ofed/drivers/infiniband/core/uverbs_marshall.c
+++ b/sys/ofed/drivers/infiniband/core/uverbs_marshall.c
@@ -30,7 +30,6 @@
* SOFTWARE.
*/
-#include <linux/module.h>
#include <rdma/ib_marshall.h>
void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst,
@@ -140,5 +139,10 @@ void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst,
dst->packet_life_time = src->packet_life_time;
dst->preference = src->preference;
dst->packet_life_time_selector = src->packet_life_time_selector;
+
+ memset(dst->dmac, 0, sizeof(dst->dmac));
+ dst->net = NULL;
+ dst->ifindex = 0;
+ dst->gid_type = IB_GID_TYPE_IB;
}
EXPORT_SYMBOL(ib_copy_path_rec_from_user);
diff --git a/sys/ofed/drivers/infiniband/core/verbs.c b/sys/ofed/drivers/infiniband/core/verbs.c
deleted file mode 100644
index 7756b882896c2..0000000000000
--- a/sys/ofed/drivers/infiniband/core/verbs.c
+++ /dev/null
@@ -1,1538 +0,0 @@
-/*
- * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
- * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
- * Copyright (c) 2004 Intel Corporation. All rights reserved.
- * Copyright (c) 2004 Topspin Corporation. All rights reserved.
- * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
- * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
- * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-
-#include <rdma/ib_verbs.h>
-#include <rdma/ib_cache.h>
-#include <rdma/ib_addr.h>
-
-int ib_rate_to_mult(enum ib_rate rate)
-{
- switch (rate) {
- case IB_RATE_2_5_GBPS: return 1;
- case IB_RATE_5_GBPS: return 2;
- case IB_RATE_10_GBPS: return 4;
- case IB_RATE_20_GBPS: return 8;
- case IB_RATE_30_GBPS: return 12;
- case IB_RATE_40_GBPS: return 16;
- case IB_RATE_60_GBPS: return 24;
- case IB_RATE_80_GBPS: return 32;
- case IB_RATE_120_GBPS: return 48;
- default: return -1;
- }
-}
-EXPORT_SYMBOL(ib_rate_to_mult);
-
-enum ib_rate mult_to_ib_rate(int mult)
-{
- switch (mult) {
- case 1: return IB_RATE_2_5_GBPS;
- case 2: return IB_RATE_5_GBPS;
- case 4: return IB_RATE_10_GBPS;
- case 8: return IB_RATE_20_GBPS;
- case 12: return IB_RATE_30_GBPS;
- case 16: return IB_RATE_40_GBPS;
- case 24: return IB_RATE_60_GBPS;
- case 32: return IB_RATE_80_GBPS;
- case 48: return IB_RATE_120_GBPS;
- default: return IB_RATE_PORT_CURRENT;
- }
-}
-EXPORT_SYMBOL(mult_to_ib_rate);
-
-int ib_rate_to_mbps(enum ib_rate rate)
-{
- switch (rate) {
- case IB_RATE_2_5_GBPS: return 2500;
- case IB_RATE_5_GBPS: return 5000;
- case IB_RATE_10_GBPS: return 10000;
- case IB_RATE_20_GBPS: return 20000;
- case IB_RATE_30_GBPS: return 30000;
- case IB_RATE_40_GBPS: return 40000;
- case IB_RATE_60_GBPS: return 60000;
- case IB_RATE_80_GBPS: return 80000;
- case IB_RATE_120_GBPS: return 120000;
- case IB_RATE_14_GBPS: return 14062;
- case IB_RATE_56_GBPS: return 56250;
- case IB_RATE_112_GBPS: return 112500;
- case IB_RATE_168_GBPS: return 168750;
- case IB_RATE_25_GBPS: return 25781;
- case IB_RATE_100_GBPS: return 103125;
- case IB_RATE_200_GBPS: return 206250;
- case IB_RATE_300_GBPS: return 309375;
- default: return -1;
- }
-}
-EXPORT_SYMBOL(ib_rate_to_mbps);
-
-enum rdma_transport_type
-rdma_node_get_transport(enum rdma_node_type node_type)
-{
- switch (node_type) {
- case RDMA_NODE_IB_CA:
- case RDMA_NODE_IB_SWITCH:
- case RDMA_NODE_IB_ROUTER:
- return RDMA_TRANSPORT_IB;
- case RDMA_NODE_RNIC:
- return RDMA_TRANSPORT_IWARP;
- case RDMA_NODE_MIC:
- return RDMA_TRANSPORT_SCIF;
- default:
- BUG();
- return 0;
- }
-}
-EXPORT_SYMBOL(rdma_node_get_transport);
-
-enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num)
-{
- if (device->get_link_layer)
- return device->get_link_layer(device, port_num);
-
- switch (rdma_node_get_transport(device->node_type)) {
- case RDMA_TRANSPORT_IB:
- return IB_LINK_LAYER_INFINIBAND;
- case RDMA_TRANSPORT_IWARP:
- return IB_LINK_LAYER_ETHERNET;
- case RDMA_TRANSPORT_SCIF:
- return IB_LINK_LAYER_SCIF;
- default:
- return IB_LINK_LAYER_UNSPECIFIED;
- }
-}
-EXPORT_SYMBOL(rdma_port_get_link_layer);
-
-/* Protection domains */
-
-struct ib_pd *ib_alloc_pd(struct ib_device *device)
-{
- struct ib_pd *pd;
-
- pd = device->alloc_pd(device, NULL, NULL);
-
- if (!IS_ERR(pd)) {
- pd->device = device;
- pd->uobject = NULL;
- atomic_set(&pd->usecnt, 0);
- }
-
- return pd;
-}
-EXPORT_SYMBOL(ib_alloc_pd);
-
-int ib_dealloc_pd(struct ib_pd *pd)
-{
- if (atomic_read(&pd->usecnt))
- return -EBUSY;
-
- return pd->device->dealloc_pd(pd);
-}
-EXPORT_SYMBOL(ib_dealloc_pd);
-
-/* Address handles */
-
-struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
-{
- struct ib_ah *ah;
-
- ah = pd->device->create_ah(pd, ah_attr);
-
- if (!IS_ERR(ah)) {
- ah->device = pd->device;
- ah->pd = pd;
- ah->uobject = NULL;
- atomic_inc(&pd->usecnt);
- }
-
- return ah;
-}
-EXPORT_SYMBOL(ib_create_ah);
-
-int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
- struct ib_grh *grh, struct ib_ah_attr *ah_attr)
-{
- u32 flow_class;
- u16 gid_index;
- int ret;
- int is_eth = (rdma_port_get_link_layer(device, port_num) ==
- IB_LINK_LAYER_ETHERNET);
-
- memset(ah_attr, 0, sizeof *ah_attr);
- if (is_eth) {
- if (!(wc->wc_flags & IB_WC_GRH))
- return -EPROTOTYPE;
-
- if (wc->wc_flags & IB_WC_WITH_SMAC &&
- wc->wc_flags & IB_WC_WITH_VLAN) {
- memcpy(ah_attr->dmac, wc->smac, ETH_ALEN);
- ah_attr->vlan_id = wc->vlan_id;
- } else {
- u32 scope_id = rdma_get_ipv6_scope_id(device, port_num);
- ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid,
- ah_attr->dmac, &ah_attr->vlan_id,
- scope_id);
- if (ret)
- return ret;
- }
- } else {
- ah_attr->vlan_id = 0xffff;
- }
-
-
- ah_attr->dlid = wc->slid;
- ah_attr->sl = wc->sl;
- ah_attr->src_path_bits = wc->dlid_path_bits;
- ah_attr->port_num = port_num;
-
- if (wc->wc_flags & IB_WC_GRH) {
- ah_attr->ah_flags = IB_AH_GRH;
- ah_attr->grh.dgid = grh->sgid;
-
- ret = ib_find_cached_gid(device, &grh->dgid, &port_num,
- &gid_index);
- if (ret)
- return ret;
-
- ah_attr->grh.sgid_index = (u8) gid_index;
- flow_class = be32_to_cpu(grh->version_tclass_flow);
- ah_attr->grh.flow_label = flow_class & 0xFFFFF;
- ah_attr->grh.hop_limit = 0xFF;
- ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
- }
- return 0;
-}
-EXPORT_SYMBOL(ib_init_ah_from_wc);
-
-struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc,
- struct ib_grh *grh, u8 port_num)
-{
- struct ib_ah_attr ah_attr;
- int ret;
-
- ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr);
- if (ret)
- return ERR_PTR(ret);
-
- return ib_create_ah(pd, &ah_attr);
-}
-EXPORT_SYMBOL(ib_create_ah_from_wc);
-
-int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
-{
- return ah->device->modify_ah ?
- ah->device->modify_ah(ah, ah_attr) :
- -ENOSYS;
-}
-EXPORT_SYMBOL(ib_modify_ah);
-
-int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
-{
- return ah->device->query_ah ?
- ah->device->query_ah(ah, ah_attr) :
- -ENOSYS;
-}
-EXPORT_SYMBOL(ib_query_ah);
-
-int ib_destroy_ah(struct ib_ah *ah)
-{
- struct ib_pd *pd;
- int ret;
-
- pd = ah->pd;
- ret = ah->device->destroy_ah(ah);
- if (!ret)
- atomic_dec(&pd->usecnt);
-
- return ret;
-}
-EXPORT_SYMBOL(ib_destroy_ah);
-
-/* Shared receive queues */
-
-struct ib_srq *ib_create_srq(struct ib_pd *pd,
- struct ib_srq_init_attr *srq_init_attr)
-{
- struct ib_srq *srq;
-
- if (!pd->device->create_srq)
- return ERR_PTR(-ENOSYS);
-
- srq = pd->device->create_srq(pd, srq_init_attr, NULL);
-
- if (!IS_ERR(srq)) {
- srq->device = pd->device;
- srq->pd = pd;
- srq->uobject = NULL;
- srq->event_handler = srq_init_attr->event_handler;
- srq->srq_context = srq_init_attr->srq_context;
- srq->srq_type = srq_init_attr->srq_type;
- if (srq->srq_type == IB_SRQT_XRC) {
- srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd;
- srq->ext.xrc.cq = srq_init_attr->ext.xrc.cq;
- atomic_inc(&srq->ext.xrc.xrcd->usecnt);
- atomic_inc(&srq->ext.xrc.cq->usecnt);
- }
- atomic_inc(&pd->usecnt);
- atomic_set(&srq->usecnt, 0);
- }
-
- return srq;
-}
-EXPORT_SYMBOL(ib_create_srq);
-
-int ib_modify_srq(struct ib_srq *srq,
- struct ib_srq_attr *srq_attr,
- enum ib_srq_attr_mask srq_attr_mask)
-{
- return srq->device->modify_srq ?
- srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) :
- -ENOSYS;
-}
-EXPORT_SYMBOL(ib_modify_srq);
-
-int ib_query_srq(struct ib_srq *srq,
- struct ib_srq_attr *srq_attr)
-{
- return srq->device->query_srq ?
- srq->device->query_srq(srq, srq_attr) : -ENOSYS;
-}
-EXPORT_SYMBOL(ib_query_srq);
-
-int ib_query_values(struct ib_device *device,
- int q_values, struct ib_device_values *values)
-{
- return device->query_values ?
- device->query_values(device, q_values, values) : -ENOSYS;
-}
-EXPORT_SYMBOL(ib_query_values);
-
-int ib_destroy_srq(struct ib_srq *srq)
-{
- struct ib_pd *pd;
- enum ib_srq_type srq_type;
- struct ib_xrcd *uninitialized_var(xrcd);
- struct ib_cq *uninitialized_var(cq);
- int ret;
-
- if (atomic_read(&srq->usecnt))
- return -EBUSY;
-
- pd = srq->pd;
- srq_type = srq->srq_type;
- if (srq_type == IB_SRQT_XRC) {
- xrcd = srq->ext.xrc.xrcd;
- cq = srq->ext.xrc.cq;
- }
-
- ret = srq->device->destroy_srq(srq);
- if (!ret) {
- atomic_dec(&pd->usecnt);
- if (srq_type == IB_SRQT_XRC) {
- atomic_dec(&xrcd->usecnt);
- atomic_dec(&cq->usecnt);
- }
- }
-
- return ret;
-}
-EXPORT_SYMBOL(ib_destroy_srq);
-
-/* Queue pairs */
-
-static void __ib_shared_qp_event_handler(struct ib_event *event, void *context)
-{
- struct ib_qp *qp = context;
- unsigned long flags;
-
- /* The code below must be synced with deletions of existing qps (ib_close_qp) --
- * because a qp from the list may be closed during the scan, resulting in a kernel Oops.
- */
- spin_lock_irqsave(&qp->device->event_handler_lock, flags);
- list_for_each_entry(event->element.qp, &qp->open_list, open_list)
- if (event->element.qp->event_handler)
- event->element.qp->event_handler(event, event->element.qp->qp_context);
- spin_unlock_irqrestore(&qp->device->event_handler_lock, flags);
-}
-
-static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp)
-{
- mutex_lock(&xrcd->tgt_qp_mutex);
- list_add(&qp->xrcd_list, &xrcd->tgt_qp_list);
- mutex_unlock(&xrcd->tgt_qp_mutex);
-}
-
-static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp,
- void (*event_handler)(struct ib_event *, void *),
- void *qp_context)
-{
- struct ib_qp *qp;
- unsigned long flags;
-
- qp = kzalloc(sizeof *qp, GFP_KERNEL);
- if (!qp)
- return ERR_PTR(-ENOMEM);
-
- qp->real_qp = real_qp;
- atomic_inc(&real_qp->usecnt);
- qp->device = real_qp->device;
- qp->event_handler = event_handler;
- qp->qp_context = qp_context;
- qp->qp_num = real_qp->qp_num;
- qp->qp_type = real_qp->qp_type;
-
- spin_lock_irqsave(&real_qp->device->event_handler_lock, flags);
- list_add(&qp->open_list, &real_qp->open_list);
- spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
-
- return qp;
-}
-
-struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
- struct ib_qp_open_attr *qp_open_attr)
-{
- struct ib_qp *qp, *real_qp;
-
- if (qp_open_attr->qp_type != IB_QPT_XRC_TGT)
- return ERR_PTR(-EINVAL);
-
- qp = ERR_PTR(-EINVAL);
- mutex_lock(&xrcd->tgt_qp_mutex);
- list_for_each_entry(real_qp, &xrcd->tgt_qp_list, xrcd_list) {
- if (real_qp->qp_num == qp_open_attr->qp_num) {
- qp = __ib_open_qp(real_qp, qp_open_attr->event_handler,
- qp_open_attr->qp_context);
- break;
- }
- }
- mutex_unlock(&xrcd->tgt_qp_mutex);
- return qp;
-}
-EXPORT_SYMBOL(ib_open_qp);
-
-struct ib_qp *ib_create_qp(struct ib_pd *pd,
- struct ib_qp_init_attr *qp_init_attr)
-{
- struct ib_qp *qp, *real_qp;
- struct ib_device *device;
-
- device = pd ? pd->device : qp_init_attr->xrcd->device;
- qp = device->create_qp(pd, qp_init_attr, NULL);
-
- if (!IS_ERR(qp)) {
- qp->device = device;
- qp->real_qp = qp;
- qp->uobject = NULL;
- qp->qp_type = qp_init_attr->qp_type;
-
- atomic_set(&qp->usecnt, 0);
- if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) {
- qp->event_handler = __ib_shared_qp_event_handler;
- qp->qp_context = qp;
- qp->pd = NULL;
- qp->send_cq = qp->recv_cq = NULL;
- qp->srq = NULL;
- qp->xrcd = qp_init_attr->xrcd;
- atomic_inc(&qp_init_attr->xrcd->usecnt);
- INIT_LIST_HEAD(&qp->open_list);
-
- real_qp = qp;
- qp = __ib_open_qp(real_qp, qp_init_attr->event_handler,
- qp_init_attr->qp_context);
- if (!IS_ERR(qp))
- __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp);
- else
- real_qp->device->destroy_qp(real_qp);
- } else {
- qp->event_handler = qp_init_attr->event_handler;
- qp->qp_context = qp_init_attr->qp_context;
- if (qp_init_attr->qp_type == IB_QPT_XRC_INI) {
- qp->recv_cq = NULL;
- qp->srq = NULL;
- } else {
- qp->recv_cq = qp_init_attr->recv_cq;
- atomic_inc(&qp_init_attr->recv_cq->usecnt);
- qp->srq = qp_init_attr->srq;
- if (qp->srq)
- atomic_inc(&qp_init_attr->srq->usecnt);
- }
-
- qp->pd = pd;
- qp->send_cq = qp_init_attr->send_cq;
- qp->xrcd = NULL;
-
- atomic_inc(&pd->usecnt);
- atomic_inc(&qp_init_attr->send_cq->usecnt);
- }
- }
-
- return qp;
-}
-EXPORT_SYMBOL(ib_create_qp);
-
-static const struct {
- int valid;
- enum ib_qp_attr_mask req_param[IB_QPT_MAX];
- enum ib_qp_attr_mask req_param_add_eth[IB_QPT_MAX];
- enum ib_qp_attr_mask opt_param[IB_QPT_MAX];
- enum ib_qp_attr_mask opt_param_add_eth[IB_QPT_MAX];
-} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
- [IB_QPS_RESET] = {
- [IB_QPS_RESET] = { .valid = 1 },
- [IB_QPS_INIT] = {
- .valid = 1,
- .req_param = {
- [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
- IB_QP_PORT |
- IB_QP_QKEY),
- [IB_QPT_RAW_PACKET] = IB_QP_PORT,
- [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
- IB_QP_PORT |
- IB_QP_ACCESS_FLAGS),
- [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
- IB_QP_PORT |
- IB_QP_ACCESS_FLAGS),
- [IB_QPT_DC_INI] = (IB_QP_PKEY_INDEX |
- IB_QP_PORT |
- IB_QP_ACCESS_FLAGS |
- IB_QP_DC_KEY),
- [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX |
- IB_QP_PORT |
- IB_QP_ACCESS_FLAGS),
- [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX |
- IB_QP_PORT |
- IB_QP_ACCESS_FLAGS),
- [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
- IB_QP_QKEY),
- [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
- IB_QP_QKEY),
- },
- .opt_param = {
- [IB_QPT_UD] = IB_QP_GROUP_RSS,
- [IB_QPT_RAW_PACKET] = IB_QP_GROUP_RSS
- }
- },
- },
- [IB_QPS_INIT] = {
- [IB_QPS_RESET] = { .valid = 1 },
- [IB_QPS_ERR] = { .valid = 1 },
- [IB_QPS_INIT] = {
- .valid = 1,
- .opt_param = {
- [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
- IB_QP_PORT |
- IB_QP_QKEY),
- [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
- IB_QP_PORT |
- IB_QP_ACCESS_FLAGS),
- [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
- IB_QP_PORT |
- IB_QP_ACCESS_FLAGS),
- [IB_QPT_DC_INI] = (IB_QP_PKEY_INDEX |
- IB_QP_PORT |
- IB_QP_ACCESS_FLAGS),
- [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX |
- IB_QP_PORT |
- IB_QP_ACCESS_FLAGS),
- [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX |
- IB_QP_PORT |
- IB_QP_ACCESS_FLAGS),
- [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
- IB_QP_QKEY),
- [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
- IB_QP_QKEY),
- }
- },
- [IB_QPS_RTR] = {
- .valid = 1,
- .req_param = {
- [IB_QPT_UC] = (IB_QP_AV |
- IB_QP_PATH_MTU |
- IB_QP_DEST_QPN |
- IB_QP_RQ_PSN),
- [IB_QPT_RC] = (IB_QP_AV |
- IB_QP_PATH_MTU |
- IB_QP_DEST_QPN |
- IB_QP_RQ_PSN |
- IB_QP_MAX_DEST_RD_ATOMIC |
- IB_QP_MIN_RNR_TIMER),
- [IB_QPT_DC_INI] = (IB_QP_PATH_MTU |
- IB_QP_MAX_DEST_RD_ATOMIC |
- IB_QP_MIN_RNR_TIMER),
- [IB_QPT_XRC_INI] = (IB_QP_AV |
- IB_QP_PATH_MTU |
- IB_QP_DEST_QPN |
- IB_QP_RQ_PSN),
- [IB_QPT_XRC_TGT] = (IB_QP_AV |
- IB_QP_PATH_MTU |
- IB_QP_DEST_QPN |
- IB_QP_RQ_PSN |
- IB_QP_MAX_DEST_RD_ATOMIC |
- IB_QP_MIN_RNR_TIMER),
- },
- .req_param_add_eth = {
- [IB_QPT_RC] = (IB_QP_SMAC),
- [IB_QPT_UC] = (IB_QP_SMAC),
- [IB_QPT_XRC_INI] = (IB_QP_SMAC),
- [IB_QPT_XRC_TGT] = (IB_QP_SMAC)
- },
- .opt_param = {
- [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
- IB_QP_QKEY),
- [IB_QPT_UC] = (IB_QP_ALT_PATH |
- IB_QP_ACCESS_FLAGS |
- IB_QP_PKEY_INDEX),
- [IB_QPT_RC] = (IB_QP_ALT_PATH |
- IB_QP_ACCESS_FLAGS |
- IB_QP_PKEY_INDEX),
- [IB_QPT_DC_INI] = (IB_QP_ALT_PATH |
- IB_QP_ACCESS_FLAGS |
- IB_QP_PKEY_INDEX),
- [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH |
- IB_QP_ACCESS_FLAGS |
- IB_QP_PKEY_INDEX),
- [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH |
- IB_QP_ACCESS_FLAGS |
- IB_QP_PKEY_INDEX),
- [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
- IB_QP_QKEY),
- [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
- IB_QP_QKEY),
- [IB_QPT_RAW_PACKET] = IB_QP_AV,
- },
- .opt_param_add_eth = {
- [IB_QPT_RC] = (IB_QP_ALT_SMAC |
- IB_QP_VID |
- IB_QP_ALT_VID),
- [IB_QPT_UC] = (IB_QP_ALT_SMAC |
- IB_QP_VID |
- IB_QP_ALT_VID),
- [IB_QPT_XRC_INI] = (IB_QP_ALT_SMAC |
- IB_QP_VID |
- IB_QP_ALT_VID),
- [IB_QPT_XRC_TGT] = (IB_QP_ALT_SMAC |
- IB_QP_VID |
- IB_QP_ALT_VID)
- }
- }
- },
- [IB_QPS_RTR] = {
- [IB_QPS_RESET] = { .valid = 1 },
- [IB_QPS_ERR] = { .valid = 1 },
- [IB_QPS_RTS] = {
- .valid = 1,
- .req_param = {
- [IB_QPT_UD] = IB_QP_SQ_PSN,
- [IB_QPT_UC] = IB_QP_SQ_PSN,
- [IB_QPT_RC] = (IB_QP_TIMEOUT |
- IB_QP_RETRY_CNT |
- IB_QP_RNR_RETRY |
- IB_QP_SQ_PSN |
- IB_QP_MAX_QP_RD_ATOMIC),
- [IB_QPT_DC_INI] = (IB_QP_TIMEOUT |
- IB_QP_RETRY_CNT |
- IB_QP_RNR_RETRY |
- IB_QP_MAX_QP_RD_ATOMIC),
- [IB_QPT_XRC_INI] = (IB_QP_TIMEOUT |
- IB_QP_RETRY_CNT |
- IB_QP_RNR_RETRY |
- IB_QP_SQ_PSN |
- IB_QP_MAX_QP_RD_ATOMIC),
- [IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT |
- IB_QP_SQ_PSN),
- [IB_QPT_SMI] = IB_QP_SQ_PSN,
- [IB_QPT_GSI] = IB_QP_SQ_PSN,
- },
- .opt_param = {
- [IB_QPT_UD] = (IB_QP_CUR_STATE |
- IB_QP_QKEY),
- [IB_QPT_UC] = (IB_QP_CUR_STATE |
- IB_QP_ALT_PATH |
- IB_QP_ACCESS_FLAGS |
- IB_QP_PATH_MIG_STATE),
- [IB_QPT_RC] = (IB_QP_CUR_STATE |
- IB_QP_ALT_PATH |
- IB_QP_ACCESS_FLAGS |
- IB_QP_MIN_RNR_TIMER |
- IB_QP_PATH_MIG_STATE),
- [IB_QPT_DC_INI] = (IB_QP_CUR_STATE |
- IB_QP_ALT_PATH |
- IB_QP_ACCESS_FLAGS |
- IB_QP_MIN_RNR_TIMER |
- IB_QP_PATH_MIG_STATE),
- [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE |
- IB_QP_ALT_PATH |
- IB_QP_ACCESS_FLAGS |
- IB_QP_PATH_MIG_STATE),
- [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE |
- IB_QP_ALT_PATH |
- IB_QP_ACCESS_FLAGS |
- IB_QP_MIN_RNR_TIMER |
- IB_QP_PATH_MIG_STATE),
- [IB_QPT_SMI] = (IB_QP_CUR_STATE |
- IB_QP_QKEY),
- [IB_QPT_GSI] = (IB_QP_CUR_STATE |
- IB_QP_QKEY),
- }
- }
- },
- [IB_QPS_RTS] = {
- [IB_QPS_RESET] = { .valid = 1 },
- [IB_QPS_ERR] = { .valid = 1 },
- [IB_QPS_RTS] = {
- .valid = 1,
- .opt_param = {
- [IB_QPT_UD] = (IB_QP_CUR_STATE |
- IB_QP_QKEY),
- [IB_QPT_UC] = (IB_QP_CUR_STATE |
- IB_QP_ACCESS_FLAGS |
- IB_QP_ALT_PATH |
- IB_QP_PATH_MIG_STATE),
- [IB_QPT_RC] = (IB_QP_CUR_STATE |
- IB_QP_ACCESS_FLAGS |
- IB_QP_ALT_PATH |
- IB_QP_PATH_MIG_STATE |
- IB_QP_MIN_RNR_TIMER),
- [IB_QPT_DC_INI] = (IB_QP_CUR_STATE |
- IB_QP_ACCESS_FLAGS |
- IB_QP_ALT_PATH |
- IB_QP_PATH_MIG_STATE |
- IB_QP_MIN_RNR_TIMER),
- [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE |
- IB_QP_ACCESS_FLAGS |
- IB_QP_ALT_PATH |
- IB_QP_PATH_MIG_STATE),
- [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE |
- IB_QP_ACCESS_FLAGS |
- IB_QP_ALT_PATH |
- IB_QP_PATH_MIG_STATE |
- IB_QP_MIN_RNR_TIMER),
- [IB_QPT_SMI] = (IB_QP_CUR_STATE |
- IB_QP_QKEY),
- [IB_QPT_GSI] = (IB_QP_CUR_STATE |
- IB_QP_QKEY),
- }
- },
- [IB_QPS_SQD] = {
- .valid = 1,
- .opt_param = {
- [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY,
- [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
- [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
- [IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
- [IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */
- [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
- [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY
- }
- },
- },
- [IB_QPS_SQD] = {
- [IB_QPS_RESET] = { .valid = 1 },
- [IB_QPS_ERR] = { .valid = 1 },
- [IB_QPS_RTS] = {
- .valid = 1,
- .opt_param = {
- [IB_QPT_UD] = (IB_QP_CUR_STATE |
- IB_QP_QKEY),
- [IB_QPT_UC] = (IB_QP_CUR_STATE |
- IB_QP_ALT_PATH |
- IB_QP_ACCESS_FLAGS |
- IB_QP_PATH_MIG_STATE),
- [IB_QPT_RC] = (IB_QP_CUR_STATE |
- IB_QP_ALT_PATH |
- IB_QP_ACCESS_FLAGS |
- IB_QP_MIN_RNR_TIMER |
- IB_QP_PATH_MIG_STATE),
- [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE |
- IB_QP_ALT_PATH |
- IB_QP_ACCESS_FLAGS |
- IB_QP_PATH_MIG_STATE),
- [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE |
- IB_QP_ALT_PATH |
- IB_QP_ACCESS_FLAGS |
- IB_QP_MIN_RNR_TIMER |
- IB_QP_PATH_MIG_STATE),
- [IB_QPT_SMI] = (IB_QP_CUR_STATE |
- IB_QP_QKEY),
- [IB_QPT_GSI] = (IB_QP_CUR_STATE |
- IB_QP_QKEY),
- }
- },
- [IB_QPS_SQD] = {
- .valid = 1,
- .opt_param = {
- [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
- IB_QP_QKEY),
- [IB_QPT_UC] = (IB_QP_AV |
- IB_QP_ALT_PATH |
- IB_QP_ACCESS_FLAGS |
- IB_QP_PKEY_INDEX |
- IB_QP_PATH_MIG_STATE),
- [IB_QPT_RC] = (IB_QP_PORT |
- IB_QP_AV |
- IB_QP_TIMEOUT |
- IB_QP_RETRY_CNT |
- IB_QP_RNR_RETRY |
- IB_QP_MAX_QP_RD_ATOMIC |
- IB_QP_MAX_DEST_RD_ATOMIC |
- IB_QP_ALT_PATH |
- IB_QP_ACCESS_FLAGS |
- IB_QP_PKEY_INDEX |
- IB_QP_MIN_RNR_TIMER |
- IB_QP_PATH_MIG_STATE),
- [IB_QPT_XRC_INI] = (IB_QP_PORT |
- IB_QP_AV |
- IB_QP_TIMEOUT |
- IB_QP_RETRY_CNT |
- IB_QP_RNR_RETRY |
- IB_QP_MAX_QP_RD_ATOMIC |
- IB_QP_ALT_PATH |
- IB_QP_ACCESS_FLAGS |
- IB_QP_PKEY_INDEX |
- IB_QP_PATH_MIG_STATE),
- [IB_QPT_XRC_TGT] = (IB_QP_PORT |
- IB_QP_AV |
- IB_QP_TIMEOUT |
- IB_QP_MAX_DEST_RD_ATOMIC |
- IB_QP_ALT_PATH |
- IB_QP_ACCESS_FLAGS |
- IB_QP_PKEY_INDEX |
- IB_QP_MIN_RNR_TIMER |
- IB_QP_PATH_MIG_STATE),
- [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
- IB_QP_QKEY),
- [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
- IB_QP_QKEY),
- }
- }
- },
- [IB_QPS_SQE] = {
- [IB_QPS_RESET] = { .valid = 1 },
- [IB_QPS_ERR] = { .valid = 1 },
- [IB_QPS_RTS] = {
- .valid = 1,
- .opt_param = {
- [IB_QPT_UD] = (IB_QP_CUR_STATE |
- IB_QP_QKEY),
- [IB_QPT_UC] = (IB_QP_CUR_STATE |
- IB_QP_ACCESS_FLAGS),
- [IB_QPT_SMI] = (IB_QP_CUR_STATE |
- IB_QP_QKEY),
- [IB_QPT_GSI] = (IB_QP_CUR_STATE |
- IB_QP_QKEY),
- }
- }
- },
- [IB_QPS_ERR] = {
- [IB_QPS_RESET] = { .valid = 1 },
- [IB_QPS_ERR] = { .valid = 1 }
- }
-};
-
-int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
- enum ib_qp_type type, enum ib_qp_attr_mask mask,
- enum rdma_link_layer ll)
-{
- enum ib_qp_attr_mask req_param, opt_param;
-
- if (cur_state < 0 || cur_state > IB_QPS_ERR ||
- next_state < 0 || next_state > IB_QPS_ERR)
- return 0;
-
- if (mask & IB_QP_CUR_STATE &&
- cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
- cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
- return 0;
-
- if (!qp_state_table[cur_state][next_state].valid)
- return 0;
-
- req_param = qp_state_table[cur_state][next_state].req_param[type];
- opt_param = qp_state_table[cur_state][next_state].opt_param[type];
-
- if (ll == IB_LINK_LAYER_ETHERNET) {
- req_param |= qp_state_table[cur_state][next_state].
- req_param_add_eth[type];
- opt_param |= qp_state_table[cur_state][next_state].
- opt_param_add_eth[type];
- }
-
- if ((mask & req_param) != req_param)
- return 0;
-
- if (mask & ~(req_param | opt_param | IB_QP_STATE))
- return 0;
-
- return 1;
-}
-EXPORT_SYMBOL(ib_modify_qp_is_ok);
-
-int ib_modify_qp(struct ib_qp *qp,
- struct ib_qp_attr *qp_attr,
- int qp_attr_mask)
-{
- int ret;
-
- ret = qp->device->modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL);
- if (!ret && (qp_attr_mask & IB_QP_PORT))
- qp->port_num = qp_attr->port_num;
-
- return ret;
-}
-EXPORT_SYMBOL(ib_modify_qp);
-
-int ib_query_qp(struct ib_qp *qp,
- struct ib_qp_attr *qp_attr,
- int qp_attr_mask,
- struct ib_qp_init_attr *qp_init_attr)
-{
- return qp->device->query_qp ?
- qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) :
- -ENOSYS;
-}
-EXPORT_SYMBOL(ib_query_qp);
-
-int ib_close_qp(struct ib_qp *qp)
-{
- struct ib_qp *real_qp;
- unsigned long flags;
-
- real_qp = qp->real_qp;
- if (real_qp == qp)
- return -EINVAL;
-
- spin_lock_irqsave(&real_qp->device->event_handler_lock, flags);
- list_del(&qp->open_list);
- spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
-
- atomic_dec(&real_qp->usecnt);
- kfree(qp);
-
- return 0;
-}
-EXPORT_SYMBOL(ib_close_qp);
-
-static int __ib_destroy_shared_qp(struct ib_qp *qp)
-{
- struct ib_xrcd *xrcd;
- struct ib_qp *real_qp;
- int ret;
-
- real_qp = qp->real_qp;
- xrcd = real_qp->xrcd;
-
- mutex_lock(&xrcd->tgt_qp_mutex);
- ib_close_qp(qp);
- if (atomic_read(&real_qp->usecnt) == 0)
- list_del(&real_qp->xrcd_list);
- else
- real_qp = NULL;
- mutex_unlock(&xrcd->tgt_qp_mutex);
-
- if (real_qp) {
- ret = ib_destroy_qp(real_qp);
- if (!ret)
- atomic_dec(&xrcd->usecnt);
- else
- __ib_insert_xrcd_qp(xrcd, real_qp);
- }
-
- return 0;
-}
-
-int ib_destroy_qp(struct ib_qp *qp)
-{
- struct ib_pd *pd;
- struct ib_cq *scq, *rcq;
- struct ib_srq *srq;
- int ret;
-
- if (atomic_read(&qp->usecnt))
- return -EBUSY;
-
- if (qp->real_qp != qp)
- return __ib_destroy_shared_qp(qp);
-
- pd = qp->pd;
- scq = qp->send_cq;
- rcq = qp->recv_cq;
- srq = qp->srq;
-
- ret = qp->device->destroy_qp(qp);
- if (!ret) {
- if (pd)
- atomic_dec(&pd->usecnt);
- if (scq)
- atomic_dec(&scq->usecnt);
- if (rcq)
- atomic_dec(&rcq->usecnt);
- if (srq)
- atomic_dec(&srq->usecnt);
- }
-
- return ret;
-}
-EXPORT_SYMBOL(ib_destroy_qp);
-
-/* Completion queues */
-
-struct ib_cq *ib_create_cq(struct ib_device *device,
- ib_comp_handler comp_handler,
- void (*event_handler)(struct ib_event *, void *),
- void *cq_context, int cqe, int comp_vector)
-{
- struct ib_cq *cq;
- struct ib_cq_init_attr attr = {
- .cqe = cqe,
- .comp_vector = comp_vector,
- .flags = 0,
- };
-
- cq = device->create_cq(device, &attr, NULL, NULL);
-
- if (!IS_ERR(cq)) {
- cq->device = device;
- cq->uobject = NULL;
- cq->comp_handler = comp_handler;
- cq->event_handler = event_handler;
- cq->cq_context = cq_context;
- atomic_set(&cq->usecnt, 0);
- }
-
- return cq;
-}
-EXPORT_SYMBOL(ib_create_cq);
-
-int ib_modify_cq(struct ib_cq *cq,
- struct ib_cq_attr *cq_attr,
- int cq_attr_mask)
-{
- return cq->device->modify_cq ?
- cq->device->modify_cq(cq, cq_attr, cq_attr_mask) : -ENOSYS;
-}
-EXPORT_SYMBOL(ib_modify_cq);
-
-int ib_destroy_cq(struct ib_cq *cq)
-{
- if (atomic_read(&cq->usecnt))
- return -EBUSY;
-
- return cq->device->destroy_cq(cq);
-}
-EXPORT_SYMBOL(ib_destroy_cq);
-
-int ib_resize_cq(struct ib_cq *cq, int cqe)
-{
- return cq->device->resize_cq ?
- cq->device->resize_cq(cq, cqe, NULL) : -ENOSYS;
-}
-EXPORT_SYMBOL(ib_resize_cq);
-
-/* Memory regions */
-
-struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
-{
- struct ib_mr *mr;
- int err;
-
- err = ib_check_mr_access(mr_access_flags);
- if (err)
- return ERR_PTR(err);
-
- mr = pd->device->get_dma_mr(pd, mr_access_flags);
-
- if (!IS_ERR(mr)) {
- mr->device = pd->device;
- mr->pd = pd;
- mr->uobject = NULL;
- atomic_inc(&pd->usecnt);
- atomic_set(&mr->usecnt, 0);
- }
-
- return mr;
-}
-EXPORT_SYMBOL(ib_get_dma_mr);
-
-struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- int mr_access_flags,
- u64 *iova_start)
-{
- struct ib_mr *mr;
- int err;
-
- err = ib_check_mr_access(mr_access_flags);
- if (err)
- return ERR_PTR(err);
-
- if (!pd->device->reg_phys_mr)
- return ERR_PTR(-ENOSYS);
-
- mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
- mr_access_flags, iova_start);
-
- if (!IS_ERR(mr)) {
- mr->device = pd->device;
- mr->pd = pd;
- mr->uobject = NULL;
- atomic_inc(&pd->usecnt);
- atomic_set(&mr->usecnt, 0);
- }
-
- return mr;
-}
-EXPORT_SYMBOL(ib_reg_phys_mr);
-
-int ib_rereg_phys_mr(struct ib_mr *mr,
- int mr_rereg_mask,
- struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- int mr_access_flags,
- u64 *iova_start)
-{
- struct ib_pd *old_pd;
- int ret;
-
- ret = ib_check_mr_access(mr_access_flags);
- if (ret)
- return ret;
-
- if (!mr->device->rereg_phys_mr)
- return -ENOSYS;
-
- if (atomic_read(&mr->usecnt))
- return -EBUSY;
-
- old_pd = mr->pd;
-
- ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd,
- phys_buf_array, num_phys_buf,
- mr_access_flags, iova_start);
-
- if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) {
- atomic_dec(&old_pd->usecnt);
- atomic_inc(&pd->usecnt);
- }
-
- return ret;
-}
-EXPORT_SYMBOL(ib_rereg_phys_mr);
-
-int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
-{
- return mr->device->query_mr ?
- mr->device->query_mr(mr, mr_attr) : -ENOSYS;
-}
-EXPORT_SYMBOL(ib_query_mr);
-
-int ib_dereg_mr(struct ib_mr *mr)
-{
- struct ib_pd *pd;
- int ret;
-
- if (atomic_read(&mr->usecnt))
- return -EBUSY;
-
- pd = mr->pd;
- ret = mr->device->dereg_mr(mr);
- if (!ret)
- atomic_dec(&pd->usecnt);
-
- return ret;
-}
-EXPORT_SYMBOL(ib_dereg_mr);
-
-struct ib_mr *ib_create_mr(struct ib_pd *pd,
- struct ib_mr_init_attr *mr_init_attr)
-{
- struct ib_mr *mr;
-
- if (!pd->device->create_mr)
- return ERR_PTR(-ENOSYS);
-
- mr = pd->device->create_mr(pd, mr_init_attr);
-
- if (!IS_ERR(mr)) {
- mr->device = pd->device;
- mr->pd = pd;
- mr->uobject = NULL;
- atomic_inc(&pd->usecnt);
- atomic_set(&mr->usecnt, 0);
- }
-
- return mr;
-}
-EXPORT_SYMBOL(ib_create_mr);
-
-int ib_destroy_mr(struct ib_mr *mr)
-{
- struct ib_pd *pd;
- int ret;
-
- if (atomic_read(&mr->usecnt))
- return -EBUSY;
-
- pd = mr->pd;
- ret = mr->device->destroy_mr(mr);
- if (!ret)
- atomic_dec(&pd->usecnt);
-
- return ret;
-}
-EXPORT_SYMBOL(ib_destroy_mr);
-
-struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
-{
- struct ib_mr *mr;
-
- if (!pd->device->alloc_fast_reg_mr)
- return ERR_PTR(-ENOSYS);
-
- mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
-
- if (!IS_ERR(mr)) {
- mr->device = pd->device;
- mr->pd = pd;
- mr->uobject = NULL;
- atomic_inc(&pd->usecnt);
- atomic_set(&mr->usecnt, 0);
- }
-
- return mr;
-}
-EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
-
-struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,
- int max_page_list_len)
-{
- struct ib_fast_reg_page_list *page_list;
-
- if (!device->alloc_fast_reg_page_list)
- return ERR_PTR(-ENOSYS);
-
- page_list = device->alloc_fast_reg_page_list(device, max_page_list_len);
-
- if (!IS_ERR(page_list)) {
- page_list->device = device;
- page_list->max_page_list_len = max_page_list_len;
- }
-
- return page_list;
-}
-EXPORT_SYMBOL(ib_alloc_fast_reg_page_list);
-
-void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
-{
- page_list->device->free_fast_reg_page_list(page_list);
-}
-EXPORT_SYMBOL(ib_free_fast_reg_page_list);
-
-/* Memory windows */
-
-struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
-{
- struct ib_mw *mw;
-
- if (!pd->device->alloc_mw)
- return ERR_PTR(-ENOSYS);
-
- mw = pd->device->alloc_mw(pd, type);
- if (!IS_ERR(mw)) {
- mw->device = pd->device;
- mw->pd = pd;
- mw->uobject = NULL;
- mw->type = type;
- atomic_inc(&pd->usecnt);
- }
-
- return mw;
-}
-EXPORT_SYMBOL(ib_alloc_mw);
-
-int ib_dealloc_mw(struct ib_mw *mw)
-{
- struct ib_pd *pd;
- int ret;
-
- pd = mw->pd;
- ret = mw->device->dealloc_mw(mw);
- if (!ret)
- atomic_dec(&pd->usecnt);
-
- return ret;
-}
-EXPORT_SYMBOL(ib_dealloc_mw);
-
-/* "Fast" memory regions */
-
-struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
- int mr_access_flags,
- struct ib_fmr_attr *fmr_attr)
-{
- struct ib_fmr *fmr;
-
- if (!pd->device->alloc_fmr)
- return ERR_PTR(-ENOSYS);
-
- fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr);
- if (!IS_ERR(fmr)) {
- fmr->device = pd->device;
- fmr->pd = pd;
- atomic_inc(&pd->usecnt);
- }
-
- return fmr;
-}
-EXPORT_SYMBOL(ib_alloc_fmr);
-
-int ib_unmap_fmr(struct list_head *fmr_list)
-{
- struct ib_fmr *fmr;
-
- if (list_empty(fmr_list))
- return 0;
-
- fmr = list_entry(fmr_list->next, struct ib_fmr, list);
- return fmr->device->unmap_fmr(fmr_list);
-}
-EXPORT_SYMBOL(ib_unmap_fmr);
-
-int ib_dealloc_fmr(struct ib_fmr *fmr)
-{
- struct ib_pd *pd;
- int ret;
-
- pd = fmr->pd;
- ret = fmr->device->dealloc_fmr(fmr);
- if (!ret)
- atomic_dec(&pd->usecnt);
-
- return ret;
-}
-EXPORT_SYMBOL(ib_dealloc_fmr);
-
-/* Multicast groups */
-
-int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
-{
- int ret;
-
- if (!qp->device->attach_mcast)
- return -ENOSYS;
-
- switch (rdma_node_get_transport(qp->device->node_type)) {
- case RDMA_TRANSPORT_IB:
- if ((gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) &&
- qp->qp_type != IB_QPT_RAW_PACKET)
- return -EINVAL;
- break;
- case RDMA_TRANSPORT_IWARP:
- case RDMA_TRANSPORT_SCIF:
- if (qp->qp_type != IB_QPT_RAW_PACKET)
- return -EINVAL;
- break;
- }
-
- ret = qp->device->attach_mcast(qp, gid, lid);
- if (!ret)
- atomic_inc(&qp->usecnt);
- return ret;
-}
-EXPORT_SYMBOL(ib_attach_mcast);
-
-int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
-{
- int ret;
-
- if (!qp->device->detach_mcast)
- return -ENOSYS;
-
- switch (rdma_node_get_transport(qp->device->node_type)) {
- case RDMA_TRANSPORT_IB:
- if ((gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) &&
- qp->qp_type != IB_QPT_RAW_PACKET)
- return -EINVAL;
- break;
- case RDMA_TRANSPORT_IWARP:
- case RDMA_TRANSPORT_SCIF:
-
- if (qp->qp_type != IB_QPT_RAW_PACKET)
- return -EINVAL;
- break;
- }
-
- ret = qp->device->detach_mcast(qp, gid, lid);
- if (!ret)
- atomic_dec(&qp->usecnt);
- return ret;
-}
-EXPORT_SYMBOL(ib_detach_mcast);
-
-struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device)
-{
- struct ib_xrcd *xrcd;
-
- if (!device->alloc_xrcd)
- return ERR_PTR(-ENOSYS);
-
- xrcd = device->alloc_xrcd(device, NULL, NULL);
- if (!IS_ERR(xrcd)) {
- xrcd->device = device;
- xrcd->inode = NULL;
- atomic_set(&xrcd->usecnt, 0);
- mutex_init(&xrcd->tgt_qp_mutex);
- INIT_LIST_HEAD(&xrcd->tgt_qp_list);
- }
-
- return xrcd;
-}
-EXPORT_SYMBOL(ib_alloc_xrcd);
-
-int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
-{
- struct ib_qp *qp;
- int ret;
-
- if (atomic_read(&xrcd->usecnt))
- return -EBUSY;
-
- while (!list_empty(&xrcd->tgt_qp_list)) {
- qp = list_entry(xrcd->tgt_qp_list.next, struct ib_qp, xrcd_list);
- ret = ib_destroy_qp(qp);
- if (ret)
- return ret;
- }
-
- return xrcd->device->dealloc_xrcd(xrcd);
-}
-EXPORT_SYMBOL(ib_dealloc_xrcd);
-
-struct ib_flow *ib_create_flow(struct ib_qp *qp,
- struct ib_flow_attr *flow_attr,
- int domain)
-{
- struct ib_flow *flow_id;
- if (!qp->device->create_flow)
- return ERR_PTR(-ENOSYS);
-
- flow_id = qp->device->create_flow(qp, flow_attr, domain);
- if (!IS_ERR(flow_id))
- atomic_inc(&qp->usecnt);
- return flow_id;
-}
-EXPORT_SYMBOL(ib_create_flow);
-
-int ib_destroy_flow(struct ib_flow *flow_id)
-{
- int err;
- struct ib_qp *qp;
-
- if (!flow_id)
- return -EINVAL;
- qp = flow_id->qp;
- if (!qp->device->destroy_flow)
- return -ENOSYS;
- err = qp->device->destroy_flow(flow_id);
- if (!err)
- atomic_dec(&qp->usecnt);
- return err;
-}
-EXPORT_SYMBOL(ib_destroy_flow);
-
-struct ib_dct *ib_create_dct(struct ib_pd *pd, struct ib_dct_init_attr *attr,
- struct ib_udata *udata)
-{
- struct ib_dct *dct;
-
- if (!pd->device->exp_create_dct)
- return ERR_PTR(-ENOSYS);
-
- dct = pd->device->exp_create_dct(pd, attr, udata);
- if (!IS_ERR(dct)) {
- dct->pd = pd;
- dct->srq = attr->srq;
- dct->cq = attr->cq;
- atomic_inc(&dct->srq->usecnt);
- atomic_inc(&dct->cq->usecnt);
- atomic_inc(&dct->pd->usecnt);
- }
-
- return dct;
-}
-EXPORT_SYMBOL(ib_create_dct);
-
-int ib_destroy_dct(struct ib_dct *dct)
-{
- int err;
-
- if (!dct->device->exp_destroy_dct)
- return -ENOSYS;
-
- err = dct->device->exp_destroy_dct(dct);
- if (!err) {
- atomic_dec(&dct->srq->usecnt);
- atomic_dec(&dct->cq->usecnt);
- atomic_dec(&dct->pd->usecnt);
- }
-
- return err;
-}
-EXPORT_SYMBOL(ib_destroy_dct);
-
-int ib_query_dct(struct ib_dct *dct, struct ib_dct_attr *attr)
-{
- if (!dct->device->exp_query_dct)
- return -ENOSYS;
-
- return dct->device->exp_query_dct(dct, attr);
-}
-EXPORT_SYMBOL(ib_query_dct);
-
-int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
- struct ib_mr_status *mr_status)
-{
- return mr->device->check_mr_status ?
- mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS;
-}
-EXPORT_SYMBOL(ib_check_mr_status);
diff --git a/sys/ofed/drivers/infiniband/debug/Makefile b/sys/ofed/drivers/infiniband/debug/Makefile
deleted file mode 100644
index e9d9f4b8fd21e..0000000000000
--- a/sys/ofed/drivers/infiniband/debug/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-EXTRA_CFLAGS := $(subst $(KERNEL_MEMTRACK_CFLAGS),,$(EXTRA_CFLAGS))
-
-obj-m += memtrack.o
diff --git a/sys/ofed/drivers/infiniband/debug/memtrack.c b/sys/ofed/drivers/infiniband/debug/memtrack.c
deleted file mode 100644
index cfbc4872a2b24..0000000000000
--- a/sys/ofed/drivers/infiniband/debug/memtrack.c
+++ /dev/null
@@ -1,960 +0,0 @@
-/*
- This software is available to you under a choice of one of two
- licenses. You may choose to be licensed under the terms of the GNU
- General Public License (GPL) Version 2, available at
- <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
- license, available in the LICENSE.TXT file accompanying this
- software. These details are also available at
- <http://openib.org/license.html>.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-
- Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
-*/
-
-#define LINUXKPI_PARAM_PREFIX memtrack_
-
-#define C_MEMTRACK_C
-
-#ifdef kmalloc
- #undef kmalloc
-#endif
-#ifdef kmemdup
- #undef kmemdup
-#endif
-#ifdef kfree
- #undef kfree
-#endif
-#ifdef vmalloc
- #undef vmalloc
-#endif
-#ifdef vzalloc
- #undef vzalloc
-#endif
-#ifdef vzalloc_node
- #undef vzalloc_node
-#endif
-#ifdef vfree
- #undef vfree
-#endif
-#ifdef kmem_cache_alloc
- #undef kmem_cache_alloc
-#endif
-#ifdef kmem_cache_free
- #undef kmem_cache_free
-#endif
-#ifdef ioremap
- #undef ioremap
-#endif
-#ifdef io_mapping_create_wc
- #undef io_mapping_create_wc
-#endif
-#ifdef io_mapping_free
- #undef io_mapping_free
-#endif
-#ifdef ioremap_nocache
- #undef ioremap_nocache
-#endif
-#ifdef iounmap
- #undef iounmap
-#endif
-#ifdef alloc_pages
- #undef alloc_pages
-#endif
-#ifdef free_pages
- #undef free_pages
-#endif
-#ifdef get_page
- #undef get_page
-#endif
-#ifdef put_page
- #undef put_page
-#endif
-#ifdef create_workqueue
- #undef create_workqueue
-#endif
-#ifdef create_rt_workqueue
- #undef create_rt_workqueue
-#endif
-#ifdef create_freezeable_workqueue
- #undef create_freezeable_workqueue
-#endif
-#ifdef create_singlethread_workqueue
- #undef create_singlethread_workqueue
-#endif
-#ifdef destroy_workqueue
- #undef destroy_workqueue
-#endif
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/interrupt.h>
-#include <linux/vmalloc.h>
-#include <linux/mm.h>
-#include <asm/uaccess.h>
-#include <linux/proc_fs.h>
-#include <linux/random.h>
-#include "memtrack.h"
-
-#include <linux/moduleparam.h>
-
-
-MODULE_AUTHOR("Mellanox Technologies LTD.");
-MODULE_DESCRIPTION("Memory allocations tracking");
-MODULE_LICENSE("GPL");
-
-#define MEMTRACK_HASH_SZ ((1<<15)-19) /* prime: http://www.utm.edu/research/primes/lists/2small/0bit.html */
-#define MAX_FILENAME_LEN 31
-
-#define memtrack_spin_lock(spl, flags) spin_lock_irqsave(spl, flags)
-#define memtrack_spin_unlock(spl, flags) spin_unlock_irqrestore(spl, flags)
-
-/* if a bit is set then the corresponding allocation is tracked.
- bit0 corresponds to MEMTRACK_KMALLOC, bit1 corresponds to MEMTRACK_VMALLOC etc. */
-static unsigned long track_mask = -1; /* effectively everything */
-module_param(track_mask, ulong, 0444);
-MODULE_PARM_DESC(track_mask, "bitmask defining what is tracked");
-
-/* if a bit is set then the corresponding allocation is strictly tracked.
- That is, before inserting the whole range is checked to not overlap any
- of the allocations already in the database */
-static unsigned long strict_track_mask = 0; /* no strict tracking */
-module_param(strict_track_mask, ulong, 0444);
-MODULE_PARM_DESC(strict_track_mask, "bitmask which allocation requires strict tracking");
-
-/* Sets the frequency of allocations failures injections
- if set to 0 all allocation should succeed */
-static unsigned int inject_freq = 0;
-module_param(inject_freq, uint, 0644);
-MODULE_PARM_DESC(inject_freq, "Error injection frequency, default is 0 (disabled)");
-
-static int random_mem = 1;
-module_param(random_mem, uint, 0644);
-MODULE_PARM_DESC(random_mem, "When set, randomize allocated memory, default is 1 (enabled)");
-
-struct memtrack_meminfo_t {
- unsigned long addr;
- unsigned long size;
- unsigned long line_num;
- unsigned long dev;
- unsigned long addr2;
- int direction;
- struct memtrack_meminfo_t *next;
- struct list_head list; /* used to link all items from a certain type together */
- char filename[MAX_FILENAME_LEN + 1]; /* putting the char array last is better for struct. packing */
- char ext_info[32];
-};
-
-static struct kmem_cache *meminfo_cache;
-
-struct tracked_obj_desc_t {
- struct memtrack_meminfo_t *mem_hash[MEMTRACK_HASH_SZ];
- spinlock_t hash_lock;
- unsigned long count; /* size of memory tracked (*malloc) or number of objects tracked */
- struct list_head tracked_objs_head; /* head of list of all objects */
- int strict_track; /* if 1 then for each object inserted check if it overlaps any of the objects already in the list */
-};
-
-static struct tracked_obj_desc_t *tracked_objs_arr[MEMTRACK_NUM_OF_MEMTYPES];
-
-static const char *rsc_names[MEMTRACK_NUM_OF_MEMTYPES] = {
- "kmalloc",
- "vmalloc",
- "kmem_cache_alloc",
- "io_remap",
- "create_workqueue",
- "alloc_pages",
- "ib_dma_map_single",
- "ib_dma_map_page",
- "ib_dma_map_sg"
-};
-
-static const char *rsc_free_names[MEMTRACK_NUM_OF_MEMTYPES] = {
- "kfree",
- "vfree",
- "kmem_cache_free",
- "io_unmap",
- "destory_workqueue",
- "free_pages",
- "ib_dma_unmap_single",
- "ib_dma_unmap_page",
- "ib_dma_unmap_sg"
-};
-
-static inline const char *memtype_alloc_str(enum memtrack_memtype_t memtype)
-{
- switch (memtype) {
- case MEMTRACK_KMALLOC:
- case MEMTRACK_VMALLOC:
- case MEMTRACK_KMEM_OBJ:
- case MEMTRACK_IOREMAP:
- case MEMTRACK_WORK_QUEUE:
- case MEMTRACK_PAGE_ALLOC:
- case MEMTRACK_DMA_MAP_SINGLE:
- case MEMTRACK_DMA_MAP_PAGE:
- case MEMTRACK_DMA_MAP_SG:
- return rsc_names[memtype];
- default:
- return "(Unknown allocation type)";
- }
-}
-
-static inline const char *memtype_free_str(enum memtrack_memtype_t memtype)
-{
- switch (memtype) {
- case MEMTRACK_KMALLOC:
- case MEMTRACK_VMALLOC:
- case MEMTRACK_KMEM_OBJ:
- case MEMTRACK_IOREMAP:
- case MEMTRACK_WORK_QUEUE:
- case MEMTRACK_PAGE_ALLOC:
- case MEMTRACK_DMA_MAP_SINGLE:
- case MEMTRACK_DMA_MAP_PAGE:
- case MEMTRACK_DMA_MAP_SG:
- return rsc_free_names[memtype];
- default:
- return "(Unknown allocation type)";
- }
-}
-
-/*
- * overlap_a_b
- */
-static inline int overlap_a_b(unsigned long a_start, unsigned long a_end,
- unsigned long b_start, unsigned long b_end)
-{
- if ((b_start > a_end) || (a_start > b_end))
- return 0;
-
- return 1;
-}
-
-/*
- * check_overlap
- */
-static void check_overlap(enum memtrack_memtype_t memtype,
- struct memtrack_meminfo_t *mem_info_p,
- struct tracked_obj_desc_t *obj_desc_p)
-{
- struct list_head *pos, *next;
- struct memtrack_meminfo_t *cur;
- unsigned long start_a, end_a, start_b, end_b;
-
- start_a = mem_info_p->addr;
- end_a = mem_info_p->addr + mem_info_p->size - 1;
-
- list_for_each_safe(pos, next, &obj_desc_p->tracked_objs_head) {
- cur = list_entry(pos, struct memtrack_meminfo_t, list);
-
- start_b = cur->addr;
- end_b = cur->addr + cur->size - 1;
-
- if (overlap_a_b(start_a, end_a, start_b, end_b))
- printk(KERN_ERR "%s overlaps! new_start=0x%lx, new_end=0x%lx, item_start=0x%lx, item_end=0x%lx\n",
- memtype_alloc_str(memtype), mem_info_p->addr,
- mem_info_p->addr + mem_info_p->size - 1, cur->addr,
- cur->addr + cur->size - 1);
- }
-}
-
-/* Invoke on memory allocation */
-void memtrack_alloc(enum memtrack_memtype_t memtype, unsigned long dev,
- unsigned long addr, unsigned long size, unsigned long addr2,
- int direction, const char *filename,
- const unsigned long line_num, int alloc_flags)
-{
- unsigned long hash_val;
- struct memtrack_meminfo_t *cur_mem_info_p, *new_mem_info_p;
- struct tracked_obj_desc_t *obj_desc_p;
- unsigned long flags;
-
- if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) {
- printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype);
- return;
- }
-
- if (!tracked_objs_arr[memtype]) {
- /* object is not tracked */
- return;
- }
- obj_desc_p = tracked_objs_arr[memtype];
-
- hash_val = addr % MEMTRACK_HASH_SZ;
-
- new_mem_info_p = (struct memtrack_meminfo_t *)kmem_cache_alloc(meminfo_cache, alloc_flags);
- if (new_mem_info_p == NULL) {
- printk(KERN_ERR "%s: Failed allocating kmem_cache item for new mem_info. "
- "Lost tracking on allocation at %s:%lu...\n", __func__,
- filename, line_num);
- return;
- }
- /* save allocation properties */
- new_mem_info_p->addr = addr;
- new_mem_info_p->size = size;
- new_mem_info_p->dev = dev;
- new_mem_info_p->addr2 = addr2;
- new_mem_info_p->direction = direction;
-
- new_mem_info_p->line_num = line_num;
- *new_mem_info_p->ext_info = '\0';
- /* Make sure that we will print out the path tail if the given filename is longer
- * than MAX_FILENAME_LEN. (otherwise, we will not see the name of the actual file
- * in the printout -- only the path head!
- */
- if (strlen(filename) > MAX_FILENAME_LEN)
- strncpy(new_mem_info_p->filename, filename + strlen(filename) - MAX_FILENAME_LEN, MAX_FILENAME_LEN);
- else
- strncpy(new_mem_info_p->filename, filename, MAX_FILENAME_LEN);
-
- new_mem_info_p->filename[MAX_FILENAME_LEN] = 0; /* NULL terminate anyway */
-
- memtrack_spin_lock(&obj_desc_p->hash_lock, flags);
- /* make sure given memory location is not already allocated */
- if ((memtype != MEMTRACK_DMA_MAP_SINGLE) && (memtype != MEMTRACK_DMA_MAP_PAGE) &&
- (memtype != MEMTRACK_DMA_MAP_SG)) {
-
- /* make sure given memory location is not already allocated */
- cur_mem_info_p = obj_desc_p->mem_hash[hash_val];
- while (cur_mem_info_p != NULL) {
- if ((cur_mem_info_p->addr == addr) && (cur_mem_info_p->dev == dev)) {
- /* Found given address in the database */
- printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s @ addr=0x%lX which is already known from %s:%lu\n",
- __func__, filename, line_num,
- memtype_alloc_str(memtype), addr,
- cur_mem_info_p->filename,
- cur_mem_info_p->line_num);
- memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
- kmem_cache_free(meminfo_cache, new_mem_info_p);
- return;
- }
- cur_mem_info_p = cur_mem_info_p->next;
- }
- }
- /* not found - we can put in the hash bucket */
- /* link as first */
- new_mem_info_p->next = obj_desc_p->mem_hash[hash_val];
- obj_desc_p->mem_hash[hash_val] = new_mem_info_p;
- if (obj_desc_p->strict_track)
- check_overlap(memtype, new_mem_info_p, obj_desc_p);
- obj_desc_p->count += size;
- list_add(&new_mem_info_p->list, &obj_desc_p->tracked_objs_head);
-
- memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
- return;
-}
-EXPORT_SYMBOL(memtrack_alloc);
-
-/* Invoke on memory free */
-void memtrack_free(enum memtrack_memtype_t memtype, unsigned long dev,
- unsigned long addr, unsigned long size, int direction,
- const char *filename, const unsigned long line_num)
-{
- unsigned long hash_val;
- struct memtrack_meminfo_t *cur_mem_info_p, *prev_mem_info_p;
- struct tracked_obj_desc_t *obj_desc_p;
- unsigned long flags;
-
- if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) {
- printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype);
- return;
- }
-
- if (!tracked_objs_arr[memtype]) {
- /* object is not tracked */
- return;
- }
- obj_desc_p = tracked_objs_arr[memtype];
-
- hash_val = addr % MEMTRACK_HASH_SZ;
-
- memtrack_spin_lock(&obj_desc_p->hash_lock, flags);
- /* find mem_info of given memory location */
- prev_mem_info_p = NULL;
- cur_mem_info_p = obj_desc_p->mem_hash[hash_val];
- while (cur_mem_info_p != NULL) {
- if ((cur_mem_info_p->addr == addr) && (cur_mem_info_p->dev == dev)) {
- /* Found given address in the database */
- if ((memtype == MEMTRACK_DMA_MAP_SINGLE) || (memtype == MEMTRACK_DMA_MAP_PAGE) ||
- (memtype == MEMTRACK_DMA_MAP_SG)) {
- if (direction != cur_mem_info_p->direction)
- printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s bad direction for addr 0x%lX: alloc:0x%x, free:0x%x (allocated in %s::%lu)\n",
- __func__, filename, line_num, memtype_free_str(memtype), addr, cur_mem_info_p->direction, direction,
- cur_mem_info_p->filename, cur_mem_info_p->line_num);
-
- if (size != cur_mem_info_p->size)
- printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s bad size for addr 0x%lX: size:%lu, free:%lu (allocated in %s::%lu)\n",
- __func__, filename, line_num, memtype_free_str(memtype), addr, cur_mem_info_p->size, size,
- cur_mem_info_p->filename, cur_mem_info_p->line_num);
- }
-
- /* Remove from the bucket/list */
- if (prev_mem_info_p == NULL)
- obj_desc_p->mem_hash[hash_val] = cur_mem_info_p->next; /* removing first */
- else
- prev_mem_info_p->next = cur_mem_info_p->next; /* "crossover" */
-
- list_del(&cur_mem_info_p->list);
-
- obj_desc_p->count -= cur_mem_info_p->size;
- memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
- kmem_cache_free(meminfo_cache, cur_mem_info_p);
- return;
- }
- prev_mem_info_p = cur_mem_info_p;
- cur_mem_info_p = cur_mem_info_p->next;
- }
-
- /* not found */
- printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s for unknown address=0x%lX, device=0x%lX\n",
- __func__, filename, line_num, memtype_free_str(memtype), addr, dev);
- memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
- return;
-}
-EXPORT_SYMBOL(memtrack_free);
-
-/*
- * This function recognizes allocations which
- * may be released by kernel (e.g. skb) and
- * therefore not trackable by memtrack.
- * The allocations are recognized by the name
- * of their calling function.
- */
-int is_non_trackable_alloc_func(const char *func_name)
-{
- static const char * const str_str_arr[] = {
- /* functions containing these strings consider non trackable */
- "skb",
- };
- static const char * const str_str_excep_arr[] = {
- /* functions which are exception to the str_str_arr table */
- "ipoib_cm_skb_too_long"
- };
- static const char * const str_cmp_arr[] = {
- /* functions that allocate SKBs */
- "mlx4_en_alloc_frags",
- "mlx4_en_alloc_frag",
- "mlx4_en_init_allocator",
- "mlx4_en_free_frag",
- "mlx4_en_free_rx_desc",
- "mlx4_en_destroy_allocator",
- "mlx4_en_complete_rx_desc",
- /* vnic skb functions */
- "free_single_frag",
- "vnic_alloc_rx_skb",
- "vnic_rx_skb",
- "vnic_alloc_frag",
- "vnic_empty_rx_entry",
- "vnic_init_allocator",
- "vnic_destroy_allocator",
- "sdp_post_recv",
- "sdp_rx_ring_purge",
- "sdp_post_srcavail",
- "sk_stream_alloc_page",
- "update_send_head",
- "sdp_bcopy_get",
- "sdp_destroy_resources",
-
- /* function that allocate memory for RDMA device context */
- "ib_alloc_device"
- };
- size_t str_str_arr_size = sizeof(str_str_arr)/sizeof(char *);
- size_t str_str_excep_size = sizeof(str_str_excep_arr)/sizeof(char *);
- size_t str_cmp_arr_size = sizeof(str_cmp_arr)/sizeof(char *);
-
- int i, j;
-
- for (i = 0; i < str_str_arr_size; ++i)
- if (strstr(func_name, str_str_arr[i])) {
- for (j = 0; j < str_str_excep_size; ++j)
- if (!strcmp(func_name, str_str_excep_arr[j]))
- return 0;
- return 1;
- }
- for (i = 0; i < str_cmp_arr_size; ++i)
- if (!strcmp(func_name, str_cmp_arr[i]))
- return 1;
- return 0;
-}
-EXPORT_SYMBOL(is_non_trackable_alloc_func);
-
-/*
- * In some cases we need to free a memory
- * we defined as "non trackable" (see
- * is_non_trackable_alloc_func).
- * This function recognizes such releases
- * by the name of their calling function.
- */
-int is_non_trackable_free_func(const char *func_name)
-{
-
- static const char * const str_cmp_arr[] = {
- /* function that deallocate memory for RDMA device context */
- "ib_dealloc_device"
- };
- size_t str_cmp_arr_size = sizeof(str_cmp_arr)/sizeof(char *);
-
- int i;
-
- for (i = 0; i < str_cmp_arr_size; ++i)
- if (!strcmp(func_name, str_cmp_arr[i]))
- return 1;
- return 0;
-}
-EXPORT_SYMBOL(is_non_trackable_free_func);
-
-
-/* WA - In this function handles confirm
- the function name is
- '__ib_umem_release' or 'ib_umem_get'
- In this case we won't track the
- memory there because the kernel
- was the one who allocated it.
- Return value:
- 1 - if the function name is match, else 0 */
-int is_umem_put_page(const char *func_name)
-{
- const char func_str[18] = "__ib_umem_release";
- /* In case of error flow put_page is called as part of ib_umem_get */
- const char func_str1[12] = "ib_umem_get";
-
- return ((strstr(func_name, func_str) != NULL) ||
- (strstr(func_name, func_str1) != NULL)) ? 1 : 0;
-}
-EXPORT_SYMBOL(is_umem_put_page);
-
-/* Check page order size
- When Freeing a page allocation it checks whether
- we are trying to free the same size
- we asked to allocate */
-int memtrack_check_size(enum memtrack_memtype_t memtype, unsigned long addr,
- unsigned long size, const char *filename,
- const unsigned long line_num)
-{
- unsigned long hash_val;
- struct memtrack_meminfo_t *cur_mem_info_p;
- struct tracked_obj_desc_t *obj_desc_p;
- unsigned long flags;
- int ret = 0;
-
- if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) {
- printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype);
- return 1;
- }
-
- if (!tracked_objs_arr[memtype]) {
- /* object is not tracked */
- return 1;
- }
- obj_desc_p = tracked_objs_arr[memtype];
-
- hash_val = addr % MEMTRACK_HASH_SZ;
-
- memtrack_spin_lock(&obj_desc_p->hash_lock, flags);
- /* find mem_info of given memory location */
- cur_mem_info_p = obj_desc_p->mem_hash[hash_val];
- while (cur_mem_info_p != NULL) {
- if (cur_mem_info_p->addr == addr) {
- /* Found given address in the database - check size */
- if (cur_mem_info_p->size != size) {
- printk(KERN_ERR "mtl size inconsistency: %s: %s::%lu: try to %s at address=0x%lX with size %lu while was created with size %lu\n",
- __func__, filename, line_num, memtype_free_str(memtype),
- addr, size, cur_mem_info_p->size);
- snprintf(cur_mem_info_p->ext_info, sizeof(cur_mem_info_p->ext_info),
- "invalid free size %lu\n", size);
- ret = 1;
- }
- memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
- return ret;
- }
- cur_mem_info_p = cur_mem_info_p->next;
- }
-
- /* not found - This function will not give any indication
- but will only check the correct size\order
- For inconsistency the 'free' function will check that */
- memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
- return 1;
-}
-EXPORT_SYMBOL(memtrack_check_size);
-
-/* Search for a specific addr whether it exist in the
- current data-base.
- It will print an error msg if we get an unexpected result,
- Return value: 0 - if addr exist, else 1 */
-int memtrack_is_new_addr(enum memtrack_memtype_t memtype, unsigned long addr, int expect_exist,
- const char *filename, const unsigned long line_num)
-{
- unsigned long hash_val;
- struct memtrack_meminfo_t *cur_mem_info_p;
- struct tracked_obj_desc_t *obj_desc_p;
- unsigned long flags;
-
- if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) {
- printk(KERN_ERR "%s: Invalid memory type (%d)\n", __func__, memtype);
- return 1;
- }
-
- if (!tracked_objs_arr[memtype]) {
- /* object is not tracked */
- return 0;
- }
- obj_desc_p = tracked_objs_arr[memtype];
-
- hash_val = addr % MEMTRACK_HASH_SZ;
-
- memtrack_spin_lock(&obj_desc_p->hash_lock, flags);
- /* find mem_info of given memory location */
- cur_mem_info_p = obj_desc_p->mem_hash[hash_val];
- while (cur_mem_info_p != NULL) {
- if (cur_mem_info_p->addr == addr) {
- /* Found given address in the database - exiting */
- memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
- return 0;
- }
- cur_mem_info_p = cur_mem_info_p->next;
- }
-
- /* not found */
- if (expect_exist)
- printk(KERN_ERR "mtl rsc inconsistency: %s: %s::%lu: %s for unknown address=0x%lX\n",
- __func__, filename, line_num, memtype_free_str(memtype), addr);
-
- memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
- return 1;
-}
-EXPORT_SYMBOL(memtrack_is_new_addr);
-
-/* Return current page reference counter */
-int memtrack_get_page_ref_count(unsigned long addr)
-{
- unsigned long hash_val;
- struct memtrack_meminfo_t *cur_mem_info_p;
- struct tracked_obj_desc_t *obj_desc_p;
- unsigned long flags;
- /* This function is called only for page allocation */
- enum memtrack_memtype_t memtype = MEMTRACK_PAGE_ALLOC;
- int ref_conut = 0;
-
- if (!tracked_objs_arr[memtype]) {
- /* object is not tracked */
- return ref_conut;
- }
- obj_desc_p = tracked_objs_arr[memtype];
-
- hash_val = addr % MEMTRACK_HASH_SZ;
-
- memtrack_spin_lock(&obj_desc_p->hash_lock, flags);
- /* find mem_info of given memory location */
- cur_mem_info_p = obj_desc_p->mem_hash[hash_val];
- while (cur_mem_info_p != NULL) {
- if (cur_mem_info_p->addr == addr) {
- /* Found given address in the database - check ref-count */
- struct page *page = (struct page *)(cur_mem_info_p->addr);
- ref_conut = atomic_read(&page->_count);
- memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
- return ref_conut;
- }
- cur_mem_info_p = cur_mem_info_p->next;
- }
-
- /* not found */
- memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
- return ref_conut;
-}
-EXPORT_SYMBOL(memtrack_get_page_ref_count);
-
-/* Report current allocations status (for all memory types) */
-static void memtrack_report(void)
-{
- enum memtrack_memtype_t memtype;
- unsigned long cur_bucket;
- struct memtrack_meminfo_t *cur_mem_info_p;
- int serial = 1;
- struct tracked_obj_desc_t *obj_desc_p;
- unsigned long flags;
- unsigned long detected_leaks = 0;
-
- printk(KERN_INFO "%s: Currently known allocations:\n", __func__);
- for (memtype = 0; memtype < MEMTRACK_NUM_OF_MEMTYPES; memtype++) {
- if (tracked_objs_arr[memtype]) {
- printk(KERN_INFO "%d) %s:\n", serial, memtype_alloc_str(memtype));
- obj_desc_p = tracked_objs_arr[memtype];
- /* Scan all buckets to find existing allocations */
- /* TBD: this may be optimized by holding a linked list of all hash items */
- for (cur_bucket = 0; cur_bucket < MEMTRACK_HASH_SZ; cur_bucket++) {
- memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* protect per bucket/list */
- cur_mem_info_p = obj_desc_p->mem_hash[cur_bucket];
- while (cur_mem_info_p != NULL) { /* scan bucket */
- printk(KERN_INFO "%s::%lu: %s(%lu)==%lX dev=%lX %s\n",
- cur_mem_info_p->filename,
- cur_mem_info_p->line_num,
- memtype_alloc_str(memtype),
- cur_mem_info_p->size,
- cur_mem_info_p->addr,
- cur_mem_info_p->dev,
- cur_mem_info_p->ext_info);
- cur_mem_info_p = cur_mem_info_p->next;
- ++ detected_leaks;
- } /* while cur_mem_info_p */
- memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
- } /* for cur_bucket */
- serial++;
- }
- } /* for memtype */
- printk(KERN_INFO "%s: Summary: %lu leak(s) detected\n", __func__, detected_leaks);
-}
-
-
-
-static struct proc_dir_entry *memtrack_tree;
-
-static enum memtrack_memtype_t get_rsc_by_name(const char *name)
-{
- enum memtrack_memtype_t i;
-
- for (i = 0; i < MEMTRACK_NUM_OF_MEMTYPES; ++i) {
- if (strcmp(name, rsc_names[i]) == 0)
- return i;
- }
-
- return i;
-}
-
-
-static ssize_t memtrack_read(struct file *filp,
- char __user *buf,
- size_t size,
- loff_t *offset)
-{
- unsigned long cur, flags;
- loff_t pos = *offset;
- static char kbuf[20];
- static int file_len;
- int _read, to_ret, left;
- const char *fname;
- enum memtrack_memtype_t memtype;
-
- if (pos < 0)
- return -EINVAL;
-
- fname = filp->f_dentry->d_name.name;
-
- memtype = get_rsc_by_name(fname);
- if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) {
- printk(KERN_ERR "invalid file name\n");
- return -EINVAL;
- }
-
- if (pos == 0) {
- memtrack_spin_lock(&tracked_objs_arr[memtype]->hash_lock, flags);
- cur = tracked_objs_arr[memtype]->count;
- memtrack_spin_unlock(&tracked_objs_arr[memtype]->hash_lock, flags);
- _read = sprintf(kbuf, "%lu\n", cur);
- if (_read < 0)
- return _read;
- else
- file_len = _read;
- }
-
- left = file_len - pos;
- to_ret = (left < size) ? left : size;
- if (copy_to_user(buf, kbuf+pos, to_ret))
- return -EFAULT;
- else {
- *offset = pos + to_ret;
- return to_ret;
- }
-}
-
-static const struct file_operations memtrack_proc_fops = {
- .read = memtrack_read,
-};
-
-static const char *memtrack_proc_entry_name = "mt_memtrack";
-
-static int create_procfs_tree(void)
-{
- struct proc_dir_entry *dir_ent;
- struct proc_dir_entry *proc_ent;
- int i, j;
- unsigned long bit_mask;
-
- dir_ent = proc_mkdir(memtrack_proc_entry_name, NULL);
- if (!dir_ent)
- return -1;
-
- memtrack_tree = dir_ent;
-
- for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask <<= 1) {
- if (bit_mask & track_mask) {
- proc_ent = create_proc_entry(rsc_names[i], S_IRUGO, memtrack_tree);
- if (!proc_ent)
- goto undo_create_root;
-
- proc_ent->proc_fops = &memtrack_proc_fops;
- }
- }
-
- goto exit_ok;
-
-undo_create_root:
- for (j = 0, bit_mask = 1; j < i; ++j, bit_mask <<= 1) {
- if (bit_mask & track_mask)
- remove_proc_entry(rsc_names[j], memtrack_tree);
- }
- remove_proc_entry(memtrack_proc_entry_name, NULL);
- return -1;
-
-exit_ok:
- return 0;
-}
-
-
-static void destroy_procfs_tree(void)
-{
- int i;
- unsigned long bit_mask;
-
- for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask <<= 1) {
- if (bit_mask & track_mask)
- remove_proc_entry(rsc_names[i], memtrack_tree);
-
- }
- remove_proc_entry(memtrack_proc_entry_name, NULL);
-}
-
-int memtrack_inject_error(void)
-{
- int val = 0;
-
- if (inject_freq) {
- if (!(random32() % inject_freq))
- val = 1;
- }
-
- return val;
-}
-EXPORT_SYMBOL(memtrack_inject_error);
-
-int memtrack_randomize_mem(void)
-{
- return random_mem;
-}
-EXPORT_SYMBOL(memtrack_randomize_mem);
-
-/* module entry points */
-
-int init_module(void)
-{
- enum memtrack_memtype_t i;
- int j;
- unsigned long bit_mask;
-
-
- /* create a cache for the memtrack_meminfo_t strcutures */
- meminfo_cache = kmem_cache_create("memtrack_meminfo_t",
- sizeof(struct memtrack_meminfo_t), 0,
- SLAB_HWCACHE_ALIGN, NULL);
- if (!meminfo_cache) {
- printk(KERN_ERR "memtrack::%s: failed to allocate meminfo cache\n", __func__);
- return -1;
- }
-
- /* initialize array of descriptors */
- memset(tracked_objs_arr, 0, sizeof(tracked_objs_arr));
-
- /* create a tracking object descriptor for all required objects */
- for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask <<= 1) {
- if (bit_mask & track_mask) {
- tracked_objs_arr[i] = vmalloc(sizeof(struct tracked_obj_desc_t));
- if (!tracked_objs_arr[i]) {
- printk(KERN_ERR "memtrack: failed to allocate tracking object\n");
- goto undo_cache_create;
- }
-
- memset(tracked_objs_arr[i], 0, sizeof(struct tracked_obj_desc_t));
- spin_lock_init(&tracked_objs_arr[i]->hash_lock);
- INIT_LIST_HEAD(&tracked_objs_arr[i]->tracked_objs_head);
- if (bit_mask & strict_track_mask)
- tracked_objs_arr[i]->strict_track = 1;
- else
- tracked_objs_arr[i]->strict_track = 0;
- }
- }
-
-
- if (create_procfs_tree()) {
- printk(KERN_ERR "%s: create_procfs_tree() failed\n", __FILE__);
- goto undo_cache_create;
- }
-
- printk(KERN_INFO "memtrack::%s done.\n", __func__);
-
- return 0;
-
-undo_cache_create:
- for (j = 0; j < i; ++j) {
- if (tracked_objs_arr[j])
- vfree(tracked_objs_arr[j]);
- }
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 19)
- if (kmem_cache_destroy(meminfo_cache) != 0)
- printk(KERN_ERR "Failed on kmem_cache_destroy!\n");
-#else
- kmem_cache_destroy(meminfo_cache);
-#endif
- return -1;
-}
-
-
-void cleanup_module(void)
-{
- enum memtrack_memtype_t memtype;
- unsigned long cur_bucket;
- struct memtrack_meminfo_t *cur_mem_info_p, *next_mem_info_p;
- struct tracked_obj_desc_t *obj_desc_p;
- unsigned long flags;
-
-
- memtrack_report();
-
-
- destroy_procfs_tree();
-
- /* clean up any hash table left-overs */
- for (memtype = 0; memtype < MEMTRACK_NUM_OF_MEMTYPES; memtype++) {
- /* Scan all buckets to find existing allocations */
- /* TBD: this may be optimized by holding a linked list of all hash items */
- if (tracked_objs_arr[memtype]) {
- obj_desc_p = tracked_objs_arr[memtype];
- for (cur_bucket = 0; cur_bucket < MEMTRACK_HASH_SZ; cur_bucket++) {
- memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* protect per bucket/list */
- cur_mem_info_p = obj_desc_p->mem_hash[cur_bucket];
- while (cur_mem_info_p != NULL) { /* scan bucket */
- next_mem_info_p = cur_mem_info_p->next; /* save "next" pointer before the "free" */
- kmem_cache_free(meminfo_cache, cur_mem_info_p);
- cur_mem_info_p = next_mem_info_p;
- } /* while cur_mem_info_p */
- memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
- } /* for cur_bucket */
- vfree(obj_desc_p);
- }
- } /* for memtype */
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 19)
- if (kmem_cache_destroy(meminfo_cache) != 0)
- printk(KERN_ERR "memtrack::cleanup_module: Failed on kmem_cache_destroy!\n");
-#else
- kmem_cache_destroy(meminfo_cache);
-#endif
- printk(KERN_INFO "memtrack::cleanup_module done.\n");
-}
diff --git a/sys/ofed/drivers/infiniband/debug/memtrack.h b/sys/ofed/drivers/infiniband/debug/memtrack.h
deleted file mode 100644
index fb52f4b32eae2..0000000000000
--- a/sys/ofed/drivers/infiniband/debug/memtrack.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- This software is available to you under a choice of one of two
- licenses. You may choose to be licensed under the terms of the GNU
- General Public License (GPL) Version 2, available at
- <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
- license, available in the LICENSE.TXT file accompanying this
- software. These details are also available at
- <http://openib.org/license.html>.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-
- Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
-*/
-
-#ifndef H_MEMTRACK_H
-#define H_MEMTRACK_H
-
-enum memtrack_memtype_t {
- MEMTRACK_KMALLOC,
- MEMTRACK_VMALLOC,
- MEMTRACK_KMEM_OBJ,
- MEMTRACK_IOREMAP, /* IO-RE/UN-MAP */
- MEMTRACK_WORK_QUEUE, /* Handle work-queue create & destroy */
- MEMTRACK_PAGE_ALLOC, /* Handle page allocation and free */
- MEMTRACK_DMA_MAP_SINGLE,/* Handle ib_dma_single map and unmap */
- MEMTRACK_DMA_MAP_PAGE, /* Handle ib_dma_page map and unmap */
- MEMTRACK_DMA_MAP_SG, /* Handle ib_dma_sg map and unmap with and without attributes */
- MEMTRACK_NUM_OF_MEMTYPES
-};
-
-/* Invoke on memory allocation */
-void memtrack_alloc(enum memtrack_memtype_t memtype, unsigned long dev,
- unsigned long addr, unsigned long size, unsigned long addr2,
- int direction, const char *filename,
- const unsigned long line_num, int alloc_flags);
-
-/* Invoke on memory free */
-void memtrack_free(enum memtrack_memtype_t memtype, unsigned long dev,
- unsigned long addr, unsigned long size, int direction,
- const char *filename, const unsigned long line_num);
-
-/*
- * This function recognizes allocations which
- * may be released by kernel (e.g. skb & vnic) and
- * therefore not trackable by memtrack.
- * The allocations are recognized by the name
- * of their calling function.
- */
-int is_non_trackable_alloc_func(const char *func_name);
-/*
- * In some cases we need to free a memory
- * we defined as "non trackable" (see
- * is_non_trackable_alloc_func).
- * This function recognizes such releases
- * by the name of their calling function.
- */
-int is_non_trackable_free_func(const char *func_name);
-
-/* WA - In this function handles confirm
- the function name is
- '__ib_umem_release' or 'ib_umem_get'
- In this case we won't track the
- memory there because the kernel
- was the one who allocated it.
- Return value:
- 1 - if the function name is match, else 0 */
-int is_umem_put_page(const char *func_name);
-
-/* Check page order size
- When Freeing a page allocation it checks whether
- we are trying to free the same amount of pages
- we ask to allocate (In log2(order)).
- In case an error if found it will print
- an error msg */
-int memtrack_check_size(enum memtrack_memtype_t memtype, unsigned long addr,
- unsigned long size, const char *filename,
- const unsigned long line_num);
-
-/* Search for a specific addr whether it exist in the
- current data-base.
- If not it will print an error msg,
- Return value: 0 - if addr exist, else 1 */
-int memtrack_is_new_addr(enum memtrack_memtype_t memtype, unsigned long addr, int expect_exist,
- const char *filename, const unsigned long line_num);
-
-/* Return current page reference counter */
-int memtrack_get_page_ref_count(unsigned long addr);
-
-/* Report current allocations status (for all memory types) */
-/* we do not export this function since it is used by cleanup_module only */
-/* void memtrack_report(void); */
-
-/* Allow support of error injections */
-int memtrack_inject_error(void);
-
-/* randomize allocated memory */
-int memtrack_randomize_mem(void);
-
-#endif
diff --git a/sys/ofed/drivers/infiniband/debug/mtrack.h b/sys/ofed/drivers/infiniband/debug/mtrack.h
deleted file mode 100644
index 5c0cd20110be3..0000000000000
--- a/sys/ofed/drivers/infiniband/debug/mtrack.h
+++ /dev/null
@@ -1,844 +0,0 @@
-#ifndef __mtrack_h_
-#define __mtrack_h_
-
-#include "memtrack.h"
-
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/kernel.h>
-#include <linux/io.h> /* For ioremap_nocache, ioremap, iounmap */
-#include <linux/random.h>
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 27)
-# include <linux/io-mapping.h> /* For ioremap_nocache, ioremap, iounmap */
-#endif
-#include <linux/mm.h> /* For all page handling */
-#include <linux/workqueue.h> /* For all work-queue handling */
-#include <linux/scatterlist.h> /* For using scatterlists */
-#include <linux/skbuff.h> /* For skbufs handling */
-#include <asm/uaccess.h> /* For copy from/to user */
-
-#define MEMTRACK_ERROR_INJECTION_MESSAGE(file, line, func) ({ \
- printk(KERN_ERR "%s failure injected at %s:%d\n", func, file, line); \
- dump_stack(); \
-})
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 14)
-#define RDMA_KZALLOC_H
-#define kzalloc(size, flags) ({ \
- void *__memtrack_kz_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kzalloc");\
- else \
- __memtrack_kz_addr = kmalloc(size, flags); \
- if (__memtrack_kz_addr && !is_non_trackable_alloc_func(__func__)) { \
- memset(__memtrack_kz_addr, 0, size); \
- } \
- __memtrack_kz_addr; \
-})
-
-#else
-#define kzalloc(size, flags) ({ \
- void *__memtrack_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kzalloc");\
- else \
- __memtrack_addr = kzalloc(size, flags); \
- if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \
- memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, flags); \
- } \
- __memtrack_addr; \
-})
-
-#endif
-
-#define kzalloc_node(size, flags, node) ({ \
- void *__memtrack_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kzalloc_node"); \
- else \
- __memtrack_addr = kzalloc_node(size, flags, node); \
- if (__memtrack_addr && (size) && \
- !is_non_trackable_alloc_func(__func__)) { \
- memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, flags); \
- } \
- __memtrack_addr; \
-})
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 19)
-#define kcalloc(n, size, flags) kzalloc((n)*(size), flags)
-#else
-#define kcalloc(n, size, flags) ({ \
- void *__memtrack_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kcalloc");\
- else \
- __memtrack_addr = kcalloc(n, size, flags); \
- if (__memtrack_addr && (size)) { \
- memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), (n)*(size), 0UL, 0, __FILE__, __LINE__, flags); \
- } \
- __memtrack_addr; \
-})
-#endif
-
-
-
-#ifdef ZERO_OR_NULL_PTR
-#define kmalloc(sz, flgs) ({ \
- void *__memtrack_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmalloc");\
- else \
- __memtrack_addr = kmalloc(sz, flgs); \
- if (!ZERO_OR_NULL_PTR(__memtrack_addr)) { \
- memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \
- if (memtrack_randomize_mem()) \
- get_random_bytes(__memtrack_addr, sz); \
- } \
- __memtrack_addr; \
-})
-#else
-#define kmalloc(sz, flgs) ({ \
- void *__memtrack_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmalloc");\
- else \
- __memtrack_addr = kmalloc(sz, flgs); \
- if (__memtrack_addr) { \
- memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \
- if (memtrack_randomize_mem()) \
- get_random_bytes(__memtrack_addr, sz); \
- } \
- __memtrack_addr; \
-})
-
-#endif
-
-#define kmalloc_node(sz, flgs, node) ({ \
- void *__memtrack_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmalloc_node"); \
- else \
- __memtrack_addr = kmalloc_node(sz, flgs, node); \
- if (__memtrack_addr) { \
- memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \
- if (memtrack_randomize_mem() && ((flgs) == GFP_KERNEL)) \
- get_random_bytes(__memtrack_addr, sz); \
- } \
- __memtrack_addr; \
-})
-
-#ifdef ZERO_OR_NULL_PTR
-#define kmemdup(src, sz, flgs) ({ \
- void *__memtrack_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmemdup");\
- else \
- __memtrack_addr = kmemdup(src, sz, flgs); \
- if (!ZERO_OR_NULL_PTR(__memtrack_addr)) { \
- memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \
- } \
- __memtrack_addr; \
-})
-#else
-#define kmemdup(src, sz, flgs) ({ \
- void *__memtrack_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmemdup");\
- else \
- __memtrack_addr = kmemdup(src, sz, flgs); \
- if (__memtrack_addr) { \
- memtrack_alloc(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), sz, 0UL, 0, __FILE__, __LINE__, flgs); \
- } \
- __memtrack_addr; \
-})
-#endif
-
-#ifdef ZERO_OR_NULL_PTR
-#define kfree(addr) ({ \
- void *__memtrack_addr = (void *)addr; \
- \
- if (!ZERO_OR_NULL_PTR(__memtrack_addr) && \
- !is_non_trackable_free_func(__func__)) { \
- memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
- } \
- kfree(__memtrack_addr); \
-})
-#else
-#define kfree(addr) ({ \
- void *__memtrack_addr = (void *)addr; \
- \
- if (__memtrack_addr && !is_non_trackable_free_func(__func__)) { \
- memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
- } \
- kfree(__memtrack_addr); \
-})
-#endif
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0) || defined (CONFIG_COMPAT_RCU)
-#ifdef kfree_rcu
- #undef kfree_rcu
-#endif
-
-#ifdef ZERO_OR_NULL_PTR
-#define kfree_rcu(addr, rcu_head) ({ \
- void *__memtrack_addr = (void *)addr; \
- \
- if (!ZERO_OR_NULL_PTR(__memtrack_addr) && \
- !is_non_trackable_free_func(__func__)) { \
- memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
- } \
- __kfree_rcu(&((addr)->rcu_head), offsetof(typeof(*(addr)), rcu_head)); \
-})
-#else
-#define kfree_rcu(addr, rcu_head) ({ \
- void *__memtrack_addr = (void *)addr; \
- \
- if (__memtrack_addr && !is_non_trackable_free_func(__func__)) { \
- memtrack_free(MEMTRACK_KMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
- } \
- __kfree_rcu(&((addr)->rcu_head), offsetof(typeof(*(addr)), rcu_head)); \
-})
-#endif
-#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3, 0, 0) */
-
-#define vmalloc(size) ({ \
- void *__memtrack_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "vmalloc");\
- else \
- __memtrack_addr = vmalloc(size); \
- if (__memtrack_addr) { \
- memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- if (memtrack_randomize_mem()) \
- get_random_bytes(__memtrack_addr, size); \
- } \
- __memtrack_addr; \
-})
-
-#ifndef vzalloc
-#define vzalloc(size) ({ \
- void *__memtrack_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "vzalloc");\
- else \
- __memtrack_addr = vzalloc(size); \
- if (__memtrack_addr) { \
- memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- __memtrack_addr; \
-})
-#endif
-
-#ifndef vzalloc_node
-#define vzalloc_node(size, node) ({ \
- void *__memtrack_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "vzalloc_node"); \
- else \
- __memtrack_addr = vzalloc_node(size, node); \
- if (__memtrack_addr) { \
- memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- if (memtrack_randomize_mem()) \
- get_random_bytes(__memtrack_addr, size); \
- } \
- __memtrack_addr; \
-})
-#endif
-
-#define vmalloc_node(size, node) ({ \
- void *__memtrack_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "vmalloc_node"); \
- else \
- __memtrack_addr = vmalloc_node(size, node); \
- if (__memtrack_addr) { \
- memtrack_alloc(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- if (memtrack_randomize_mem()) \
- get_random_bytes(__memtrack_addr, size); \
- } \
- __memtrack_addr; \
-})
-
-#define vfree(addr) ({ \
- void *__memtrack_addr = (void *)addr; \
- if (__memtrack_addr) { \
- memtrack_free(MEMTRACK_VMALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
- } \
- vfree(__memtrack_addr); \
-})
-
-
-#define kmem_cache_alloc(cache, flags) ({ \
- void *__memtrack_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "kmem_cache_alloc"); \
- else \
- __memtrack_addr = kmem_cache_alloc(cache, flags); \
- if (__memtrack_addr) { \
- memtrack_alloc(MEMTRACK_KMEM_OBJ, 0UL, (unsigned long)(__memtrack_addr), 1, 0UL, 0, __FILE__, __LINE__, flags); \
- } \
- __memtrack_addr; \
-})
-
-
-#define kmem_cache_free(cache, addr) ({ \
- void *__memtrack_addr = (void *)addr; \
- \
- if (__memtrack_addr) { \
- memtrack_free(MEMTRACK_KMEM_OBJ, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
- } \
- kmem_cache_free(cache, __memtrack_addr); \
-})
-
-
-/* All IO-MAP handling */
-#define ioremap(phys_addr, size) ({ \
- void __iomem *__memtrack_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "ioremap");\
- else \
- __memtrack_addr = ioremap(phys_addr, size); \
- if (__memtrack_addr) { \
- memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- __memtrack_addr; \
-})
-
-#define io_mapping_create_wc(base, size) ({ \
- void __iomem *__memtrack_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "io_mapping_create_wc"); \
- else \
- __memtrack_addr = io_mapping_create_wc(base, size); \
- if (__memtrack_addr) { \
- memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- __memtrack_addr; \
-})
-
-#define io_mapping_free(addr) ({ \
- void *__memtrack_addr = (void *)addr; \
- \
- if (__memtrack_addr) { \
- memtrack_free(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
- } \
- io_mapping_free(__memtrack_addr); \
-})
-
-#ifdef CONFIG_PPC
-#ifdef ioremap_nocache
- #undef ioremap_nocache
-#endif
-#define ioremap_nocache(phys_addr, size) ({ \
- void __iomem *__memtrack_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "ioremap_nocache"); \
- else \
- __memtrack_addr = ioremap(phys_addr, size); \
- if (__memtrack_addr) { \
- memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- __memtrack_addr; \
-})
-#else
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 18) /* 2.6.16 - 2.6.17 */
-#ifdef ioremap_nocache
- #undef ioremap_nocache
-#endif
-#define ioremap_nocache(phys_addr, size) ({ \
- void __iomem *__memtrack_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "ioremap_nocache"); \
- else \
- __memtrack_addr = ioremap(phys_addr, size); \
- if (__memtrack_addr) { \
- memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- __memtrack_addr; \
-})
-#else
-#define ioremap_nocache(phys_addr, size) ({ \
- void __iomem *__memtrack_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "ioremap_nocache"); \
- else \
- __memtrack_addr = ioremap_nocache(phys_addr, size); \
- if (__memtrack_addr) { \
- memtrack_alloc(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), size, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- __memtrack_addr; \
-})
-#endif /* Kernel version is under 2.6.18 */
-#endif /* PPC */
-
-#define iounmap(addr) ({ \
- void *__memtrack_addr = (void *)addr; \
- \
- if (__memtrack_addr) { \
- memtrack_free(MEMTRACK_IOREMAP, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
- } \
- iounmap(__memtrack_addr); \
-})
-
-
-/* All Page handlers */
-/* TODO: Catch netif_rx for page dereference */
-#define alloc_pages_node(nid, gfp_mask, order) ({ \
- struct page *page_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_pages_node"); \
- else \
- page_addr = (struct page *)alloc_pages_node(nid, gfp_mask, order); \
- if (page_addr && !is_non_trackable_alloc_func(__func__)) { \
- memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), (unsigned long)(order), 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- page_addr; \
-})
-
-#ifdef CONFIG_NUMA
-#define alloc_pages(gfp_mask, order) ({ \
- struct page *page_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_pages"); \
- else \
- page_addr = (struct page *)alloc_pages(gfp_mask, order); \
- if (page_addr && !is_non_trackable_alloc_func(__func__)) { \
- memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), (unsigned long)(order), 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- page_addr; \
-})
-#else
-#ifdef alloc_pages
- #undef alloc_pages
-#endif
-#define alloc_pages(gfp_mask, order) ({ \
- struct page *page_addr; \
- \
- page_addr = (struct page *)alloc_pages_node(numa_node_id(), gfp_mask, order); \
- page_addr; \
-})
-#endif
-
-#define __get_free_pages(gfp_mask, order) ({ \
- struct page *page_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "__get_free_pages"); \
- else \
- page_addr = (struct page *)__get_free_pages(gfp_mask, order); \
- if (page_addr && !is_non_trackable_alloc_func(__func__)) { \
- memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), (unsigned long)(order), 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- page_addr; \
-})
-
-#define get_zeroed_page(gfp_mask) ({ \
- struct page *page_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "get_zeroed_page"); \
- else \
- page_addr = (struct page *)get_zeroed_page(gfp_mask); \
- if (page_addr && !is_non_trackable_alloc_func(__func__)) { \
- memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(page_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- (unsigned long)page_addr; \
-})
-
-#define __free_pages(addr, order) ({ \
- void *__memtrack_addr = (void *)addr; \
- \
- if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \
- if (!memtrack_check_size(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), (unsigned long)(order), __FILE__, __LINE__)) \
- memtrack_free(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
- } \
- __free_pages(addr, order); \
-})
-
-
-#define free_pages(addr, order) ({ \
- void *__memtrack_addr = (void *)addr; \
- \
- if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \
- if (!memtrack_check_size(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), (unsigned long)(order), __FILE__, __LINE__)) \
- memtrack_free(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
- } \
- free_pages(addr, order); \
-})
-
-
-#define get_page(addr) ({ \
- void *__memtrack_addr = (void *)addr; \
- \
- if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \
- if (memtrack_is_new_addr(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), 0, __FILE__, __LINE__)) { \
- memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- } \
- get_page(addr); \
-})
-
-#define get_user_pages_fast(start, nr_pages, write, pages) ({ \
- int __memtrack_rc = -1; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "get_user_pages_fast"); \
- else \
- __memtrack_rc = get_user_pages_fast(start, nr_pages, write, pages); \
- if (__memtrack_rc > 0 && !is_non_trackable_alloc_func(__func__)) { \
- int __memtrack_i; \
- \
- for (__memtrack_i = 0; __memtrack_i < __memtrack_rc; __memtrack_i++) \
- memtrack_alloc(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(pages[__memtrack_i]), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- __memtrack_rc; \
-})
-
-#define put_page(addr) ({ \
- void *__memtrack_addr = (void *)addr; \
- \
- if (__memtrack_addr && !is_non_trackable_alloc_func(__func__)) { \
- /* Check whether this is not part of umem put page & not */\
- /* a new addr and the ref-count is 1 then we'll free this addr */\
- /* Don't change the order these conditions */ \
- if (!is_umem_put_page(__func__) && \
- !memtrack_is_new_addr(MEMTRACK_PAGE_ALLOC, (unsigned long)(__memtrack_addr), 1, __FILE__, __LINE__) && \
- (memtrack_get_page_ref_count((unsigned long)(__memtrack_addr)) == 1)) { \
- memtrack_free(MEMTRACK_PAGE_ALLOC, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
- } \
- } \
- put_page(addr); \
-})
-
-
-/* Work-Queue handlers */
-#ifdef create_workqueue
- #undef create_workqueue
-#endif
-#ifdef create_rt_workqueue
- #undef create_rt_workqueue
-#endif
-#ifdef create_freezeable_workqueue
- #undef create_freezeable_workqueue
-#endif
-#ifdef create_singlethread_workqueue
- #undef create_singlethread_workqueue
-#endif
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 20) /* 2.6.18 - 2.6.19 */
-#define create_workqueue(name) ({ \
- struct workqueue_struct *wq_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_workqueue"); \
- else \
- wq_addr = __create_workqueue((name), 0); \
- if (wq_addr) { \
- memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- wq_addr; \
-})
-
-#define create_singlethread_workqueue(name) ({ \
- struct workqueue_struct *wq_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_singlethread_workqueue"); \
- else \
- wq_addr = __create_workqueue((name), 1); \
- if (wq_addr) { \
- memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- wq_addr; \
-})
-
-#elif LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) /* 2.6.20 - 2.6.27 */
-#define create_workqueue(name) ({ \
- struct workqueue_struct *wq_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_workqueue"); \
- else \
- wq_addr = __create_workqueue((name), 0, 0); \
- if (wq_addr) { \
- memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- wq_addr; \
-})
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 22) /* 2.6.20 - 2.6.21 */
-#define create_freezeable_workqueue(name) ({ \
- struct workqueue_struct *wq_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_freezeable_workqueue"); \
- else \
- wq_addr = __create_workqueue((name), 0, 1); \
- if (wq_addr) { \
- memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- wq_addr; \
-})
-#else /* 2.6.22 - 2.6.27 */
-#define create_freezeable_workqueue(name) ({ \
- struct workqueue_struct *wq_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_freezeable_workqueue"); \
- else \
- wq_addr = __create_workqueue((name), 1, 1); \
- if (wq_addr) { \
- memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- wq_addr; \
-})
-#endif /* 2.6.20 - 2.6.27 */
-
-#define create_singlethread_workqueue(name) ({ \
- struct workqueue_struct *wq_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_singlethread_workqueue"); \
- else \
- wq_addr = __create_workqueue((name), 1, 0); \
- if (wq_addr) { \
- memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- wq_addr; \
-})
-
-#elif LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 36) /* 2.6.28 - 2.6.35 */
-
-#ifdef alloc_workqueue
- #undef alloc_workqueue
-#endif
-
-#define alloc_workqueue(name, flags, max_active) ({ \
- struct workqueue_struct *wq_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_workqueue"); \
- else \
- wq_addr = __create_workqueue((name), (flags), (max_active), 0); \
- if (wq_addr) { \
- memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- wq_addr; \
-})
-
-#define create_workqueue(name) ({ \
- struct workqueue_struct *wq_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_workqueue"); \
- else \
- wq_addr = __create_workqueue((name), 0, 0, 0); \
- if (wq_addr) { \
- memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- wq_addr; \
-})
-
-#define create_rt_workqueue(name) ({ \
- struct workqueue_struct *wq_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_rt_workqueue"); \
- else \
- wq_addr = __create_workqueue((name), 0, 0, 1); \
- if (wq_addr) { \
- memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- wq_addr; \
-})
-
-#define create_freezeable_workqueue(name) ({ \
- struct workqueue_struct *wq_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_freezeable_workqueue"); \
- else \
- wq_addr = __create_workqueue((name), 1, 1, 0); \
- if (wq_addr) { \
- memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- wq_addr; \
-})
-
-#define create_singlethread_workqueue(name) ({ \
- struct workqueue_struct *wq_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "create_singlethread_workqueue"); \
- else \
- wq_addr = __create_workqueue((name), 1, 0, 0); \
- if (wq_addr) { \
- memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- wq_addr; \
-})
-#else /* 2.6.36 */
-#ifdef alloc_workqueue
- #undef alloc_workqueue
-#endif
-#ifdef CONFIG_LOCKDEP
-#define alloc_workqueue(name, flags, max_active) \
-({ \
- static struct lock_class_key __key; \
- const char *__lock_name; \
- struct workqueue_struct *wq_addr = NULL; \
- \
- if (__builtin_constant_p(name)) \
- __lock_name = (name); \
- else \
- __lock_name = #name; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_workqueue"); \
- else \
- wq_addr = __alloc_workqueue_key((name), (flags), (max_active), \
- &__key, __lock_name); \
- if (wq_addr) { \
- memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- wq_addr; \
-})
-#else
-#define alloc_workqueue(name, flags, max_active) ({ \
- struct workqueue_struct *wq_addr = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_workqueue"); \
- else \
- wq_addr = __alloc_workqueue_key((name), (flags), (max_active), NULL, NULL); \
- if (wq_addr) { \
- memtrack_alloc(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(wq_addr), 0, 0UL, 0, __FILE__, __LINE__, GFP_ATOMIC); \
- } \
- wq_addr; \
-})
-#endif
-
-#define create_workqueue(name) \
- alloc_workqueue((name), WQ_RESCUER, 1);
-
-#define create_freezeable_workqueue(name) \
- alloc_workqueue((name), WQ_FREEZEABLE | WQ_UNBOUND | WQ_RESCUER, 1);
-
-#define create_singlethread_workqueue(name) \
- alloc_workqueue((name), WQ_UNBOUND | WQ_RESCUER, 1);
-
-#endif /* Work-Queue Kernel Versions */
-
-#define destroy_workqueue(wq_addr) ({ \
- void *__memtrack_addr = (void *)wq_addr; \
- \
- if (__memtrack_addr) { \
- memtrack_free(MEMTRACK_WORK_QUEUE, 0UL, (unsigned long)(__memtrack_addr), 0UL, 0, __FILE__, __LINE__); \
- } \
- destroy_workqueue(wq_addr); \
-})
-
-/* ONLY error injection to functions that we don't monitor */
-#define alloc_skb(size, prio) ({ \
- struct sk_buff *__memtrack_skb = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_skb"); \
- else \
- __memtrack_skb = alloc_skb(size, prio); \
- __memtrack_skb; \
-})
-
-#define dev_alloc_skb(size) ({ \
- struct sk_buff *__memtrack_skb = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "dev_alloc_skb"); \
- else \
- __memtrack_skb = dev_alloc_skb(size); \
- __memtrack_skb; \
-})
-
-#define alloc_skb_fclone(size, prio) ({ \
- struct sk_buff *__memtrack_skb = NULL; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "alloc_skb_fclone"); \
- else \
- __memtrack_skb = alloc_skb_fclone(size, prio); \
- __memtrack_skb; \
-})
-
-#define copy_from_user(to, from, n) ({ \
- int ret = n; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "copy_from_user"); \
- else \
- ret = copy_from_user(to, from, n); \
- ret; \
-})
-
-#define copy_to_user(to, from, n) ({ \
- int ret = n; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "copy_to_user"); \
- else \
- ret = copy_to_user(to, from, n); \
- ret; \
-})
-
-#define sysfs_create_file(kobj, attr) ({ \
- int ret = -ENOSYS; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "sysfs_create_file"); \
- else \
- ret = sysfs_create_file(kobj, attr); \
- ret; \
-})
-
-#define sysfs_create_link(kobj, target, name) ({ \
- int ret = -ENOSYS; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "sysfs_create_link"); \
- else \
- ret = sysfs_create_link(kobj, target, name); \
- ret; \
-})
-
-#define sysfs_create_group(kobj, grp) ({ \
- int ret = -ENOSYS; \
- \
- if (memtrack_inject_error()) \
- MEMTRACK_ERROR_INJECTION_MESSAGE(__FILE__, __LINE__, "sysfs_create_group"); \
- else \
- ret = sysfs_create_group(kobj, grp); \
- ret; \
-})
-
-#endif /* __mtrack_h_ */
-
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig b/sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig
deleted file mode 100644
index 9d9a9dc51f181..0000000000000
--- a/sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig
+++ /dev/null
@@ -1,50 +0,0 @@
-config INFINIBAND_IPOIB
- tristate "IP-over-InfiniBand"
- depends on NETDEVICES && INET && (IPV6 || IPV6=n)
- select INET_LRO
- ---help---
- Support for the IP-over-InfiniBand protocol (IPoIB). This
- transports IP packets over InfiniBand so you can use your IB
- device as a fancy NIC.
-
- See Documentation/infiniband/ipoib.txt for more information
-
-config INFINIBAND_IPOIB_CM
- bool "IP-over-InfiniBand Connected Mode support"
- depends on INFINIBAND_IPOIB
- default n
- ---help---
- This option enables support for IPoIB connected mode. After
- enabling this option, you need to switch to connected mode
- through /sys/class/net/ibXXX/mode to actually create
- connections, and then increase the interface MTU with
- e.g. ifconfig ib0 mtu 65520.
-
- WARNING: Enabling connected mode will trigger some packet
- drops for multicast and UD mode traffic from this interface,
- unless you limit mtu for these destinations to 2044.
-
-config INFINIBAND_IPOIB_DEBUG
- bool "IP-over-InfiniBand debugging" if EMBEDDED
- depends on INFINIBAND_IPOIB
- default y
- ---help---
- This option causes debugging code to be compiled into the
- IPoIB driver. The output can be turned on via the
- debug_level and mcast_debug_level module parameters (which
- can also be set after the driver is loaded through sysfs).
-
- This option also creates a directory tree under ipoib/ in
- debugfs, which contains files that expose debugging
- information about IB multicast groups used by the IPoIB
- driver.
-
-config INFINIBAND_IPOIB_DEBUG_DATA
- bool "IP-over-InfiniBand data path debugging"
- depends on INFINIBAND_IPOIB_DEBUG
- ---help---
- This option compiles debugging code into the data path
- of the IPoIB driver. The output can be turned on via the
- data_debug_level module parameter; however, even with output
- turned off, this debugging code will have some performance
- impact.
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h
index ed2b91c23e43e..77c692af186f0 100644
--- a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -325,6 +325,7 @@ struct ipoib_dev_priv {
unsigned long flags;
int gone;
+ int unit;
struct mutex vlan_mutex;
@@ -349,7 +350,6 @@ struct ipoib_dev_priv {
u16 pkey;
u16 pkey_index;
struct ib_pd *pd;
- struct ib_mr *mr;
struct ib_cq *recv_cq;
struct ib_cq *send_cq;
struct ib_qp *qp;
@@ -368,7 +368,7 @@ struct ipoib_dev_priv {
unsigned tx_head;
unsigned tx_tail;
struct ib_sge tx_sge[IPOIB_MAX_TX_SG];
- struct ib_send_wr tx_wr;
+ struct ib_ud_wr tx_wr;
unsigned tx_outstanding;
struct ib_wc send_wc[MAX_SEND_CQE];
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 34817ead2b396..277dff53029e6 100644
--- a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -256,7 +256,7 @@ ipoib_ib_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
eh = mtod(mb, struct ipoib_header *);
bzero(eh->hwaddr, 4); /* Zero the queue pair, only dgid is in grh */
- if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok))
+ if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->wc_flags & IB_WC_IP_CSUM_OK))
mb->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID;
dev->if_input(dev, mb);
@@ -451,21 +451,20 @@ post_send(struct ipoib_dev_priv *priv, unsigned int wr_id,
priv->tx_sge[i].addr = mapping[i];
priv->tx_sge[i].length = m->m_len;
}
- priv->tx_wr.num_sge = i;
- priv->tx_wr.wr_id = wr_id;
- priv->tx_wr.wr.ud.remote_qpn = qpn;
- priv->tx_wr.wr.ud.ah = address;
-
+ priv->tx_wr.wr.num_sge = i;
+ priv->tx_wr.wr.wr_id = wr_id;
+ priv->tx_wr.remote_qpn = qpn;
+ priv->tx_wr.ah = address;
if (head) {
- priv->tx_wr.wr.ud.mss = 0; /* XXX mb_shinfo(mb)->gso_size; */
- priv->tx_wr.wr.ud.header = head;
- priv->tx_wr.wr.ud.hlen = hlen;
- priv->tx_wr.opcode = IB_WR_LSO;
+ priv->tx_wr.mss = 0; /* XXX mb_shinfo(mb)->gso_size; */
+ priv->tx_wr.header = head;
+ priv->tx_wr.hlen = hlen;
+ priv->tx_wr.wr.opcode = IB_WR_LSO;
} else
- priv->tx_wr.opcode = IB_WR_SEND;
+ priv->tx_wr.wr.opcode = IB_WR_SEND;
- return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
+ return ib_post_send(priv->qp, &priv->tx_wr.wr, &bad_wr);
}
void
@@ -524,9 +523,9 @@ ipoib_send(struct ipoib_dev_priv *priv, struct mbuf *mb,
}
if (mb->m_pkthdr.csum_flags & (CSUM_IP|CSUM_TCP|CSUM_UDP))
- priv->tx_wr.send_flags |= IB_SEND_IP_CSUM;
+ priv->tx_wr.wr.send_flags |= IB_SEND_IP_CSUM;
else
- priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
+ priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
if (++priv->tx_outstanding == ipoib_sendq_size) {
ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c
index bca6b4fa1c5e0..fa7076955f6d2 100644
--- a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -84,7 +84,7 @@ struct workqueue_struct *ipoib_workqueue;
struct ib_sa_client ipoib_sa_client;
static void ipoib_add_one(struct ib_device *device);
-static void ipoib_remove_one(struct ib_device *device);
+static void ipoib_remove_one(struct ib_device *device, void *client_data);
static void ipoib_start(struct ifnet *dev);
static int ipoib_output(struct ifnet *ifp, struct mbuf *m,
const struct sockaddr *dst, struct route *ro);
@@ -99,6 +99,31 @@ do { \
} \
} while (0)
+static struct unrhdr *ipoib_unrhdr;
+
+static void
+ipoib_unrhdr_init(void *arg)
+{
+
+ ipoib_unrhdr = new_unrhdr(0, 65535, NULL);
+}
+SYSINIT(ipoib_unrhdr_init, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_init, NULL);
+
+static void
+ipoib_unrhdr_uninit(void *arg)
+{
+
+ if (ipoib_unrhdr != NULL) {
+ struct unrhdr *hdr;
+
+ hdr = ipoib_unrhdr;
+ ipoib_unrhdr = NULL;
+
+ delete_unrhdr(hdr);
+ }
+}
+SYSUNINIT(ipoib_unrhdr_uninit, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_uninit, NULL);
+
/*
* This is for clients that have an ipoib_header in the mbuf.
*/
@@ -808,6 +833,7 @@ ipoib_detach(struct ipoib_dev_priv *priv)
bpfdetach(dev);
if_detach(dev);
if_free(dev);
+ free_unr(ipoib_unrhdr, priv->unit);
} else
VLAN_SETCOOKIE(priv->dev, NULL);
@@ -834,8 +860,6 @@ ipoib_dev_cleanup(struct ipoib_dev_priv *priv)
priv->tx_ring = NULL;
}
-static volatile int ipoib_unit;
-
static struct ipoib_dev_priv *
ipoib_priv_alloc(void)
{
@@ -876,7 +900,13 @@ ipoib_intf_alloc(const char *name)
return NULL;
}
dev->if_softc = priv;
- if_initname(dev, name, atomic_fetchadd_int(&ipoib_unit, 1));
+ priv->unit = alloc_unr(ipoib_unrhdr);
+ if (priv->unit == -1) {
+ if_free(dev);
+ free(priv, M_TEMP);
+ return NULL;
+ }
+ if_initname(dev, name, priv->unit);
dev->if_flags = IFF_BROADCAST | IFF_MULTICAST;
dev->if_addrlen = INFINIBAND_ALEN;
dev->if_hdrlen = IPOIB_HEADER_LEN;
@@ -903,27 +933,10 @@ ipoib_intf_alloc(const char *name)
int
ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
{
- struct ib_device_attr *device_attr;
- int result = -ENOMEM;
-
- device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
- if (!device_attr) {
- printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",
- hca->name, sizeof *device_attr);
- return result;
- }
+ struct ib_device_attr *device_attr = &hca->attrs;
- result = ib_query_device(hca, device_attr);
- if (result) {
- printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",
- hca->name, result);
- kfree(device_attr);
- return result;
- }
priv->hca_caps = device_attr->device_cap_flags;
- kfree(device_attr);
-
priv->dev->if_hwassist = 0;
priv->dev->if_capabilities = 0;
@@ -991,7 +1004,7 @@ ipoib_add_port(const char *format, struct ib_device *hca, u8 port)
priv->broadcastaddr[8] = priv->pkey >> 8;
priv->broadcastaddr[9] = priv->pkey & 0xff;
- result = ib_query_gid(hca, port, 0, &priv->local_gid);
+ result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL);
if (result) {
printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
hca->name, port, result);
@@ -1070,15 +1083,16 @@ ipoib_add_one(struct ib_device *device)
}
static void
-ipoib_remove_one(struct ib_device *device)
+ipoib_remove_one(struct ib_device *device, void *client_data)
{
struct ipoib_dev_priv *priv, *tmp;
- struct list_head *dev_list;
+ struct list_head *dev_list = client_data;
- if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ if (!dev_list)
return;
- dev_list = ib_get_client_data(device, &ipoib_client);
+ if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ return;
list_for_each_entry_safe(priv, tmp, dev_list, list) {
if (rdma_port_get_link_layer(device, priv->port) != IB_LINK_LAYER_INFINIBAND)
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 4c04da1533d00..4af5c91e7818d 100644
--- a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -167,7 +167,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
}
priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
spin_unlock_irq(&priv->lock);
- priv->tx_wr.wr.ud.remote_qkey = priv->qkey;
+ priv->tx_wr.remote_qkey = priv->qkey;
set_qkey = 1;
}
@@ -480,7 +480,7 @@ void ipoib_mcast_join_task(struct work_struct *work)
return;
}
- if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid))
+ if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid, NULL))
ipoib_warn(priv, "ib_query_gid() failed\n");
else
memcpy(IF_LLADDR(dev) + 4, priv->local_gid.raw, sizeof (union ib_gid));
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 510a96317eee1..e49da8ece1bb9 100644
--- a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -137,23 +137,18 @@ int ipoib_transport_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca)
.sq_sig_type = IB_SIGNAL_ALL_WR,
.qp_type = IB_QPT_UD
};
+ struct ib_cq_init_attr cq_attr = {};
int ret, size;
int i;
/* XXX struct ethtool_coalesce *coal; */
- priv->pd = ib_alloc_pd(priv->ca);
+ priv->pd = ib_alloc_pd(priv->ca, 0);
if (IS_ERR(priv->pd)) {
printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name);
return -ENODEV;
}
- priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
- if (IS_ERR(priv->mr)) {
- printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name);
- goto out_free_pd;
- }
-
size = ipoib_recvq_size + 1;
ret = ipoib_cm_dev_init(priv);
if (!ret) {
@@ -164,14 +159,16 @@ int ipoib_transport_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca)
size += ipoib_recvq_size * ipoib_max_conn_qp;
}
- priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, priv, size, 0);
+ cq_attr.cqe = size;
+ priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, priv, &cq_attr);
if (IS_ERR(priv->recv_cq)) {
printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name);
goto out_free_mr;
}
+ cq_attr.cqe = ipoib_sendq_size;
priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL,
- priv, ipoib_sendq_size, 0);
+ priv, &cq_attr);
if (IS_ERR(priv->send_cq)) {
printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name);
goto out_free_recv_cq;
@@ -215,14 +212,14 @@ int ipoib_transport_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca)
IF_LLADDR(priv->dev)[3] = (priv->qp->qp_num ) & 0xff;
for (i = 0; i < IPOIB_MAX_TX_SG; ++i)
- priv->tx_sge[i].lkey = priv->mr->lkey;
+ priv->tx_sge[i].lkey = priv->pd->local_dma_lkey;
- priv->tx_wr.opcode = IB_WR_SEND;
- priv->tx_wr.sg_list = priv->tx_sge;
- priv->tx_wr.send_flags = IB_SEND_SIGNALED;
+ priv->tx_wr.wr.opcode = IB_WR_SEND;
+ priv->tx_wr.wr.sg_list = priv->tx_sge;
+ priv->tx_wr.wr.send_flags = IB_SEND_SIGNALED;
for (i = 0; i < IPOIB_UD_RX_SG; ++i)
- priv->rx_sge[i].lkey = priv->mr->lkey;
+ priv->rx_sge[i].lkey = priv->pd->local_dma_lkey;
priv->rx_wr.next = NULL;
priv->rx_wr.sg_list = priv->rx_sge;
@@ -235,10 +232,8 @@ out_free_recv_cq:
ib_destroy_cq(priv->recv_cq);
out_free_mr:
- ib_dereg_mr(priv->mr);
ipoib_cm_dev_cleanup(priv);
-out_free_pd:
ib_dealloc_pd(priv->pd);
return -ENODEV;
}
@@ -262,11 +257,7 @@ void ipoib_transport_dev_cleanup(struct ipoib_dev_priv *priv)
ipoib_cm_dev_cleanup(priv);
- if (ib_dereg_mr(priv->mr))
- ipoib_warn(priv, "ib_dereg_mr failed\n");
-
- if (ib_dealloc_pd(priv->pd))
- ipoib_warn(priv, "ib_dealloc_pd failed\n");
+ ib_dealloc_pd(priv->pd);
}
void ipoib_event(struct ib_event_handler *handler,
diff --git a/sys/ofed/include/rdma/ib.h b/sys/ofed/include/rdma/ib.h
new file mode 100644
index 0000000000000..8d4aefcc91277
--- /dev/null
+++ b/sys/ofed/include/rdma/ib.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2010 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(_RDMA_IB_H)
+#define _RDMA_IB_H
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+
+/*
+ * Define a native infiniband address as in Linux upstream
+ * 8d36eb01da5d371feffa280e501377b5c450f5a5
+ */
+#define AF_IB 41
+
+struct ib_addr {
+ union {
+ __u8 uib_addr8[16];
+ __be16 uib_addr16[8];
+ __be32 uib_addr32[4];
+ __be64 uib_addr64[2];
+ } ib_u;
+#define sib_addr8 ib_u.uib_addr8
+#define sib_addr16 ib_u.uib_addr16
+#define sib_addr32 ib_u.uib_addr32
+#define sib_addr64 ib_u.uib_addr64
+#define sib_raw ib_u.uib_addr8
+#define sib_subnet_prefix ib_u.uib_addr64[0]
+#define sib_interface_id ib_u.uib_addr64[1]
+};
+
+static inline int ib_addr_any(const struct ib_addr *a)
+{
+ return ((a->sib_addr64[0] | a->sib_addr64[1]) == 0);
+}
+
+static inline int ib_addr_loopback(const struct ib_addr *a)
+{
+ return ((a->sib_addr32[0] | a->sib_addr32[1] |
+ a->sib_addr32[2] | (a->sib_addr32[3] ^ htonl(1))) == 0);
+}
+
+static inline void ib_addr_set(struct ib_addr *addr,
+ __be32 w1, __be32 w2, __be32 w3, __be32 w4)
+{
+ addr->sib_addr32[0] = w1;
+ addr->sib_addr32[1] = w2;
+ addr->sib_addr32[2] = w3;
+ addr->sib_addr32[3] = w4;
+}
+
+static inline int ib_addr_cmp(const struct ib_addr *a1, const struct ib_addr *a2)
+{
+ return memcmp(a1, a2, sizeof(struct ib_addr));
+}
+
+struct sockaddr_ib {
+ unsigned short int sib_family; /* AF_IB */
+ __be16 sib_pkey;
+ __be32 sib_flowinfo;
+ struct ib_addr sib_addr;
+ __be64 sib_sid;
+ __be64 sib_sid_mask;
+ __u64 sib_scope_id;
+};
+
+/*
+ * The IB interfaces that use write() as bi-directional ioctl() are
+ * fundamentally unsafe, since there are lots of ways to trigger "write()"
+ * calls from various contexts with elevated privileges. That includes the
+ * traditional suid executable error message writes, but also various kernel
+ * interfaces that can write to file descriptors.
+ *
+ * This function provides protection for the legacy API by restricting the
+ * calling context.
+ */
+static inline bool ib_safe_file_access(struct file *filp)
+{
+ struct thread *td = curthread;
+
+ /*
+ * Check if called from userspace through a devfs related
+ * system call belonging to the given file:
+ */
+ return (filp->_file != NULL &&
+ filp->_file == td->td_fpop &&
+ filp->_file->f_cred == td->td_ucred);
+}
+
+#endif /* _RDMA_IB_H */
diff --git a/sys/ofed/include/rdma/ib_addr.h b/sys/ofed/include/rdma/ib_addr.h
index c62f3ebba284d..1dbe844cc8d0e 100644
--- a/sys/ofed/include/rdma/ib_addr.h
+++ b/sys/ofed/include/rdma/ib_addr.h
@@ -31,7 +31,7 @@
* SOFTWARE.
*/
-#ifndef IB_ADDR_H
+#if !defined(IB_ADDR_H)
#define IB_ADDR_H
#include <linux/in.h>
@@ -41,9 +41,11 @@
#include <linux/inetdevice.h>
#include <linux/socket.h>
#include <linux/if_vlan.h>
+#include <net/ipv6.h>
+#include <net/if_inet6.h>
+#include <net/ip.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_pack.h>
-#include <net/if_inet6.h>
#include <net/ipv6.h>
struct rdma_addr_client {
@@ -62,6 +64,17 @@ void rdma_addr_register_client(struct rdma_addr_client *client);
*/
void rdma_addr_unregister_client(struct rdma_addr_client *client);
+/**
+ * struct rdma_dev_addr - Contains resolved RDMA hardware addresses
+ * @src_dev_addr: Source MAC address.
+ * @dst_dev_addr: Destination MAC address.
+ * @broadcast: Broadcast address of the device.
+ * @dev_type: The interface hardware type of the device.
+ * @bound_dev_if: An optional device interface index.
+ * @transport: The transport type used.
+ * @net: Network namespace containing the bound_dev_if net_dev.
+ */
+struct vnet;
struct rdma_dev_addr {
unsigned char src_dev_addr[MAX_ADDR_LEN];
unsigned char dst_dev_addr[MAX_ADDR_LEN];
@@ -69,14 +82,19 @@ struct rdma_dev_addr {
unsigned short dev_type;
int bound_dev_if;
enum rdma_transport_type transport;
+ struct vnet *net;
+ enum rdma_network_type network;
+ int hoplimit;
};
/**
* rdma_translate_ip - Translate a local IP address to an RDMA hardware
* address.
+ *
+ * The dev_addr->net field must be initialized.
*/
-int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
- u16 *vlan_id);
+int rdma_translate_ip(const struct sockaddr *addr,
+ struct rdma_dev_addr *dev_addr, u16 *vlan_id);
/**
* rdma_resolve_ip - Resolve source and destination IP addresses to
@@ -88,7 +106,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
* @dst_addr: The destination address to resolve.
* @addr: A reference to a data location that will receive the resolved
* addresses. The data location must remain valid until the callback has
- * been invoked.
+ * been invoked. The net field of the addr struct must be valid.
* @timeout_ms: Amount of time to wait for the address resolution to complete.
* @callback: Call invoked once address resolution has completed, timed out,
* or been canceled. A status of 0 indicates success.
@@ -101,20 +119,22 @@ int rdma_resolve_ip(struct rdma_addr_client *client,
struct rdma_dev_addr *addr, void *context),
void *context);
+int rdma_resolve_ip_route(struct sockaddr *src_addr,
+ const struct sockaddr *dst_addr,
+ struct rdma_dev_addr *addr);
+
void rdma_addr_cancel(struct rdma_dev_addr *addr);
int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
const unsigned char *dst_dev_addr);
-int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id,
- u32 scope_id);
-int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *smac,
- u16 *vlan_id, u32 scope_id);
-static inline int ip_addr_size(struct sockaddr *addr)
-{
- return addr->sa_family == AF_INET6 ?
- sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in);
-}
+int rdma_addr_size(struct sockaddr *addr);
+
+int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id);
+int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
+ const union ib_gid *dgid,
+ u8 *smac, u16 *vlan_id, int *if_index,
+ int *hoplimit);
static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr)
{
@@ -147,16 +167,16 @@ static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev)
return tag;
}
-static inline int rdma_ip2gid(struct sockaddr *addr, union ib_gid *gid)
+static inline int rdma_ip2gid(const struct sockaddr *addr, union ib_gid *gid)
{
switch (addr->sa_family) {
case AF_INET:
- ipv6_addr_set_v4mapped(((struct sockaddr_in *)addr)->sin_addr.s_addr,
+ ipv6_addr_set_v4mapped(((const struct sockaddr_in *)
+ addr)->sin_addr.s_addr,
(struct in6_addr *)gid);
break;
case AF_INET6:
- memcpy(gid->raw, &((struct sockaddr_in6 *)addr)->sin6_addr,
- 16);
+ memcpy(gid->raw, &((const struct sockaddr_in6 *)addr)->sin6_addr, 16);
break;
default:
return -EINVAL;
@@ -165,10 +185,9 @@ static inline int rdma_ip2gid(struct sockaddr *addr, union ib_gid *gid)
}
/* Important - sockaddr should be a union of sockaddr_in and sockaddr_in6 */
-static inline int rdma_gid2ip(struct sockaddr *out, union ib_gid *gid,
- uint32_t scope_id)
+static inline void rdma_gid2ip(struct sockaddr *out, const union ib_gid *gid)
{
- if (ipv6_addr_v4mapped((struct in6_addr *)gid)) {
+ if (ipv6_addr_v4mapped((const struct in6_addr *)gid)) {
struct sockaddr_in *out_in = (struct sockaddr_in *)out;
memset(out_in, 0, sizeof(*out_in));
out_in->sin_len = sizeof(*out_in);
@@ -180,21 +199,28 @@ static inline int rdma_gid2ip(struct sockaddr *out, union ib_gid *gid,
out_in->sin6_len = sizeof(*out_in);
out_in->sin6_family = AF_INET6;
memcpy(&out_in->sin6_addr.s6_addr, gid->raw, 16);
- if (scope_id < 256 &&
- IN6_IS_SCOPE_LINKLOCAL(&out_in->sin6_addr))
- out_in->sin6_scope_id = scope_id;
}
- return 0;
}
-u32 rdma_get_ipv6_scope_id(struct ib_device *ib, u8 port_num);
-
-/* This func is called only in loopback ip address (127.0.0.1)
- * case in which sgid is not relevant
- */
static inline void iboe_addr_get_sgid(struct rdma_dev_addr *dev_addr,
union ib_gid *gid)
{
+ struct net_device *dev;
+ struct ifaddr *ifa;
+
+ dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+ if (dev) {
+ TAILQ_FOREACH(ifa, &dev->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr == NULL ||
+ ifa->ifa_addr->sa_family != AF_INET)
+ continue;
+ ipv6_addr_set_v4mapped(((struct sockaddr_in *)
+ ifa->ifa_addr)->sin_addr.s_addr,
+ (struct in6_addr *)gid);
+ break;
+ }
+ dev_put(dev);
+ }
}
static inline void rdma_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
@@ -246,13 +272,19 @@ static inline enum ib_mtu iboe_get_mtu(int mtu)
static inline int iboe_get_rate(struct net_device *dev)
{
- if (dev->if_baudrate >= IF_Gbps(40))
+ uint64_t baudrate = dev->if_baudrate;
+#ifdef if_baudrate_pf
+ int exp;
+ for (exp = dev->if_baudrate_pf; exp > 0; exp--)
+ baudrate *= 10;
+#endif
+ if (baudrate >= IF_Gbps(40))
return IB_RATE_40_GBPS;
- else if (dev->if_baudrate >= IF_Gbps(30))
+ else if (baudrate >= IF_Gbps(30))
return IB_RATE_30_GBPS;
- else if (dev->if_baudrate >= IF_Gbps(20))
+ else if (baudrate >= IF_Gbps(20))
return IB_RATE_20_GBPS;
- else if (dev->if_baudrate >= IF_Gbps(10))
+ else if (baudrate >= IF_Gbps(10))
return IB_RATE_10_GBPS;
else
return IB_RATE_PORT_CURRENT;
@@ -279,20 +311,6 @@ static inline int rdma_is_multicast_addr(struct in6_addr *addr)
return addr->s6_addr[0] == 0xff;
}
-static inline void resolve_mcast_mac(struct in6_addr *addr, u8 *mac)
-{
- if (addr->s6_addr[0] != 0xff)
- return;
-
-#ifdef DUAL_MODE_MCAST_MAC
- if (addr->s6_addr[1] == 0x0e) /* IPv4 */
- ip_eth_mc_map(addr->s6_addr32[3], mac);
- else
-#endif
- ipv6_eth_mc_map(addr, mac);
-}
-
-
static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac)
{
int i;
@@ -308,7 +326,7 @@ static inline u16 rdma_get_vlan_id(union ib_gid *dgid)
u16 vid;
vid = dgid->raw[11] << 8 | dgid->raw[12];
- return vid < 0x1000 ? vid : 0xffff;
+ return vid < 0x1000 ? vid : 0xffff;
}
static inline struct net_device *rdma_vlan_dev_real_dev(const struct net_device *dev)
diff --git a/sys/ofed/include/rdma/ib_cache.h b/sys/ofed/include/rdma/ib_cache.h
index ad9a3c280944f..e30f19bd4a41e 100644
--- a/sys/ofed/include/rdma/ib_cache.h
+++ b/sys/ofed/include/rdma/ib_cache.h
@@ -43,6 +43,8 @@
* @port_num: The port number of the device to query.
* @index: The index into the cached GID table to query.
* @gid: The GID value found at the specified index.
+ * @attr: The GID attribute found at the specified index (only in RoCE).
+ * NULL means ignore (output parameter).
*
* ib_get_cached_gid() fetches the specified GID table entry stored in
* the local software cache.
@@ -50,13 +52,16 @@
int ib_get_cached_gid(struct ib_device *device,
u8 port_num,
int index,
- union ib_gid *gid);
+ union ib_gid *gid,
+ struct ib_gid_attr *attr);
/**
* ib_find_cached_gid - Returns the port number and GID table index where
* a specified GID value occurs.
* @device: The device to query.
* @gid: The GID value to search for.
+ * @gid_type: The GID type to search for.
+ * @ndev: In RoCE, the net device of the device. NULL means ignore.
* @port_num: The port number of the device where the GID value was found.
* @index: The index into the cached GID table where the GID was found. This
* parameter may be NULL.
@@ -65,11 +70,42 @@ int ib_get_cached_gid(struct ib_device *device,
* the local software cache.
*/
int ib_find_cached_gid(struct ib_device *device,
- union ib_gid *gid,
+ const union ib_gid *gid,
+ enum ib_gid_type gid_type,
+ struct net_device *ndev,
u8 *port_num,
u16 *index);
/**
+ * ib_find_cached_gid_by_port - Returns the GID table index where a specified
+ * GID value occurs
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @gid_type: The GID type to search for.
+ * @port_num: The port number of the device where the GID value sould be
+ * searched.
+ * @ndev: In RoCE, the net device of the device. Null means ignore.
+ * @index: The index into the cached GID table where the GID was found. This
+ * parameter may be NULL.
+ *
+ * ib_find_cached_gid() searches for the specified GID value in
+ * the local software cache.
+ */
+int ib_find_cached_gid_by_port(struct ib_device *device,
+ const union ib_gid *gid,
+ enum ib_gid_type gid_type,
+ u8 port_num,
+ struct net_device *ndev,
+ u16 *index);
+
+int ib_find_gid_by_filter(struct ib_device *device,
+ const union ib_gid *gid,
+ u8 port_num,
+ bool (*filter)(const union ib_gid *gid,
+ const struct ib_gid_attr *,
+ void *),
+ void *context, u16 *index);
+/**
* ib_get_cached_pkey - Returns a cached PKey table entry
* @device: The device to query.
* @port_num: The port number of the device to query.
diff --git a/sys/ofed/include/rdma/ib_cm.h b/sys/ofed/include/rdma/ib_cm.h
index a7ffaf9503916..92a7d85917b4d 100644
--- a/sys/ofed/include/rdma/ib_cm.h
+++ b/sys/ofed/include/rdma/ib_cm.h
@@ -105,13 +105,16 @@ enum ib_cm_data_size {
IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE = 216,
IB_CM_SIDR_REP_PRIVATE_DATA_SIZE = 136,
IB_CM_SIDR_REP_INFO_LENGTH = 72,
- IB_CM_COMPARE_SIZE = 64
};
struct ib_cm_id;
struct ib_cm_req_event_param {
struct ib_cm_id *listen_id;
+
+ /* P_Key that was used by the GMP's BTH header */
+ u16 bth_pkey;
+
u8 port;
struct ib_sa_path_rec *primary_path;
@@ -222,6 +225,9 @@ struct ib_cm_apr_event_param {
struct ib_cm_sidr_req_event_param {
struct ib_cm_id *listen_id;
+ __be64 service_id;
+ /* P_Key that was used by the GMP's BTH header */
+ u16 bth_pkey;
u8 port;
u16 pkey;
};
@@ -336,11 +342,6 @@ void ib_destroy_cm_id(struct ib_cm_id *cm_id);
#define IB_SDP_SERVICE_ID cpu_to_be64(0x0000000000010000ULL)
#define IB_SDP_SERVICE_ID_MASK cpu_to_be64(0xFFFFFFFFFFFF0000ULL)
-struct ib_cm_compare_data {
- u8 data[IB_CM_COMPARE_SIZE];
- u8 mask[IB_CM_COMPARE_SIZE];
-};
-
/**
* ib_cm_listen - Initiates listening on the specified service ID for
* connection and service ID resolution requests.
@@ -353,12 +354,13 @@ struct ib_cm_compare_data {
* range of service IDs. If set to 0, the service ID is matched
* exactly. This parameter is ignored if %service_id is set to
* IB_CM_ASSIGN_SERVICE_ID.
- * @compare_data: This parameter is optional. It specifies data that must
- * appear in the private data of a connection request for the specified
- * listen request.
*/
-int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
- struct ib_cm_compare_data *compare_data);
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id,
+ __be64 service_mask);
+
+struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
+ ib_cm_handler cm_handler,
+ __be64 service_id);
struct ib_cm_req_param {
struct ib_sa_path_rec *primary_path;
@@ -601,6 +603,4 @@ struct ib_cm_sidr_rep_param {
int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
struct ib_cm_sidr_rep_param *param);
-int ib_update_cm_av(struct ib_cm_id *id, const u8 *smac, const u8 *alt_smac);
-
#endif /* IB_CM_H */
diff --git a/sys/ofed/include/rdma/ib_hdrs.h b/sys/ofed/include/rdma/ib_hdrs.h
new file mode 100644
index 0000000000000..408439fe911e4
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_hdrs.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef IB_HDRS_H
+#define IB_HDRS_H
+
+#include <linux/types.h>
+#include <asm/unaligned.h>
+#include <rdma/ib_verbs.h>
+
+#define IB_SEQ_NAK (3 << 29)
+
+/* AETH NAK opcode values */
+#define IB_RNR_NAK 0x20
+#define IB_NAK_PSN_ERROR 0x60
+#define IB_NAK_INVALID_REQUEST 0x61
+#define IB_NAK_REMOTE_ACCESS_ERROR 0x62
+#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
+#define IB_NAK_INVALID_RD_REQUEST 0x64
+
+#define IB_BTH_REQ_ACK BIT(31)
+#define IB_BTH_SOLICITED BIT(23)
+#define IB_BTH_MIG_REQ BIT(22)
+
+#define IB_GRH_VERSION 6
+#define IB_GRH_VERSION_MASK 0xF
+#define IB_GRH_VERSION_SHIFT 28
+#define IB_GRH_TCLASS_MASK 0xFF
+#define IB_GRH_TCLASS_SHIFT 20
+#define IB_GRH_FLOW_MASK 0xFFFFF
+#define IB_GRH_FLOW_SHIFT 0
+#define IB_GRH_NEXT_HDR 0x1B
+
+struct ib_reth {
+ __be64 vaddr; /* potentially unaligned */
+ __be32 rkey;
+ __be32 length;
+} __packed;
+
+struct ib_atomic_eth {
+ __be64 vaddr; /* potentially unaligned */
+ __be32 rkey;
+ __be64 swap_data; /* potentially unaligned */
+ __be64 compare_data; /* potentially unaligned */
+} __packed;
+
+union ib_ehdrs {
+ struct {
+ __be32 deth[2];
+ __be32 imm_data;
+ } ud;
+ struct {
+ struct ib_reth reth;
+ __be32 imm_data;
+ } rc;
+ struct {
+ __be32 aeth;
+ __be64 atomic_ack_eth; /* potentially unaligned */
+ } __packed at;
+ __be32 imm_data;
+ __be32 aeth;
+ __be32 ieth;
+ struct ib_atomic_eth atomic_eth;
+} __packed;
+
+struct ib_other_headers {
+ __be32 bth[3];
+ union ib_ehdrs u;
+} __packed;
+
+struct ib_header {
+ __be16 lrh[4];
+ union {
+ struct {
+ struct ib_grh grh;
+ struct ib_other_headers oth;
+ } l;
+ struct ib_other_headers oth;
+ } u;
+} __packed;
+
+/* accessors for unaligned __be64 items */
+
+static inline u64 ib_u64_get(__be64 *p)
+{
+ return get_unaligned_be64(p);
+}
+
+static inline void ib_u64_put(u64 val, __be64 *p)
+{
+ put_unaligned_be64(val, p);
+}
+
+static inline u64 get_ib_reth_vaddr(struct ib_reth *reth)
+{
+ return ib_u64_get(&reth->vaddr);
+}
+
+static inline void put_ib_reth_vaddr(u64 val, struct ib_reth *reth)
+{
+ ib_u64_put(val, &reth->vaddr);
+}
+
+static inline u64 get_ib_ateth_vaddr(struct ib_atomic_eth *ateth)
+{
+ return ib_u64_get(&ateth->vaddr);
+}
+
+static inline void put_ib_ateth_vaddr(u64 val, struct ib_atomic_eth *ateth)
+{
+ ib_u64_put(val, &ateth->vaddr);
+}
+
+static inline u64 get_ib_ateth_swap(struct ib_atomic_eth *ateth)
+{
+ return ib_u64_get(&ateth->swap_data);
+}
+
+static inline void put_ib_ateth_swap(u64 val, struct ib_atomic_eth *ateth)
+{
+ ib_u64_put(val, &ateth->swap_data);
+}
+
+static inline u64 get_ib_ateth_compare(struct ib_atomic_eth *ateth)
+{
+ return ib_u64_get(&ateth->compare_data);
+}
+
+static inline void put_ib_ateth_compare(u64 val, struct ib_atomic_eth *ateth)
+{
+ ib_u64_put(val, &ateth->compare_data);
+}
+
+#endif /* IB_HDRS_H */
diff --git a/sys/ofed/include/rdma/ib_mad.h b/sys/ofed/include/rdma/ib_mad.h
index 3d81b90cc3158..9b02d11364043 100644
--- a/sys/ofed/include/rdma/ib_mad.h
+++ b/sys/ofed/include/rdma/ib_mad.h
@@ -40,9 +40,13 @@
#include <linux/list.h>
#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_mad.h>
-/* Management base version */
+/* Management base versions */
#define IB_MGMT_BASE_VERSION 1
+#define OPA_MGMT_BASE_VERSION 0x80
+
+#define OPA_SMP_CLASS_VERSION 0x80
/* Management classes */
#define IB_MGMT_CLASS_SUBN_LID_ROUTED 0x01
@@ -123,6 +127,23 @@
#define IB_DEFAULT_PKEY_PARTIAL 0x7FFF
#define IB_DEFAULT_PKEY_FULL 0xFFFF
+/*
+ * Generic trap/notice types
+ */
+#define IB_NOTICE_TYPE_FATAL 0x80
+#define IB_NOTICE_TYPE_URGENT 0x81
+#define IB_NOTICE_TYPE_SECURITY 0x82
+#define IB_NOTICE_TYPE_SM 0x83
+#define IB_NOTICE_TYPE_INFO 0x84
+
+/*
+ * Generic trap/notice producers
+ */
+#define IB_NOTICE_PROD_CA cpu_to_be16(1)
+#define IB_NOTICE_PROD_SWITCH cpu_to_be16(2)
+#define IB_NOTICE_PROD_ROUTER cpu_to_be16(3)
+#define IB_NOTICE_PROD_CLASS_MGR cpu_to_be16(4)
+
enum {
IB_MGMT_MAD_HDR = 24,
IB_MGMT_MAD_DATA = 232,
@@ -134,6 +155,10 @@ enum {
IB_MGMT_SA_DATA = 200,
IB_MGMT_DEVICE_HDR = 64,
IB_MGMT_DEVICE_DATA = 192,
+ IB_MGMT_MAD_SIZE = IB_MGMT_MAD_HDR + IB_MGMT_MAD_DATA,
+ OPA_MGMT_MAD_DATA = 2024,
+ OPA_MGMT_RMPP_DATA = 2012,
+ OPA_MGMT_MAD_SIZE = IB_MGMT_MAD_HDR + OPA_MGMT_MAD_DATA,
};
struct ib_mad_hdr {
@@ -180,12 +205,23 @@ struct ib_mad {
u8 data[IB_MGMT_MAD_DATA];
};
+struct opa_mad {
+ struct ib_mad_hdr mad_hdr;
+ u8 data[OPA_MGMT_MAD_DATA];
+};
+
struct ib_rmpp_mad {
struct ib_mad_hdr mad_hdr;
struct ib_rmpp_hdr rmpp_hdr;
u8 data[IB_MGMT_RMPP_DATA];
};
+struct opa_rmpp_mad {
+ struct ib_mad_hdr mad_hdr;
+ struct ib_rmpp_hdr rmpp_hdr;
+ u8 data[OPA_MGMT_RMPP_DATA];
+};
+
struct ib_sa_mad {
struct ib_mad_hdr mad_hdr;
struct ib_rmpp_hdr rmpp_hdr;
@@ -201,12 +237,17 @@ struct ib_vendor_mad {
u8 data[IB_MGMT_VENDOR_DATA];
};
+#define IB_MGMT_CLASSPORTINFO_ATTR_ID cpu_to_be16(0x0001)
+
+#define IB_CLASS_PORT_INFO_RESP_TIME_MASK 0x1F
+#define IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE 5
+
struct ib_class_port_info {
u8 base_version;
u8 class_version;
__be16 capability_mask;
- u8 reserved[3];
- u8 resp_time_value;
+ /* 27 bits for cap_mask2, 5 bits for resp_time */
+ __be32 cap_mask2_resp_time;
u8 redirect_gid[16];
__be32 redirect_tcslfl;
__be16 redirect_lid;
@@ -222,6 +263,123 @@ struct ib_class_port_info {
};
/**
+ * ib_get_cpi_resp_time - Returns the resp_time value from
+ * cap_mask2_resp_time in ib_class_port_info.
+ * @cpi: A struct ib_class_port_info mad.
+ */
+static inline u8 ib_get_cpi_resp_time(struct ib_class_port_info *cpi)
+{
+ return (u8)(be32_to_cpu(cpi->cap_mask2_resp_time) &
+ IB_CLASS_PORT_INFO_RESP_TIME_MASK);
+}
+
+/**
+ * ib_set_cpi_resptime - Sets the response time in an
+ * ib_class_port_info mad.
+ * @cpi: A struct ib_class_port_info.
+ * @rtime: The response time to set.
+ */
+static inline void ib_set_cpi_resp_time(struct ib_class_port_info *cpi,
+ u8 rtime)
+{
+ cpi->cap_mask2_resp_time =
+ (cpi->cap_mask2_resp_time &
+ cpu_to_be32(~IB_CLASS_PORT_INFO_RESP_TIME_MASK)) |
+ cpu_to_be32(rtime & IB_CLASS_PORT_INFO_RESP_TIME_MASK);
+}
+
+/**
+ * ib_get_cpi_capmask2 - Returns the capmask2 value from
+ * cap_mask2_resp_time in ib_class_port_info.
+ * @cpi: A struct ib_class_port_info mad.
+ */
+static inline u32 ib_get_cpi_capmask2(struct ib_class_port_info *cpi)
+{
+ return (be32_to_cpu(cpi->cap_mask2_resp_time) >>
+ IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE);
+}
+
+/**
+ * ib_set_cpi_capmask2 - Sets the capmask2 in an
+ * ib_class_port_info mad.
+ * @cpi: A struct ib_class_port_info.
+ * @capmask2: The capmask2 to set.
+ */
+static inline void ib_set_cpi_capmask2(struct ib_class_port_info *cpi,
+ u32 capmask2)
+{
+ cpi->cap_mask2_resp_time =
+ (cpi->cap_mask2_resp_time &
+ cpu_to_be32(IB_CLASS_PORT_INFO_RESP_TIME_MASK)) |
+ cpu_to_be32(capmask2 <<
+ IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE);
+}
+
+struct ib_mad_notice_attr {
+ u8 generic_type;
+ u8 prod_type_msb;
+ __be16 prod_type_lsb;
+ __be16 trap_num;
+ __be16 issuer_lid;
+ __be16 toggle_count;
+
+ union {
+ struct {
+ u8 details[54];
+ } raw_data;
+
+ struct {
+ __be16 reserved;
+ __be16 lid; /* where violation happened */
+ u8 port_num; /* where violation happened */
+ } __packed ntc_129_131;
+
+ struct {
+ __be16 reserved;
+ __be16 lid; /* LID where change occurred */
+ u8 reserved2;
+ u8 local_changes; /* low bit - local changes */
+ __be32 new_cap_mask; /* new capability mask */
+ u8 reserved3;
+ u8 change_flags; /* low 3 bits only */
+ } __packed ntc_144;
+
+ struct {
+ __be16 reserved;
+ __be16 lid; /* lid where sys guid changed */
+ __be16 reserved2;
+ __be64 new_sys_guid;
+ } __packed ntc_145;
+
+ struct {
+ __be16 reserved;
+ __be16 lid;
+ __be16 dr_slid;
+ u8 method;
+ u8 reserved2;
+ __be16 attr_id;
+ __be32 attr_mod;
+ __be64 mkey;
+ u8 reserved3;
+ u8 dr_trunc_hop;
+ u8 dr_rtn_path[30];
+ } __packed ntc_256;
+
+ struct {
+ __be16 reserved;
+ __be16 lid1;
+ __be16 lid2;
+ __be32 key;
+ __be32 sl_qp1; /* SL: high 4 bits */
+ __be32 qp2; /* high 8 bits reserved */
+ union ib_gid gid1;
+ union ib_gid gid2;
+ } __packed ntc_257_258;
+
+ } details;
+};
+
+/**
* ib_mad_send_buf - MAD data buffer and work request for sends.
* @next: A pointer used to chain together MADs for posting.
* @mad: References an allocated MAD data buffer for MADs that do not have
@@ -234,7 +392,10 @@ struct ib_class_port_info {
* includes the common MAD, RMPP, and class specific headers.
* @data_len: Indicates the total size of user-transferred data.
* @seg_count: The number of RMPP segments allocated for this send.
- * @seg_size: Size of each RMPP segment.
+ * @seg_size: Size of the data in each RMPP segment. This does not include
+ * class specific headers.
+ * @seg_rmpp_size: Size of each RMPP segment including the class specific
+ * headers.
* @timeout_ms: Time to wait for a response.
* @retries: Number of times to retry a request for a response. For MADs
* using RMPP, this applies per window. On completion, returns the number
@@ -254,6 +415,7 @@ struct ib_mad_send_buf {
int data_len;
int seg_count;
int seg_size;
+ int seg_rmpp_size;
int timeout_ms;
int retries;
};
@@ -262,7 +424,7 @@ struct ib_mad_send_buf {
* ib_response_mad - Returns if the specified MAD has been generated in
* response to a sent request or trap.
*/
-int ib_response_mad(struct ib_mad *mad);
+int ib_response_mad(const struct ib_mad_hdr *hdr);
/**
* ib_get_rmpp_resptime - Returns the RMPP response time.
@@ -277,7 +439,7 @@ static inline u8 ib_get_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr)
* ib_get_rmpp_flags - Returns the RMPP flags.
* @rmpp_hdr: An RMPP header.
*/
-static inline u8 ib_get_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr)
+static inline u8 ib_get_rmpp_flags(const struct ib_rmpp_hdr *rmpp_hdr)
{
return rmpp_hdr->rmpp_rtime_flags & 0x7;
}
@@ -318,11 +480,11 @@ typedef void (*ib_mad_send_handler)(struct ib_mad_agent *mad_agent,
/**
* ib_mad_snoop_handler - Callback handler for snooping sent MADs.
* @mad_agent: MAD agent that snooped the MAD.
- * @send_wr: Work request information on the sent MAD.
+ * @send_buf: send MAD data buffer.
* @mad_send_wc: Work completion information on the sent MAD. Valid
* only for snooping that occurs on a send completion.
*
- * Clients snooping MADs should not modify data referenced by the @send_wr
+ * Clients snooping MADs should not modify data referenced by the @send_buf
* or @mad_send_wc.
*/
typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent,
@@ -332,6 +494,7 @@ typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent,
/**
* ib_mad_recv_handler - callback handler for a received MAD.
* @mad_agent: MAD agent requesting the received MAD.
+ * @send_buf: Send buffer if found, else NULL
* @mad_recv_wc: Received work completion information on the received MAD.
*
* MADs received in response to a send request operation will be handed to
@@ -341,6 +504,7 @@ typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent,
* modify the data referenced by @mad_recv_wc.
*/
typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf,
struct ib_mad_recv_wc *mad_recv_wc);
/**
@@ -355,18 +519,22 @@ typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent,
* @hi_tid: Access layer assigned transaction ID for this client.
* Unsolicited MADs sent by this client will have the upper 32-bits
* of their TID set to this value.
+ * @flags: registration flags
* @port_num: Port number on which QP is registered
* @rmpp_version: If set, indicates the RMPP version used by this agent.
*/
+enum {
+ IB_MAD_USER_RMPP = IB_USER_MAD_USER_RMPP,
+};
struct ib_mad_agent {
struct ib_device *device;
struct ib_qp *qp;
- struct ib_mr *mr;
ib_mad_recv_handler recv_handler;
ib_mad_send_handler send_handler;
ib_mad_snoop_handler snoop_handler;
void *context;
u32 hi_tid;
+ u32 flags;
u8 port_num;
u8 rmpp_version;
};
@@ -395,7 +563,10 @@ struct ib_mad_send_wc {
struct ib_mad_recv_buf {
struct list_head list;
struct ib_grh *grh;
- struct ib_mad *mad;
+ union {
+ struct ib_mad *mad;
+ struct opa_mad *opa_mad;
+ };
};
/**
@@ -404,6 +575,7 @@ struct ib_mad_recv_buf {
* @recv_buf: Specifies the location of the received data buffer(s).
* @rmpp_list: Specifies a list of RMPP reassembled received MAD buffers.
* @mad_len: The length of the received MAD, without duplicated headers.
+ * @mad_seg_size: The size of individual MAD segments
*
* For received response, the wr_id contains a pointer to the ib_mad_send_buf
* for the corresponding send request.
@@ -413,6 +585,7 @@ struct ib_mad_recv_wc {
struct ib_mad_recv_buf recv_buf;
struct list_head rmpp_list;
int mad_len;
+ size_t mad_seg_size;
};
/**
@@ -426,6 +599,7 @@ struct ib_mad_recv_wc {
* in the range from 0x30 to 0x4f. Otherwise not used.
* @method_mask: The caller will receive unsolicited MADs for any method
* where @method_mask = 1.
+ *
*/
struct ib_mad_reg_req {
u8 mgmt_class;
@@ -451,6 +625,7 @@ struct ib_mad_reg_req {
* @recv_handler: The completion callback routine invoked for a received
* MAD.
* @context: User specified context associated with the registration.
+ * @registration_flags: Registration flags to set for this agent
*/
struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
u8 port_num,
@@ -459,7 +634,8 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
u8 rmpp_version,
ib_mad_send_handler send_handler,
ib_mad_recv_handler recv_handler,
- void *context);
+ void *context,
+ u32 registration_flags);
enum ib_mad_snoop_flags {
/*IB_MAD_SNOOP_POSTED_SENDS = 1,*/
@@ -609,6 +785,7 @@ int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
* automatically adjust the allocated buffer size to account for any
* additional padding that may be necessary.
* @gfp_mask: GFP mask used for the memory allocation.
+ * @base_version: Base Version of this MAD
*
* This routine allocates a MAD for sending. The returned MAD send buffer
* will reference a data buffer usable for sending a MAD, along
@@ -624,7 +801,8 @@ struct ib_mad_send_buf *ib_create_send_mad(struct ib_mad_agent *mad_agent,
u32 remote_qpn, u16 pkey_index,
int rmpp_active,
int hdr_len, int data_len,
- gfp_t gfp_mask);
+ gfp_t gfp_mask,
+ u8 base_version);
/**
* ib_is_mad_class_rmpp - returns whether given management class
@@ -661,4 +839,11 @@ void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num);
*/
void ib_free_send_mad(struct ib_mad_send_buf *send_buf);
+/**
+ * ib_mad_kernel_rmpp_agent - Returns if the agent is performing RMPP.
+ * @agent: the agent in question
+ * @return: true if agent is performing rmpp, false otherwise.
+ */
+int ib_mad_kernel_rmpp_agent(const struct ib_mad_agent *agent);
+
#endif /* IB_MAD_H */
diff --git a/sys/ofed/include/rdma/ib_pack.h b/sys/ofed/include/rdma/ib_pack.h
index 1678be7f047dc..950aeebedc8a2 100644
--- a/sys/ofed/include/rdma/ib_pack.h
+++ b/sys/ofed/include/rdma/ib_pack.h
@@ -40,6 +40,8 @@ enum {
IB_ETH_BYTES = 14,
IB_VLAN_BYTES = 4,
IB_GRH_BYTES = 40,
+ IB_IP4_BYTES = 20,
+ IB_UDP_BYTES = 8,
IB_BTH_BYTES = 12,
IB_DETH_BYTES = 8
};
@@ -75,6 +77,8 @@ enum {
IB_OPCODE_UC = 0x20,
IB_OPCODE_RD = 0x40,
IB_OPCODE_UD = 0x60,
+ /* per IBTA 1.3 vol 1 Table 38, A10.3.2 */
+ IB_OPCODE_CNP = 0x80,
/* operations -- just used to define real constants */
IB_OPCODE_SEND_FIRST = 0x00,
@@ -98,6 +102,9 @@ enum {
IB_OPCODE_ATOMIC_ACKNOWLEDGE = 0x12,
IB_OPCODE_COMPARE_SWAP = 0x13,
IB_OPCODE_FETCH_ADD = 0x14,
+ /* opcode 0x15 is reserved */
+ IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16,
+ IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17,
/* real constants follow -- see comment about above IB_OPCODE()
macro for more details */
@@ -124,6 +131,8 @@ enum {
IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE),
IB_OPCODE(RC, COMPARE_SWAP),
IB_OPCODE(RC, FETCH_ADD),
+ IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE),
+ IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE),
/* UC */
IB_OPCODE(UC, SEND_FIRST),
@@ -220,6 +229,27 @@ struct ib_unpacked_eth {
__be16 type;
};
+struct ib_unpacked_ip4 {
+ u8 ver;
+ u8 hdr_len;
+ u8 tos;
+ __be16 tot_len;
+ __be16 id;
+ __be16 frag_off;
+ u8 ttl;
+ u8 protocol;
+ __sum16 check;
+ __be32 saddr;
+ __be32 daddr;
+};
+
+struct ib_unpacked_udp {
+ __be16 sport;
+ __be16 dport;
+ __be16 length;
+ __be16 csum;
+};
+
struct ib_unpacked_vlan {
__be16 tag;
__be16 type;
@@ -228,16 +258,20 @@ struct ib_unpacked_vlan {
struct ib_ud_header {
int lrh_present;
struct ib_unpacked_lrh lrh;
- int eth_present;
- struct ib_unpacked_eth eth;
+ int eth_present;
+ struct ib_unpacked_eth eth;
int vlan_present;
struct ib_unpacked_vlan vlan;
- int grh_present;
- struct ib_unpacked_grh grh;
- struct ib_unpacked_bth bth;
+ int grh_present;
+ struct ib_unpacked_grh grh;
+ int ipv4_present;
+ struct ib_unpacked_ip4 ip4;
+ int udp_present;
+ struct ib_unpacked_udp udp;
+ struct ib_unpacked_bth bth;
struct ib_unpacked_deth deth;
- int immediate_present;
- __be32 immediate_data;
+ int immediate_present;
+ __be32 immediate_data;
};
void ib_pack(const struct ib_field *desc,
@@ -250,13 +284,17 @@ void ib_unpack(const struct ib_field *desc,
void *buf,
void *structure);
-void ib_ud_header_init(int payload_bytes,
- int lrh_present,
- int eth_present,
- int vlan_present,
- int grh_present,
- int immediate_present,
- struct ib_ud_header *header);
+__sum16 ib_ud_ip4_csum(struct ib_ud_header *header);
+
+int ib_ud_header_init(int payload_bytes,
+ int lrh_present,
+ int eth_present,
+ int vlan_present,
+ int grh_present,
+ int ip_version,
+ int udp_present,
+ int immediate_present,
+ struct ib_ud_header *header);
int ib_ud_header_pack(struct ib_ud_header *header,
void *buf);
diff --git a/sys/ofed/include/rdma/ib_peer_mem.h b/sys/ofed/include/rdma/ib_peer_mem.h
deleted file mode 100644
index b2a8a4a3f246e..0000000000000
--- a/sys/ofed/include/rdma/ib_peer_mem.h
+++ /dev/null
@@ -1,59 +0,0 @@
-#if !defined(IB_PEER_MEM_H)
-#define IB_PEER_MEM_H
-
-#include <rdma/peer_mem.h>
-
-
-struct invalidation_ctx;
-struct ib_ucontext;
-
-struct ib_peer_memory_statistics {
- unsigned long num_alloc_mrs;
- unsigned long num_dealloc_mrs;
- unsigned long num_reg_pages;
- unsigned long num_dereg_pages;
- unsigned long num_free_callbacks;
-};
-
-struct ib_peer_memory_client {
- const struct peer_memory_client *peer_mem;
-
- struct list_head core_peer_list;
- struct list_head core_ticket_list;
- unsigned long last_ticket;
-#ifdef __FreeBSD__
- int holdcount;
- int needwakeup;
- struct cv peer_cv;
-#else
- struct srcu_struct peer_srcu;
-#endif
- struct mutex lock;
- struct kobject *kobj;
- struct attribute_group peer_mem_attr_group;
- struct ib_peer_memory_statistics stats;
-};
-
-struct core_ticket {
- unsigned long key;
- void *context;
- struct list_head ticket_list;
-};
-
-struct ib_peer_memory_client *ib_get_peer_client(struct ib_ucontext *context, unsigned long addr,
- size_t size, void **peer_client_context,
- int *srcu_key);
-
-void ib_put_peer_client(struct ib_peer_memory_client *ib_peer_client,
- void *peer_client_context,
- int srcu_key);
-
-unsigned long ib_peer_insert_context(struct ib_peer_memory_client *ib_peer_client,
- void *context);
-int ib_peer_remove_context(struct ib_peer_memory_client *ib_peer_client,
- unsigned long key);
-struct core_ticket *ib_peer_search_context(struct ib_peer_memory_client *ib_peer_client,
- unsigned long key);
-#endif
-
-
diff --git a/sys/ofed/include/rdma/ib_pma.h b/sys/ofed/include/rdma/ib_pma.h
index a2300cd276cdf..2f8a65c1fca7b 100644
--- a/sys/ofed/include/rdma/ib_pma.h
+++ b/sys/ofed/include/rdma/ib_pma.h
@@ -37,30 +37,12 @@
#include <rdma/ib_mad.h>
-#define MAX_U32 0xffffffffULL
-#define MAX_U16 0xffffUL
-
-/* Counters should be saturate once they reach their maximum value */
-#define ASSIGN_32BIT_COUNTER(counter, value) do { \
- if ((value) > MAX_U32) \
- counter = cpu_to_be32(MAX_U32); \
- else \
- counter = cpu_to_be32(value); \
-} while (0)
-
-/* Counters should be saturate once they reach their maximum value */
-#define ASSIGN_16BIT_COUNTER(counter, value) do { \
- if ((value) > MAX_U16) \
- counter = cpu_to_be16(MAX_U16); \
- else \
- counter = cpu_to_be16(value); \
-} while (0)
-
/*
* PMA class portinfo capability mask bits
*/
#define IB_PMA_CLASS_CAP_ALLPORTSELECT cpu_to_be16(1 << 8)
#define IB_PMA_CLASS_CAP_EXT_WIDTH cpu_to_be16(1 << 9)
+#define IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF cpu_to_be16(1 << 10)
#define IB_PMA_CLASS_CAP_XMIT_WAIT cpu_to_be16(1 << 12)
#define IB_PMA_CLASS_PORT_INFO cpu_to_be16(0x0001)
diff --git a/sys/ofed/include/rdma/ib_sa.h b/sys/ofed/include/rdma/ib_sa.h
index 65f1a006ee529..4c3e2924e21db 100644
--- a/sys/ofed/include/rdma/ib_sa.h
+++ b/sys/ofed/include/rdma/ib_sa.h
@@ -37,8 +37,7 @@
#include <linux/completion.h>
#include <linux/compiler.h>
-
-#include <asm/atomic.h>
+#include <linux/netdevice.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_mad.h>
@@ -94,6 +93,21 @@ enum ib_sa_selector {
};
/*
+ * There are 4 types of join states:
+ * FullMember, NonMember, SendOnlyNonMember, SendOnlyFullMember.
+ * The order corresponds to JoinState bits in MCMemberRecord.
+ */
+enum ib_sa_mc_join_states {
+ FULLMEMBER_JOIN,
+ NONMEMBER_JOIN,
+ SENDONLY_NONMEBER_JOIN,
+ SENDONLY_FULLMEMBER_JOIN,
+ NUM_JOIN_MEMBERSHIP_TYPES,
+};
+
+#define IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT BIT(12)
+
+/*
* Structures for SA records are named "struct ib_sa_xxx_rec." No
* attempt is made to pack structures to match the physical layout of
* SA records in SA MADs; all packing and unpacking is handled by the
@@ -154,11 +168,19 @@ struct ib_sa_path_rec {
u8 packet_life_time_selector;
u8 packet_life_time;
u8 preference;
- u8 smac[ETH_ALEN];
- u8 dmac[6];
- __be16 vlan_id;
+ u8 dmac[ETH_ALEN];
+ /* ignored in IB */
+ int ifindex;
+ /* ignored in IB */
+ struct vnet *net;
+ enum ib_gid_type gid_type;
};
+static inline struct net_device *ib_get_ndev_from_path(struct ib_sa_path_rec *rec)
+{
+ return rec->net ? dev_get_by_index(rec->net, rec->ifindex) : NULL;
+}
+
#define IB_SA_MCMEMBER_REC_MGID IB_SA_COMP_MASK( 0)
#define IB_SA_MCMEMBER_REC_PORT_GID IB_SA_COMP_MASK( 1)
#define IB_SA_MCMEMBER_REC_QKEY IB_SA_COMP_MASK( 2)
@@ -394,6 +416,8 @@ int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
*/
int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
struct ib_sa_mcmember_rec *rec,
+ struct net_device *ndev,
+ enum ib_gid_type gid_type,
struct ib_ah_attr *ah_attr);
/**
@@ -405,6 +429,12 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
struct ib_ah_attr *ah_attr);
/**
+ * ib_sa_pack_path - Conert a path record from struct ib_sa_path_rec
+ * to IB MAD wire format.
+ */
+void ib_sa_pack_path(struct ib_sa_path_rec *rec, void *attribute);
+
+/**
* ib_sa_unpack_path - Convert a path record from MAD format to struct
* ib_sa_path_rec.
*/
@@ -412,13 +442,24 @@ void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec);
/* Support GuidInfoRecord */
int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
- struct ib_device *device, u8 port_num,
- struct ib_sa_guidinfo_rec *rec,
- ib_sa_comp_mask comp_mask, u8 method,
- int timeout_ms, gfp_t gfp_mask,
- void (*callback)(int status,
- struct ib_sa_guidinfo_rec *resp,
- void *context),
- void *context,
- struct ib_sa_query **sa_query);
+ struct ib_device *device, u8 port_num,
+ struct ib_sa_guidinfo_rec *rec,
+ ib_sa_comp_mask comp_mask, u8 method,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_guidinfo_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query);
+
+/* Support get SA ClassPortInfo */
+int ib_sa_classport_info_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_class_port_info *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query);
+
#endif /* IB_SA_H */
diff --git a/sys/ofed/include/rdma/ib_smi.h b/sys/ofed/include/rdma/ib_smi.h
index 973c9a4efec85..15ac3d71c5eea 100644
--- a/sys/ofed/include/rdma/ib_smi.h
+++ b/sys/ofed/include/rdma/ib_smi.h
@@ -38,7 +38,6 @@
#define IB_SMI_H
#include <rdma/ib_mad.h>
-#include <asm/byteorder.h>
#define IB_SMP_DATA_SIZE 64
#define IB_SMP_MAX_PATH_HOPS 64
@@ -120,10 +119,57 @@ struct ib_port_info {
u8 link_roundtrip_latency[3];
};
+struct ib_node_info {
+ u8 base_version;
+ u8 class_version;
+ u8 node_type;
+ u8 num_ports;
+ __be64 sys_guid;
+ __be64 node_guid;
+ __be64 port_guid;
+ __be16 partition_cap;
+ __be16 device_id;
+ __be32 revision;
+ u8 local_port_num;
+ u8 vendor_id[3];
+} __packed;
+
+struct ib_vl_weight_elem {
+ u8 vl; /* IB: VL is low 4 bits, upper 4 bits reserved */
+ /* OPA: VL is low 5 bits, upper 3 bits reserved */
+ u8 weight;
+};
+
static inline u8
-ib_get_smp_direction(struct ib_smp *smp)
+ib_get_smp_direction(const struct ib_smp *smp)
{
return ((smp->status & IB_SMP_DIRECTION) == IB_SMP_DIRECTION);
}
+/*
+ * SM Trap/Notice numbers
+ */
+#define IB_NOTICE_TRAP_LLI_THRESH cpu_to_be16(129)
+#define IB_NOTICE_TRAP_EBO_THRESH cpu_to_be16(130)
+#define IB_NOTICE_TRAP_FLOW_UPDATE cpu_to_be16(131)
+#define IB_NOTICE_TRAP_CAP_MASK_CHG cpu_to_be16(144)
+#define IB_NOTICE_TRAP_SYS_GUID_CHG cpu_to_be16(145)
+#define IB_NOTICE_TRAP_BAD_MKEY cpu_to_be16(256)
+#define IB_NOTICE_TRAP_BAD_PKEY cpu_to_be16(257)
+#define IB_NOTICE_TRAP_BAD_QKEY cpu_to_be16(258)
+
+/*
+ * Other local changes flags (trap 144).
+ */
+#define IB_NOTICE_TRAP_LSE_CHG 0x04 /* Link Speed Enable changed */
+#define IB_NOTICE_TRAP_LWE_CHG 0x02 /* Link Width Enable changed */
+#define IB_NOTICE_TRAP_NODE_DESC_CHG 0x01
+
+/*
+ * M_Key volation flags in dr_trunc_hop (trap 256).
+ */
+#define IB_NOTICE_TRAP_DR_NOTICE 0x80
+#define IB_NOTICE_TRAP_DR_TRUNC 0x40
+
+
#endif /* IB_SMI_H */
diff --git a/sys/ofed/include/rdma/ib_umem.h b/sys/ofed/include/rdma/ib_umem.h
index 82f6cfaa5b7aa..6386d45c9cf23 100644
--- a/sys/ofed/include/rdma/ib_umem.h
+++ b/sys/ofed/include/rdma/ib_umem.h
@@ -36,58 +36,73 @@
#include <linux/list.h>
#include <linux/scatterlist.h>
#include <linux/workqueue.h>
-#include <linux/dma-attrs.h>
-#include <linux/completion.h>
-#include <rdma/ib_peer_mem.h>
struct ib_ucontext;
-struct ib_umem;
-
-typedef void (*umem_invalidate_func_t)(void *invalidation_cookie,
- struct ib_umem *umem,
- unsigned long addr, size_t size);
-
-struct invalidation_ctx {
- struct ib_umem *umem;
- umem_invalidate_func_t func;
- void *cookie;
- unsigned long context_ticket;
- int peer_callback;
- int inflight_invalidation;
- int peer_invalidated;
- struct completion comp;
-};
+struct ib_umem_odp;
struct ib_umem {
struct ib_ucontext *context;
size_t length;
- int offset;
+ unsigned long address;
int page_size;
int writable;
- int hugetlb;
struct work_struct work;
+ pid_t pid;
+ struct mm_struct *mm;
unsigned long diff;
- unsigned long start;
+ struct ib_umem_odp *odp_data;
struct sg_table sg_head;
- int nmap;
+ int nmap;
int npages;
- /* peer memory that manages this umem*/
- struct ib_peer_memory_client *ib_peer_mem;
- struct invalidation_ctx *invalidation_ctx;
- int peer_mem_srcu_key;
- /* peer memory private context */
- void *peer_mem_client_context;
};
+/* Returns the offset of the umem start relative to the first page. */
+static inline int ib_umem_offset(struct ib_umem *umem)
+{
+ return umem->address & ((unsigned long)umem->page_size - 1);
+}
+
+/* Returns the first page of an ODP umem. */
+static inline unsigned long ib_umem_start(struct ib_umem *umem)
+{
+ return umem->address - ib_umem_offset(umem);
+}
+
+/* Returns the address of the page after the last one of an ODP umem. */
+static inline unsigned long ib_umem_end(struct ib_umem *umem)
+{
+ return PAGE_ALIGN(umem->address + umem->length);
+}
+
+static inline size_t ib_umem_num_pages(struct ib_umem *umem)
+{
+ return (ib_umem_end(umem) - ib_umem_start(umem)) >> PAGE_SHIFT;
+}
+
+#ifdef CONFIG_INFINIBAND_USER_MEM
+
struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
size_t size, int access, int dmasync);
-struct ib_umem *ib_umem_get_ex(struct ib_ucontext *context, unsigned long addr,
- size_t size, int access, int dmasync,
- int invalidation_supported);
-void ib_umem_activate_invalidation_notifier(struct ib_umem *umem,
- umem_invalidate_func_t func,
- void *cookie);
void ib_umem_release(struct ib_umem *umem);
int ib_umem_page_count(struct ib_umem *umem);
+int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
+ size_t length);
+
+#else /* CONFIG_INFINIBAND_USER_MEM */
+
+#include <linux/err.h>
+
+static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context,
+ unsigned long addr, size_t size,
+ int access, int dmasync) {
+ return ERR_PTR(-EINVAL);
+}
+static inline void ib_umem_release(struct ib_umem *umem) { }
+static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; }
+static inline int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
+ size_t length) {
+ return -EINVAL;
+}
+#endif /* CONFIG_INFINIBAND_USER_MEM */
#endif /* IB_UMEM_H */
diff --git a/sys/ofed/include/rdma/ib_umem_odp.h b/sys/ofed/include/rdma/ib_umem_odp.h
new file mode 100644
index 0000000000000..d2f0e6b1a8a51
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_umem_odp.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_UMEM_ODP_H
+#define IB_UMEM_ODP_H
+
+#include <linux/rbtree.h>
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_verbs.h>
+
+struct umem_odp_node {
+ u64 __subtree_last;
+ struct rb_node rb;
+};
+
+struct ib_umem_odp {
+ /*
+ * An array of the pages included in the on-demand paging umem.
+ * Indices of pages that are currently not mapped into the device will
+ * contain NULL.
+ */
+ struct page **page_list;
+ /*
+ * An array of the same size as page_list, with DMA addresses mapped
+ * for pages the pages in page_list. The lower two bits designate
+ * access permissions. See ODP_READ_ALLOWED_BIT and
+ * ODP_WRITE_ALLOWED_BIT.
+ */
+ dma_addr_t *dma_list;
+ /*
+ * The umem_mutex protects the page_list and dma_list fields of an ODP
+ * umem, allowing only a single thread to map/unmap pages. The mutex
+ * also protects access to the mmu notifier counters.
+ */
+ struct mutex umem_mutex;
+ void *private; /* for the HW driver to use. */
+
+ /* When false, use the notifier counter in the ucontext struct. */
+ bool mn_counters_active;
+ int notifiers_seq;
+ int notifiers_count;
+
+ /* A linked list of umems that don't have private mmu notifier
+ * counters yet. */
+ struct list_head no_private_counters;
+ struct ib_umem *umem;
+
+ /* Tree tracking */
+ struct umem_odp_node interval_tree;
+
+ struct completion notifier_completion;
+ int dying;
+};
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+
+int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem);
+
+void ib_umem_odp_release(struct ib_umem *umem);
+
+/*
+ * The lower 2 bits of the DMA address signal the R/W permissions for
+ * the entry. To upgrade the permissions, provide the appropriate
+ * bitmask to the map_dma_pages function.
+ *
+ * Be aware that upgrading a mapped address might result in change of
+ * the DMA address for the page.
+ */
+#define ODP_READ_ALLOWED_BIT (1<<0ULL)
+#define ODP_WRITE_ALLOWED_BIT (1<<1ULL)
+
+#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT))
+
+int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt,
+ u64 access_mask, unsigned long current_seq);
+
+void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset,
+ u64 bound);
+
+void rbt_ib_umem_insert(struct umem_odp_node *node, struct rb_root *root);
+void rbt_ib_umem_remove(struct umem_odp_node *node, struct rb_root *root);
+typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end,
+ void *cookie);
+/*
+ * Call the callback on each ib_umem in the range. Returns the logical or of
+ * the return values of the functions called.
+ */
+int rbt_ib_umem_for_each_in_range(struct rb_root *root, u64 start, u64 end,
+ umem_call_back cb, void *cookie);
+
+struct umem_odp_node *rbt_ib_umem_iter_first(struct rb_root *root,
+ u64 start, u64 last);
+struct umem_odp_node *rbt_ib_umem_iter_next(struct umem_odp_node *node,
+ u64 start, u64 last);
+
+static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item,
+ unsigned long mmu_seq)
+{
+ /*
+ * This code is strongly based on the KVM code from
+ * mmu_notifier_retry. Should be called with
+ * the relevant locks taken (item->odp_data->umem_mutex
+ * and the ucontext umem_mutex semaphore locked for read).
+ */
+
+ /* Do not allow page faults while the new ib_umem hasn't seen a state
+ * with zero notifiers yet, and doesn't have its own valid set of
+ * private counters. */
+ if (!item->odp_data->mn_counters_active)
+ return 1;
+
+ if (unlikely(item->odp_data->notifiers_count))
+ return 1;
+ if (item->odp_data->notifiers_seq != mmu_seq)
+ return 1;
+ return 0;
+}
+
+#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+
+static inline int ib_umem_odp_get(struct ib_ucontext *context,
+ struct ib_umem *umem)
+{
+ return -EINVAL;
+}
+
+static inline void ib_umem_odp_release(struct ib_umem *umem) {}
+
+#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+
+#endif /* IB_UMEM_ODP_H */
diff --git a/sys/ofed/include/rdma/ib_user_mad.h b/sys/ofed/include/rdma/ib_user_mad.h
index 3595ae26c9080..09f809f323eaa 100644
--- a/sys/ofed/include/rdma/ib_user_mad.h
+++ b/sys/ofed/include/rdma/ib_user_mad.h
@@ -191,12 +191,55 @@ struct ib_user_mad_reg_req {
__u8 rmpp_version;
};
+/**
+ * ib_user_mad_reg_req2 - MAD registration request
+ *
+ * @id - Set by the _kernel_; used by userspace to identify the
+ * registered agent in future requests.
+ * @qpn - Queue pair number; must be 0 or 1.
+ * @mgmt_class - Indicates which management class of MADs should be
+ * receive by the caller. This field is only required if
+ * the user wishes to receive unsolicited MADs, otherwise
+ * it should be 0.
+ * @mgmt_class_version - Indicates which version of MADs for the given
+ * management class to receive.
+ * @res - Ignored.
+ * @flags - additional registration flags; Must be in the set of
+ * flags defined in IB_USER_MAD_REG_FLAGS_CAP
+ * @method_mask - The caller wishes to receive unsolicited MADs for the
+ * methods whose bit(s) is(are) set.
+ * @oui - Indicates IEEE OUI to use when mgmt_class is a vendor
+ * class in the range from 0x30 to 0x4f. Otherwise not
+ * used.
+ * @rmpp_version - If set, indicates the RMPP version to use.
+ */
+enum {
+ IB_USER_MAD_USER_RMPP = (1 << 0),
+};
+#define IB_USER_MAD_REG_FLAGS_CAP (IB_USER_MAD_USER_RMPP)
+struct ib_user_mad_reg_req2 {
+ __u32 id;
+ __u32 qpn;
+ __u8 mgmt_class;
+ __u8 mgmt_class_version;
+ __u16 res;
+ __u32 flags;
+ __u64 method_mask[2];
+ __u32 oui;
+ __u8 rmpp_version;
+ __u8 reserved[3];
+};
+
#define IB_IOCTL_MAGIC 0x1b
-#define IB_USER_MAD_REGISTER_AGENT _IO(IB_IOCTL_MAGIC, 1)
+#define IB_USER_MAD_REGISTER_AGENT _IOWR(IB_IOCTL_MAGIC, 1, \
+ struct ib_user_mad_reg_req)
-#define IB_USER_MAD_UNREGISTER_AGENT _IO(IB_IOCTL_MAGIC, 2)
+#define IB_USER_MAD_UNREGISTER_AGENT _IOW(IB_IOCTL_MAGIC, 2, __u32)
#define IB_USER_MAD_ENABLE_PKEY _IO(IB_IOCTL_MAGIC, 3)
+#define IB_USER_MAD_REGISTER_AGENT2 _IOWR(IB_IOCTL_MAGIC, 4, \
+ struct ib_user_mad_reg_req2)
+
#endif /* IB_USER_MAD_H */
diff --git a/sys/ofed/include/rdma/ib_user_verbs.h b/sys/ofed/include/rdma/ib_user_verbs.h
index a07de88a5d0d1..25225ebbc7d52 100644
--- a/sys/ofed/include/rdma/ib_user_verbs.h
+++ b/sys/ofed/include/rdma/ib_user_verbs.h
@@ -45,12 +45,6 @@
#define IB_USER_VERBS_ABI_VERSION 6
#define IB_USER_VERBS_CMD_THRESHOLD 50
-/*
- * To support 6 legacy commands using the old extension style
- */
-#define IB_USER_VERBS_LEGACY_CMD_FIRST 52
-#define IB_USER_VERBS_LEGACY_EX_CMD_LAST 56
-
enum {
IB_USER_VERBS_CMD_GET_CONTEXT,
IB_USER_VERBS_CMD_QUERY_DEVICE,
@@ -92,15 +86,22 @@ enum {
IB_USER_VERBS_CMD_OPEN_XRCD,
IB_USER_VERBS_CMD_CLOSE_XRCD,
IB_USER_VERBS_CMD_CREATE_XSRQ,
- IB_USER_VERBS_CMD_OPEN_QP
+ IB_USER_VERBS_CMD_OPEN_QP,
};
enum {
+ IB_USER_VERBS_EX_CMD_QUERY_DEVICE = IB_USER_VERBS_CMD_QUERY_DEVICE,
+ IB_USER_VERBS_EX_CMD_CREATE_CQ = IB_USER_VERBS_CMD_CREATE_CQ,
+ IB_USER_VERBS_EX_CMD_CREATE_QP = IB_USER_VERBS_CMD_CREATE_QP,
IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD,
- IB_USER_VERBS_EX_CMD_DESTROY_FLOW
+ IB_USER_VERBS_EX_CMD_DESTROY_FLOW,
+ IB_USER_VERBS_EX_CMD_CREATE_WQ,
+ IB_USER_VERBS_EX_CMD_MODIFY_WQ,
+ IB_USER_VERBS_EX_CMD_DESTROY_WQ,
+ IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL,
+ IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL
};
-
/*
* Make sure that all structs defined in this file remain laid out so
* that they pack the same way on 32-bit and 64-bit architectures (to
@@ -130,14 +131,6 @@ struct ib_uverbs_comp_event_desc {
* the rest of the command struct based on these value.
*/
-#define IBV_RESP_TO_VERBS_RESP_EX_RAW(ex_ptr, ex_type, ibv_type, field) \
- ((ibv_type *)((void *)(ex_ptr) + offsetof(ex_type, \
- field) + sizeof((ex_ptr)->field)))
-
-#define IBV_RESP_TO_VERBS_RESP_EX(ex_ptr, ex_type, ibv_type) \
- IBV_RESP_TO_VERBS_RESP_EX_RAW(ex_ptr, ex_type, ibv_type, comp_mask)
-
-
#define IB_USER_VERBS_CMD_COMMAND_MASK 0xff
#define IB_USER_VERBS_CMD_FLAGS_MASK 0xff000000u
#define IB_USER_VERBS_CMD_FLAGS_SHIFT 24
@@ -172,11 +165,6 @@ struct ib_uverbs_query_device {
__u64 driver_data[0];
};
-struct ib_uverbs_query_device_ex {
- __u64 comp_mask;
- __u64 driver_data[0];
-};
-
struct ib_uverbs_query_device_resp {
__u64 fw_ver;
__be64 node_guid;
@@ -221,6 +209,45 @@ struct ib_uverbs_query_device_resp {
__u8 reserved[4];
};
+struct ib_uverbs_ex_query_device {
+ __u32 comp_mask;
+ __u32 reserved;
+};
+
+struct ib_uverbs_odp_caps {
+ __u64 general_caps;
+ struct {
+ __u32 rc_odp_caps;
+ __u32 uc_odp_caps;
+ __u32 ud_odp_caps;
+ } per_transport_caps;
+ __u32 reserved;
+};
+
+struct ib_uverbs_rss_caps {
+ /* Corresponding bit will be set if qp type from
+ * 'enum ib_qp_type' is supported, e.g.
+ * supported_qpts |= 1 << IB_QPT_UD
+ */
+ __u32 supported_qpts;
+ __u32 max_rwq_indirection_tables;
+ __u32 max_rwq_indirection_table_size;
+ __u32 reserved;
+};
+
+struct ib_uverbs_ex_query_device_resp {
+ struct ib_uverbs_query_device_resp base;
+ __u32 comp_mask;
+ __u32 response_length;
+ struct ib_uverbs_odp_caps odp_caps;
+ __u64 timestamp_mask;
+ __u64 hca_core_clock; /* in KHZ */
+ __u64 device_cap_flags_ex;
+ struct ib_uverbs_rss_caps rss_caps;
+ __u32 max_wq_type_rq;
+ __u32 reserved;
+};
+
struct ib_uverbs_query_port {
__u64 response;
__u8 port_num;
@@ -296,6 +323,22 @@ struct ib_uverbs_reg_mr_resp {
__u32 rkey;
};
+struct ib_uverbs_rereg_mr {
+ __u64 response;
+ __u32 mr_handle;
+ __u32 flags;
+ __u64 start;
+ __u64 length;
+ __u64 hca_va;
+ __u32 pd_handle;
+ __u32 access_flags;
+};
+
+struct ib_uverbs_rereg_mr_resp {
+ __u32 lkey;
+ __u32 rkey;
+};
+
struct ib_uverbs_dereg_mr {
__u32 mr_handle;
};
@@ -334,33 +377,25 @@ struct ib_uverbs_create_cq {
__u64 driver_data[0];
};
-struct ib_uverbs_create_cq_resp {
- __u32 cq_handle;
- __u32 cqe;
-};
-
-enum ib_uverbs_create_cq_ex_comp_mask {
- IB_UVERBS_CREATE_CQ_EX_CAP_FLAGS = (u64)1 << 0,
-};
-
-struct ib_uverbs_create_cq_ex {
- __u64 comp_mask;
+struct ib_uverbs_ex_create_cq {
__u64 user_handle;
__u32 cqe;
__u32 comp_vector;
__s32 comp_channel;
+ __u32 comp_mask;
+ __u32 flags;
__u32 reserved;
- __u64 create_flags;
- __u64 driver_data[0];
};
-struct ib_uverbs_modify_cq_ex {
- __u64 comp_mask;
+struct ib_uverbs_create_cq_resp {
__u32 cq_handle;
- __u32 attr_mask;
- __u16 cq_count;
- __u16 cq_period;
- __u32 cq_cap_flags;
+ __u32 cqe;
+};
+
+struct ib_uverbs_ex_create_cq_resp {
+ struct ib_uverbs_create_cq_resp base;
+ __u32 comp_mask;
+ __u32 response_length;
};
struct ib_uverbs_resize_cq {
@@ -502,6 +537,35 @@ struct ib_uverbs_create_qp {
__u64 driver_data[0];
};
+enum ib_uverbs_create_qp_mask {
+ IB_UVERBS_CREATE_QP_MASK_IND_TABLE = 1UL << 0,
+};
+
+enum {
+ IB_UVERBS_CREATE_QP_SUP_COMP_MASK = IB_UVERBS_CREATE_QP_MASK_IND_TABLE,
+};
+
+struct ib_uverbs_ex_create_qp {
+ __u64 user_handle;
+ __u32 pd_handle;
+ __u32 send_cq_handle;
+ __u32 recv_cq_handle;
+ __u32 srq_handle;
+ __u32 max_send_wr;
+ __u32 max_recv_wr;
+ __u32 max_send_sge;
+ __u32 max_recv_sge;
+ __u32 max_inline_data;
+ __u8 sq_sig_all;
+ __u8 qp_type;
+ __u8 is_srq;
+ __u8 reserved;
+ __u32 comp_mask;
+ __u32 create_flags;
+ __u32 rwq_ind_tbl_handle;
+ __u32 reserved1;
+};
+
struct ib_uverbs_open_qp {
__u64 response;
__u64 user_handle;
@@ -524,6 +588,12 @@ struct ib_uverbs_create_qp_resp {
__u32 reserved;
};
+struct ib_uverbs_ex_create_qp_resp {
+ struct ib_uverbs_create_qp_resp base;
+ __u32 comp_mask;
+ __u32 response_length;
+};
+
/*
* This struct needs to remain a multiple of 8 bytes to keep the
* alignment of the modify QP parameters.
@@ -614,42 +684,6 @@ struct ib_uverbs_modify_qp {
__u64 driver_data[0];
};
-enum ib_uverbs_modify_qp_ex_comp_mask {
- IB_UVERBS_QP_ATTR_DCT_KEY = 1ULL << 0,
-};
-
-struct ib_uverbs_modify_qp_ex {
- __u32 comp_mask;
- struct ib_uverbs_qp_dest dest;
- struct ib_uverbs_qp_dest alt_dest;
- __u32 qp_handle;
- __u32 attr_mask;
- __u32 qkey;
- __u32 rq_psn;
- __u32 sq_psn;
- __u32 dest_qp_num;
- __u32 qp_access_flags;
- __u16 pkey_index;
- __u16 alt_pkey_index;
- __u8 qp_state;
- __u8 cur_qp_state;
- __u8 path_mtu;
- __u8 path_mig_state;
- __u8 en_sqd_async_notify;
- __u8 max_rd_atomic;
- __u8 max_dest_rd_atomic;
- __u8 min_rnr_timer;
- __u8 port_num;
- __u8 timeout;
- __u8 retry_cnt;
- __u8 rnr_retry;
- __u8 alt_port_num;
- __u8 alt_timeout;
- __u8 reserved[2];
- __u64 dct_key;
- __u64 driver_data[0];
-};
-
struct ib_uverbs_modify_qp_resp {
};
@@ -784,18 +818,18 @@ struct ib_uverbs_detach_mcast {
};
struct ib_uverbs_flow_spec_hdr {
- __u32 type;
+ __u32 type;
__u16 size;
__u16 reserved;
/* followed by flow_spec */
__u64 flow_spec_data[0];
};
-struct ib_kern_eth_filter {
- __u8 dst_mac[6];
- __u8 src_mac[6];
- __be16 ether_type;
- __be16 vlan_tag;
+struct ib_uverbs_flow_eth_filter {
+ __u8 dst_mac[6];
+ __u8 src_mac[6];
+ __be16 ether_type;
+ __be16 vlan_tag;
};
struct ib_uverbs_flow_spec_eth {
@@ -807,16 +841,20 @@ struct ib_uverbs_flow_spec_eth {
__u16 reserved;
};
};
- struct ib_kern_eth_filter val;
- struct ib_kern_eth_filter mask;
+ struct ib_uverbs_flow_eth_filter val;
+ struct ib_uverbs_flow_eth_filter mask;
};
-struct ib_kern_ib_filter {
- __be32 l3_type_qpn;
- __u8 dst_gid[16];
+struct ib_uverbs_flow_ipv4_filter {
+ __be32 src_ip;
+ __be32 dst_ip;
+ __u8 proto;
+ __u8 tos;
+ __u8 ttl;
+ __u8 flags;
};
-struct ib_uverbs_flow_spec_ib {
+struct ib_uverbs_flow_spec_ipv4 {
union {
struct ib_uverbs_flow_spec_hdr hdr;
struct {
@@ -825,16 +863,16 @@ struct ib_uverbs_flow_spec_ib {
__u16 reserved;
};
};
- struct ib_kern_ib_filter val;
- struct ib_kern_ib_filter mask;
+ struct ib_uverbs_flow_ipv4_filter val;
+ struct ib_uverbs_flow_ipv4_filter mask;
};
-struct ib_kern_ipv4_filter {
- __be32 src_ip;
- __be32 dst_ip;
+struct ib_uverbs_flow_tcp_udp_filter {
+ __be16 dst_port;
+ __be16 src_port;
};
-struct ib_uverbs_flow_spec_ipv4 {
+struct ib_uverbs_flow_spec_tcp_udp {
union {
struct ib_uverbs_flow_spec_hdr hdr;
struct {
@@ -843,16 +881,21 @@ struct ib_uverbs_flow_spec_ipv4 {
__u16 reserved;
};
};
- struct ib_kern_ipv4_filter val;
- struct ib_kern_ipv4_filter mask;
+ struct ib_uverbs_flow_tcp_udp_filter val;
+ struct ib_uverbs_flow_tcp_udp_filter mask;
};
-struct ib_kern_tcp_udp_filter {
- __be16 dst_port;
- __be16 src_port;
+struct ib_uverbs_flow_ipv6_filter {
+ __u8 src_ip[16];
+ __u8 dst_ip[16];
+ __be32 flow_label;
+ __u8 next_hdr;
+ __u8 traffic_class;
+ __u8 hop_limit;
+ __u8 reserved;
};
-struct ib_uverbs_flow_spec_tcp_udp {
+struct ib_uverbs_flow_spec_ipv6 {
union {
struct ib_uverbs_flow_spec_hdr hdr;
struct {
@@ -861,18 +904,18 @@ struct ib_uverbs_flow_spec_tcp_udp {
__u16 reserved;
};
};
- struct ib_kern_tcp_udp_filter val;
- struct ib_kern_tcp_udp_filter mask;
+ struct ib_uverbs_flow_ipv6_filter val;
+ struct ib_uverbs_flow_ipv6_filter mask;
};
struct ib_uverbs_flow_attr {
- __u32 type;
- __u16 size;
- __u16 priority;
- __u8 num_of_specs;
- __u8 reserved[2];
- __u8 port;
- __u32 flags;
+ __u32 type;
+ __u16 size;
+ __u16 priority;
+ __u8 num_of_specs;
+ __u8 reserved[2];
+ __u8 port;
+ __u32 flags;
/* Following are the optional layers according to user request
* struct ib_flow_spec_xxx
* struct ib_flow_spec_yyy
@@ -959,22 +1002,66 @@ struct ib_uverbs_destroy_srq_resp {
__u32 events_reported;
};
+struct ib_uverbs_ex_create_wq {
+ __u32 comp_mask;
+ __u32 wq_type;
+ __u64 user_handle;
+ __u32 pd_handle;
+ __u32 cq_handle;
+ __u32 max_wr;
+ __u32 max_sge;
+};
-/*
- * Legacy extended verbs related structures
- */
-struct ib_uverbs_ex_cmd_hdr_legacy {
- __u32 command;
- __u16 in_words;
- __u16 out_words;
- __u16 provider_in_words;
- __u16 provider_out_words;
- __u32 cmd_hdr_reserved;
+struct ib_uverbs_ex_create_wq_resp {
+ __u32 comp_mask;
+ __u32 response_length;
+ __u32 wq_handle;
+ __u32 max_wr;
+ __u32 max_sge;
+ __u32 wqn;
};
-struct ib_uverbs_ex_cmd_resp1_legacy {
- __u64 comp_mask;
- __u64 response;
+struct ib_uverbs_ex_destroy_wq {
+ __u32 comp_mask;
+ __u32 wq_handle;
+};
+
+struct ib_uverbs_ex_destroy_wq_resp {
+ __u32 comp_mask;
+ __u32 response_length;
+ __u32 events_reported;
+ __u32 reserved;
+};
+
+struct ib_uverbs_ex_modify_wq {
+ __u32 attr_mask;
+ __u32 wq_handle;
+ __u32 wq_state;
+ __u32 curr_wq_state;
+};
+
+/* Prevent memory allocation rather than max expected size */
+#define IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE 0x0d
+struct ib_uverbs_ex_create_rwq_ind_table {
+ __u32 comp_mask;
+ __u32 log_ind_tbl_size;
+ /* Following are the wq handles according to log_ind_tbl_size
+ * wq_handle1
+ * wq_handle2
+ */
+ __u32 wq_handles[0];
+};
+
+struct ib_uverbs_ex_create_rwq_ind_table_resp {
+ __u32 comp_mask;
+ __u32 response_length;
+ __u32 ind_tbl_handle;
+ __u32 ind_tbl_num;
+};
+
+struct ib_uverbs_ex_destroy_rwq_ind_table {
+ __u32 comp_mask;
+ __u32 ind_tbl_handle;
};
#endif /* IB_USER_VERBS_H */
diff --git a/sys/ofed/include/rdma/ib_user_verbs_exp.h b/sys/ofed/include/rdma/ib_user_verbs_exp.h
deleted file mode 100644
index 557d4ba7ceceb..0000000000000
--- a/sys/ofed/include/rdma/ib_user_verbs_exp.h
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright (c) 2005 Topspin Communications. All rights reserved.
- * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
- * Copyright (c) 2005 PathScale, Inc. All rights reserved.
- * Copyright (c) 2006 Mellanox Technologies. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef IB_USER_VERBS_EXP_H
-#define IB_USER_VERBS_EXP_H
-
-#include <rdma/ib_user_verbs.h>
-
-enum {
- IB_USER_VERBS_EXP_CMD_FIRST = 64
-};
-
-enum {
- IB_USER_VERBS_EXP_CMD_CREATE_QP,
- IB_USER_VERBS_EXP_CMD_MODIFY_CQ,
- IB_USER_VERBS_EXP_CMD_MODIFY_QP,
- IB_USER_VERBS_EXP_CMD_CREATE_CQ,
- IB_USER_VERBS_EXP_CMD_QUERY_DEVICE,
- IB_USER_VERBS_EXP_CMD_CREATE_DCT,
- IB_USER_VERBS_EXP_CMD_DESTROY_DCT,
- IB_USER_VERBS_EXP_CMD_QUERY_DCT,
-};
-
-/*
- * Make sure that all structs defined in this file remain laid out so
- * that they pack the same way on 32-bit and 64-bit architectures (to
- * avoid incompatibility between 32-bit userspace and 64-bit kernels).
- * Specifically:
- * - Do not use pointer types -- pass pointers in __u64 instead.
- * - Make sure that any structure larger than 4 bytes is padded to a
- * multiple of 8 bytes. Otherwise the structure size will be
- * different between 32-bit and 64-bit architectures.
- */
-
-enum ib_uverbs_exp_create_qp_comp_mask {
- IB_UVERBS_EXP_CREATE_QP_CAP_FLAGS = (1ULL << 0),
- IB_UVERBS_EXP_CREATE_QP_INL_RECV = (1ULL << 1),
- IB_UVERBS_EXP_CREATE_QP_QPG = (1ULL << 2)
-};
-
-struct ib_uverbs_qpg_init_attrib {
- __u32 tss_child_count;
- __u32 rss_child_count;
-};
-
-struct ib_uverbs_qpg {
- __u32 qpg_type;
- union {
- struct {
- __u32 parent_handle;
- __u32 reserved;
- };
- struct ib_uverbs_qpg_init_attrib parent_attrib;
- };
- __u32 reserved2;
-};
-
-struct ib_uverbs_exp_create_qp {
- __u64 comp_mask;
- __u64 user_handle;
- __u32 pd_handle;
- __u32 send_cq_handle;
- __u32 recv_cq_handle;
- __u32 srq_handle;
- __u32 max_send_wr;
- __u32 max_recv_wr;
- __u32 max_send_sge;
- __u32 max_recv_sge;
- __u32 max_inline_data;
- __u8 sq_sig_all;
- __u8 qp_type;
- __u8 is_srq;
- __u8 reserved;
- __u64 qp_cap_flags;
- __u32 max_inl_recv;
- __u32 reserved1;
- struct ib_uverbs_qpg qpg;
- __u64 driver_data[0];
-};
-
-enum ib_uverbs_exp_create_qp_resp_comp_mask {
- IB_UVERBS_EXP_CREATE_QP_RESP_INL_RECV = (1ULL << 0),
-};
-
-struct ib_uverbs_exp_create_qp_resp {
- __u64 comp_mask;
- __u32 qp_handle;
- __u32 qpn;
- __u32 max_send_wr;
- __u32 max_recv_wr;
- __u32 max_send_sge;
- __u32 max_recv_sge;
- __u32 max_inline_data;
- __u32 max_inl_recv;
-};
-
-struct ib_uverbs_create_dct {
- __u64 comp_mask;
- __u64 user_handle;
- __u32 pd_handle;
- __u32 cq_handle;
- __u32 srq_handle;
- __u32 access_flags;
- __u32 flow_label;
- __u64 dc_key;
- __u8 min_rnr_timer;
- __u8 tclass;
- __u8 port;
- __u8 pkey_index;
- __u8 gid_index;
- __u8 hop_limit;
- __u8 mtu;
- __u8 rsvd;
- __u32 create_flags;
- __u64 driver_data[0];
-};
-
-struct ib_uverbs_create_dct_resp {
- __u32 dct_handle;
- __u32 dctn;
-};
-
-struct ib_uverbs_destroy_dct {
- __u64 comp_mask;
- __u64 user_handle;
-};
-
-struct ib_uverbs_destroy_dct_resp {
- __u64 reserved;
-};
-
-struct ib_uverbs_query_dct {
- __u64 comp_mask;
- __u64 dct_handle;
- __u64 driver_data[0];
-};
-
-struct ib_uverbs_query_dct_resp {
- __u64 dc_key;
- __u32 access_flags;
- __u32 flow_label;
- __u32 key_violations;
- __u8 port;
- __u8 min_rnr_timer;
- __u8 tclass;
- __u8 mtu;
- __u8 pkey_index;
- __u8 gid_index;
- __u8 hop_limit;
- __u8 state;
- __u32 rsvd;
- __u64 driver_data[0];
-};
-
-struct ib_uverbs_exp_query_device {
- __u64 comp_mask;
- __u64 driver_data[0];
-};
-
-struct ib_uverbs_exp_query_device_resp {
- __u64 comp_mask;
- struct ib_uverbs_query_device_resp base;
- __u64 timestamp_mask;
- __u64 hca_core_clock;
- __u64 device_cap_flags2;
- __u32 dc_rd_req;
- __u32 dc_rd_res;
- __u32 inline_recv_sz;
- __u32 max_rss_tbl_sz;
-};
-
-#endif /* IB_USER_VERBS_EXP_H */
diff --git a/sys/ofed/include/rdma/ib_verbs.h b/sys/ofed/include/rdma/ib_verbs.h
index 21fe2ffdb3bb2..c96757d27af68 100644
--- a/sys/ofed/include/rdma/ib_verbs.h
+++ b/sys/ofed/include/rdma/ib_verbs.h
@@ -48,12 +48,24 @@
#include <linux/rwsem.h>
#include <linux/scatterlist.h>
#include <linux/workqueue.h>
+#include <linux/socket.h>
#include <linux/if_ether.h>
-#include <linux/mutex.h>
-
+#include <net/ipv6.h>
+#include <net/ip.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>
+#include <linux/netdevice.h>
+#include <netinet/ip.h>
+
+#include <asm/atomic.h>
#include <asm/uaccess.h>
+struct ifla_vf_info;
+struct ifla_vf_stats;
+
extern struct workqueue_struct *ib_wq;
+extern struct workqueue_struct *ib_comp_wq;
union ib_gid {
u8 raw[16];
@@ -63,50 +75,116 @@ union ib_gid {
} global;
};
+extern union ib_gid zgid;
+
+enum ib_gid_type {
+ /* If link layer is Ethernet, this is RoCE V1 */
+ IB_GID_TYPE_IB = 0,
+ IB_GID_TYPE_ROCE = 0,
+ IB_GID_TYPE_ROCE_UDP_ENCAP = 1,
+ IB_GID_TYPE_SIZE
+};
+
+#define ROCE_V2_UDP_DPORT 4791
+struct ib_gid_attr {
+ enum ib_gid_type gid_type;
+ struct net_device *ndev;
+};
+
enum rdma_node_type {
/* IB values map to NodeInfo:NodeType. */
RDMA_NODE_IB_CA = 1,
RDMA_NODE_IB_SWITCH,
RDMA_NODE_IB_ROUTER,
RDMA_NODE_RNIC,
- RDMA_NODE_MIC
+ RDMA_NODE_USNIC,
+ RDMA_NODE_USNIC_UDP,
+};
+
+enum {
+ /* set the local administered indication */
+ IB_SA_WELL_KNOWN_GUID = BIT_ULL(57) | 2,
};
enum rdma_transport_type {
RDMA_TRANSPORT_IB,
RDMA_TRANSPORT_IWARP,
- RDMA_TRANSPORT_SCIF
+ RDMA_TRANSPORT_USNIC,
+ RDMA_TRANSPORT_USNIC_UDP
+};
+
+enum rdma_protocol_type {
+ RDMA_PROTOCOL_IB,
+ RDMA_PROTOCOL_IBOE,
+ RDMA_PROTOCOL_IWARP,
+ RDMA_PROTOCOL_USNIC_UDP
};
-enum rdma_transport_type
-rdma_node_get_transport(enum rdma_node_type node_type) __attribute_const__;
+__attribute_const__ enum rdma_transport_type
+rdma_node_get_transport(enum rdma_node_type node_type);
+
+enum rdma_network_type {
+ RDMA_NETWORK_IB,
+ RDMA_NETWORK_ROCE_V1 = RDMA_NETWORK_IB,
+ RDMA_NETWORK_IPV4,
+ RDMA_NETWORK_IPV6
+};
+
+static inline enum ib_gid_type ib_network_to_gid_type(enum rdma_network_type network_type)
+{
+ if (network_type == RDMA_NETWORK_IPV4 ||
+ network_type == RDMA_NETWORK_IPV6)
+ return IB_GID_TYPE_ROCE_UDP_ENCAP;
+
+ /* IB_GID_TYPE_IB same as RDMA_NETWORK_ROCE_V1 */
+ return IB_GID_TYPE_IB;
+}
+
+static inline enum rdma_network_type ib_gid_to_network_type(enum ib_gid_type gid_type,
+ union ib_gid *gid)
+{
+ if (gid_type == IB_GID_TYPE_IB)
+ return RDMA_NETWORK_IB;
+
+ if (ipv6_addr_v4mapped((struct in6_addr *)gid))
+ return RDMA_NETWORK_IPV4;
+ else
+ return RDMA_NETWORK_IPV6;
+}
enum rdma_link_layer {
IB_LINK_LAYER_UNSPECIFIED,
IB_LINK_LAYER_INFINIBAND,
IB_LINK_LAYER_ETHERNET,
- IB_LINK_LAYER_SCIF
};
enum ib_device_cap_flags {
- IB_DEVICE_RESIZE_MAX_WR = 1,
- IB_DEVICE_BAD_PKEY_CNTR = (1<<1),
- IB_DEVICE_BAD_QKEY_CNTR = (1<<2),
- IB_DEVICE_RAW_MULTI = (1<<3),
- IB_DEVICE_AUTO_PATH_MIG = (1<<4),
- IB_DEVICE_CHANGE_PHY_PORT = (1<<5),
- IB_DEVICE_UD_AV_PORT_ENFORCE = (1<<6),
- IB_DEVICE_CURR_QP_STATE_MOD = (1<<7),
- IB_DEVICE_SHUTDOWN_PORT = (1<<8),
- IB_DEVICE_INIT_TYPE = (1<<9),
- IB_DEVICE_PORT_ACTIVE_EVENT = (1<<10),
- IB_DEVICE_SYS_IMAGE_GUID = (1<<11),
- IB_DEVICE_RC_RNR_NAK_GEN = (1<<12),
- IB_DEVICE_SRQ_RESIZE = (1<<13),
- IB_DEVICE_N_NOTIFY_CQ = (1<<14),
- IB_DEVICE_LOCAL_DMA_LKEY = (1<<15),
- IB_DEVICE_RESERVED = (1<<16), /* old SEND_W_INV */
- IB_DEVICE_MEM_WINDOW = (1<<17),
+ IB_DEVICE_RESIZE_MAX_WR = (1 << 0),
+ IB_DEVICE_BAD_PKEY_CNTR = (1 << 1),
+ IB_DEVICE_BAD_QKEY_CNTR = (1 << 2),
+ IB_DEVICE_RAW_MULTI = (1 << 3),
+ IB_DEVICE_AUTO_PATH_MIG = (1 << 4),
+ IB_DEVICE_CHANGE_PHY_PORT = (1 << 5),
+ IB_DEVICE_UD_AV_PORT_ENFORCE = (1 << 6),
+ IB_DEVICE_CURR_QP_STATE_MOD = (1 << 7),
+ IB_DEVICE_SHUTDOWN_PORT = (1 << 8),
+ IB_DEVICE_INIT_TYPE = (1 << 9),
+ IB_DEVICE_PORT_ACTIVE_EVENT = (1 << 10),
+ IB_DEVICE_SYS_IMAGE_GUID = (1 << 11),
+ IB_DEVICE_RC_RNR_NAK_GEN = (1 << 12),
+ IB_DEVICE_SRQ_RESIZE = (1 << 13),
+ IB_DEVICE_N_NOTIFY_CQ = (1 << 14),
+
+ /*
+ * This device supports a per-device lkey or stag that can be
+ * used without performing a memory registration for the local
+ * memory. Note that ULPs should never check this flag, but
+ * instead of use the local_dma_lkey flag in the ib_pd structure,
+ * which will always contain a usable lkey.
+ */
+ IB_DEVICE_LOCAL_DMA_LKEY = (1 << 15),
+ IB_DEVICE_RESERVED /* old SEND_W_INV */ = (1 << 16),
+ IB_DEVICE_MEM_WINDOW = (1 << 17),
/*
* Devices should set IB_DEVICE_UD_IP_SUM if they support
* insertion of UDP and TCP checksum on outgoing UD IPoIB
@@ -114,27 +192,38 @@ enum ib_device_cap_flags {
* incoming messages. Setting this flag implies that the
* IPoIB driver may set NETIF_F_IP_CSUM for datagram mode.
*/
- IB_DEVICE_UD_IP_CSUM = (1<<18),
- IB_DEVICE_UD_TSO = (1<<19),
- IB_DEVICE_XRC = (1<<20),
- IB_DEVICE_MEM_MGT_EXTENSIONS = (1<<21),
- IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22),
- IB_DEVICE_MR_ALLOCATE = (1<<23),
- IB_DEVICE_SHARED_MR = (1<<24),
- IB_DEVICE_QPG = (1<<25),
- IB_DEVICE_UD_RSS = (1<<26),
- IB_DEVICE_UD_TSS = (1<<27),
- IB_DEVICE_CROSS_CHANNEL = (1<<28),
- IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29),
+ IB_DEVICE_UD_IP_CSUM = (1 << 18),
+ IB_DEVICE_UD_TSO = (1 << 19),
+ IB_DEVICE_XRC = (1 << 20),
+
/*
- * Devices can set either IB_DEVICE_MEM_WINDOW_TYPE_2A or
- * IB_DEVICE_MEM_WINDOW_TYPE_2B if it supports type 2A or type 2B
- * memory windows. It can set neither to indicate it doesn't support
- * type 2 windows at all.
+ * This device supports the IB "base memory management extension",
+ * which includes support for fast registrations (IB_WR_REG_MR,
+ * IB_WR_LOCAL_INV and IB_WR_SEND_WITH_INV verbs). This flag should
+ * also be set by any iWarp device which must support FRs to comply
+ * to the iWarp verbs spec. iWarp devices also support the
+ * IB_WR_RDMA_READ_WITH_INV verb for RDMA READs that invalidate the
+ * stag.
*/
- IB_DEVICE_MEM_WINDOW_TYPE_2A = (1<<30),
- IB_DEVICE_MEM_WINDOW_TYPE_2B = (1<<31),
- IB_DEVICE_SIGNATURE_HANDOVER = (1LL<<32)
+ IB_DEVICE_MEM_MGT_EXTENSIONS = (1 << 21),
+ IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1 << 22),
+ IB_DEVICE_MEM_WINDOW_TYPE_2A = (1 << 23),
+ IB_DEVICE_MEM_WINDOW_TYPE_2B = (1 << 24),
+ IB_DEVICE_RC_IP_CSUM = (1 << 25),
+ IB_DEVICE_RAW_IP_CSUM = (1 << 26),
+ /*
+ * Devices should set IB_DEVICE_CROSS_CHANNEL if they
+ * support execution of WQEs that involve synchronization
+ * of I/O operations with single completion queue managed
+ * by hardware.
+ */
+ IB_DEVICE_CROSS_CHANNEL = (1 << 27),
+ IB_DEVICE_MANAGED_FLOW_STEERING = (1 << 29),
+ IB_DEVICE_SIGNATURE_HANDOVER = (1 << 30),
+ IB_DEVICE_ON_DEMAND_PAGING = (1ULL << 31),
+ IB_DEVICE_SG_GAPS_REG = (1ULL << 32),
+ IB_DEVICE_VIRTUAL_FUNCTION = (1ULL << 33),
+ IB_DEVICE_RAW_SCATTER_FCS = (1ULL << 34),
};
enum ib_signature_prot_cap {
@@ -154,10 +243,46 @@ enum ib_atomic_cap {
IB_ATOMIC_GLOB
};
-enum ib_cq_create_flags {
- IB_CQ_CREATE_CROSS_CHANNEL = 1 << 0,
- IB_CQ_TIMESTAMP = 1 << 1,
- IB_CQ_TIMESTAMP_TO_SYS_TIME = 1 << 2
+enum ib_odp_general_cap_bits {
+ IB_ODP_SUPPORT = 1 << 0,
+};
+
+enum ib_odp_transport_cap_bits {
+ IB_ODP_SUPPORT_SEND = 1 << 0,
+ IB_ODP_SUPPORT_RECV = 1 << 1,
+ IB_ODP_SUPPORT_WRITE = 1 << 2,
+ IB_ODP_SUPPORT_READ = 1 << 3,
+ IB_ODP_SUPPORT_ATOMIC = 1 << 4,
+};
+
+struct ib_odp_caps {
+ uint64_t general_caps;
+ struct {
+ uint32_t rc_odp_caps;
+ uint32_t uc_odp_caps;
+ uint32_t ud_odp_caps;
+ } per_transport_caps;
+};
+
+struct ib_rss_caps {
+ /* Corresponding bit will be set if qp type from
+ * 'enum ib_qp_type' is supported, e.g.
+ * supported_qpts |= 1 << IB_QPT_UD
+ */
+ u32 supported_qpts;
+ u32 max_rwq_indirection_tables;
+ u32 max_rwq_indirection_table_size;
+};
+
+enum ib_cq_creation_flags {
+ IB_CQ_FLAGS_TIMESTAMP_COMPLETION = 1 << 0,
+ IB_CQ_FLAGS_IGNORE_OVERRUN = 1 << 1,
+};
+
+struct ib_cq_init_attr {
+ unsigned int cqe;
+ int comp_vector;
+ u32 flags;
};
struct ib_device_attr {
@@ -199,19 +324,15 @@ struct ib_device_attr {
int max_srq_wr;
int max_srq_sge;
unsigned int max_fast_reg_page_list_len;
- int max_rss_tbl_sz;
u16 max_pkeys;
u8 local_ca_ack_delay;
- int comp_mask;
- uint64_t timestamp_mask;
- uint64_t hca_core_clock;
- unsigned int sig_prot_cap;
- unsigned int sig_guard_cap;
-};
-
-enum ib_device_attr_comp_mask {
- IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK = 1ULL << 1,
- IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK = 1ULL << 2
+ int sig_prot_cap;
+ int sig_guard_cap;
+ struct ib_odp_caps odp_caps;
+ uint64_t timestamp_mask;
+ uint64_t hca_core_clock; /* in KHZ */
+ struct ib_rss_caps rss_caps;
+ u32 max_wq_type_rq;
};
enum ib_mtu {
@@ -241,7 +362,7 @@ enum ib_port_state {
IB_PORT_ARMED = 3,
IB_PORT_ACTIVE = 4,
IB_PORT_ACTIVE_DEFER = 5,
- IB_PORT_DUMMY = -1 /* force enum signed */
+ IB_PORT_DUMMY = -1, /* force enum signed */
};
enum ib_port_cap_flags {
@@ -267,7 +388,8 @@ enum ib_port_cap_flags {
IB_PORT_CAP_MASK_NOTICE_SUP = 1 << 22,
IB_PORT_BOOT_MGMT_SUP = 1 << 23,
IB_PORT_LINK_LATENCY_SUP = 1 << 24,
- IB_PORT_CLIENT_REG_SUP = 1 << 25
+ IB_PORT_CLIENT_REG_SUP = 1 << 25,
+ IB_PORT_IP_BASED_GIDS = 1 << 26,
};
enum ib_port_width {
@@ -297,58 +419,101 @@ enum ib_port_speed {
IB_SPEED_EDR = 32
};
-struct ib_protocol_stats {
- /* TBD... */
-};
-
-struct iw_protocol_stats {
- u64 ipInReceives;
- u64 ipInHdrErrors;
- u64 ipInTooBigErrors;
- u64 ipInNoRoutes;
- u64 ipInAddrErrors;
- u64 ipInUnknownProtos;
- u64 ipInTruncatedPkts;
- u64 ipInDiscards;
- u64 ipInDelivers;
- u64 ipOutForwDatagrams;
- u64 ipOutRequests;
- u64 ipOutDiscards;
- u64 ipOutNoRoutes;
- u64 ipReasmTimeout;
- u64 ipReasmReqds;
- u64 ipReasmOKs;
- u64 ipReasmFails;
- u64 ipFragOKs;
- u64 ipFragFails;
- u64 ipFragCreates;
- u64 ipInMcastPkts;
- u64 ipOutMcastPkts;
- u64 ipInBcastPkts;
- u64 ipOutBcastPkts;
-
- u64 tcpRtoAlgorithm;
- u64 tcpRtoMin;
- u64 tcpRtoMax;
- u64 tcpMaxConn;
- u64 tcpActiveOpens;
- u64 tcpPassiveOpens;
- u64 tcpAttemptFails;
- u64 tcpEstabResets;
- u64 tcpCurrEstab;
- u64 tcpInSegs;
- u64 tcpOutSegs;
- u64 tcpRetransSegs;
- u64 tcpInErrs;
- u64 tcpOutRsts;
-};
-
-union rdma_protocol_stats {
- struct ib_protocol_stats ib;
- struct iw_protocol_stats iw;
-};
+/**
+ * struct rdma_hw_stats
+ * @timestamp - Used by the core code to track when the last update was
+ * @lifespan - Used by the core code to determine how old the counters
+ * should be before being updated again. Stored in jiffies, defaults
+ * to 10 milliseconds, drivers can override the default be specifying
+ * their own value during their allocation routine.
+ * @name - Array of pointers to static names used for the counters in
+ * directory.
+ * @num_counters - How many hardware counters there are. If name is
+ * shorter than this number, a kernel oops will result. Driver authors
+ * are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters)
+ * in their code to prevent this.
+ * @value - Array of u64 counters that are accessed by the sysfs code and
+ * filled in by the drivers get_stats routine
+ */
+struct rdma_hw_stats {
+ unsigned long timestamp;
+ unsigned long lifespan;
+ const char * const *names;
+ int num_counters;
+ u64 value[];
+};
+
+#define RDMA_HW_STATS_DEFAULT_LIFESPAN 10
+/**
+ * rdma_alloc_hw_stats_struct - Helper function to allocate dynamic struct
+ * for drivers.
+ * @names - Array of static const char *
+ * @num_counters - How many elements in array
+ * @lifespan - How many milliseconds between updates
+ */
+static inline struct rdma_hw_stats *rdma_alloc_hw_stats_struct(
+ const char * const *names, int num_counters,
+ unsigned long lifespan)
+{
+ struct rdma_hw_stats *stats;
+
+ stats = kzalloc(sizeof(*stats) + num_counters * sizeof(u64),
+ GFP_KERNEL);
+ if (!stats)
+ return NULL;
+ stats->names = names;
+ stats->num_counters = num_counters;
+ stats->lifespan = msecs_to_jiffies(lifespan);
+
+ return stats;
+}
+
+
+/* Define bits for the various functionality this port needs to be supported by
+ * the core.
+ */
+/* Management 0x00000FFF */
+#define RDMA_CORE_CAP_IB_MAD 0x00000001
+#define RDMA_CORE_CAP_IB_SMI 0x00000002
+#define RDMA_CORE_CAP_IB_CM 0x00000004
+#define RDMA_CORE_CAP_IW_CM 0x00000008
+#define RDMA_CORE_CAP_IB_SA 0x00000010
+#define RDMA_CORE_CAP_OPA_MAD 0x00000020
+
+/* Address format 0x000FF000 */
+#define RDMA_CORE_CAP_AF_IB 0x00001000
+#define RDMA_CORE_CAP_ETH_AH 0x00002000
+
+/* Protocol 0xFFF00000 */
+#define RDMA_CORE_CAP_PROT_IB 0x00100000
+#define RDMA_CORE_CAP_PROT_ROCE 0x00200000
+#define RDMA_CORE_CAP_PROT_IWARP 0x00400000
+#define RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP 0x00800000
+
+#define RDMA_CORE_PORT_IBA_IB (RDMA_CORE_CAP_PROT_IB \
+ | RDMA_CORE_CAP_IB_MAD \
+ | RDMA_CORE_CAP_IB_SMI \
+ | RDMA_CORE_CAP_IB_CM \
+ | RDMA_CORE_CAP_IB_SA \
+ | RDMA_CORE_CAP_AF_IB)
+#define RDMA_CORE_PORT_IBA_ROCE (RDMA_CORE_CAP_PROT_ROCE \
+ | RDMA_CORE_CAP_IB_MAD \
+ | RDMA_CORE_CAP_IB_CM \
+ | RDMA_CORE_CAP_AF_IB \
+ | RDMA_CORE_CAP_ETH_AH)
+#define RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP \
+ (RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP \
+ | RDMA_CORE_CAP_IB_MAD \
+ | RDMA_CORE_CAP_IB_CM \
+ | RDMA_CORE_CAP_AF_IB \
+ | RDMA_CORE_CAP_ETH_AH)
+#define RDMA_CORE_PORT_IWARP (RDMA_CORE_CAP_PROT_IWARP \
+ | RDMA_CORE_CAP_IW_CM)
+#define RDMA_CORE_PORT_INTEL_OPA (RDMA_CORE_PORT_IBA_IB \
+ | RDMA_CORE_CAP_OPA_MAD)
struct ib_port_attr {
+ u64 subnet_prefix;
enum ib_port_state state;
enum ib_mtu max_mtu;
enum ib_mtu active_mtu;
@@ -368,6 +533,7 @@ struct ib_port_attr {
u8 active_width;
u8 active_speed;
u8 phys_state;
+ bool grh_required;
};
enum ib_device_modify_flags {
@@ -375,9 +541,11 @@ enum ib_device_modify_flags {
IB_DEVICE_MODIFY_NODE_DESC = 1 << 1
};
+#define IB_DEVICE_NODE_DESC_MAX 64
+
struct ib_device_modify {
u64 sys_image_guid;
- char node_desc[64];
+ char node_desc[IB_DEVICE_NODE_DESC_MAX];
};
enum ib_port_modify_flags {
@@ -412,14 +580,18 @@ enum ib_event_type {
IB_EVENT_QP_LAST_WQE_REACHED,
IB_EVENT_CLIENT_REREGISTER,
IB_EVENT_GID_CHANGE,
+ IB_EVENT_WQ_FATAL,
};
+const char *__attribute_const__ ib_event_msg(enum ib_event_type event);
+
struct ib_event {
struct ib_device *device;
union {
struct ib_cq *cq;
struct ib_qp *qp;
struct ib_srq *srq;
+ struct ib_wq *wq;
u8 port_num;
} element;
enum ib_event_type event;
@@ -455,11 +627,23 @@ struct ib_grh {
union ib_gid dgid;
};
+union rdma_network_hdr {
+ struct ib_grh ibgrh;
+ struct {
+ /* The IB spec states that if it's IPv4, the header
+ * is located in the last 20 bytes of the header.
+ */
+ u8 reserved[20];
+ struct ip roce4grh;
+ };
+};
+
enum {
IB_MULTICAST_QPN = 0xffffff
};
#define IB_LID_PERMISSIVE cpu_to_be16(0xFFFF)
+#define IB_MULTICAST_LID_BASE cpu_to_be16(0xC000)
enum ib_ah_flags {
IB_AH_GRH = 1
@@ -486,57 +670,48 @@ enum ib_rate {
IB_RATE_300_GBPS = 18
};
-enum ib_mr_create_flags {
- IB_MR_SIGNATURE_EN = 1,
-};
-
-/**
- * ib_mr_init_attr - Memory region init attributes passed to routine
- * ib_create_mr.
- * @max_reg_descriptors: max number of registration descriptors that
- * may be used with registration work requests.
- * @flags: MR creation flags bit mask.
- */
-struct ib_mr_init_attr {
- int max_reg_descriptors;
- u32 flags;
-};
-
/**
* ib_rate_to_mult - Convert the IB rate enum to a multiple of the
* base rate of 2.5 Gbit/sec. For example, IB_RATE_5_GBPS will be
* converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec.
* @rate: rate to convert.
*/
-int ib_rate_to_mult(enum ib_rate rate) __attribute_const__;
+__attribute_const__ int ib_rate_to_mult(enum ib_rate rate);
/**
* ib_rate_to_mbps - Convert the IB rate enum to Mbps.
* For example, IB_RATE_2_5_GBPS will be converted to 2500.
* @rate: rate to convert.
*/
-int ib_rate_to_mbps(enum ib_rate rate) __attribute_const__;
+__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate);
-struct ib_cq_init_attr {
- int cqe;
- int comp_vector;
- u32 flags;
-};
-enum ib_signature_type {
- IB_SIG_TYPE_T10_DIF,
+/**
+ * enum ib_mr_type - memory region type
+ * @IB_MR_TYPE_MEM_REG: memory region that is used for
+ * normal registration
+ * @IB_MR_TYPE_SIGNATURE: memory region that is used for
+ * signature operations (data-integrity
+ * capable regions)
+ * @IB_MR_TYPE_SG_GAPS: memory region that is capable to
+ * register any arbitrary sg lists (without
+ * the normal mr constraints - see
+ * ib_map_mr_sg)
+ */
+enum ib_mr_type {
+ IB_MR_TYPE_MEM_REG,
+ IB_MR_TYPE_SIGNATURE,
+ IB_MR_TYPE_SG_GAPS,
};
/**
- * T10-DIF Signature types
- * T10-DIF types are defined by SCSI
- * specifications.
+ * Signature types
+ * IB_SIG_TYPE_NONE: Unprotected.
+ * IB_SIG_TYPE_T10_DIF: Type T10-DIF
*/
-enum ib_t10_dif_type {
- IB_T10DIF_NONE,
- IB_T10DIF_TYPE1,
- IB_T10DIF_TYPE2,
- IB_T10DIF_TYPE3
+enum ib_signature_type {
+ IB_SIG_TYPE_NONE,
+ IB_SIG_TYPE_T10_DIF,
};
/**
@@ -552,24 +727,26 @@ enum ib_t10_dif_bg_type {
/**
* struct ib_t10_dif_domain - Parameters specific for T10-DIF
* domain.
- * @type: T10-DIF type (0|1|2|3)
* @bg_type: T10-DIF block guard type (CRC|CSUM)
* @pi_interval: protection information interval.
* @bg: seed of guard computation.
* @app_tag: application tag of guard block
* @ref_tag: initial guard block reference tag.
- * @type3_inc_reftag: T10-DIF type 3 does not state
- * about the reference tag, it is the user
- * choice to increment it or not.
+ * @ref_remap: Indicate wethear the reftag increments each block
+ * @app_escape: Indicate to skip block check if apptag=0xffff
+ * @ref_escape: Indicate to skip block check if reftag=0xffffffff
+ * @apptag_check_mask: check bitmask of application tag.
*/
struct ib_t10_dif_domain {
- enum ib_t10_dif_type type;
enum ib_t10_dif_bg_type bg_type;
- u32 pi_interval;
+ u16 pi_interval;
u16 bg;
u16 app_tag;
u32 ref_tag;
- bool type3_inc_reftag;
+ bool ref_remap;
+ bool app_escape;
+ bool ref_escape;
+ u16 apptag_check_mask;
};
/**
@@ -636,7 +813,7 @@ struct ib_mr_status {
* enum.
* @mult: multiple to convert.
*/
-enum ib_rate mult_to_ib_rate(int mult) __attribute_const__;
+__attribute_const__ enum ib_rate mult_to_ib_rate(int mult);
struct ib_ah_attr {
struct ib_global_route grh;
@@ -646,8 +823,7 @@ struct ib_ah_attr {
u8 static_rate;
u8 ah_flags;
u8 port_num;
- u8 dmac[6];
- u16 vlan_id;
+ u8 dmac[ETH_ALEN];
};
enum ib_wc_status {
@@ -675,16 +851,17 @@ enum ib_wc_status {
IB_WC_GENERAL_ERR
};
+const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status);
+
enum ib_wc_opcode {
IB_WC_SEND,
IB_WC_RDMA_WRITE,
IB_WC_RDMA_READ,
IB_WC_COMP_SWAP,
IB_WC_FETCH_ADD,
- IB_WC_BIND_MW,
IB_WC_LSO,
IB_WC_LOCAL_INV,
- IB_WC_FAST_REG_MR,
+ IB_WC_REG_MR,
IB_WC_MASKED_COMP_SWAP,
IB_WC_MASKED_FETCH_ADD,
/*
@@ -692,7 +869,8 @@ enum ib_wc_opcode {
* receive by testing (opcode & IB_WC_RECV).
*/
IB_WC_RECV = 1 << 7,
- IB_WC_RECV_RDMA_WITH_IMM
+ IB_WC_RECV_RDMA_WITH_IMM,
+ IB_WC_DUMMY = -1, /* force enum signed */
};
enum ib_wc_flags {
@@ -700,15 +878,16 @@ enum ib_wc_flags {
IB_WC_WITH_IMM = (1<<1),
IB_WC_WITH_INVALIDATE = (1<<2),
IB_WC_IP_CSUM_OK = (1<<3),
- IB_WC_WITH_SL = (1<<4),
- IB_WC_WITH_SLID = (1<<5),
- IB_WC_WITH_TIMESTAMP = (1<<6),
- IB_WC_WITH_SMAC = (1<<7),
- IB_WC_WITH_VLAN = (1<<8),
+ IB_WC_WITH_SMAC = (1<<4),
+ IB_WC_WITH_VLAN = (1<<5),
+ IB_WC_WITH_NETWORK_HDR_TYPE = (1<<6),
};
struct ib_wc {
- u64 wr_id;
+ union {
+ u64 wr_id;
+ struct ib_cqe *wr_cqe;
+ };
enum ib_wc_status status;
enum ib_wc_opcode opcode;
u32 vendor_err;
@@ -725,12 +904,9 @@ struct ib_wc {
u8 sl;
u8 dlid_path_bits;
u8 port_num; /* valid only for DR SMPs on switches */
- int csum_ok;
- struct {
- uint64_t timestamp; /* timestamp = 0 indicates error*/
- } ts;
- u8 smac[6];
+ u8 smac[ETH_ALEN];
u16 vlan_id;
+ u8 network_hdr_type;
};
enum ib_cq_notify_flags {
@@ -776,7 +952,13 @@ struct ib_qp_cap {
u32 max_send_sge;
u32 max_recv_sge;
u32 max_inline_data;
- u32 qpg_tss_mask_sz;
+
+ /*
+ * Maximum number of rdma_rw_ctx structures in flight at a time.
+ * ib_create_qp() will calculate the right amount of neededed WRs
+ * and MRs based on this.
+ */
+ u32 max_rdma_ctxs;
};
enum ib_sig_type {
@@ -801,7 +983,6 @@ enum ib_qp_type {
IB_QPT_RAW_PACKET = 8,
IB_QPT_XRC_INI = 9,
IB_QPT_XRC_TGT,
- IB_QPT_DC_INI,
IB_QPT_MAX,
/* Reserve a range for qp types internal to the low level driver.
* These qp types will not be visible at the IB core layer, so the
@@ -822,27 +1003,22 @@ enum ib_qp_type {
enum ib_qp_create_flags {
IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0,
IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1,
- IB_QP_CREATE_CROSS_CHANNEL = 1 << 2,
- IB_QP_CREATE_MANAGED_SEND = 1 << 3,
- IB_QP_CREATE_MANAGED_RECV = 1 << 4,
+ IB_QP_CREATE_CROSS_CHANNEL = 1 << 2,
+ IB_QP_CREATE_MANAGED_SEND = 1 << 3,
+ IB_QP_CREATE_MANAGED_RECV = 1 << 4,
IB_QP_CREATE_NETIF_QP = 1 << 5,
IB_QP_CREATE_SIGNATURE_EN = 1 << 6,
+ IB_QP_CREATE_USE_GFP_NOIO = 1 << 7,
+ IB_QP_CREATE_SCATTER_FCS = 1 << 8,
/* reserve bits 26-31 for low level drivers' internal use */
IB_QP_CREATE_RESERVED_START = 1 << 26,
IB_QP_CREATE_RESERVED_END = 1 << 31,
};
-enum ib_qpg_type {
- IB_QPG_NONE = 0,
- IB_QPG_PARENT = (1<<0),
- IB_QPG_CHILD_RX = (1<<1),
- IB_QPG_CHILD_TX = (1<<2)
-};
-
-struct ib_qpg_init_attrib {
- u32 tss_child_count;
- u32 rss_child_count;
-};
+/*
+ * Note: users may not call ib_close_qp or ib_destroy_qp from the event_handler
+ * callback to destroy the passed in QP.
+ */
struct ib_qp_init_attr {
void (*event_handler)(struct ib_event *, void *);
@@ -852,52 +1028,15 @@ struct ib_qp_init_attr {
struct ib_srq *srq;
struct ib_xrcd *xrcd; /* XRC TGT QPs only */
struct ib_qp_cap cap;
- union {
- struct ib_qp *qpg_parent; /* see qpg_type */
- struct ib_qpg_init_attrib parent_attrib;
- };
enum ib_sig_type sq_sig_type;
enum ib_qp_type qp_type;
enum ib_qp_create_flags create_flags;
- enum ib_qpg_type qpg_type;
- u8 port_num; /* special QP types only */
-};
-enum {
- IB_DCT_CREATE_FLAG_RCV_INLINE = 1 << 0,
- IB_DCT_CREATE_FLAGS_MASK = IB_DCT_CREATE_FLAG_RCV_INLINE,
-};
-
-struct ib_dct_init_attr {
- struct ib_pd *pd;
- struct ib_cq *cq;
- struct ib_srq *srq;
- u64 dc_key;
- u8 port;
- u32 access_flags;
- u8 min_rnr_timer;
- u8 tclass;
- u32 flow_label;
- enum ib_mtu mtu;
- u8 pkey_index;
- u8 gid_index;
- u8 hop_limit;
- u32 create_flags;
-};
-
-struct ib_dct_attr {
- u64 dc_key;
- u8 port;
- u32 access_flags;
- u8 min_rnr_timer;
- u8 tclass;
- u32 flow_label;
- enum ib_mtu mtu;
- u8 pkey_index;
- u8 gid_index;
- u8 hop_limit;
- u32 key_violations;
- u8 state;
+ /*
+ * Only needed for special QP types, or when using the RW API.
+ */
+ u8 port_num;
+ struct ib_rwq_ind_table *rwq_ind_tbl;
};
struct ib_qp_open_attr {
@@ -964,12 +1103,10 @@ enum ib_qp_attr_mask {
IB_QP_PATH_MIG_STATE = (1<<18),
IB_QP_CAP = (1<<19),
IB_QP_DEST_QPN = (1<<20),
- IB_QP_GROUP_RSS = (1<<21),
- IB_QP_DC_KEY = (1<<22),
- IB_QP_SMAC = (1<<23),
- IB_QP_ALT_SMAC = (1<<24),
- IB_QP_VID = (1<<25),
- IB_QP_ALT_VID = (1<<26)
+ IB_QP_RESERVED1 = (1<<21),
+ IB_QP_RESERVED2 = (1<<22),
+ IB_QP_RESERVED3 = (1<<23),
+ IB_QP_RESERVED4 = (1<<24),
};
enum ib_qp_state {
@@ -980,7 +1117,7 @@ enum ib_qp_state {
IB_QPS_SQD,
IB_QPS_SQE,
IB_QPS_ERR,
- IB_QPS_DUMMY = -1 /* force enum signed */
+ IB_QPS_DUMMY = -1, /* force enum signed */
};
enum ib_mig_state {
@@ -1020,40 +1157,6 @@ struct ib_qp_attr {
u8 rnr_retry;
u8 alt_port_num;
u8 alt_timeout;
- u8 smac[ETH_ALEN];
- u8 alt_smac[ETH_ALEN];
- u16 vlan_id;
- u16 alt_vlan_id;
-
-};
-
-struct ib_qp_attr_ex {
- enum ib_qp_state qp_state;
- enum ib_qp_state cur_qp_state;
- enum ib_mtu path_mtu;
- enum ib_mig_state path_mig_state;
- u32 qkey;
- u32 rq_psn;
- u32 sq_psn;
- u32 dest_qp_num;
- int qp_access_flags;
- struct ib_qp_cap cap;
- struct ib_ah_attr ah_attr;
- struct ib_ah_attr alt_ah_attr;
- u16 pkey_index;
- u16 alt_pkey_index;
- u8 en_sqd_async_notify;
- u8 sq_draining;
- u8 max_rd_atomic;
- u8 max_dest_rd_atomic;
- u8 min_rnr_timer;
- u8 port_num;
- u8 timeout;
- u8 retry_cnt;
- u8 rnr_retry;
- u8 alt_port_num;
- u8 alt_timeout;
- u64 dct_key;
};
enum ib_wr_opcode {
@@ -1068,10 +1171,9 @@ enum ib_wr_opcode {
IB_WR_SEND_WITH_INV,
IB_WR_RDMA_READ_WITH_INV,
IB_WR_LOCAL_INV,
- IB_WR_FAST_REG_MR,
+ IB_WR_REG_MR,
IB_WR_MASKED_ATOMIC_CMP_AND_SWP,
IB_WR_MASKED_ATOMIC_FETCH_AND_ADD,
- IB_WR_BIND_MW,
IB_WR_REG_SIG_MR,
/* reserve values for low level drivers' internal use.
* These values will not be used at all in the ib core layer.
@@ -1086,6 +1188,7 @@ enum ib_wr_opcode {
IB_WR_RESERVED8,
IB_WR_RESERVED9,
IB_WR_RESERVED10,
+ IB_WR_DUMMY = -1, /* force enum signed */
};
enum ib_send_flags {
@@ -1098,7 +1201,6 @@ enum ib_send_flags {
/* reserve bits 26-31 for low level drivers' internal use */
IB_SEND_RESERVED_START = (1 << 26),
IB_SEND_RESERVED_END = (1 << 31),
- IB_SEND_UMR_UNREG = (1<<5)
};
struct ib_sge {
@@ -1107,32 +1209,16 @@ struct ib_sge {
u32 lkey;
};
-struct ib_fast_reg_page_list {
- struct ib_device *device;
- u64 *page_list;
- unsigned int max_page_list_len;
-};
-
-/**
- * struct ib_mw_bind_info - Parameters for a memory window bind operation.
- * @mr: A memory region to bind the memory window to.
- * @addr: The address where the memory window should begin.
- * @length: The length of the memory window, in bytes.
- * @mw_access_flags: Access flags from enum ib_access_flags for the window.
- *
- * This struct contains the shared parameters for type 1 and type 2
- * memory window bind operations.
- */
-struct ib_mw_bind_info {
- struct ib_mr *mr;
- u64 addr;
- u64 length;
- int mw_access_flags;
+struct ib_cqe {
+ void (*done)(struct ib_cq *cq, struct ib_wc *wc);
};
struct ib_send_wr {
struct ib_send_wr *next;
- u64 wr_id;
+ union {
+ u64 wr_id;
+ struct ib_cqe *wr_cqe;
+ };
struct ib_sge *sg_list;
int num_sge;
enum ib_wr_opcode opcode;
@@ -1141,66 +1227,82 @@ struct ib_send_wr {
__be32 imm_data;
u32 invalidate_rkey;
} ex;
- union {
- struct {
- u64 remote_addr;
- u32 rkey;
- } rdma;
- struct {
- u64 remote_addr;
- u64 compare_add;
- u64 swap;
- u64 compare_add_mask;
- u64 swap_mask;
- u32 rkey;
- } atomic;
- struct {
- struct ib_ah *ah;
- void *header;
- int hlen;
- int mss;
- u32 remote_qpn;
- u32 remote_qkey;
- u16 pkey_index; /* valid for GSI only */
- u8 port_num; /* valid for DR SMPs on switch only */
- } ud;
- struct {
- u64 iova_start;
- struct ib_fast_reg_page_list *page_list;
- unsigned int page_shift;
- unsigned int page_list_len;
- u32 length;
- int access_flags;
- u32 rkey;
- } fast_reg;
- struct {
- int npages;
- int access_flags;
- u32 mkey;
- struct ib_pd *pd;
- u64 virt_addr;
- u64 length;
- int page_shift;
- } umr;
- struct {
- struct ib_mw *mw;
- /* The new rkey for the memory window. */
- u32 rkey;
- struct ib_mw_bind_info bind_info;
- } bind_mw;
- struct {
- struct ib_sig_attrs *sig_attrs;
- struct ib_mr *sig_mr;
- int access_flags;
- struct ib_sge *prot;
- } sig_handover;
- } wr;
- u32 xrc_remote_srq_num; /* XRC TGT QPs only */
};
+struct ib_rdma_wr {
+ struct ib_send_wr wr;
+ u64 remote_addr;
+ u32 rkey;
+};
+
+static inline struct ib_rdma_wr *rdma_wr(struct ib_send_wr *wr)
+{
+ return container_of(wr, struct ib_rdma_wr, wr);
+}
+
+struct ib_atomic_wr {
+ struct ib_send_wr wr;
+ u64 remote_addr;
+ u64 compare_add;
+ u64 swap;
+ u64 compare_add_mask;
+ u64 swap_mask;
+ u32 rkey;
+};
+
+static inline struct ib_atomic_wr *atomic_wr(struct ib_send_wr *wr)
+{
+ return container_of(wr, struct ib_atomic_wr, wr);
+}
+
+struct ib_ud_wr {
+ struct ib_send_wr wr;
+ struct ib_ah *ah;
+ void *header;
+ int hlen;
+ int mss;
+ u32 remote_qpn;
+ u32 remote_qkey;
+ u16 pkey_index; /* valid for GSI only */
+ u8 port_num; /* valid for DR SMPs on switch only */
+};
+
+static inline struct ib_ud_wr *ud_wr(struct ib_send_wr *wr)
+{
+ return container_of(wr, struct ib_ud_wr, wr);
+}
+
+struct ib_reg_wr {
+ struct ib_send_wr wr;
+ struct ib_mr *mr;
+ u32 key;
+ int access;
+};
+
+static inline struct ib_reg_wr *reg_wr(struct ib_send_wr *wr)
+{
+ return container_of(wr, struct ib_reg_wr, wr);
+}
+
+struct ib_sig_handover_wr {
+ struct ib_send_wr wr;
+ struct ib_sig_attrs *sig_attrs;
+ struct ib_mr *sig_mr;
+ int access_flags;
+ struct ib_sge *prot;
+};
+
+static inline struct ib_sig_handover_wr *sig_handover_wr(struct ib_send_wr *wr)
+{
+ return container_of(wr, struct ib_sig_handover_wr, wr);
+}
+
struct ib_recv_wr {
struct ib_recv_wr *next;
- u64 wr_id;
+ union {
+ u64 wr_id;
+ struct ib_cqe *wr_cqe;
+ };
struct ib_sge *sg_list;
int num_sge;
};
@@ -1211,40 +1313,19 @@ enum ib_access_flags {
IB_ACCESS_REMOTE_READ = (1<<2),
IB_ACCESS_REMOTE_ATOMIC = (1<<3),
IB_ACCESS_MW_BIND = (1<<4),
- IB_ACCESS_ALLOCATE_MR = (1<<5),
- IB_ZERO_BASED = (1<<13)
-};
-
-struct ib_phys_buf {
- u64 addr;
- u64 size;
-};
-
-struct ib_mr_attr {
- struct ib_pd *pd;
- u64 device_virt_addr;
- u64 size;
- int mr_access_flags;
- u32 lkey;
- u32 rkey;
+ IB_ZERO_BASED = (1<<5),
+ IB_ACCESS_ON_DEMAND = (1<<6),
};
+/*
+ * XXX: these are apparently used for ->rereg_user_mr, no idea why they
+ * are hidden here instead of a uapi header!
+ */
enum ib_mr_rereg_flags {
IB_MR_REREG_TRANS = 1,
IB_MR_REREG_PD = (1<<1),
- IB_MR_REREG_ACCESS = (1<<2)
-};
-
-/**
- * struct ib_mw_bind - Parameters for a type 1 memory window bind operation.
- * @wr_id: Work request id.
- * @send_flags: Flags from ib_send_flags enum.
- * @bind_info: More parameters of the bind operation.
- */
-struct ib_mw_bind {
- u64 wr_id;
- int send_flags;
- struct ib_mw_bind_info bind_info;
+ IB_MR_REREG_ACCESS = (1<<2),
+ IB_MR_REREG_SUPPORTED = ((IB_MR_REREG_ACCESS << 1) - 1)
};
struct ib_fmr_attr {
@@ -1253,6 +1334,8 @@ struct ib_fmr_attr {
u8 page_shift;
};
+struct ib_umem;
+
struct ib_ucontext {
struct ib_device *device;
struct list_head pd_list;
@@ -1264,10 +1347,27 @@ struct ib_ucontext {
struct list_head ah_list;
struct list_head xrcd_list;
struct list_head rule_list;
- struct list_head dct_list;
+ struct list_head wq_list;
+ struct list_head rwq_ind_tbl_list;
int closing;
- void *peer_mem_private_data;
- char *peer_mem_name;
+
+ pid_t tgid;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+ struct rb_root umem_tree;
+ /*
+ * Protects .umem_rbroot and tree, as well as odp_mrs_count and
+ * mmu notifiers registration.
+ */
+ struct rw_semaphore umem_rwsem;
+ void (*invalidate_range)(struct ib_umem *umem,
+ unsigned long start, unsigned long end);
+
+ struct mmu_notifier mn;
+ atomic_t notifier_count;
+ /* A list of umems that don't have private mmu notifier counters yet. */
+ struct list_head no_private_counters;
+ int odp_mrs_count;
+#endif
};
struct ib_uobject {
@@ -1278,36 +1378,37 @@ struct ib_uobject {
int id; /* index into kernel idr */
struct kref ref;
struct rw_semaphore mutex; /* protects .live */
+ struct rcu_head rcu; /* kfree_rcu() overhead */
int live;
};
-struct ib_udata;
-struct ib_udata_ops {
- int (*copy_from)(void *dest, struct ib_udata *udata,
- size_t len);
- int (*copy_to)(struct ib_udata *udata, void *src,
- size_t len);
-};
-
struct ib_udata {
- struct ib_udata_ops *ops;
- void __user *inbuf;
+ const void __user *inbuf;
void __user *outbuf;
size_t inlen;
size_t outlen;
};
struct ib_pd {
+ u32 local_dma_lkey;
+ u32 flags;
struct ib_device *device;
struct ib_uobject *uobject;
atomic_t usecnt; /* count all resources */
+
+ u32 unsafe_global_rkey;
+
+ /*
+ * Implementation details of the RDMA core, don't use in drivers:
+ */
+ struct ib_mr *__internal_mr;
};
struct ib_xrcd {
struct ib_device *device;
atomic_t usecnt; /* count all exposed resources */
struct inode *inode;
-
+
struct mutex tgt_qp_mutex;
struct list_head tgt_qp_list;
};
@@ -1318,25 +1419,14 @@ struct ib_ah {
struct ib_uobject *uobject;
};
-enum ib_cq_attr_mask {
- IB_CQ_MODERATION = (1 << 0),
- IB_CQ_CAP_FLAGS = (1 << 1)
-};
-
-enum ib_cq_cap_flags {
- IB_CQ_IGNORE_OVERRUN = (1 << 0)
-};
+typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
-struct ib_cq_attr {
- struct {
- u16 cq_count;
- u16 cq_period;
- } moderation;
- u32 cq_cap_flags;
+enum ib_poll_context {
+ IB_POLL_DIRECT, /* caller context, no hw completions */
+ IB_POLL_SOFTIRQ, /* poll from softirq context */
+ IB_POLL_WORKQUEUE, /* poll from workqueue */
};
-typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
-
struct ib_cq {
struct ib_device *device;
struct ib_uobject *uobject;
@@ -1345,6 +1435,8 @@ struct ib_cq {
void *cq_context;
int cqe;
atomic_t usecnt; /* count number of work queues */
+ enum ib_poll_context poll_ctx;
+ struct work_struct work;
};
struct ib_srq {
@@ -1365,14 +1457,77 @@ struct ib_srq {
} ext;
};
+enum ib_wq_type {
+ IB_WQT_RQ
+};
+
+enum ib_wq_state {
+ IB_WQS_RESET,
+ IB_WQS_RDY,
+ IB_WQS_ERR
+};
+
+struct ib_wq {
+ struct ib_device *device;
+ struct ib_uobject *uobject;
+ void *wq_context;
+ void (*event_handler)(struct ib_event *, void *);
+ struct ib_pd *pd;
+ struct ib_cq *cq;
+ u32 wq_num;
+ enum ib_wq_state state;
+ enum ib_wq_type wq_type;
+ atomic_t usecnt;
+};
+
+struct ib_wq_init_attr {
+ void *wq_context;
+ enum ib_wq_type wq_type;
+ u32 max_wr;
+ u32 max_sge;
+ struct ib_cq *cq;
+ void (*event_handler)(struct ib_event *, void *);
+};
+
+enum ib_wq_attr_mask {
+ IB_WQ_STATE = 1 << 0,
+ IB_WQ_CUR_STATE = 1 << 1,
+};
+
+struct ib_wq_attr {
+ enum ib_wq_state wq_state;
+ enum ib_wq_state curr_wq_state;
+};
+
+struct ib_rwq_ind_table {
+ struct ib_device *device;
+ struct ib_uobject *uobject;
+ atomic_t usecnt;
+ u32 ind_tbl_num;
+ u32 log_ind_tbl_size;
+ struct ib_wq **ind_tbl;
+};
+
+struct ib_rwq_ind_table_init_attr {
+ u32 log_ind_tbl_size;
+ /* Each entry is a pointer to Receive Work Queue */
+ struct ib_wq **ind_tbl;
+};
+
+/*
+ * @max_write_sge: Maximum SGE elements per RDMA WRITE request.
+ * @max_read_sge: Maximum SGE elements per RDMA READ request.
+ */
struct ib_qp {
struct ib_device *device;
struct ib_pd *pd;
struct ib_cq *send_cq;
struct ib_cq *recv_cq;
+ spinlock_t mr_lock;
struct ib_srq *srq;
struct ib_xrcd *xrcd; /* XRC TGT QPs only */
struct list_head xrcd_list;
+
/* count times opened, mcast attaches, flow attaches */
atomic_t usecnt;
struct list_head open_list;
@@ -1381,27 +1536,25 @@ struct ib_qp {
void (*event_handler)(struct ib_event *, void *);
void *qp_context;
u32 qp_num;
+ u32 max_write_sge;
+ u32 max_read_sge;
enum ib_qp_type qp_type;
- enum ib_qpg_type qpg_type;
- u8 port_num;
-};
-
-struct ib_dct {
- struct ib_device *device;
- struct ib_uobject *uobject;
- struct ib_pd *pd;
- struct ib_cq *cq;
- struct ib_srq *srq;
- u32 dct_num;
+ struct ib_rwq_ind_table *rwq_ind_tbl;
};
struct ib_mr {
struct ib_device *device;
struct ib_pd *pd;
- struct ib_uobject *uobject;
u32 lkey;
u32 rkey;
- atomic_t usecnt; /* count number of MWs */
+ u64 iova;
+ u32 length;
+ unsigned int page_size;
+ bool need_inval;
+ union {
+ struct ib_uobject *uobject; /* user */
+ struct list_head qp_entry; /* FR */
+ };
};
struct ib_mw {
@@ -1440,14 +1593,15 @@ enum ib_flow_attr_type {
enum ib_flow_spec_type {
/* L2 headers*/
IB_FLOW_SPEC_ETH = 0x20,
- IB_FLOW_SPEC_IB = 0x21,
+ IB_FLOW_SPEC_IB = 0x22,
/* L3 header*/
IB_FLOW_SPEC_IPV4 = 0x30,
+ IB_FLOW_SPEC_IPV6 = 0x31,
/* L4 headers*/
IB_FLOW_SPEC_TCP = 0x40,
IB_FLOW_SPEC_UDP = 0x41
};
-
+#define IB_FLOW_SPEC_LAYER_MASK 0xF0
#define IB_FLOW_SPEC_SUPPORT_LAYERS 4
/* Flow steering rule priority is set according to it's domain.
@@ -1462,7 +1616,8 @@ enum ib_flow_domain {
};
enum ib_flow_flags {
- IB_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1
+ IB_FLOW_ATTR_FLAGS_DONT_TRAP = 1UL << 1, /* Continue match, no steal */
+ IB_FLOW_ATTR_FLAGS_RESERVED = 1UL << 2 /* Must be last */
};
struct ib_flow_eth_filter {
@@ -1470,6 +1625,8 @@ struct ib_flow_eth_filter {
u8 src_mac[6];
__be16 ether_type;
__be16 vlan_tag;
+ /* Must be last */
+ u8 real_sz[0];
};
struct ib_flow_spec_eth {
@@ -1480,8 +1637,10 @@ struct ib_flow_spec_eth {
};
struct ib_flow_ib_filter {
- __be32 l3_type_qpn;
- u8 dst_gid[16];
+ __be16 dlid;
+ __u8 sl;
+ /* Must be last */
+ u8 real_sz[0];
};
struct ib_flow_spec_ib {
@@ -1491,9 +1650,22 @@ struct ib_flow_spec_ib {
struct ib_flow_ib_filter mask;
};
+/* IPv4 header flags */
+enum ib_ipv4_flags {
+ IB_IPV4_DONT_FRAG = 0x2, /* Don't enable packet fragmentation */
+ IB_IPV4_MORE_FRAG = 0X4 /* For All fragmented packets except the
+ last have this flag set */
+};
+
struct ib_flow_ipv4_filter {
- __be32 src_ip;
- __be32 dst_ip;
+ __be32 src_ip;
+ __be32 dst_ip;
+ u8 proto;
+ u8 tos;
+ u8 ttl;
+ u8 flags;
+ /* Must be last */
+ u8 real_sz[0];
};
struct ib_flow_spec_ipv4 {
@@ -1503,9 +1675,29 @@ struct ib_flow_spec_ipv4 {
struct ib_flow_ipv4_filter mask;
};
+struct ib_flow_ipv6_filter {
+ u8 src_ip[16];
+ u8 dst_ip[16];
+ __be32 flow_label;
+ u8 next_hdr;
+ u8 traffic_class;
+ u8 hop_limit;
+ /* Must be last */
+ u8 real_sz[0];
+};
+
+struct ib_flow_spec_ipv6 {
+ enum ib_flow_spec_type type;
+ u16 size;
+ struct ib_flow_ipv6_filter val;
+ struct ib_flow_ipv6_filter mask;
+};
+
struct ib_flow_tcp_udp_filter {
- __be16 dst_port;
+ __be16 dst_port;
__be16 src_port;
+ /* Must be last */
+ u8 real_sz[0];
};
struct ib_flow_spec_tcp_udp {
@@ -1520,19 +1712,20 @@ union ib_flow_spec {
enum ib_flow_spec_type type;
u16 size;
};
- struct ib_flow_spec_ib ib;
- struct ib_flow_spec_eth eth;
- struct ib_flow_spec_ipv4 ipv4;
- struct ib_flow_spec_tcp_udp tcp_udp;
+ struct ib_flow_spec_eth eth;
+ struct ib_flow_spec_ib ib;
+ struct ib_flow_spec_ipv4 ipv4;
+ struct ib_flow_spec_tcp_udp tcp_udp;
+ struct ib_flow_spec_ipv6 ipv6;
};
struct ib_flow_attr {
enum ib_flow_attr_type type;
u16 size;
u16 priority;
+ u32 flags;
u8 num_of_specs;
u8 port;
- u32 flags;
/* Following are the optional layers according to user request
* struct ib_flow_spec_xxx
* struct ib_flow_spec_yyy
@@ -1544,7 +1737,7 @@ struct ib_flow {
struct ib_uobject *uobject;
};
-struct ib_mad;
+struct ib_mad_hdr;
struct ib_grh;
enum ib_process_mad_flags {
@@ -1566,19 +1759,10 @@ struct ib_cache {
rwlock_t lock;
struct ib_event_handler event_handler;
struct ib_pkey_cache **pkey_cache;
- struct ib_gid_cache **gid_cache;
+ struct ib_gid_table **gid_cache;
u8 *lmc_cache;
};
-enum verbs_values_mask {
- IBV_VALUES_HW_CLOCK = 1 << 0
-};
-
-struct ib_device_values {
- int values_mask;
- uint64_t hwclock;
-};
-
struct ib_dma_mapping_ops {
int (*mapping_error)(struct ib_device *dev,
u64 dma_addr);
@@ -1601,10 +1785,14 @@ struct ib_dma_mapping_ops {
void (*unmap_sg)(struct ib_device *dev,
struct scatterlist *sg, int nents,
enum dma_data_direction direction);
- u64 (*dma_address)(struct ib_device *dev,
- struct scatterlist *sg);
- unsigned int (*dma_len)(struct ib_device *dev,
- struct scatterlist *sg);
+ int (*map_sg_attrs)(struct ib_device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction,
+ struct dma_attrs *attrs);
+ void (*unmap_sg_attrs)(struct ib_device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction,
+ struct dma_attrs *attrs);
void (*sync_single_for_cpu)(struct ib_device *dev,
u64 dma_handle,
size_t size,
@@ -1623,8 +1811,13 @@ struct ib_dma_mapping_ops {
};
struct iw_cm_verbs;
-struct ib_exp_device_attr;
-struct ib_exp_qp_init_attr;
+
+struct ib_port_immutable {
+ int pkey_tbl_len;
+ int gid_tbl_len;
+ u32 core_cap_flags;
+ u32 max_mad_size;
+};
struct ib_device {
struct device *dma_device;
@@ -1636,36 +1829,92 @@ struct ib_device {
spinlock_t client_data_lock;
struct list_head core_list;
+ /* Access to the client_data_list is protected by the client_data_lock
+ * spinlock and the lists_rwsem read-write semaphore */
struct list_head client_data_list;
struct ib_cache cache;
- int *pkey_tbl_len;
- int *gid_tbl_len;
+ /**
+ * port_immutable is indexed by port number
+ */
+ struct ib_port_immutable *port_immutable;
int num_comp_vectors;
struct iw_cm_verbs *iwcm;
- int (*get_protocol_stats)(struct ib_device *device,
- union rdma_protocol_stats *stats);
+ /**
+ * alloc_hw_stats - Allocate a struct rdma_hw_stats and fill in the
+ * driver initialized data. The struct is kfree()'ed by the sysfs
+ * core when the device is removed. A lifespan of -1 in the return
+ * struct tells the core to set a default lifespan.
+ */
+ struct rdma_hw_stats *(*alloc_hw_stats)(struct ib_device *device,
+ u8 port_num);
+ /**
+ * get_hw_stats - Fill in the counter value(s) in the stats struct.
+ * @index - The index in the value array we wish to have updated, or
+ * num_counters if we want all stats updated
+ * Return codes -
+ * < 0 - Error, no counters updated
+ * index - Updated the single counter pointed to by index
+ * num_counters - Updated all counters (will reset the timestamp
+ * and prevent further calls for lifespan milliseconds)
+ * Drivers are allowed to update all counters in leiu of just the
+ * one given in index at their option
+ */
+ int (*get_hw_stats)(struct ib_device *device,
+ struct rdma_hw_stats *stats,
+ u8 port, int index);
int (*query_device)(struct ib_device *device,
- struct ib_device_attr *device_attr);
+ struct ib_device_attr *device_attr,
+ struct ib_udata *udata);
int (*query_port)(struct ib_device *device,
u8 port_num,
struct ib_port_attr *port_attr);
enum rdma_link_layer (*get_link_layer)(struct ib_device *device,
u8 port_num);
/* When calling get_netdev, the HW vendor's driver should return the
- * net device of device @device at port @port_num. The function
- * is called in rtnl_lock. The HW vendor's device driver must guarantee
- * to return NULL before the net device has reached
+ * net device of device @device at port @port_num or NULL if such
+ * a net device doesn't exist. The vendor driver should call dev_hold
+ * on this net device. The HW vendor's device driver must guarantee
+ * that this function returns NULL before the net device reaches
* NETDEV_UNREGISTER_FINAL state.
*/
- struct net_device *(*get_netdev)(struct ib_device *device,
- u8 port_num);
+ struct net_device *(*get_netdev)(struct ib_device *device,
+ u8 port_num);
int (*query_gid)(struct ib_device *device,
u8 port_num, int index,
union ib_gid *gid);
+ /* When calling add_gid, the HW vendor's driver should
+ * add the gid of device @device at gid index @index of
+ * port @port_num to be @gid. Meta-info of that gid (for example,
+ * the network device related to this gid is available
+ * at @attr. @context allows the HW vendor driver to store extra
+ * information together with a GID entry. The HW vendor may allocate
+ * memory to contain this information and store it in @context when a
+ * new GID entry is written to. Params are consistent until the next
+ * call of add_gid or delete_gid. The function should return 0 on
+ * success or error otherwise. The function could be called
+ * concurrently for different ports. This function is only called
+ * when roce_gid_table is used.
+ */
+ int (*add_gid)(struct ib_device *device,
+ u8 port_num,
+ unsigned int index,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *attr,
+ void **context);
+ /* When calling del_gid, the HW vendor's driver should delete the
+ * gid of device @device at gid index @index of port @port_num.
+ * Upon the deletion of a GID entry, the HW vendor must free any
+ * allocated memory. The caller will clear @context afterwards.
+ * This function is only called when roce_gid_table is used.
+ */
+ int (*del_gid)(struct ib_device *device,
+ u8 port_num,
+ unsigned int index,
+ void **context);
int (*query_pkey)(struct ib_device *device,
u8 port_num, u16 index, u16 *pkey);
int (*modify_device)(struct ib_device *device,
@@ -1722,12 +1971,11 @@ struct ib_device {
struct ib_recv_wr *recv_wr,
struct ib_recv_wr **bad_recv_wr);
struct ib_cq * (*create_cq)(struct ib_device *device,
- struct ib_cq_init_attr *attr,
+ const struct ib_cq_init_attr *attr,
struct ib_ucontext *context,
struct ib_udata *udata);
- int (*modify_cq)(struct ib_cq *cq,
- struct ib_cq_attr *cq_attr,
- int cq_attr_mask);
+ int (*modify_cq)(struct ib_cq *cq, u16 cq_count,
+ u16 cq_period);
int (*destroy_cq)(struct ib_cq *cq);
int (*resize_cq)(struct ib_cq *cq, int cqe,
struct ib_udata *udata);
@@ -1740,40 +1988,29 @@ struct ib_device {
int wc_cnt);
struct ib_mr * (*get_dma_mr)(struct ib_pd *pd,
int mr_access_flags);
- struct ib_mr * (*reg_phys_mr)(struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- int mr_access_flags,
- u64 *iova_start);
struct ib_mr * (*reg_user_mr)(struct ib_pd *pd,
u64 start, u64 length,
u64 virt_addr,
int mr_access_flags,
- struct ib_udata *udata,
- int mr_id);
- int (*query_mr)(struct ib_mr *mr,
- struct ib_mr_attr *mr_attr);
- int (*dereg_mr)(struct ib_mr *mr);
- int (*destroy_mr)(struct ib_mr *mr);
- struct ib_mr * (*create_mr)(struct ib_pd *pd,
- struct ib_mr_init_attr *mr_init_attr);
- struct ib_mr * (*alloc_fast_reg_mr)(struct ib_pd *pd,
- int max_page_list_len);
- struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device,
- int page_list_len);
- void (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list);
- int (*rereg_phys_mr)(struct ib_mr *mr,
- int mr_rereg_mask,
- struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
+ struct ib_udata *udata);
+ int (*rereg_user_mr)(struct ib_mr *mr,
+ int flags,
+ u64 start, u64 length,
+ u64 virt_addr,
int mr_access_flags,
- u64 *iova_start);
+ struct ib_pd *pd,
+ struct ib_udata *udata);
+ int (*dereg_mr)(struct ib_mr *mr);
+ struct ib_mr * (*alloc_mr)(struct ib_pd *pd,
+ enum ib_mr_type mr_type,
+ u32 max_num_sg);
+ int (*map_mr_sg)(struct ib_mr *mr,
+ struct scatterlist *sg,
+ int sg_nents,
+ unsigned int *sg_offset);
struct ib_mw * (*alloc_mw)(struct ib_pd *pd,
- enum ib_mw_type type);
- int (*bind_mw)(struct ib_qp *qp,
- struct ib_mw *mw,
- struct ib_mw_bind *mw_bind);
+ enum ib_mw_type type,
+ struct ib_udata *udata);
int (*dealloc_mw)(struct ib_mw *mw);
struct ib_fmr * (*alloc_fmr)(struct ib_pd *pd,
int mr_access_flags,
@@ -1792,10 +2029,13 @@ struct ib_device {
int (*process_mad)(struct ib_device *device,
int process_mad_flags,
u8 port_num,
- struct ib_wc *in_wc,
- struct ib_grh *in_grh,
- struct ib_mad *in_mad,
- struct ib_mad *out_mad);
+ const struct ib_wc *in_wc,
+ const struct ib_grh *in_grh,
+ const struct ib_mad_hdr *in_mad,
+ size_t in_mad_size,
+ struct ib_mad_hdr *out_mad,
+ size_t *out_mad_size,
+ u16 *out_mad_pkey_index);
struct ib_xrcd * (*alloc_xrcd)(struct ib_device *device,
struct ib_ucontext *ucontext,
struct ib_udata *udata);
@@ -1807,17 +2047,29 @@ struct ib_device {
int (*destroy_flow)(struct ib_flow *flow_id);
int (*check_mr_status)(struct ib_mr *mr, u32 check_mask,
struct ib_mr_status *mr_status);
-
- unsigned long (*get_unmapped_area)(struct file *file,
- unsigned long addr,
- unsigned long len, unsigned long pgoff,
- unsigned long flags);
- int (*ioctl)(struct ib_ucontext *context,
- unsigned int cmd,
- unsigned long arg);
- int (*query_values)(struct ib_device *device,
- int q_values,
- struct ib_device_values *values);
+ void (*disassociate_ucontext)(struct ib_ucontext *ibcontext);
+ void (*drain_rq)(struct ib_qp *qp);
+ void (*drain_sq)(struct ib_qp *qp);
+ int (*set_vf_link_state)(struct ib_device *device, int vf, u8 port,
+ int state);
+ int (*get_vf_config)(struct ib_device *device, int vf, u8 port,
+ struct ifla_vf_info *ivf);
+ int (*get_vf_stats)(struct ib_device *device, int vf, u8 port,
+ struct ifla_vf_stats *stats);
+ int (*set_vf_guid)(struct ib_device *device, int vf, u8 port, u64 guid,
+ int type);
+ struct ib_wq * (*create_wq)(struct ib_pd *pd,
+ struct ib_wq_init_attr *init_attr,
+ struct ib_udata *udata);
+ int (*destroy_wq)(struct ib_wq *wq);
+ int (*modify_wq)(struct ib_wq *wq,
+ struct ib_wq_attr *attr,
+ u32 wq_attr_mask,
+ struct ib_udata *udata);
+ struct ib_rwq_ind_table * (*create_rwq_ind_table)(struct ib_device *device,
+ struct ib_rwq_ind_table_init_attr *init_attr,
+ struct ib_udata *udata);
+ int (*destroy_rwq_ind_table)(struct ib_rwq_ind_table *wq_ind_table);
struct ib_dma_mapping_ops *dma_ops;
struct module *owner;
@@ -1835,44 +2087,61 @@ struct ib_device {
u64 uverbs_cmd_mask;
u64 uverbs_ex_cmd_mask;
- char node_desc[64];
+ char node_desc[IB_DEVICE_NODE_DESC_MAX];
__be64 node_guid;
u32 local_dma_lkey;
+ u16 is_switch:1;
u8 node_type;
u8 phys_port_cnt;
- int cmd_perf;
- u64 cmd_avg;
- u32 cmd_n;
- spinlock_t cmd_perf_lock;
-
- /*
- * Experimental data and functions
+ struct ib_device_attr attrs;
+ struct attribute_group *hw_stats_ag;
+ struct rdma_hw_stats *hw_stats;
+
+ /**
+ * The following mandatory functions are used only at device
+ * registration. Keep functions such as these at the end of this
+ * structure to avoid cache line misses when accessing struct ib_device
+ * in fast paths.
*/
- int (*exp_query_device)(struct ib_device *device,
- struct ib_exp_device_attr *device_attr);
- struct ib_qp * (*exp_create_qp)(struct ib_pd *pd,
- struct ib_exp_qp_init_attr *qp_init_attr,
- struct ib_udata *udata);
- struct ib_dct * (*exp_create_dct)(struct ib_pd *pd,
- struct ib_dct_init_attr *attr,
- struct ib_udata *udata);
- int (*exp_destroy_dct)(struct ib_dct *dct);
- int (*exp_query_dct)(struct ib_dct *dct, struct ib_dct_attr *attr);
-
- u64 uverbs_exp_cmd_mask;
+ int (*get_port_immutable)(struct ib_device *, u8, struct ib_port_immutable *);
+ void (*get_dev_fw_str)(struct ib_device *, char *str, size_t str_len);
};
struct ib_client {
char *name;
void (*add) (struct ib_device *);
- void (*remove)(struct ib_device *);
-
+ void (*remove)(struct ib_device *, void *client_data);
+
+ /* Returns the net_dev belonging to this ib_client and matching the
+ * given parameters.
+ * @dev: An RDMA device that the net_dev use for communication.
+ * @port: A physical port number on the RDMA device.
+ * @pkey: P_Key that the net_dev uses if applicable.
+ * @gid: A GID that the net_dev uses to communicate.
+ * @addr: An IP address the net_dev is configured with.
+ * @client_data: The device's client data set by ib_set_client_data().
+ *
+ * An ib_client that implements a net_dev on top of RDMA devices
+ * (such as IP over IB) should implement this callback, allowing the
+ * rdma_cm module to find the right net_dev for a given request.
+ *
+ * The caller is responsible for calling dev_put on the returned
+ * netdev. */
+ struct net_device *(*get_net_dev_by_params)(
+ struct ib_device *dev,
+ u8 port,
+ u16 pkey,
+ const union ib_gid *gid,
+ const struct sockaddr *addr,
+ void *client_data);
struct list_head list;
};
struct ib_device *ib_alloc_device(size_t size);
void ib_dealloc_device(struct ib_device *device);
+void ib_get_device_fw_str(struct ib_device *device, char *str, size_t str_len);
+
int ib_register_device(struct ib_device *device,
int (*port_callback)(struct ib_device *,
u8, struct kobject *));
@@ -1887,12 +2156,32 @@ void ib_set_client_data(struct ib_device *device, struct ib_client *client,
static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len)
{
- return udata->ops->copy_from(dest, udata, len);
+ return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0;
}
static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len)
{
- return udata->ops->copy_to(udata, src, len);
+ return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0;
+}
+
+static inline bool ib_is_udata_cleared(struct ib_udata *udata,
+ size_t offset,
+ size_t len)
+{
+ const void __user *p = (const char __user *)udata->inbuf + offset;
+ bool ret;
+ u8 *buf;
+
+ if (len > USHRT_MAX)
+ return false;
+
+ buf = memdup_user(p, len);
+ if (IS_ERR(buf))
+ return false;
+
+ ret = !memchr_inv(buf, 0, len);
+ kfree(buf);
+ return ret;
}
/**
@@ -1919,17 +2208,330 @@ int ib_register_event_handler (struct ib_event_handler *event_handler);
int ib_unregister_event_handler(struct ib_event_handler *event_handler);
void ib_dispatch_event(struct ib_event *event);
-int ib_query_device(struct ib_device *device,
- struct ib_device_attr *device_attr);
-
int ib_query_port(struct ib_device *device,
u8 port_num, struct ib_port_attr *port_attr);
enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device,
u8 port_num);
+/**
+ * rdma_cap_ib_switch - Check if the device is IB switch
+ * @device: Device to check
+ *
+ * Device driver is responsible for setting is_switch bit on
+ * in ib_device structure at init time.
+ *
+ * Return: true if the device is IB switch.
+ */
+static inline bool rdma_cap_ib_switch(const struct ib_device *device)
+{
+ return device->is_switch;
+}
+
+/**
+ * rdma_start_port - Return the first valid port number for the device
+ * specified
+ *
+ * @device: Device to be checked
+ *
+ * Return start port number
+ */
+static inline u8 rdma_start_port(const struct ib_device *device)
+{
+ return rdma_cap_ib_switch(device) ? 0 : 1;
+}
+
+/**
+ * rdma_end_port - Return the last valid port number for the device
+ * specified
+ *
+ * @device: Device to be checked
+ *
+ * Return last port number
+ */
+static inline u8 rdma_end_port(const struct ib_device *device)
+{
+ return rdma_cap_ib_switch(device) ? 0 : device->phys_port_cnt;
+}
+
+static inline bool rdma_protocol_ib(const struct ib_device *device, u8 port_num)
+{
+ return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IB;
+}
+
+static inline bool rdma_protocol_roce(const struct ib_device *device, u8 port_num)
+{
+ return device->port_immutable[port_num].core_cap_flags &
+ (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP);
+}
+
+static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u8 port_num)
+{
+ return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP;
+}
+
+static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u8 port_num)
+{
+ return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE;
+}
+
+static inline bool rdma_protocol_iwarp(const struct ib_device *device, u8 port_num)
+{
+ return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IWARP;
+}
+
+static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num)
+{
+ return rdma_protocol_ib(device, port_num) ||
+ rdma_protocol_roce(device, port_num);
+}
+
+/**
+ * rdma_cap_ib_mad - Check if the port of a device supports Infiniband
+ * Management Datagrams.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * Management Datagrams (MAD) are a required part of the InfiniBand
+ * specification and are supported on all InfiniBand devices. A slightly
+ * extended version are also supported on OPA interfaces.
+ *
+ * Return: true if the port supports sending/receiving of MAD packets.
+ */
+static inline bool rdma_cap_ib_mad(const struct ib_device *device, u8 port_num)
+{
+ return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_MAD;
+}
+
+/**
+ * rdma_cap_opa_mad - Check if the port of device provides support for OPA
+ * Management Datagrams.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * Intel OmniPath devices extend and/or replace the InfiniBand Management
+ * datagrams with their own versions. These OPA MADs share many but not all of
+ * the characteristics of InfiniBand MADs.
+ *
+ * OPA MADs differ in the following ways:
+ *
+ * 1) MADs are variable size up to 2K
+ * IBTA defined MADs remain fixed at 256 bytes
+ * 2) OPA SMPs must carry valid PKeys
+ * 3) OPA SMP packets are a different format
+ *
+ * Return: true if the port supports OPA MAD packet formats.
+ */
+static inline bool rdma_cap_opa_mad(struct ib_device *device, u8 port_num)
+{
+ return (device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_OPA_MAD)
+ == RDMA_CORE_CAP_OPA_MAD;
+}
+
+/**
+ * rdma_cap_ib_smi - Check if the port of a device provides an Infiniband
+ * Subnet Management Agent (SMA) on the Subnet Management Interface (SMI).
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * Each InfiniBand node is required to provide a Subnet Management Agent
+ * that the subnet manager can access. Prior to the fabric being fully
+ * configured by the subnet manager, the SMA is accessed via a well known
+ * interface called the Subnet Management Interface (SMI). This interface
+ * uses directed route packets to communicate with the SM to get around the
+ * chicken and egg problem of the SM needing to know what's on the fabric
+ * in order to configure the fabric, and needing to configure the fabric in
+ * order to send packets to the devices on the fabric. These directed
+ * route packets do not need the fabric fully configured in order to reach
+ * their destination. The SMI is the only method allowed to send
+ * directed route packets on an InfiniBand fabric.
+ *
+ * Return: true if the port provides an SMI.
+ */
+static inline bool rdma_cap_ib_smi(const struct ib_device *device, u8 port_num)
+{
+ return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SMI;
+}
+
+/**
+ * rdma_cap_ib_cm - Check if the port of device has the capability Infiniband
+ * Communication Manager.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * The InfiniBand Communication Manager is one of many pre-defined General
+ * Service Agents (GSA) that are accessed via the General Service
+ * Interface (GSI). It's role is to facilitate establishment of connections
+ * between nodes as well as other management related tasks for established
+ * connections.
+ *
+ * Return: true if the port supports an IB CM (this does not guarantee that
+ * a CM is actually running however).
+ */
+static inline bool rdma_cap_ib_cm(const struct ib_device *device, u8 port_num)
+{
+ return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_CM;
+}
+
+/**
+ * rdma_cap_iw_cm - Check if the port of device has the capability IWARP
+ * Communication Manager.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * Similar to above, but specific to iWARP connections which have a different
+ * managment protocol than InfiniBand.
+ *
+ * Return: true if the port supports an iWARP CM (this does not guarantee that
+ * a CM is actually running however).
+ */
+static inline bool rdma_cap_iw_cm(const struct ib_device *device, u8 port_num)
+{
+ return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IW_CM;
+}
+
+/**
+ * rdma_cap_ib_sa - Check if the port of device has the capability Infiniband
+ * Subnet Administration.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * An InfiniBand Subnet Administration (SA) service is a pre-defined General
+ * Service Agent (GSA) provided by the Subnet Manager (SM). On InfiniBand
+ * fabrics, devices should resolve routes to other hosts by contacting the
+ * SA to query the proper route.
+ *
+ * Return: true if the port should act as a client to the fabric Subnet
+ * Administration interface. This does not imply that the SA service is
+ * running locally.
+ */
+static inline bool rdma_cap_ib_sa(const struct ib_device *device, u8 port_num)
+{
+ return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SA;
+}
+
+/**
+ * rdma_cap_ib_mcast - Check if the port of device has the capability Infiniband
+ * Multicast.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * InfiniBand multicast registration is more complex than normal IPv4 or
+ * IPv6 multicast registration. Each Host Channel Adapter must register
+ * with the Subnet Manager when it wishes to join a multicast group. It
+ * should do so only once regardless of how many queue pairs it subscribes
+ * to this group. And it should leave the group only after all queue pairs
+ * attached to the group have been detached.
+ *
+ * Return: true if the port must undertake the additional adminstrative
+ * overhead of registering/unregistering with the SM and tracking of the
+ * total number of queue pairs attached to the multicast group.
+ */
+static inline bool rdma_cap_ib_mcast(const struct ib_device *device, u8 port_num)
+{
+ return rdma_cap_ib_sa(device, port_num);
+}
+
+/**
+ * rdma_cap_af_ib - Check if the port of device has the capability
+ * Native Infiniband Address.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * InfiniBand addressing uses a port's GUID + Subnet Prefix to make a default
+ * GID. RoCE uses a different mechanism, but still generates a GID via
+ * a prescribed mechanism and port specific data.
+ *
+ * Return: true if the port uses a GID address to identify devices on the
+ * network.
+ */
+static inline bool rdma_cap_af_ib(const struct ib_device *device, u8 port_num)
+{
+ return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_AF_IB;
+}
+
+/**
+ * rdma_cap_eth_ah - Check if the port of device has the capability
+ * Ethernet Address Handle.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * RoCE is InfiniBand over Ethernet, and it uses a well defined technique
+ * to fabricate GIDs over Ethernet/IP specific addresses native to the
+ * port. Normally, packet headers are generated by the sending host
+ * adapter, but when sending connectionless datagrams, we must manually
+ * inject the proper headers for the fabric we are communicating over.
+ *
+ * Return: true if we are running as a RoCE port and must force the
+ * addition of a Global Route Header built from our Ethernet Address
+ * Handle into our header list for connectionless packets.
+ */
+static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num)
+{
+ return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_ETH_AH;
+}
+
+/**
+ * rdma_max_mad_size - Return the max MAD size required by this RDMA Port.
+ *
+ * @device: Device
+ * @port_num: Port number
+ *
+ * This MAD size includes the MAD headers and MAD payload. No other headers
+ * are included.
+ *
+ * Return the max MAD size required by the Port. Will return 0 if the port
+ * does not support MADs
+ */
+static inline size_t rdma_max_mad_size(const struct ib_device *device, u8 port_num)
+{
+ return device->port_immutable[port_num].max_mad_size;
+}
+
+/**
+ * rdma_cap_roce_gid_table - Check if the port of device uses roce_gid_table
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * RoCE GID table mechanism manages the various GIDs for a device.
+ *
+ * NOTE: if allocating the port's GID table has failed, this call will still
+ * return true, but any RoCE GID table API will fail.
+ *
+ * Return: true if the port uses RoCE GID table mechanism in order to manage
+ * its GIDs.
+ */
+static inline bool rdma_cap_roce_gid_table(const struct ib_device *device,
+ u8 port_num)
+{
+ return rdma_protocol_roce(device, port_num) &&
+ device->add_gid && device->del_gid;
+}
+
+/*
+ * Check if the device supports READ W/ INVALIDATE.
+ */
+static inline bool rdma_cap_read_inv(struct ib_device *dev, u32 port_num)
+{
+ /*
+ * iWarp drivers must support READ W/ INVALIDATE. No other protocol
+ * has support for it yet.
+ */
+ return rdma_protocol_iwarp(dev, port_num);
+}
+
int ib_query_gid(struct ib_device *device,
- u8 port_num, int index, union ib_gid *gid);
+ u8 port_num, int index, union ib_gid *gid,
+ struct ib_gid_attr *attr);
+
+int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port,
+ int state);
+int ib_get_vf_config(struct ib_device *device, int vf, u8 port,
+ struct ifla_vf_info *info);
+int ib_get_vf_stats(struct ib_device *device, int vf, u8 port,
+ struct ifla_vf_stats *stats);
+int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid,
+ int type);
int ib_query_pkey(struct ib_device *device,
u8 port_num, u16 index, u16 *pkey);
@@ -1943,25 +2545,30 @@ int ib_modify_port(struct ib_device *device,
struct ib_port_modify *port_modify);
int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+ enum ib_gid_type gid_type, struct net_device *ndev,
u8 *port_num, u16 *index);
int ib_find_pkey(struct ib_device *device,
u8 port_num, u16 pkey, u16 *index);
-/**
- * ib_alloc_pd - Allocates an unused protection domain.
- * @device: The device on which to allocate the protection domain.
- *
- * A protection domain object provides an association between QPs, shared
- * receive queues, address handles, memory regions, and memory windows.
- */
-struct ib_pd *ib_alloc_pd(struct ib_device *device);
+enum ib_pd_flags {
+ /*
+ * Create a memory registration for all memory in the system and place
+ * the rkey for it into pd->unsafe_global_rkey. This can be used by
+ * ULPs to avoid the overhead of dynamic MRs.
+ *
+ * This flag is generally considered unsafe and must only be used in
+ * extremly trusted environments. Every use of it will log a warning
+ * in the kernel log.
+ */
+ IB_PD_UNSAFE_GLOBAL_RKEY = 0x01,
+};
-/**
- * ib_dealloc_pd - Deallocates a protection domain.
- * @pd: The protection domain to deallocate.
- */
-int ib_dealloc_pd(struct ib_pd *pd);
+struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
+ const char *caller);
+#define ib_alloc_pd(device, flags) \
+ __ib_alloc_pd((device), (flags), __func__)
+void ib_dealloc_pd(struct ib_pd *pd);
/**
* ib_create_ah - Creates an address handle for the given address vector.
@@ -1984,8 +2591,9 @@ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
* @ah_attr: Returned attributes that can be used when creating an address
* handle for replying to the message.
*/
-int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
- struct ib_grh *grh, struct ib_ah_attr *ah_attr);
+int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
+ const struct ib_wc *wc, const struct ib_grh *grh,
+ struct ib_ah_attr *ah_attr);
/**
* ib_create_ah_from_wc - Creates an address handle associated with the
@@ -1999,8 +2607,8 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
* The address handle is used to reference a local or global destination
* in all UD QP post sends.
*/
-struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc,
- struct ib_grh *grh, u8 port_num);
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc,
+ const struct ib_grh *grh, u8 port_num);
/**
* ib_modify_ah - Modifies the address vector associated with an address
@@ -2187,6 +2795,10 @@ static inline int ib_post_recv(struct ib_qp *qp,
return qp->device->post_recv(qp, recv_wr, bad_recv_wr);
}
+struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
+ int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx);
+void ib_free_cq(struct ib_cq *cq);
+
/**
* ib_create_cq - Creates a CQ on the specified device.
* @device: The device on which to create the CQ.
@@ -2196,16 +2808,15 @@ static inline int ib_post_recv(struct ib_qp *qp,
* asynchronous event not associated with a completion occurs on the CQ.
* @cq_context: Context associated with the CQ returned to the user via
* the associated completion and event handlers.
- * @cqe: The minimum size of the CQ.
- * @comp_vector - Completion vector used to signal completion events.
- * Must be >= 0 and < context->num_comp_vectors.
+ * @cq_attr: The attributes the CQ should be created upon.
*
* Users can examine the cq structure to determine the actual CQ size.
*/
struct ib_cq *ib_create_cq(struct ib_device *device,
ib_comp_handler comp_handler,
void (*event_handler)(struct ib_event *, void *),
- void *cq_context, int cqe, int comp_vector);
+ void *cq_context,
+ const struct ib_cq_init_attr *cq_attr);
/**
* ib_resize_cq - Modifies the capacity of the CQ.
@@ -2217,16 +2828,13 @@ struct ib_cq *ib_create_cq(struct ib_device *device,
int ib_resize_cq(struct ib_cq *cq, int cqe);
/**
- * ib_modify_cq - Modifies the attributes for the specified CQ and then
- * transitions the CQ to the given state.
+ * ib_modify_cq - Modifies moderation params of the CQ
* @cq: The CQ to modify.
- * @cq_attr: specifies the CQ attributes to modify.
- * @cq_attr_mask: A bit-mask used to specify which attributes of the CQ
- * are being modified.
+ * @cq_count: number of CQEs that will trigger an event
+ * @cq_period: max period of time in usec before triggering an event
+ *
*/
-int ib_modify_cq(struct ib_cq *cq,
- struct ib_cq_attr *cq_attr,
- int cq_attr_mask);
+int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
/**
* ib_destroy_cq - Destroys the specified CQ.
@@ -2312,18 +2920,6 @@ static inline int ib_req_ncomp_notif(struct ib_cq *cq, int wc_cnt)
}
/**
- * ib_get_dma_mr - Returns a memory region for system memory that is
- * usable for DMA.
- * @pd: The protection domain associated with the memory region.
- * @mr_access_flags: Specifies the memory access rights.
- *
- * Note that the ib_dma_*() functions defined below must be used
- * to create/destroy addresses used with the Lkey or Rkey returned
- * by ib_get_dma_mr().
- */
-struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
-
-/**
* ib_dma_mapping_error - check a DMA addr for error
* @dev: The device for which the dma_addr was created
* @dma_addr: The DMA address to check
@@ -2371,19 +2967,19 @@ static inline void ib_dma_unmap_single(struct ib_device *dev,
static inline u64 ib_dma_map_single_attrs(struct ib_device *dev,
void *cpu_addr, size_t size,
enum dma_data_direction direction,
- struct dma_attrs *attrs)
+ struct dma_attrs *dma_attrs)
{
return dma_map_single_attrs(dev->dma_device, cpu_addr, size,
- direction, attrs);
+ direction, dma_attrs);
}
static inline void ib_dma_unmap_single_attrs(struct ib_device *dev,
u64 addr, size_t size,
enum dma_data_direction direction,
- struct dma_attrs *attrs)
+ struct dma_attrs *dma_attrs)
{
return dma_unmap_single_attrs(dev->dma_device, addr, size,
- direction, attrs);
+ direction, dma_attrs);
}
/**
@@ -2458,28 +3054,39 @@ static inline void ib_dma_unmap_sg(struct ib_device *dev,
static inline int ib_dma_map_sg_attrs(struct ib_device *dev,
struct scatterlist *sg, int nents,
enum dma_data_direction direction,
- struct dma_attrs *attrs)
+ struct dma_attrs *dma_attrs)
{
- return dma_map_sg_attrs(dev->dma_device, sg, nents, direction, attrs);
+ if (dev->dma_ops)
+ return dev->dma_ops->map_sg_attrs(dev, sg, nents, direction,
+ dma_attrs);
+ else
+ return dma_map_sg_attrs(dev->dma_device, sg, nents, direction,
+ dma_attrs);
}
static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev,
struct scatterlist *sg, int nents,
enum dma_data_direction direction,
- struct dma_attrs *attrs)
+ struct dma_attrs *dma_attrs)
{
- dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, attrs);
+ if (dev->dma_ops)
+ return dev->dma_ops->unmap_sg_attrs(dev, sg, nents, direction,
+ dma_attrs);
+ else
+ dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction,
+ dma_attrs);
}
/**
* ib_sg_dma_address - Return the DMA address from a scatter/gather entry
* @dev: The device for which the DMA addresses were created
* @sg: The scatter/gather entry
+ *
+ * Note: this function is obsolete. To do: change all occurrences of
+ * ib_sg_dma_address() into sg_dma_address().
*/
static inline u64 ib_sg_dma_address(struct ib_device *dev,
struct scatterlist *sg)
{
- if (dev->dma_ops)
- return dev->dma_ops->dma_address(dev, sg);
return sg_dma_address(sg);
}
@@ -2487,12 +3094,13 @@ static inline u64 ib_sg_dma_address(struct ib_device *dev,
* ib_sg_dma_len - Return the DMA length from a scatter/gather entry
* @dev: The device for which the DMA addresses were created
* @sg: The scatter/gather entry
+ *
+ * Note: this function is obsolete. To do: change all occurrences of
+ * ib_sg_dma_len() into sg_dma_len().
*/
static inline unsigned int ib_sg_dma_len(struct ib_device *dev,
struct scatterlist *sg)
{
- if (dev->dma_ops)
- return dev->dma_ops->dma_len(dev, sg);
return sg_dma_len(sg);
}
@@ -2574,59 +3182,6 @@ static inline void ib_dma_free_coherent(struct ib_device *dev,
}
/**
- * ib_reg_phys_mr - Prepares a virtually addressed memory region for use
- * by an HCA.
- * @pd: The protection domain associated assigned to the registered region.
- * @phys_buf_array: Specifies a list of physical buffers to use in the
- * memory region.
- * @num_phys_buf: Specifies the size of the phys_buf_array.
- * @mr_access_flags: Specifies the memory access rights.
- * @iova_start: The offset of the region's starting I/O virtual address.
- */
-struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- int mr_access_flags,
- u64 *iova_start);
-
-/**
- * ib_rereg_phys_mr - Modifies the attributes of an existing memory region.
- * Conceptually, this call performs the functions deregister memory region
- * followed by register physical memory region. Where possible,
- * resources are reused instead of deallocated and reallocated.
- * @mr: The memory region to modify.
- * @mr_rereg_mask: A bit-mask used to indicate which of the following
- * properties of the memory region are being modified.
- * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies
- * the new protection domain to associated with the memory region,
- * otherwise, this parameter is ignored.
- * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
- * field specifies a list of physical buffers to use in the new
- * translation, otherwise, this parameter is ignored.
- * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
- * field specifies the size of the phys_buf_array, otherwise, this
- * parameter is ignored.
- * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this
- * field specifies the new memory access rights, otherwise, this
- * parameter is ignored.
- * @iova_start: The offset of the region's starting I/O virtual address.
- */
-int ib_rereg_phys_mr(struct ib_mr *mr,
- int mr_rereg_mask,
- struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- int mr_access_flags,
- u64 *iova_start);
-
-/**
- * ib_query_mr - Retrieves information about a specific memory region.
- * @mr: The memory region to retrieve information about.
- * @mr_attr: The attributes of the specified memory region.
- */
-int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
-
-/**
* ib_dereg_mr - Deregisters a memory region and removes it from the
* HCA translation table.
* @mr: The memory region to deregister.
@@ -2635,60 +3190,9 @@ int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
*/
int ib_dereg_mr(struct ib_mr *mr);
-
-/**
- * ib_create_mr - Allocates a memory region that may be used for
- * signature handover operations.
- * @pd: The protection domain associated with the region.
- * @mr_init_attr: memory region init attributes.
- */
-struct ib_mr *ib_create_mr(struct ib_pd *pd,
- struct ib_mr_init_attr *mr_init_attr);
-
-/**
- * ib_destroy_mr - Destroys a memory region that was created using
- * ib_create_mr and removes it from HW translation tables.
- * @mr: The memory region to destroy.
- *
- * This function can fail, if the memory region has memory windows bound to it.
- */
-int ib_destroy_mr(struct ib_mr *mr);
-
-/**
- * ib_alloc_fast_reg_mr - Allocates memory region usable with the
- * IB_WR_FAST_REG_MR send work request.
- * @pd: The protection domain associated with the region.
- * @max_page_list_len: requested max physical buffer list length to be
- * used with fast register work requests for this MR.
- */
-struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len);
-
-/**
- * ib_alloc_fast_reg_page_list - Allocates a page list array
- * @device - ib device pointer.
- * @page_list_len - size of the page list array to be allocated.
- *
- * This allocates and returns a struct ib_fast_reg_page_list * and a
- * page_list array that is at least page_list_len in size. The actual
- * size is returned in max_page_list_len. The caller is responsible
- * for initializing the contents of the page_list array before posting
- * a send work request with the IB_WC_FAST_REG_MR opcode.
- *
- * The page_list array entries must be translated using one of the
- * ib_dma_*() functions just like the addresses passed to
- * ib_map_phys_fmr(). Once the ib_post_send() is issued, the struct
- * ib_fast_reg_page_list must not be modified by the caller until the
- * IB_WC_FAST_REG_MR work request completes.
- */
-struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(
- struct ib_device *device, int page_list_len);
-
-/**
- * ib_free_fast_reg_page_list - Deallocates a previously allocated
- * page list array.
- * @page_list - struct ib_fast_reg_page_list pointer to be deallocated.
- */
-void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
+struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
+ enum ib_mr_type mr_type,
+ u32 max_num_sg);
/**
* ib_update_fast_reg_key - updates the key portion of the fast_reg MR
@@ -2714,42 +3218,6 @@ static inline u32 ib_inc_rkey(u32 rkey)
}
/**
- * ib_alloc_mw - Allocates a memory window.
- * @pd: The protection domain associated with the memory window.
- * @type: The type of the memory window (1 or 2).
- */
-struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
-
-/**
- * ib_bind_mw - Posts a work request to the send queue of the specified
- * QP, which binds the memory window to the given address range and
- * remote access attributes.
- * @qp: QP to post the bind work request on.
- * @mw: The memory window to bind.
- * @mw_bind: Specifies information about the memory window, including
- * its address range, remote access rights, and associated memory region.
- *
- * If there is no immediate error, the function will update the rkey member
- * of the mw parameter to its new value. The bind operation can still fail
- * asynchronously.
- */
-static inline int ib_bind_mw(struct ib_qp *qp,
- struct ib_mw *mw,
- struct ib_mw_bind *mw_bind)
-{
- /* XXX reference counting in corresponding MR? */
- return mw->device->bind_mw ?
- mw->device->bind_mw(qp, mw, mw_bind) :
- -ENOSYS;
-}
-
-/**
- * ib_dealloc_mw - Deallocates a memory window.
- * @mw: The memory window to deallocate.
- */
-int ib_dealloc_mw(struct ib_mw *mw);
-
-/**
* ib_alloc_fmr - Allocates a unmapped fast memory region.
* @pd: The protection domain associated with the unmapped region.
* @mr_access_flags: Specifies the memory access rights.
@@ -2826,47 +3294,6 @@ struct ib_flow *ib_create_flow(struct ib_qp *qp,
struct ib_flow_attr *flow_attr, int domain);
int ib_destroy_flow(struct ib_flow *flow_id);
-struct ib_dct *ib_create_dct(struct ib_pd *pd, struct ib_dct_init_attr *attr,
- struct ib_udata *udata);
-int ib_destroy_dct(struct ib_dct *dct);
-int ib_query_dct(struct ib_dct *dct, struct ib_dct_attr *attr);
-
-int ib_query_values(struct ib_device *device,
- int q_values, struct ib_device_values *values);
-
-static inline void ib_active_speed_enum_to_rate(u8 active_speed,
- int *rate,
- char **speed)
-{
- switch (active_speed) {
- case IB_SPEED_DDR:
- *speed = " DDR";
- *rate = 50;
- break;
- case IB_SPEED_QDR:
- *speed = " QDR";
- *rate = 100;
- break;
- case IB_SPEED_FDR10:
- *speed = " FDR10";
- *rate = 100;
- break;
- case IB_SPEED_FDR:
- *speed = " FDR";
- *rate = 140;
- break;
- case IB_SPEED_EDR:
- *speed = " EDR";
- *rate = 250;
- break;
- case IB_SPEED_SDR:
- default: /* default to SDR for invalid rates */
- *rate = 25;
- break;
- }
-
-}
-
static inline int ib_check_mr_access(int flags)
{
/*
@@ -2895,4 +3322,38 @@ static inline int ib_check_mr_access(int flags)
int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
struct ib_mr_status *mr_status);
+struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port,
+ u16 pkey, const union ib_gid *gid,
+ const struct sockaddr *addr);
+struct ib_wq *ib_create_wq(struct ib_pd *pd,
+ struct ib_wq_init_attr *init_attr);
+int ib_destroy_wq(struct ib_wq *wq);
+int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *attr,
+ u32 wq_attr_mask);
+struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device,
+ struct ib_rwq_ind_table_init_attr*
+ wq_ind_table_init_attr);
+int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
+
+int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset, unsigned int page_size);
+
+static inline int
+ib_map_mr_sg_zbva(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset, unsigned int page_size)
+{
+ int n;
+
+ n = ib_map_mr_sg(mr, sg, sg_nents, sg_offset, page_size);
+ mr->iova = 0;
+
+ return n;
+}
+
+int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents,
+ unsigned int *sg_offset, int (*set_page)(struct ib_mr *, u64));
+
+void ib_drain_rq(struct ib_qp *qp);
+void ib_drain_sq(struct ib_qp *qp);
+void ib_drain_qp(struct ib_qp *qp);
#endif /* IB_VERBS_H */
diff --git a/sys/ofed/include/rdma/ib_verbs_exp.h b/sys/ofed/include/rdma/ib_verbs_exp.h
deleted file mode 100644
index ca5b84b5c76d7..0000000000000
--- a/sys/ofed/include/rdma/ib_verbs_exp.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
- * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
- * Copyright (c) 2004 Intel Corporation. All rights reserved.
- * Copyright (c) 2004 Topspin Corporation. All rights reserved.
- * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
- * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
- * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef IB_VERBS_EXP_H
-#define IB_VERBS_EXP_H
-
-#include <rdma/ib_verbs.h>
-
-
-enum ib_exp_device_cap_flags2 {
- IB_EXP_DEVICE_DC_TRANSPORT = 1 << 0,
- IB_EXP_DEVICE_QPG = 1 << 1,
- IB_EXP_DEVICE_UD_RSS = 1 << 2,
- IB_EXP_DEVICE_UD_TSS = 1 << 3
-};
-
-enum ib_exp_device_attr_comp_mask {
- IB_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK = 1ULL << 1,
- IB_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK = 1ULL << 2,
- IB_EXP_DEVICE_ATTR_CAP_FLAGS2 = 1ULL << 3,
- IB_EXP_DEVICE_ATTR_DC_REQ_RD = 1ULL << 4,
- IB_EXP_DEVICE_ATTR_DC_RES_RD = 1ULL << 5,
- IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ = 1ULL << 6,
- IB_EXP_DEVICE_ATTR_RSS_TBL_SZ = 1ULL << 7,
-};
-
-struct ib_exp_device_attr {
- struct ib_device_attr base;
- /* Use IB_EXP_DEVICE_ATTR_... for exp_comp_mask */
- uint32_t exp_comp_mask;
- uint64_t device_cap_flags2;
- uint32_t dc_rd_req;
- uint32_t dc_rd_res;
- uint32_t inline_recv_sz;
- uint32_t max_rss_tbl_sz;
-};
-
-struct ib_exp_qp_init_attr {
- void (*event_handler)(struct ib_event *, void *);
- void *qp_context;
- struct ib_cq *send_cq;
- struct ib_cq *recv_cq;
- struct ib_srq *srq;
- struct ib_xrcd *xrcd; /* XRC TGT QPs only */
- struct ib_qp_cap cap;
- union {
- struct ib_qp *qpg_parent; /* see qpg_type */
- struct ib_qpg_init_attrib parent_attrib;
- };
- enum ib_sig_type sq_sig_type;
- enum ib_qp_type qp_type;
- enum ib_qp_create_flags create_flags;
- enum ib_qpg_type qpg_type;
- u8 port_num; /* special QP types only */
- u32 max_inl_recv;
-};
-
-
-int ib_exp_query_device(struct ib_device *device,
- struct ib_exp_device_attr *device_attr);
-
-
-
-
-#endif /* IB_VERBS_EXP_H */
diff --git a/sys/ofed/include/rdma/iw_cm.h b/sys/ofed/include/rdma/iw_cm.h
index a246e6176c6cd..1a919be827a72 100644
--- a/sys/ofed/include/rdma/iw_cm.h
+++ b/sys/ofed/include/rdma/iw_cm.h
@@ -1,7 +1,6 @@
/*
* Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
* Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- * Copyright (c) 2016 Chelsio Communications. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -50,12 +49,11 @@ enum iw_cm_event_type {
struct iw_cm_event {
enum iw_cm_event_type event;
int status;
- struct sockaddr_in local_addr;
- struct sockaddr_in remote_addr;
+ struct sockaddr_storage local_addr;
+ struct sockaddr_storage remote_addr;
void *private_data;
void *provider_data;
u8 private_data_len;
- struct socket *so;
u8 ord;
u8 ird;
};
@@ -85,15 +83,17 @@ struct iw_cm_id {
iw_cm_handler cm_handler; /* client callback function */
void *context; /* client cb context */
struct ib_device *device;
- struct sockaddr_in local_addr;
- struct sockaddr_in remote_addr;
+ struct sockaddr_storage local_addr; /* local addr */
+ struct sockaddr_storage remote_addr;
+ struct sockaddr_storage m_local_addr; /* nmapped local addr */
+ struct sockaddr_storage m_remote_addr; /* nmapped rem addr */
void *provider_data; /* provider private data */
iw_event_handler event_handler; /* cb for provider
events */
/* Used by provider to add and remove refs on IW cm_id */
void (*add_ref)(struct iw_cm_id *);
void (*rem_ref)(struct iw_cm_id *);
- struct socket *so;
+ u8 tos;
};
struct iw_cm_conn_param {
@@ -121,13 +121,11 @@ struct iw_cm_verbs {
int (*reject)(struct iw_cm_id *cm_id,
const void *pdata, u8 pdata_len);
- int (*create_listen_ep)(struct iw_cm_id *cm_id,
+ int (*create_listen)(struct iw_cm_id *cm_id,
int backlog);
- void (*destroy_listen_ep)(struct iw_cm_id *cm_id);
-
- void (*newconn)(struct iw_cm_id *parent_cm_id,
- struct socket *so);
+ int (*destroy_listen)(struct iw_cm_id *cm_id);
+ char ifname[IFNAMSIZ];
};
/**
@@ -138,7 +136,7 @@ struct iw_cm_verbs {
* returned IW CM identifier.
* @context: User specified context associated with the id.
*/
-struct iw_cm_id *iw_create_cm_id(struct ib_device *device, struct socket *so,
+struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
iw_cm_handler cm_handler, void *context);
/**
diff --git a/sys/ofed/include/rdma/iw_portmap.h b/sys/ofed/include/rdma/iw_portmap.h
new file mode 100644
index 0000000000000..9393f50b6e4e8
--- /dev/null
+++ b/sys/ofed/include/rdma/iw_portmap.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _IW_PORTMAP_H
+#define _IW_PORTMAP_H
+
+#define IWPM_ULIBNAME_SIZE 32
+#define IWPM_DEVNAME_SIZE 32
+#define IWPM_IFNAME_SIZE 16
+#define IWPM_IPADDR_SIZE 16
+
+enum {
+ IWPM_INVALID_NLMSG_ERR = 10,
+ IWPM_CREATE_MAPPING_ERR,
+ IWPM_DUPLICATE_MAPPING_ERR,
+ IWPM_UNKNOWN_MAPPING_ERR,
+ IWPM_CLIENT_DEV_INFO_ERR,
+ IWPM_USER_LIB_INFO_ERR,
+ IWPM_REMOTE_QUERY_REJECT
+};
+
+struct iwpm_dev_data {
+ char dev_name[IWPM_DEVNAME_SIZE];
+ char if_name[IWPM_IFNAME_SIZE];
+};
+
+struct iwpm_sa_data {
+ struct sockaddr_storage loc_addr;
+ struct sockaddr_storage mapped_loc_addr;
+ struct sockaddr_storage rem_addr;
+ struct sockaddr_storage mapped_rem_addr;
+};
+
+/**
+ * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid
+ *
+ * Returns true if the pid is greater than zero, otherwise returns false
+ */
+int iwpm_valid_pid(void);
+
+#endif /* _IW_PORTMAP_H */
diff --git a/sys/ofed/include/rdma/opa_port_info.h b/sys/ofed/include/rdma/opa_port_info.h
new file mode 100644
index 0000000000000..9303e0e4f508d
--- /dev/null
+++ b/sys/ofed/include/rdma/opa_port_info.h
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(OPA_PORT_INFO_H)
+#define OPA_PORT_INFO_H
+
+#define OPA_PORT_LINK_MODE_NOP 0 /* No change */
+#define OPA_PORT_LINK_MODE_OPA 4 /* Port mode is OPA */
+
+#define OPA_PORT_PACKET_FORMAT_NOP 0 /* No change */
+#define OPA_PORT_PACKET_FORMAT_8B 1 /* Format 8B */
+#define OPA_PORT_PACKET_FORMAT_9B 2 /* Format 9B */
+#define OPA_PORT_PACKET_FORMAT_10B 4 /* Format 10B */
+#define OPA_PORT_PACKET_FORMAT_16B 8 /* Format 16B */
+
+#define OPA_PORT_LTP_CRC_MODE_NONE 0 /* No change */
+#define OPA_PORT_LTP_CRC_MODE_14 1 /* 14-bit LTP CRC mode (optional) */
+#define OPA_PORT_LTP_CRC_MODE_16 2 /* 16-bit LTP CRC mode */
+#define OPA_PORT_LTP_CRC_MODE_48 4 /* 48-bit LTP CRC mode (optional) */
+#define OPA_PORT_LTP_CRC_MODE_PER_LANE 8 /* 12/16-bit per lane LTP CRC mode */
+
+/* Link Down / Neighbor Link Down Reason; indicated as follows: */
+#define OPA_LINKDOWN_REASON_NONE 0 /* No specified reason */
+#define OPA_LINKDOWN_REASON_RCV_ERROR_0 1
+#define OPA_LINKDOWN_REASON_BAD_PKT_LEN 2
+#define OPA_LINKDOWN_REASON_PKT_TOO_LONG 3
+#define OPA_LINKDOWN_REASON_PKT_TOO_SHORT 4
+#define OPA_LINKDOWN_REASON_BAD_SLID 5
+#define OPA_LINKDOWN_REASON_BAD_DLID 6
+#define OPA_LINKDOWN_REASON_BAD_L2 7
+#define OPA_LINKDOWN_REASON_BAD_SC 8
+#define OPA_LINKDOWN_REASON_RCV_ERROR_8 9
+#define OPA_LINKDOWN_REASON_BAD_MID_TAIL 10
+#define OPA_LINKDOWN_REASON_RCV_ERROR_10 11
+#define OPA_LINKDOWN_REASON_PREEMPT_ERROR 12
+#define OPA_LINKDOWN_REASON_PREEMPT_VL15 13
+#define OPA_LINKDOWN_REASON_BAD_VL_MARKER 14
+#define OPA_LINKDOWN_REASON_RCV_ERROR_14 15
+#define OPA_LINKDOWN_REASON_RCV_ERROR_15 16
+#define OPA_LINKDOWN_REASON_BAD_HEAD_DIST 17
+#define OPA_LINKDOWN_REASON_BAD_TAIL_DIST 18
+#define OPA_LINKDOWN_REASON_BAD_CTRL_DIST 19
+#define OPA_LINKDOWN_REASON_BAD_CREDIT_ACK 20
+#define OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER 21
+#define OPA_LINKDOWN_REASON_BAD_PREEMPT 22
+#define OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT 23
+#define OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT 24
+#define OPA_LINKDOWN_REASON_RCV_ERROR_24 25
+#define OPA_LINKDOWN_REASON_RCV_ERROR_25 26
+#define OPA_LINKDOWN_REASON_RCV_ERROR_26 27
+#define OPA_LINKDOWN_REASON_RCV_ERROR_27 28
+#define OPA_LINKDOWN_REASON_RCV_ERROR_28 29
+#define OPA_LINKDOWN_REASON_RCV_ERROR_29 30
+#define OPA_LINKDOWN_REASON_RCV_ERROR_30 31
+#define OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN 32
+#define OPA_LINKDOWN_REASON_UNKNOWN 33
+/* 34 -reserved */
+#define OPA_LINKDOWN_REASON_REBOOT 35
+#define OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN 36
+/* 37-38 reserved */
+#define OPA_LINKDOWN_REASON_FM_BOUNCE 39
+#define OPA_LINKDOWN_REASON_SPEED_POLICY 40
+#define OPA_LINKDOWN_REASON_WIDTH_POLICY 41
+/* 42-48 reserved */
+#define OPA_LINKDOWN_REASON_DISCONNECTED 49
+#define OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED 50
+#define OPA_LINKDOWN_REASON_NOT_INSTALLED 51
+#define OPA_LINKDOWN_REASON_CHASSIS_CONFIG 52
+/* 53 reserved */
+#define OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED 54
+/* 55 reserved */
+#define OPA_LINKDOWN_REASON_POWER_POLICY 56
+#define OPA_LINKDOWN_REASON_LINKSPEED_POLICY 57
+#define OPA_LINKDOWN_REASON_LINKWIDTH_POLICY 58
+/* 59 reserved */
+#define OPA_LINKDOWN_REASON_SWITCH_MGMT 60
+#define OPA_LINKDOWN_REASON_SMA_DISABLED 61
+/* 62 reserved */
+#define OPA_LINKDOWN_REASON_TRANSIENT 63
+/* 64-255 reserved */
+
+/* OPA Link Init reason; indicated as follows: */
+/* 3-7; 11-15 reserved; 8-15 cleared on Polling->LinkUp */
+#define OPA_LINKINIT_REASON_NOP 0
+#define OPA_LINKINIT_REASON_LINKUP (1 << 4)
+#define OPA_LINKINIT_REASON_FLAPPING (2 << 4)
+#define OPA_LINKINIT_REASON_CLEAR (8 << 4)
+#define OPA_LINKINIT_OUTSIDE_POLICY (8 << 4)
+#define OPA_LINKINIT_QUARANTINED (9 << 4)
+#define OPA_LINKINIT_INSUFIC_CAPABILITY (10 << 4)
+
+#define OPA_LINK_SPEED_NOP 0x0000 /* Reserved (1-5 Gbps) */
+#define OPA_LINK_SPEED_12_5G 0x0001 /* 12.5 Gbps */
+#define OPA_LINK_SPEED_25G 0x0002 /* 25.78125? Gbps (EDR) */
+
+#define OPA_LINK_WIDTH_1X 0x0001
+#define OPA_LINK_WIDTH_2X 0x0002
+#define OPA_LINK_WIDTH_3X 0x0004
+#define OPA_LINK_WIDTH_4X 0x0008
+
+#define OPA_CAP_MASK3_IsSnoopSupported (1 << 7)
+#define OPA_CAP_MASK3_IsAsyncSC2VLSupported (1 << 6)
+#define OPA_CAP_MASK3_IsAddrRangeConfigSupported (1 << 5)
+#define OPA_CAP_MASK3_IsPassThroughSupported (1 << 4)
+#define OPA_CAP_MASK3_IsSharedSpaceSupported (1 << 3)
+/* reserved (1 << 2) */
+#define OPA_CAP_MASK3_IsVLMarkerSupported (1 << 1)
+#define OPA_CAP_MASK3_IsVLrSupported (1 << 0)
+
+/**
+ * new MTU values
+ */
+enum {
+ OPA_MTU_8192 = 6,
+ OPA_MTU_10240 = 7,
+};
+
+enum {
+ OPA_PORT_PHYS_CONF_DISCONNECTED = 0,
+ OPA_PORT_PHYS_CONF_STANDARD = 1,
+ OPA_PORT_PHYS_CONF_FIXED = 2,
+ OPA_PORT_PHYS_CONF_VARIABLE = 3,
+ OPA_PORT_PHYS_CONF_SI_PHOTO = 4
+};
+
+enum port_info_field_masks {
+ /* vl.cap */
+ OPA_PI_MASK_VL_CAP = 0x1F,
+ /* port_states.ledenable_offlinereason */
+ OPA_PI_MASK_OFFLINE_REASON = 0x0F,
+ OPA_PI_MASK_LED_ENABLE = 0x40,
+ /* port_states.unsleepstate_downdefstate */
+ OPA_PI_MASK_UNSLEEP_STATE = 0xF0,
+ OPA_PI_MASK_DOWNDEF_STATE = 0x0F,
+ /* port_states.portphysstate_portstate */
+ OPA_PI_MASK_PORT_PHYSICAL_STATE = 0xF0,
+ OPA_PI_MASK_PORT_STATE = 0x0F,
+ /* port_phys_conf */
+ OPA_PI_MASK_PORT_PHYSICAL_CONF = 0x0F,
+ /* collectivemask_multicastmask */
+ OPA_PI_MASK_COLLECT_MASK = 0x38,
+ OPA_PI_MASK_MULTICAST_MASK = 0x07,
+ /* mkeyprotect_lmc */
+ OPA_PI_MASK_MKEY_PROT_BIT = 0xC0,
+ OPA_PI_MASK_LMC = 0x0F,
+ /* smsl */
+ OPA_PI_MASK_SMSL = 0x1F,
+ /* partenforce_filterraw */
+ /* Filter Raw In/Out bits 1 and 2 were removed */
+ OPA_PI_MASK_LINKINIT_REASON = 0xF0,
+ OPA_PI_MASK_PARTITION_ENFORCE_IN = 0x08,
+ OPA_PI_MASK_PARTITION_ENFORCE_OUT = 0x04,
+ /* operational_vls */
+ OPA_PI_MASK_OPERATIONAL_VL = 0x1F,
+ /* sa_qp */
+ OPA_PI_MASK_SA_QP = 0x00FFFFFF,
+ /* sm_trap_qp */
+ OPA_PI_MASK_SM_TRAP_QP = 0x00FFFFFF,
+ /* localphy_overrun_errors */
+ OPA_PI_MASK_LOCAL_PHY_ERRORS = 0xF0,
+ OPA_PI_MASK_OVERRUN_ERRORS = 0x0F,
+ /* clientrereg_subnettimeout */
+ OPA_PI_MASK_CLIENT_REREGISTER = 0x80,
+ OPA_PI_MASK_SUBNET_TIMEOUT = 0x1F,
+ /* port_link_mode */
+ OPA_PI_MASK_PORT_LINK_SUPPORTED = (0x001F << 10),
+ OPA_PI_MASK_PORT_LINK_ENABLED = (0x001F << 5),
+ OPA_PI_MASK_PORT_LINK_ACTIVE = (0x001F << 0),
+ /* port_link_crc_mode */
+ OPA_PI_MASK_PORT_LINK_CRC_SUPPORTED = 0x0F00,
+ OPA_PI_MASK_PORT_LINK_CRC_ENABLED = 0x00F0,
+ OPA_PI_MASK_PORT_LINK_CRC_ACTIVE = 0x000F,
+ /* port_mode */
+ OPA_PI_MASK_PORT_MODE_SECURITY_CHECK = 0x0001,
+ OPA_PI_MASK_PORT_MODE_16B_TRAP_QUERY = 0x0002,
+ OPA_PI_MASK_PORT_MODE_PKEY_CONVERT = 0x0004,
+ OPA_PI_MASK_PORT_MODE_SC2SC_MAPPING = 0x0008,
+ OPA_PI_MASK_PORT_MODE_VL_MARKER = 0x0010,
+ OPA_PI_MASK_PORT_PASS_THROUGH = 0x0020,
+ OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE = 0x0040,
+ /* flit_control.interleave */
+ OPA_PI_MASK_INTERLEAVE_DIST_SUP = (0x0003 << 12),
+ OPA_PI_MASK_INTERLEAVE_DIST_ENABLE = (0x0003 << 10),
+ OPA_PI_MASK_INTERLEAVE_MAX_NEST_TX = (0x001F << 5),
+ OPA_PI_MASK_INTERLEAVE_MAX_NEST_RX = (0x001F << 0),
+
+ /* port_error_action */
+ OPA_PI_MASK_EX_BUFFER_OVERRUN = 0x80000000,
+ /* 7 bits reserved */
+ OPA_PI_MASK_FM_CFG_ERR_EXCEED_MULTICAST_LIMIT = 0x00800000,
+ OPA_PI_MASK_FM_CFG_BAD_CONTROL_FLIT = 0x00400000,
+ OPA_PI_MASK_FM_CFG_BAD_PREEMPT = 0x00200000,
+ OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER = 0x00100000,
+ OPA_PI_MASK_FM_CFG_BAD_CRDT_ACK = 0x00080000,
+ OPA_PI_MASK_FM_CFG_BAD_CTRL_DIST = 0x00040000,
+ OPA_PI_MASK_FM_CFG_BAD_TAIL_DIST = 0x00020000,
+ OPA_PI_MASK_FM_CFG_BAD_HEAD_DIST = 0x00010000,
+ /* 2 bits reserved */
+ OPA_PI_MASK_PORT_RCV_BAD_VL_MARKER = 0x00002000,
+ OPA_PI_MASK_PORT_RCV_PREEMPT_VL15 = 0x00001000,
+ OPA_PI_MASK_PORT_RCV_PREEMPT_ERROR = 0x00000800,
+ /* 1 bit reserved */
+ OPA_PI_MASK_PORT_RCV_BAD_MidTail = 0x00000200,
+ /* 1 bit reserved */
+ OPA_PI_MASK_PORT_RCV_BAD_SC = 0x00000080,
+ OPA_PI_MASK_PORT_RCV_BAD_L2 = 0x00000040,
+ OPA_PI_MASK_PORT_RCV_BAD_DLID = 0x00000020,
+ OPA_PI_MASK_PORT_RCV_BAD_SLID = 0x00000010,
+ OPA_PI_MASK_PORT_RCV_PKTLEN_TOOSHORT = 0x00000008,
+ OPA_PI_MASK_PORT_RCV_PKTLEN_TOOLONG = 0x00000004,
+ OPA_PI_MASK_PORT_RCV_BAD_PKTLEN = 0x00000002,
+ OPA_PI_MASK_PORT_RCV_BAD_LT = 0x00000001,
+
+ /* pass_through.res_drctl */
+ OPA_PI_MASK_PASS_THROUGH_DR_CONTROL = 0x01,
+
+ /* buffer_units */
+ OPA_PI_MASK_BUF_UNIT_VL15_INIT = (0x00000FFF << 11),
+ OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE = (0x0000001F << 6),
+ OPA_PI_MASK_BUF_UNIT_CREDIT_ACK = (0x00000003 << 3),
+ OPA_PI_MASK_BUF_UNIT_BUF_ALLOC = (0x00000003 << 0),
+
+ /* neigh_mtu.pvlx_to_mtu */
+ OPA_PI_MASK_NEIGH_MTU_PVL0 = 0xF0,
+ OPA_PI_MASK_NEIGH_MTU_PVL1 = 0x0F,
+
+ /* neigh_mtu.vlstall_hoq_life */
+ OPA_PI_MASK_VL_STALL = (0x03 << 5),
+ OPA_PI_MASK_HOQ_LIFE = (0x1F << 0),
+
+ /* port_neigh_mode */
+ OPA_PI_MASK_NEIGH_MGMT_ALLOWED = (0x01 << 3),
+ OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS = (0x01 << 2),
+ OPA_PI_MASK_NEIGH_NODE_TYPE = (0x03 << 0),
+
+ /* resptime_value */
+ OPA_PI_MASK_RESPONSE_TIME_VALUE = 0x1F,
+
+ /* mtucap */
+ OPA_PI_MASK_MTU_CAP = 0x0F,
+};
+
+struct opa_port_states {
+ u8 reserved;
+ u8 ledenable_offlinereason; /* 1 res, 1 bit, 6 bits */
+ u8 reserved2;
+ u8 portphysstate_portstate; /* 4 bits, 4 bits */
+};
+
+struct opa_port_state_info {
+ struct opa_port_states port_states;
+ __be16 link_width_downgrade_tx_active;
+ __be16 link_width_downgrade_rx_active;
+};
+
+struct opa_port_info {
+ __be32 lid;
+ __be32 flow_control_mask;
+
+ struct {
+ u8 res; /* was inittype */
+ u8 cap; /* 3 res, 5 bits */
+ __be16 high_limit;
+ __be16 preempt_limit;
+ u8 arb_high_cap;
+ u8 arb_low_cap;
+ } vl;
+
+ struct opa_port_states port_states;
+ u8 port_phys_conf; /* 4 res, 4 bits */
+ u8 collectivemask_multicastmask; /* 2 res, 3, 3 */
+ u8 mkeyprotect_lmc; /* 2 bits, 2 res, 4 bits */
+ u8 smsl; /* 3 res, 5 bits */
+
+ u8 partenforce_filterraw; /* bit fields */
+ u8 operational_vls; /* 3 res, 5 bits */
+ __be16 pkey_8b;
+ __be16 pkey_10b;
+ __be16 mkey_violations;
+
+ __be16 pkey_violations;
+ __be16 qkey_violations;
+ __be32 sm_trap_qp; /* 8 bits, 24 bits */
+
+ __be32 sa_qp; /* 8 bits, 24 bits */
+ u8 neigh_port_num;
+ u8 link_down_reason;
+ u8 neigh_link_down_reason;
+ u8 clientrereg_subnettimeout; /* 1 bit, 2 bits, 5 */
+
+ struct {
+ __be16 supported;
+ __be16 enabled;
+ __be16 active;
+ } link_speed;
+ struct {
+ __be16 supported;
+ __be16 enabled;
+ __be16 active;
+ } link_width;
+ struct {
+ __be16 supported;
+ __be16 enabled;
+ __be16 tx_active;
+ __be16 rx_active;
+ } link_width_downgrade;
+ __be16 port_link_mode; /* 1 res, 5 bits, 5 bits, 5 bits */
+ __be16 port_ltp_crc_mode; /* 4 res, 4 bits, 4 bits, 4 bits */
+
+ __be16 port_mode; /* 9 res, bit fields */
+ struct {
+ __be16 supported;
+ __be16 enabled;
+ } port_packet_format;
+ struct {
+ __be16 interleave; /* 2 res, 2,2,5,5 */
+ struct {
+ __be16 min_initial;
+ __be16 min_tail;
+ u8 large_pkt_limit;
+ u8 small_pkt_limit;
+ u8 max_small_pkt_limit;
+ u8 preemption_limit;
+ } preemption;
+ } flit_control;
+
+ __be32 reserved4;
+ __be32 port_error_action; /* bit field */
+
+ struct {
+ u8 egress_port;
+ u8 res_drctl; /* 7 res, 1 */
+ } pass_through;
+ __be16 mkey_lease_period;
+ __be32 buffer_units; /* 9 res, 12, 5, 3, 3 */
+
+ __be32 reserved5;
+ __be32 sm_lid;
+
+ __be64 mkey;
+
+ __be64 subnet_prefix;
+
+ struct {
+ u8 pvlx_to_mtu[OPA_MAX_VLS/2]; /* 4 bits, 4 bits */
+ } neigh_mtu;
+
+ struct {
+ u8 vlstall_hoqlife; /* 3 bits, 5 bits */
+ } xmit_q[OPA_MAX_VLS];
+
+ struct {
+ u8 addr[16];
+ } ipaddr_ipv6;
+
+ struct {
+ u8 addr[4];
+ } ipaddr_ipv4;
+
+ u32 reserved6;
+ u32 reserved7;
+ u32 reserved8;
+
+ __be64 neigh_node_guid;
+
+ __be32 ib_cap_mask;
+ __be16 reserved9; /* was ib_cap_mask2 */
+ __be16 opa_cap_mask;
+
+ __be32 reserved10; /* was link_roundtrip_latency */
+ __be16 overall_buffer_space;
+ __be16 reserved11; /* was max_credit_hint */
+
+ __be16 diag_code;
+ struct {
+ u8 buffer;
+ u8 wire;
+ } replay_depth;
+ u8 port_neigh_mode;
+ u8 mtucap; /* 4 res, 4 bits */
+
+ u8 resptimevalue; /* 3 res, 5 bits */
+ u8 local_port_num;
+ u8 reserved12;
+ u8 reserved13; /* was guid_cap */
+} __attribute__ ((packed));
+
+#endif /* OPA_PORT_INFO_H */
diff --git a/sys/ofed/include/rdma/opa_smi.h b/sys/ofed/include/rdma/opa_smi.h
new file mode 100644
index 0000000000000..3c21923d52092
--- /dev/null
+++ b/sys/ofed/include/rdma/opa_smi.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(OPA_SMI_H)
+#define OPA_SMI_H
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+#define OPA_SMP_LID_DATA_SIZE 2016
+#define OPA_SMP_DR_DATA_SIZE 1872
+#define OPA_SMP_MAX_PATH_HOPS 64
+
+#define OPA_MAX_VLS 32
+#define OPA_MAX_SLS 32
+#define OPA_MAX_SCS 32
+
+#define OPA_SMI_CLASS_VERSION 0x80
+
+#define OPA_LID_PERMISSIVE cpu_to_be32(0xFFFFFFFF)
+
+struct opa_smp {
+ u8 base_version;
+ u8 mgmt_class;
+ u8 class_version;
+ u8 method;
+ __be16 status;
+ u8 hop_ptr;
+ u8 hop_cnt;
+ __be64 tid;
+ __be16 attr_id;
+ __be16 resv;
+ __be32 attr_mod;
+ __be64 mkey;
+ union {
+ struct {
+ uint8_t data[OPA_SMP_LID_DATA_SIZE];
+ } lid;
+ struct {
+ __be32 dr_slid;
+ __be32 dr_dlid;
+ u8 initial_path[OPA_SMP_MAX_PATH_HOPS];
+ u8 return_path[OPA_SMP_MAX_PATH_HOPS];
+ u8 reserved[8];
+ u8 data[OPA_SMP_DR_DATA_SIZE];
+ } dr;
+ } route;
+} __packed;
+
+
+/* Subnet management attributes */
+/* ... */
+#define OPA_ATTRIB_ID_NODE_DESCRIPTION cpu_to_be16(0x0010)
+#define OPA_ATTRIB_ID_NODE_INFO cpu_to_be16(0x0011)
+#define OPA_ATTRIB_ID_PORT_INFO cpu_to_be16(0x0015)
+#define OPA_ATTRIB_ID_PARTITION_TABLE cpu_to_be16(0x0016)
+#define OPA_ATTRIB_ID_SL_TO_SC_MAP cpu_to_be16(0x0017)
+#define OPA_ATTRIB_ID_VL_ARBITRATION cpu_to_be16(0x0018)
+#define OPA_ATTRIB_ID_SM_INFO cpu_to_be16(0x0020)
+#define OPA_ATTRIB_ID_CABLE_INFO cpu_to_be16(0x0032)
+#define OPA_ATTRIB_ID_AGGREGATE cpu_to_be16(0x0080)
+#define OPA_ATTRIB_ID_SC_TO_SL_MAP cpu_to_be16(0x0082)
+#define OPA_ATTRIB_ID_SC_TO_VLR_MAP cpu_to_be16(0x0083)
+#define OPA_ATTRIB_ID_SC_TO_VLT_MAP cpu_to_be16(0x0084)
+#define OPA_ATTRIB_ID_SC_TO_VLNT_MAP cpu_to_be16(0x0085)
+/* ... */
+#define OPA_ATTRIB_ID_PORT_STATE_INFO cpu_to_be16(0x0087)
+/* ... */
+#define OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE cpu_to_be16(0x008A)
+/* ... */
+
+struct opa_node_description {
+ u8 data[64];
+} __attribute__ ((packed));
+
+struct opa_node_info {
+ u8 base_version;
+ u8 class_version;
+ u8 node_type;
+ u8 num_ports;
+ __be32 reserved;
+ __be64 system_image_guid;
+ __be64 node_guid;
+ __be64 port_guid;
+ __be16 partition_cap;
+ __be16 device_id;
+ __be32 revision;
+ u8 local_port_num;
+ u8 vendor_id[3]; /* network byte order */
+} __attribute__ ((packed));
+
+#define OPA_PARTITION_TABLE_BLK_SIZE 32
+
+static inline u8
+opa_get_smp_direction(const struct opa_smp *smp)
+{
+ return ib_get_smp_direction((const struct ib_smp *)smp);
+}
+
+static inline u8 *opa_get_smp_data(struct opa_smp *smp)
+{
+ if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ return smp->route.dr.data;
+
+ return smp->route.lid.data;
+}
+
+static inline size_t opa_get_smp_data_size(const struct opa_smp *smp)
+{
+ if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ return sizeof(smp->route.dr.data);
+
+ return sizeof(smp->route.lid.data);
+}
+
+static inline size_t opa_get_smp_header_size(const struct opa_smp *smp)
+{
+ if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ return sizeof(*smp) - sizeof(smp->route.dr.data);
+
+ return sizeof(*smp) - sizeof(smp->route.lid.data);
+}
+
+#endif /* OPA_SMI_H */
diff --git a/sys/ofed/include/rdma/peer_mem.h b/sys/ofed/include/rdma/peer_mem.h
deleted file mode 100644
index 85658831299c6..0000000000000
--- a/sys/ofed/include/rdma/peer_mem.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2013, Mellanox Technologies. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#if !defined(PEER_MEM_H)
-#define PEER_MEM_H
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/errno.h>
-#include <linux/scatterlist.h>
-#include <linux/mutex.h>
-
-
-#define IB_PEER_MEMORY_NAME_MAX 64
-#define IB_PEER_MEMORY_VER_MAX 16
-
-struct peer_memory_client {
- char name[IB_PEER_MEMORY_NAME_MAX];
- char version[IB_PEER_MEMORY_VER_MAX];
- /* acquire return code: 1 mine, 0 - not mine */
- int (*acquire) (unsigned long addr, size_t size, void *peer_mem_private_data,
- char *peer_mem_name, void **client_context);
- int (*get_pages) (unsigned long addr,
- size_t size, int write, int force,
- struct sg_table *sg_head,
- void *client_context, void *core_context);
- int (*dma_map) (struct sg_table *sg_head, void *client_context,
- struct device *dma_device, int dmasync, int *nmap);
- int (*dma_unmap) (struct sg_table *sg_head, void *client_context,
- struct device *dma_device);
- void (*put_pages) (struct sg_table *sg_head, void *client_context);
- unsigned long (*get_page_size) (void *client_context);
- void (*release) (void *client_context);
-
-};
-
-typedef int (*invalidate_peer_memory)(void *reg_handle,
- void *core_context);
-
-void *ib_register_peer_memory_client(struct peer_memory_client *peer_client,
- invalidate_peer_memory *invalidate_callback);
-void ib_unregister_peer_memory_client(void *reg_handle);
-
-#endif
diff --git a/sys/ofed/include/rdma/rdma_cm.h b/sys/ofed/include/rdma/rdma_cm.h
index 33be95784d54b..7fee12cb36a85 100644
--- a/sys/ofed/include/rdma/rdma_cm.h
+++ b/sys/ofed/include/rdma/rdma_cm.h
@@ -1,7 +1,6 @@
/*
* Copyright (c) 2005 Voltaire Inc. All rights reserved.
* Copyright (c) 2005 Intel Corporation. All rights reserved.
- * Copyright (c) 2016 Chelsio Communications. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -60,13 +59,11 @@ enum rdma_cm_event_type {
RDMA_CM_EVENT_MULTICAST_JOIN,
RDMA_CM_EVENT_MULTICAST_ERROR,
RDMA_CM_EVENT_ADDR_CHANGE,
- RDMA_CM_EVENT_TIMEWAIT_EXIT,
- RDMA_CM_EVENT_ALT_ROUTE_RESOLVED,
- RDMA_CM_EVENT_ALT_ROUTE_ERROR,
- RDMA_CM_EVENT_LOAD_ALT_PATH,
- RDMA_CM_EVENT_ALT_PATH_LOADED,
+ RDMA_CM_EVENT_TIMEWAIT_EXIT
};
+const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event);
+
enum rdma_port_space {
RDMA_PS_SDP = 0x0001,
RDMA_PS_IPOIB = 0x0002,
@@ -75,12 +72,10 @@ enum rdma_port_space {
RDMA_PS_UDP = 0x0111,
};
-enum alt_path_type {
- RDMA_ALT_PATH_NONE,
- RDMA_ALT_PATH_PORT,
- RDMA_ALT_PATH_LID,
- RDMA_ALT_PATH_BEST
-};
+#define RDMA_IB_IP_PS_MASK 0xFFFFFFFFFFFF0000ULL
+#define RDMA_IB_IP_PS_TCP 0x0000000001060000ULL
+#define RDMA_IB_IP_PS_UDP 0x0000000001110000ULL
+#define RDMA_IB_IP_PS_IB 0x00000000013F0000ULL
struct rdma_addr {
struct sockaddr_storage src_addr;
@@ -105,6 +100,7 @@ struct rdma_conn_param {
/* Fields below ignored if a QP is created on the rdma_cm_id. */
u8 srq;
u32 qp_num;
+ u32 qkey;
};
struct rdma_ud_param {
@@ -113,7 +109,6 @@ struct rdma_ud_param {
struct ib_ah_attr ah_attr;
u32 qp_num;
u32 qkey;
- u8 alt_path_index;
};
struct rdma_cm_event {
@@ -160,19 +155,22 @@ struct rdma_cm_id {
enum rdma_port_space ps;
enum ib_qp_type qp_type;
u8 port_num;
- void *ucontext;
};
/**
* rdma_create_id - Create an RDMA identifier.
*
+ * @net: The network namespace in which to create the new id.
* @event_handler: User callback invoked to report events associated with the
* returned rdma_id.
* @context: User specified context associated with the id.
* @ps: RDMA port space.
* @qp_type: type of queue pair associated with the id.
+ *
+ * The id holds a reference on the network namespace until it is destroyed.
*/
-struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
+struct rdma_cm_id *rdma_create_id(struct vnet *net,
+ rdma_cm_event_handler event_handler,
void *context, enum rdma_port_space ps,
enum ib_qp_type qp_type);
@@ -223,19 +221,6 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms);
/**
- * rdma_enable_apm - Get ready to use APM for the given ID.
- * Actual Alternate path discovery and load will take place only
- * after a connection has been established.
- *
- * Calling this function only has an effect on the connection's client side.
- * It should be called after rdma_resolve_route and before rdma_connect.
- *
- * @id: RDMA identifier.
- * @alt_type: Alternate path type to resolve.
- */
-int rdma_enable_apm(struct rdma_cm_id *id, enum alt_path_type alt_type);
-
-/**
* rdma_create_qp - Allocate a QP and associate it with the specified RDMA
* identifier.
*
@@ -348,11 +333,13 @@ int rdma_disconnect(struct rdma_cm_id *id);
* address.
* @id: Communication identifier associated with the request.
* @addr: Multicast address identifying the group to join.
+ * @join_state: Multicast JoinState bitmap requested by port.
+ * Bitmap is based on IB_SA_MCMEMBER_REC_JOIN_STATE bits.
* @context: User-defined context associated with the join request, returned
* to the user through the private_data pointer in multicast events.
*/
int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
- void *context);
+ u8 join_state, void *context);
/**
* rdma_leave_multicast - Leave the multicast group specified by the given
@@ -394,14 +381,11 @@ int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse);
*/
int rdma_set_afonly(struct rdma_cm_id *id, int afonly);
-/**
- * rdma_set_timeout - Set the QP timeout associated with a connection
- * identifier.
- * @id: Communication identifier to associated with service type.
- * @timeout: QP timeout
+ /**
+ * rdma_get_service_id - Return the IB service ID for a specified address.
+ * @id: Communication identifier associated with the address.
+ * @addr: Address for the service ID.
*/
-void rdma_set_timeout(struct rdma_cm_id *id, int timeout);
-int rdma_cma_any_addr(struct sockaddr *addr);
-int rdma_find_cmid_laddr(struct sockaddr_in *local_addr,
- unsigned short dev_type, void **cm_id);
+__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr);
+
#endif /* RDMA_CM_H */
diff --git a/sys/ofed/include/rdma/rdma_user_cm.h b/sys/ofed/include/rdma/rdma_user_cm.h
index 4d9909994ddda..01923d463673e 100644
--- a/sys/ofed/include/rdma/rdma_user_cm.h
+++ b/sys/ofed/include/rdma/rdma_user_cm.h
@@ -34,6 +34,7 @@
#define RDMA_USER_CM_H
#include <linux/types.h>
+#include <linux/socket.h>
#include <linux/in6.h>
#include <rdma/ib_user_verbs.h>
#include <rdma/ib_user_sa.h>
@@ -45,8 +46,8 @@
enum {
RDMA_USER_CM_CMD_CREATE_ID,
RDMA_USER_CM_CMD_DESTROY_ID,
- RDMA_USER_CM_CMD_BIND_ADDR,
- RDMA_USER_CM_CMD_RESOLVE_ADDR,
+ RDMA_USER_CM_CMD_BIND_IP,
+ RDMA_USER_CM_CMD_RESOLVE_IP,
RDMA_USER_CM_CMD_RESOLVE_ROUTE,
RDMA_USER_CM_CMD_QUERY_ROUTE,
RDMA_USER_CM_CMD_CONNECT,
@@ -59,9 +60,13 @@ enum {
RDMA_USER_CM_CMD_GET_OPTION,
RDMA_USER_CM_CMD_SET_OPTION,
RDMA_USER_CM_CMD_NOTIFY,
- RDMA_USER_CM_CMD_JOIN_MCAST,
+ RDMA_USER_CM_CMD_JOIN_IP_MCAST,
RDMA_USER_CM_CMD_LEAVE_MCAST,
- RDMA_USER_CM_CMD_MIGRATE_ID
+ RDMA_USER_CM_CMD_MIGRATE_ID,
+ RDMA_USER_CM_CMD_QUERY,
+ RDMA_USER_CM_CMD_BIND,
+ RDMA_USER_CM_CMD_RESOLVE_ADDR,
+ RDMA_USER_CM_CMD_JOIN_MCAST
};
/*
@@ -95,28 +100,51 @@ struct rdma_ucm_destroy_id_resp {
__u32 events_reported;
};
-struct rdma_ucm_bind_addr {
+struct rdma_ucm_bind_ip {
__u64 response;
struct sockaddr_in6 addr;
__u32 id;
};
-struct rdma_ucm_resolve_addr {
+struct rdma_ucm_bind {
+ __u32 id;
+ __u16 addr_size;
+ __u16 reserved;
+ struct sockaddr_storage addr;
+};
+
+struct rdma_ucm_resolve_ip {
struct sockaddr_in6 src_addr;
struct sockaddr_in6 dst_addr;
__u32 id;
__u32 timeout_ms;
};
+struct rdma_ucm_resolve_addr {
+ __u32 id;
+ __u32 timeout_ms;
+ __u16 src_size;
+ __u16 dst_size;
+ __u32 reserved;
+ struct sockaddr_storage src_addr;
+ struct sockaddr_storage dst_addr;
+};
+
struct rdma_ucm_resolve_route {
__u32 id;
__u32 timeout_ms;
};
-struct rdma_ucm_query_route {
+enum {
+ RDMA_USER_CM_QUERY_ADDR,
+ RDMA_USER_CM_QUERY_PATH,
+ RDMA_USER_CM_QUERY_GID
+};
+
+struct rdma_ucm_query {
__u64 response;
__u32 id;
- __u32 reserved;
+ __u32 option;
};
struct rdma_ucm_query_route_resp {
@@ -129,9 +157,26 @@ struct rdma_ucm_query_route_resp {
__u8 reserved[3];
};
+struct rdma_ucm_query_addr_resp {
+ __u64 node_guid;
+ __u8 port_num;
+ __u8 reserved;
+ __u16 pkey;
+ __u16 src_size;
+ __u16 dst_size;
+ struct sockaddr_storage src_addr;
+ struct sockaddr_storage dst_addr;
+};
+
+struct rdma_ucm_query_path_resp {
+ __u32 num_paths;
+ __u32 reserved;
+ struct ib_path_rec_data path_data[0];
+};
+
struct rdma_ucm_conn_param {
__u32 qp_num;
- __u32 reserved;
+ __u32 qkey;
__u8 private_data[RDMA_MAX_PRIVATE_DATA];
__u8 private_data_len;
__u8 srq;
@@ -192,13 +237,29 @@ struct rdma_ucm_notify {
__u32 event;
};
-struct rdma_ucm_join_mcast {
+struct rdma_ucm_join_ip_mcast {
__u64 response; /* rdma_ucm_create_id_resp */
__u64 uid;
struct sockaddr_in6 addr;
__u32 id;
};
+/* Multicast join flags */
+enum {
+ RDMA_MC_JOIN_FLAG_FULLMEMBER,
+ RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER,
+ RDMA_MC_JOIN_FLAG_RESERVED,
+};
+
+struct rdma_ucm_join_mcast {
+ __u64 response; /* rdma_ucma_create_id_resp */
+ __u64 uid;
+ __u32 id;
+ __u16 addr_size;
+ __u16 join_flags;
+ struct sockaddr_storage addr;
+};
+
struct rdma_ucm_get_event {
__u64 response;
};
@@ -222,12 +283,10 @@ enum {
/* Option details */
enum {
- RDMA_OPTION_ID_TOS = 0,
+ RDMA_OPTION_ID_TOS = 0,
RDMA_OPTION_ID_REUSEADDR = 1,
RDMA_OPTION_ID_AFONLY = 2,
-
- RDMA_OPTION_IB_PATH = 1,
- RDMA_OPTION_IB_APM = 2,
+ RDMA_OPTION_IB_PATH = 1
};
struct rdma_ucm_set_option {
diff --git a/sys/ofed/include/rdma/rdma_vt.h b/sys/ofed/include/rdma/rdma_vt.h
new file mode 100644
index 0000000000000..e31502107a58c
--- /dev/null
+++ b/sys/ofed/include/rdma/rdma_vt.h
@@ -0,0 +1,500 @@
+#ifndef DEF_RDMA_VT_H
+#define DEF_RDMA_VT_H
+
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * Structure that low level drivers will populate in order to register with the
+ * rdmavt layer.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdmavt_mr.h>
+#include <rdma/rdmavt_qp.h>
+
+#define RVT_MAX_PKEY_VALUES 16
+
+struct rvt_ibport {
+ struct rvt_qp __rcu *qp[2];
+ struct ib_mad_agent *send_agent; /* agent for SMI (traps) */
+ struct rb_root mcast_tree;
+ spinlock_t lock; /* protect changes in this struct */
+
+ /* non-zero when timer is set */
+ unsigned long mkey_lease_timeout;
+ unsigned long trap_timeout;
+ __be64 gid_prefix; /* in network order */
+ __be64 mkey;
+ u64 tid;
+ u32 port_cap_flags;
+ u32 pma_sample_start;
+ u32 pma_sample_interval;
+ __be16 pma_counter_select[5];
+ u16 pma_tag;
+ u16 mkey_lease_period;
+ u16 sm_lid;
+ u8 sm_sl;
+ u8 mkeyprot;
+ u8 subnet_timeout;
+ u8 vl_high_limit;
+
+ /*
+ * Driver is expected to keep these up to date. These
+ * counters are informational only and not required to be
+ * completely accurate.
+ */
+ u64 n_rc_resends;
+ u64 n_seq_naks;
+ u64 n_rdma_seq;
+ u64 n_rnr_naks;
+ u64 n_other_naks;
+ u64 n_loop_pkts;
+ u64 n_pkt_drops;
+ u64 n_vl15_dropped;
+ u64 n_rc_timeouts;
+ u64 n_dmawait;
+ u64 n_unaligned;
+ u64 n_rc_dupreq;
+ u64 n_rc_seqnak;
+ u16 pkey_violations;
+ u16 qkey_violations;
+ u16 mkey_violations;
+
+ /* Hot-path per CPU counters to avoid cacheline trading to update */
+ u64 z_rc_acks;
+ u64 z_rc_qacks;
+ u64 z_rc_delayed_comp;
+ u64 __percpu *rc_acks;
+ u64 __percpu *rc_qacks;
+ u64 __percpu *rc_delayed_comp;
+
+ void *priv; /* driver private data */
+
+ /*
+ * The pkey table is allocated and maintained by the driver. Drivers
+ * need to have access to this before registering with rdmav. However
+ * rdmavt will need access to it so drivers need to proviee this during
+ * the attach port API call.
+ */
+ u16 *pkey_table;
+
+ struct rvt_ah *sm_ah;
+};
+
+#define RVT_CQN_MAX 16 /* maximum length of cq name */
+
+/*
+ * Things that are driver specific, module parameters in hfi1 and qib
+ */
+struct rvt_driver_params {
+ struct ib_device_attr props;
+
+ /*
+ * Anything driver specific that is not covered by props
+ * For instance special module parameters. Goes here.
+ */
+ unsigned int lkey_table_size;
+ unsigned int qp_table_size;
+ int qpn_start;
+ int qpn_inc;
+ int qpn_res_start;
+ int qpn_res_end;
+ int nports;
+ int npkeys;
+ char cq_name[RVT_CQN_MAX];
+ int node;
+ int psn_mask;
+ int psn_shift;
+ int psn_modify_mask;
+ u32 core_cap_flags;
+ u32 max_mad_size;
+ u8 qos_shift;
+ u8 max_rdma_atomic;
+ u8 reserved_operations;
+};
+
+/* Protection domain */
+struct rvt_pd {
+ struct ib_pd ibpd;
+ int user; /* non-zero if created from user space */
+};
+
+/* Address handle */
+struct rvt_ah {
+ struct ib_ah ibah;
+ struct ib_ah_attr attr;
+ atomic_t refcount;
+ u8 vl;
+ u8 log_pmtu;
+};
+
+struct rvt_dev_info;
+struct rvt_swqe;
+struct rvt_driver_provided {
+ /*
+ * Which functions are required depends on which verbs rdmavt is
+ * providing and which verbs the driver is overriding. See
+ * check_support() for details.
+ */
+
+ /* Passed to ib core registration. Callback to create syfs files */
+ int (*port_callback)(struct ib_device *, u8, struct kobject *);
+
+ /*
+ * Returns a string to represent the device for which is being
+ * registered. This is primarily used for error and debug messages on
+ * the console.
+ */
+ const char * (*get_card_name)(struct rvt_dev_info *rdi);
+
+ /*
+ * Returns a pointer to the undelying hardware's PCI device. This is
+ * used to display information as to what hardware is being referenced
+ * in an output message
+ */
+ struct pci_dev * (*get_pci_dev)(struct rvt_dev_info *rdi);
+
+ /*
+ * Allocate a private queue pair data structure for driver specific
+ * information which is opaque to rdmavt. Errors are returned via
+ * ERR_PTR(err). The driver is free to return NULL or a valid
+ * pointer.
+ */
+ void * (*qp_priv_alloc)(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+ gfp_t gfp);
+
+ /*
+ * Free the driver's private qp structure.
+ */
+ void (*qp_priv_free)(struct rvt_dev_info *rdi, struct rvt_qp *qp);
+
+ /*
+ * Inform the driver the particular qp in quesiton has been reset so
+ * that it can clean up anything it needs to.
+ */
+ void (*notify_qp_reset)(struct rvt_qp *qp);
+
+ /*
+ * Give the driver a notice that there is send work to do. It is up to
+ * the driver to generally push the packets out, this just queues the
+ * work with the driver. There are two variants here. The no_lock
+ * version requires the s_lock not to be held. The other assumes the
+ * s_lock is held.
+ */
+ void (*schedule_send)(struct rvt_qp *qp);
+ void (*schedule_send_no_lock)(struct rvt_qp *qp);
+
+ /*
+ * Sometimes rdmavt needs to kick the driver's send progress. That is
+ * done by this call back.
+ */
+ void (*do_send)(struct rvt_qp *qp);
+
+ /*
+ * Get a path mtu from the driver based on qp attributes.
+ */
+ int (*get_pmtu_from_attr)(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+ struct ib_qp_attr *attr);
+
+ /*
+ * Notify driver that it needs to flush any outstanding IO requests that
+ * are waiting on a qp.
+ */
+ void (*flush_qp_waiters)(struct rvt_qp *qp);
+
+ /*
+ * Notify driver to stop its queue of sending packets. Nothing else
+ * should be posted to the queue pair after this has been called.
+ */
+ void (*stop_send_queue)(struct rvt_qp *qp);
+
+ /*
+ * Have the drivr drain any in progress operations
+ */
+ void (*quiesce_qp)(struct rvt_qp *qp);
+
+ /*
+ * Inform the driver a qp has went to error state.
+ */
+ void (*notify_error_qp)(struct rvt_qp *qp);
+
+ /*
+ * Get an MTU for a qp.
+ */
+ u32 (*mtu_from_qp)(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+ u32 pmtu);
+ /*
+ * Convert an mtu to a path mtu
+ */
+ int (*mtu_to_path_mtu)(u32 mtu);
+
+ /*
+ * Get the guid of a port in big endian byte order
+ */
+ int (*get_guid_be)(struct rvt_dev_info *rdi, struct rvt_ibport *rvp,
+ int guid_index, __be64 *guid);
+
+ /*
+ * Query driver for the state of the port.
+ */
+ int (*query_port_state)(struct rvt_dev_info *rdi, u8 port_num,
+ struct ib_port_attr *props);
+
+ /*
+ * Tell driver to shutdown a port
+ */
+ int (*shut_down_port)(struct rvt_dev_info *rdi, u8 port_num);
+
+ /* Tell driver to send a trap for changed port capabilities */
+ void (*cap_mask_chg)(struct rvt_dev_info *rdi, u8 port_num);
+
+ /*
+ * The following functions can be safely ignored completely. Any use of
+ * these is checked for NULL before blindly calling. Rdmavt should also
+ * be functional if drivers omit these.
+ */
+
+ /* Called to inform the driver that all qps should now be freed. */
+ unsigned (*free_all_qps)(struct rvt_dev_info *rdi);
+
+ /* Driver specific AH validation */
+ int (*check_ah)(struct ib_device *, struct ib_ah_attr *);
+
+ /* Inform the driver a new AH has been created */
+ void (*notify_new_ah)(struct ib_device *, struct ib_ah_attr *,
+ struct rvt_ah *);
+
+ /* Let the driver pick the next queue pair number*/
+ int (*alloc_qpn)(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
+ enum ib_qp_type type, u8 port_num, gfp_t gfp);
+
+ /* Determine if its safe or allowed to modify the qp */
+ int (*check_modify_qp)(struct rvt_qp *qp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata);
+
+ /* Driver specific QP modification/notification-of */
+ void (*modify_qp)(struct rvt_qp *qp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata);
+
+ /* Driver specific work request checking */
+ int (*check_send_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe);
+
+ /* Notify driver a mad agent has been created */
+ void (*notify_create_mad_agent)(struct rvt_dev_info *rdi, int port_idx);
+
+ /* Notify driver a mad agent has been removed */
+ void (*notify_free_mad_agent)(struct rvt_dev_info *rdi, int port_idx);
+
+};
+
+struct rvt_dev_info {
+ struct ib_device ibdev; /* Keep this first. Nothing above here */
+
+ /*
+ * Prior to calling for registration the driver will be responsible for
+ * allocating space for this structure.
+ *
+ * The driver will also be responsible for filling in certain members of
+ * dparms.props. The driver needs to fill in dparms exactly as it would
+ * want values reported to a ULP. This will be returned to the caller
+ * in rdmavt's device. The driver should also therefore refrain from
+ * modifying this directly after registration with rdmavt.
+ */
+
+ /* Driver specific properties */
+ struct rvt_driver_params dparms;
+
+ /* post send table */
+ const struct rvt_operation_params *post_parms;
+
+ struct rvt_mregion __rcu *dma_mr;
+ struct rvt_lkey_table lkey_table;
+
+ /* Driver specific helper functions */
+ struct rvt_driver_provided driver_f;
+
+ /* Internal use */
+ int n_pds_allocated;
+ spinlock_t n_pds_lock; /* Protect pd allocated count */
+
+ int n_ahs_allocated;
+ spinlock_t n_ahs_lock; /* Protect ah allocated count */
+
+ u32 n_srqs_allocated;
+ spinlock_t n_srqs_lock; /* Protect srqs allocated count */
+
+ int flags;
+ struct rvt_ibport **ports;
+
+ /* QP */
+ struct rvt_qp_ibdev *qp_dev;
+ u32 n_qps_allocated; /* number of QPs allocated for device */
+ u32 n_rc_qps; /* number of RC QPs allocated for device */
+ u32 busy_jiffies; /* timeout scaling based on RC QP count */
+ spinlock_t n_qps_lock; /* protect qps, rc qps and busy jiffy counts */
+
+ /* memory maps */
+ struct list_head pending_mmaps;
+ spinlock_t mmap_offset_lock; /* protect mmap_offset */
+ u32 mmap_offset;
+ spinlock_t pending_lock; /* protect pending mmap list */
+
+ /* CQ */
+ struct kthread_worker *worker; /* per device cq worker */
+ u32 n_cqs_allocated; /* number of CQs allocated for device */
+ spinlock_t n_cqs_lock; /* protect count of in use cqs */
+
+ /* Multicast */
+ u32 n_mcast_grps_allocated; /* number of mcast groups allocated */
+ spinlock_t n_mcast_grps_lock;
+
+};
+
+static inline struct rvt_pd *ibpd_to_rvtpd(struct ib_pd *ibpd)
+{
+ return container_of(ibpd, struct rvt_pd, ibpd);
+}
+
+static inline struct rvt_ah *ibah_to_rvtah(struct ib_ah *ibah)
+{
+ return container_of(ibah, struct rvt_ah, ibah);
+}
+
+static inline struct rvt_dev_info *ib_to_rvt(struct ib_device *ibdev)
+{
+ return container_of(ibdev, struct rvt_dev_info, ibdev);
+}
+
+static inline struct rvt_srq *ibsrq_to_rvtsrq(struct ib_srq *ibsrq)
+{
+ return container_of(ibsrq, struct rvt_srq, ibsrq);
+}
+
+static inline struct rvt_qp *ibqp_to_rvtqp(struct ib_qp *ibqp)
+{
+ return container_of(ibqp, struct rvt_qp, ibqp);
+}
+
+static inline unsigned rvt_get_npkeys(struct rvt_dev_info *rdi)
+{
+ /*
+ * All ports have same number of pkeys.
+ */
+ return rdi->dparms.npkeys;
+}
+
+/*
+ * Return the max atomic suitable for determining
+ * the size of the ack ring buffer in a QP.
+ */
+static inline unsigned int rvt_max_atomic(struct rvt_dev_info *rdi)
+{
+ return rdi->dparms.max_rdma_atomic + 1;
+}
+
+/*
+ * Return the indexed PKEY from the port PKEY table.
+ */
+static inline u16 rvt_get_pkey(struct rvt_dev_info *rdi,
+ int port_index,
+ unsigned index)
+{
+ if (index >= rvt_get_npkeys(rdi))
+ return 0;
+ else
+ return rdi->ports[port_index]->pkey_table[index];
+}
+
+/**
+ * rvt_lookup_qpn - return the QP with the given QPN
+ * @ibp: the ibport
+ * @qpn: the QP number to look up
+ *
+ * The caller must hold the rcu_read_lock(), and keep the lock until
+ * the returned qp is no longer in use.
+ */
+/* TODO: Remove this and put in rdmavt/qp.h when no longer needed by drivers */
+static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi,
+ struct rvt_ibport *rvp,
+ u32 qpn) __must_hold(RCU)
+{
+ struct rvt_qp *qp = NULL;
+
+ if (unlikely(qpn <= 1)) {
+ qp = rcu_dereference(rvp->qp[qpn]);
+ } else {
+ u32 n = hash_32(qpn, rdi->qp_dev->qp_table_bits);
+
+ for (qp = rcu_dereference(rdi->qp_dev->qp_table[n]); qp;
+ qp = rcu_dereference(qp->next))
+ if (qp->ibqp.qp_num == qpn)
+ break;
+ }
+ return qp;
+}
+
+struct rvt_dev_info *rvt_alloc_device(size_t size, int nports);
+void rvt_dealloc_device(struct rvt_dev_info *rdi);
+int rvt_register_device(struct rvt_dev_info *rvd);
+void rvt_unregister_device(struct rvt_dev_info *rvd);
+int rvt_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
+int rvt_init_port(struct rvt_dev_info *rdi, struct rvt_ibport *port,
+ int port_index, u16 *pkey_table);
+int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key,
+ int access);
+int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey);
+int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge,
+ u32 len, u64 vaddr, u32 rkey, int acc);
+int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
+ struct rvt_sge *isge, struct ib_sge *sge, int acc);
+struct rvt_mcast *rvt_mcast_find(struct rvt_ibport *ibp, union ib_gid *mgid);
+
+#endif /* DEF_RDMA_VT_H */
diff --git a/sys/ofed/include/rdma/rdmavt_cq.h b/sys/ofed/include/rdma/rdmavt_cq.h
new file mode 100644
index 0000000000000..51fd00b243d01
--- /dev/null
+++ b/sys/ofed/include/rdma/rdmavt_cq.h
@@ -0,0 +1,99 @@
+#ifndef DEF_RDMAVT_INCCQ_H
+#define DEF_RDMAVT_INCCQ_H
+
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/kthread.h>
+#include <rdma/ib_user_verbs.h>
+
+/*
+ * Define an ib_cq_notify value that is not valid so we know when CQ
+ * notifications are armed.
+ */
+#define RVT_CQ_NONE (IB_CQ_NEXT_COMP + 1)
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and completion queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ */
+struct rvt_cq_wc {
+ u32 head; /* index of next entry to fill */
+ u32 tail; /* index of next ib_poll_cq() entry */
+ union {
+ /* these are actually size ibcq.cqe + 1 */
+ struct ib_uverbs_wc uqueue[0];
+ struct ib_wc kqueue[0];
+ };
+};
+
+/*
+ * The completion queue structure.
+ */
+struct rvt_cq {
+ struct ib_cq ibcq;
+ struct kthread_work comptask;
+ spinlock_t lock; /* protect changes in this struct */
+ u8 notify;
+ u8 triggered;
+ struct rvt_dev_info *rdi;
+ struct rvt_cq_wc *queue;
+ struct rvt_mmap_info *ip;
+};
+
+static inline struct rvt_cq *ibcq_to_rvtcq(struct ib_cq *ibcq)
+{
+ return container_of(ibcq, struct rvt_cq, ibcq);
+}
+
+void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited);
+
+#endif /* DEF_RDMAVT_INCCQH */
diff --git a/sys/ofed/include/rdma/rdmavt_mr.h b/sys/ofed/include/rdma/rdmavt_mr.h
new file mode 100644
index 0000000000000..6b3c6c8b6b772
--- /dev/null
+++ b/sys/ofed/include/rdma/rdmavt_mr.h
@@ -0,0 +1,140 @@
+#ifndef DEF_RDMAVT_INCMR_H
+#define DEF_RDMAVT_INCMR_H
+
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * For Memory Regions. This stuff should probably be moved into rdmavt/mr.h once
+ * drivers no longer need access to the MR directly.
+ */
+
+/*
+ * A segment is a linear region of low physical memory.
+ * Used by the verbs layer.
+ */
+struct rvt_seg {
+ void *vaddr;
+ size_t length;
+};
+
+/* The number of rvt_segs that fit in a page. */
+#define RVT_SEGSZ (PAGE_SIZE / sizeof(struct rvt_seg))
+
+struct rvt_segarray {
+ struct rvt_seg segs[RVT_SEGSZ];
+};
+
+struct rvt_mregion {
+ struct ib_pd *pd; /* shares refcnt of ibmr.pd */
+ u64 user_base; /* User's address for this region */
+ u64 iova; /* IB start address of this region */
+ size_t length;
+ u32 lkey;
+ u32 offset; /* offset (bytes) to start of region */
+ int access_flags;
+ u32 max_segs; /* number of rvt_segs in all the arrays */
+ u32 mapsz; /* size of the map array */
+ u8 page_shift; /* 0 - non unform/non powerof2 sizes */
+ u8 lkey_published; /* in global table */
+ atomic_t lkey_invalid; /* true if current lkey is invalid */
+ struct completion comp; /* complete when refcount goes to zero */
+ atomic_t refcount;
+ struct rvt_segarray *map[0]; /* the segments */
+};
+
+#define RVT_MAX_LKEY_TABLE_BITS 23
+
+struct rvt_lkey_table {
+ spinlock_t lock; /* protect changes in this struct */
+ u32 next; /* next unused index (speeds search) */
+ u32 gen; /* generation count */
+ u32 max; /* size of the table */
+ struct rvt_mregion __rcu **table;
+};
+
+/*
+ * These keep track of the copy progress within a memory region.
+ * Used by the verbs layer.
+ */
+struct rvt_sge {
+ struct rvt_mregion *mr;
+ void *vaddr; /* kernel virtual address of segment */
+ u32 sge_length; /* length of the SGE */
+ u32 length; /* remaining length of the segment */
+ u16 m; /* current index: mr->map[m] */
+ u16 n; /* current index: mr->map[m]->segs[n] */
+};
+
+struct rvt_sge_state {
+ struct rvt_sge *sg_list; /* next SGE to be used if any */
+ struct rvt_sge sge; /* progress state for the current SGE */
+ u32 total_len;
+ u8 num_sge;
+};
+
+static inline void rvt_put_mr(struct rvt_mregion *mr)
+{
+ if (unlikely(atomic_dec_and_test(&mr->refcount)))
+ complete(&mr->comp);
+}
+
+static inline void rvt_get_mr(struct rvt_mregion *mr)
+{
+ atomic_inc(&mr->refcount);
+}
+
+static inline void rvt_put_ss(struct rvt_sge_state *ss)
+{
+ while (ss->num_sge) {
+ rvt_put_mr(ss->sge.mr);
+ if (--ss->num_sge)
+ ss->sge = *ss->sg_list++;
+ }
+}
+
+#endif /* DEF_RDMAVT_INCMRH */
diff --git a/sys/ofed/include/rdma/rdmavt_qp.h b/sys/ofed/include/rdma/rdmavt_qp.h
new file mode 100644
index 0000000000000..2c5183ef0243e
--- /dev/null
+++ b/sys/ofed/include/rdma/rdmavt_qp.h
@@ -0,0 +1,535 @@
+#ifndef DEF_RDMAVT_INCQP_H
+#define DEF_RDMAVT_INCQP_H
+
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/rdma_vt.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_verbs.h>
+/*
+ * Atomic bit definitions for r_aflags.
+ */
+#define RVT_R_WRID_VALID 0
+#define RVT_R_REWIND_SGE 1
+
+/*
+ * Bit definitions for r_flags.
+ */
+#define RVT_R_REUSE_SGE 0x01
+#define RVT_R_RDMAR_SEQ 0x02
+#define RVT_R_RSP_NAK 0x04
+#define RVT_R_RSP_SEND 0x08
+#define RVT_R_COMM_EST 0x10
+
+/*
+ * Bit definitions for s_flags.
+ *
+ * RVT_S_SIGNAL_REQ_WR - set if QP send WRs contain completion signaled
+ * RVT_S_BUSY - send tasklet is processing the QP
+ * RVT_S_TIMER - the RC retry timer is active
+ * RVT_S_ACK_PENDING - an ACK is waiting to be sent after RDMA read/atomics
+ * RVT_S_WAIT_FENCE - waiting for all prior RDMA read or atomic SWQEs
+ * before processing the next SWQE
+ * RVT_S_WAIT_RDMAR - waiting for a RDMA read or atomic SWQE to complete
+ * before processing the next SWQE
+ * RVT_S_WAIT_RNR - waiting for RNR timeout
+ * RVT_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE
+ * RVT_S_WAIT_DMA - waiting for send DMA queue to drain before generating
+ * next send completion entry not via send DMA
+ * RVT_S_WAIT_PIO - waiting for a send buffer to be available
+ * RVT_S_WAIT_PIO_DRAIN - waiting for a qp to drain pio packets
+ * RVT_S_WAIT_TX - waiting for a struct verbs_txreq to be available
+ * RVT_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available
+ * RVT_S_WAIT_KMEM - waiting for kernel memory to be available
+ * RVT_S_WAIT_PSN - waiting for a packet to exit the send DMA queue
+ * RVT_S_WAIT_ACK - waiting for an ACK packet before sending more requests
+ * RVT_S_SEND_ONE - send one packet, request ACK, then wait for ACK
+ * RVT_S_ECN - a BECN was queued to the send engine
+ */
+#define RVT_S_SIGNAL_REQ_WR 0x0001
+#define RVT_S_BUSY 0x0002
+#define RVT_S_TIMER 0x0004
+#define RVT_S_RESP_PENDING 0x0008
+#define RVT_S_ACK_PENDING 0x0010
+#define RVT_S_WAIT_FENCE 0x0020
+#define RVT_S_WAIT_RDMAR 0x0040
+#define RVT_S_WAIT_RNR 0x0080
+#define RVT_S_WAIT_SSN_CREDIT 0x0100
+#define RVT_S_WAIT_DMA 0x0200
+#define RVT_S_WAIT_PIO 0x0400
+#define RVT_S_WAIT_PIO_DRAIN 0x0800
+#define RVT_S_WAIT_TX 0x1000
+#define RVT_S_WAIT_DMA_DESC 0x2000
+#define RVT_S_WAIT_KMEM 0x4000
+#define RVT_S_WAIT_PSN 0x8000
+#define RVT_S_WAIT_ACK 0x10000
+#define RVT_S_SEND_ONE 0x20000
+#define RVT_S_UNLIMITED_CREDIT 0x40000
+#define RVT_S_AHG_VALID 0x80000
+#define RVT_S_AHG_CLEAR 0x100000
+#define RVT_S_ECN 0x200000
+
+/*
+ * Wait flags that would prevent any packet type from being sent.
+ */
+#define RVT_S_ANY_WAIT_IO \
+ (RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN | RVT_S_WAIT_TX | \
+ RVT_S_WAIT_DMA_DESC | RVT_S_WAIT_KMEM)
+
+/*
+ * Wait flags that would prevent send work requests from making progress.
+ */
+#define RVT_S_ANY_WAIT_SEND (RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR | \
+ RVT_S_WAIT_RNR | RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA | \
+ RVT_S_WAIT_PSN | RVT_S_WAIT_ACK)
+
+#define RVT_S_ANY_WAIT (RVT_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND)
+
+/* Number of bits to pay attention to in the opcode for checking qp type */
+#define RVT_OPCODE_QP_MASK 0xE0
+
+/* Flags for checking QP state (see ib_rvt_state_ops[]) */
+#define RVT_POST_SEND_OK 0x01
+#define RVT_POST_RECV_OK 0x02
+#define RVT_PROCESS_RECV_OK 0x04
+#define RVT_PROCESS_SEND_OK 0x08
+#define RVT_PROCESS_NEXT_SEND_OK 0x10
+#define RVT_FLUSH_SEND 0x20
+#define RVT_FLUSH_RECV 0x40
+#define RVT_PROCESS_OR_FLUSH_SEND \
+ (RVT_PROCESS_SEND_OK | RVT_FLUSH_SEND)
+
+/*
+ * Internal send flags
+ */
+#define RVT_SEND_RESERVE_USED IB_SEND_RESERVED_START
+#define RVT_SEND_COMPLETION_ONLY (IB_SEND_RESERVED_START << 1)
+
+/*
+ * Send work request queue entry.
+ * The size of the sg_list is determined when the QP is created and stored
+ * in qp->s_max_sge.
+ */
+struct rvt_swqe {
+ union {
+ struct ib_send_wr wr; /* don't use wr.sg_list */
+ struct ib_ud_wr ud_wr;
+ struct ib_reg_wr reg_wr;
+ struct ib_rdma_wr rdma_wr;
+ struct ib_atomic_wr atomic_wr;
+ };
+ u32 psn; /* first packet sequence number */
+ u32 lpsn; /* last packet sequence number */
+ u32 ssn; /* send sequence number */
+ u32 length; /* total length of data in sg_list */
+ struct rvt_sge sg_list[0];
+};
+
+/*
+ * Receive work request queue entry.
+ * The size of the sg_list is determined when the QP (or SRQ) is created
+ * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
+ */
+struct rvt_rwqe {
+ u64 wr_id;
+ u8 num_sge;
+ struct ib_sge sg_list[0];
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and receive work queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ * Note that the wq array elements are variable size so you can't
+ * just index into the array to get the N'th element;
+ * use get_rwqe_ptr() instead.
+ */
+struct rvt_rwq {
+ u32 head; /* new work requests posted to the head */
+ u32 tail; /* receives pull requests from here. */
+ struct rvt_rwqe wq[0];
+};
+
+struct rvt_rq {
+ struct rvt_rwq *wq;
+ u32 size; /* size of RWQE array */
+ u8 max_sge;
+ /* protect changes in this struct */
+ spinlock_t lock ____cacheline_aligned_in_smp;
+};
+
+/*
+ * This structure is used by rvt_mmap() to validate an offset
+ * when an mmap() request is made. The vm_area_struct then uses
+ * this as its vm_private_data.
+ */
+struct rvt_mmap_info {
+ struct list_head pending_mmaps;
+ struct ib_ucontext *context;
+ void *obj;
+ __u64 offset;
+ struct kref ref;
+ unsigned size;
+};
+
+/*
+ * This structure holds the information that the send tasklet needs
+ * to send a RDMA read response or atomic operation.
+ */
+struct rvt_ack_entry {
+ struct rvt_sge rdma_sge;
+ u64 atomic_data;
+ u32 psn;
+ u32 lpsn;
+ u8 opcode;
+ u8 sent;
+};
+
+#define RC_QP_SCALING_INTERVAL 5
+
+#define RVT_OPERATION_PRIV 0x00000001
+#define RVT_OPERATION_ATOMIC 0x00000002
+#define RVT_OPERATION_ATOMIC_SGE 0x00000004
+#define RVT_OPERATION_LOCAL 0x00000008
+#define RVT_OPERATION_USE_RESERVE 0x00000010
+
+#define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1)
+
+/**
+ * rvt_operation_params - op table entry
+ * @length - the length to copy into the swqe entry
+ * @qpt_support - a bit mask indicating QP type support
+ * @flags - RVT_OPERATION flags (see above)
+ *
+ * This supports table driven post send so that
+ * the driver can have differing an potentially
+ * different sets of operations.
+ *
+ **/
+
+struct rvt_operation_params {
+ size_t length;
+ u32 qpt_support;
+ u32 flags;
+};
+
+/*
+ * Common variables are protected by both r_rq.lock and s_lock in that order
+ * which only happens in modify_qp() or changing the QP 'state'.
+ */
+struct rvt_qp {
+ struct ib_qp ibqp;
+ void *priv; /* Driver private data */
+ /* read mostly fields above and below */
+ struct ib_ah_attr remote_ah_attr;
+ struct ib_ah_attr alt_ah_attr;
+ struct rvt_qp __rcu *next; /* link list for QPN hash table */
+ struct rvt_swqe *s_wq; /* send work queue */
+ struct rvt_mmap_info *ip;
+
+ unsigned long timeout_jiffies; /* computed from timeout */
+
+ enum ib_mtu path_mtu;
+ int srate_mbps; /* s_srate (below) converted to Mbit/s */
+ pid_t pid; /* pid for user mode QPs */
+ u32 remote_qpn;
+ u32 qkey; /* QKEY for this QP (for UD or RD) */
+ u32 s_size; /* send work queue size */
+ u32 s_ahgpsn; /* set to the psn in the copy of the header */
+
+ u16 pmtu; /* decoded from path_mtu */
+ u8 log_pmtu; /* shift for pmtu */
+ u8 state; /* QP state */
+ u8 allowed_ops; /* high order bits of allowed opcodes */
+ u8 qp_access_flags;
+ u8 alt_timeout; /* Alternate path timeout for this QP */
+ u8 timeout; /* Timeout for this QP */
+ u8 s_srate;
+ u8 s_mig_state;
+ u8 port_num;
+ u8 s_pkey_index; /* PKEY index to use */
+ u8 s_alt_pkey_index; /* Alternate path PKEY index to use */
+ u8 r_max_rd_atomic; /* max number of RDMA read/atomic to receive */
+ u8 s_max_rd_atomic; /* max number of RDMA read/atomic to send */
+ u8 s_retry_cnt; /* number of times to retry */
+ u8 s_rnr_retry_cnt;
+ u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */
+ u8 s_max_sge; /* size of s_wq->sg_list */
+ u8 s_draining;
+
+ /* start of read/write fields */
+ atomic_t refcount ____cacheline_aligned_in_smp;
+ wait_queue_head_t wait;
+
+ struct rvt_ack_entry *s_ack_queue;
+ struct rvt_sge_state s_rdma_read_sge;
+
+ spinlock_t r_lock ____cacheline_aligned_in_smp; /* used for APM */
+ u32 r_psn; /* expected rcv packet sequence number */
+ unsigned long r_aflags;
+ u64 r_wr_id; /* ID for current receive WQE */
+ u32 r_ack_psn; /* PSN for next ACK or atomic ACK */
+ u32 r_len; /* total length of r_sge */
+ u32 r_rcv_len; /* receive data len processed */
+ u32 r_msn; /* message sequence number */
+
+ u8 r_state; /* opcode of last packet received */
+ u8 r_flags;
+ u8 r_head_ack_queue; /* index into s_ack_queue[] */
+
+ struct list_head rspwait; /* link for waiting to respond */
+
+ struct rvt_sge_state r_sge; /* current receive data */
+ struct rvt_rq r_rq; /* receive work queue */
+
+ /* post send line */
+ spinlock_t s_hlock ____cacheline_aligned_in_smp;
+ u32 s_head; /* new entries added here */
+ u32 s_next_psn; /* PSN for next request */
+ u32 s_avail; /* number of entries avail */
+ u32 s_ssn; /* SSN of tail entry */
+ atomic_t s_reserved_used; /* reserved entries in use */
+
+ spinlock_t s_lock ____cacheline_aligned_in_smp;
+ u32 s_flags;
+ struct rvt_sge_state *s_cur_sge;
+ struct rvt_swqe *s_wqe;
+ struct rvt_sge_state s_sge; /* current send request data */
+ struct rvt_mregion *s_rdma_mr;
+ u32 s_cur_size; /* size of send packet in bytes */
+ u32 s_len; /* total length of s_sge */
+ u32 s_rdma_read_len; /* total length of s_rdma_read_sge */
+ u32 s_last_psn; /* last response PSN processed */
+ u32 s_sending_psn; /* lowest PSN that is being sent */
+ u32 s_sending_hpsn; /* highest PSN that is being sent */
+ u32 s_psn; /* current packet sequence number */
+ u32 s_ack_rdma_psn; /* PSN for sending RDMA read responses */
+ u32 s_ack_psn; /* PSN for acking sends and RDMA writes */
+ u32 s_tail; /* next entry to process */
+ u32 s_cur; /* current work queue entry */
+ u32 s_acked; /* last un-ACK'ed entry */
+ u32 s_last; /* last completed entry */
+ u32 s_lsn; /* limit sequence number (credit) */
+ u16 s_hdrwords; /* size of s_hdr in 32 bit words */
+ u16 s_rdma_ack_cnt;
+ s8 s_ahgidx;
+ u8 s_state; /* opcode of last packet sent */
+ u8 s_ack_state; /* opcode of packet to ACK */
+ u8 s_nak_state; /* non-zero if NAK is pending */
+ u8 r_nak_state; /* non-zero if NAK is pending */
+ u8 s_retry; /* requester retry counter */
+ u8 s_rnr_retry; /* requester RNR retry counter */
+ u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */
+ u8 s_tail_ack_queue; /* index into s_ack_queue[] */
+
+ struct rvt_sge_state s_ack_rdma_sge;
+ struct timer_list s_timer;
+
+ atomic_t local_ops_pending; /* number of fast_reg/local_inv reqs */
+
+ /*
+ * This sge list MUST be last. Do not add anything below here.
+ */
+ struct rvt_sge r_sg_list[0] /* verified SGEs */
+ ____cacheline_aligned_in_smp;
+};
+
+struct rvt_srq {
+ struct ib_srq ibsrq;
+ struct rvt_rq rq;
+ struct rvt_mmap_info *ip;
+ /* send signal when number of RWQEs < limit */
+ u32 limit;
+};
+
+#define RVT_QPN_MAX BIT(24)
+#define RVT_QPNMAP_ENTRIES (RVT_QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
+#define RVT_BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE)
+#define RVT_BITS_PER_PAGE_MASK (RVT_BITS_PER_PAGE - 1)
+#define RVT_QPN_MASK 0xFFFFFF
+
+/*
+ * QPN-map pages start out as NULL, they get allocated upon
+ * first use and are never deallocated. This way,
+ * large bitmaps are not allocated unless large numbers of QPs are used.
+ */
+struct rvt_qpn_map {
+ void *page;
+};
+
+struct rvt_qpn_table {
+ spinlock_t lock; /* protect changes to the qp table */
+ unsigned flags; /* flags for QP0/1 allocated for each port */
+ u32 last; /* last QP number allocated */
+ u32 nmaps; /* size of the map table */
+ u16 limit;
+ u8 incr;
+ /* bit map of free QP numbers other than 0/1 */
+ struct rvt_qpn_map map[RVT_QPNMAP_ENTRIES];
+};
+
+struct rvt_qp_ibdev {
+ u32 qp_table_size;
+ u32 qp_table_bits;
+ struct rvt_qp __rcu **qp_table;
+ spinlock_t qpt_lock; /* qptable lock */
+ struct rvt_qpn_table qpn_table;
+};
+
+/*
+ * There is one struct rvt_mcast for each multicast GID.
+ * All attached QPs are then stored as a list of
+ * struct rvt_mcast_qp.
+ */
+struct rvt_mcast_qp {
+ struct list_head list;
+ struct rvt_qp *qp;
+};
+
+struct rvt_mcast {
+ struct rb_node rb_node;
+ union ib_gid mgid;
+ struct list_head qp_list;
+ wait_queue_head_t wait;
+ atomic_t refcount;
+ int n_attached;
+};
+
+/*
+ * Since struct rvt_swqe is not a fixed size, we can't simply index into
+ * struct rvt_qp.s_wq. This function does the array index computation.
+ */
+static inline struct rvt_swqe *rvt_get_swqe_ptr(struct rvt_qp *qp,
+ unsigned n)
+{
+ return (struct rvt_swqe *)((char *)qp->s_wq +
+ (sizeof(struct rvt_swqe) +
+ qp->s_max_sge *
+ sizeof(struct rvt_sge)) * n);
+}
+
+/*
+ * Since struct rvt_rwqe is not a fixed size, we can't simply index into
+ * struct rvt_rwq.wq. This function does the array index computation.
+ */
+static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n)
+{
+ return (struct rvt_rwqe *)
+ ((char *)rq->wq->wq +
+ (sizeof(struct rvt_rwqe) +
+ rq->max_sge * sizeof(struct ib_sge)) * n);
+}
+
+/**
+ * rvt_get_qp - get a QP reference
+ * @qp - the QP to hold
+ */
+static inline void rvt_get_qp(struct rvt_qp *qp)
+{
+ atomic_inc(&qp->refcount);
+}
+
+/**
+ * rvt_put_qp - release a QP reference
+ * @qp - the QP to release
+ */
+static inline void rvt_put_qp(struct rvt_qp *qp)
+{
+ if (qp && atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+}
+
+/**
+ * rvt_qp_wqe_reserve - reserve operation
+ * @qp - the rvt qp
+ * @wqe - the send wqe
+ *
+ * This routine used in post send to record
+ * a wqe relative reserved operation use.
+ */
+static inline void rvt_qp_wqe_reserve(
+ struct rvt_qp *qp,
+ struct rvt_swqe *wqe)
+{
+ wqe->wr.send_flags |= RVT_SEND_RESERVE_USED;
+ atomic_inc(&qp->s_reserved_used);
+}
+
+/**
+ * rvt_qp_wqe_unreserve - clean reserved operation
+ * @qp - the rvt qp
+ * @wqe - the send wqe
+ *
+ * This decrements the reserve use count.
+ *
+ * This call MUST precede the change to
+ * s_last to insure that post send sees a stable
+ * s_avail.
+ *
+ * An smp_mp__after_atomic() is used to insure
+ * the compiler does not juggle the order of the s_last
+ * ring index and the decrementing of s_reserved_used.
+ */
+static inline void rvt_qp_wqe_unreserve(
+ struct rvt_qp *qp,
+ struct rvt_swqe *wqe)
+{
+ if (unlikely(wqe->wr.send_flags & RVT_SEND_RESERVE_USED)) {
+ wqe->wr.send_flags &= ~RVT_SEND_RESERVE_USED;
+ atomic_dec(&qp->s_reserved_used);
+ /* insure no compiler re-order up to s_last change */
+ smp_mb__after_atomic();
+ }
+}
+
+extern const int ib_rvt_state_ops[];
+
+struct rvt_dev_info;
+int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err);
+
+#endif /* DEF_RDMAVT_INCQP_H */
diff --git a/sys/ofed/include/rdma/sdp_socket.h b/sys/ofed/include/rdma/sdp_socket.h
deleted file mode 100644
index 6b075892968e3..0000000000000
--- a/sys/ofed/include/rdma/sdp_socket.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Stuff that should go into include/linux/socket.h */
-
-#ifndef SDP_SOCKET_H
-#define SDP_SOCKET_H
-
-#ifndef __FreeBSD__
-#ifndef AF_INET_SDP
-#define AF_INET_SDP 27
-#define PF_INET_SDP AF_INET_SDP
-#endif
-#endif
-
-#ifndef SDP_ZCOPY_THRESH
-#define SDP_ZCOPY_THRESH 80
-#endif
-
-#ifndef SDP_LAST_BIND_ERR
-#define SDP_LAST_BIND_ERR 81
-#endif
-
-/* TODO: AF_INET6_SDP ? */
-
-#endif
diff --git a/sys/ofed/include/uapi/rdma/mlx4-abi.h b/sys/ofed/include/uapi/rdma/mlx4-abi.h
new file mode 100644
index 0000000000000..af431752655c5
--- /dev/null
+++ b/sys/ofed/include/uapi/rdma/mlx4-abi.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_ABI_USER_H
+#define MLX4_ABI_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+
+#define MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION 3
+#define MLX4_IB_UVERBS_ABI_VERSION 4
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct mlx4_ib_alloc_ucontext_resp_v3 {
+ __u32 qp_tab_size;
+ __u16 bf_reg_size;
+ __u16 bf_regs_per_page;
+};
+
+struct mlx4_ib_alloc_ucontext_resp {
+ __u32 dev_caps;
+ __u32 qp_tab_size;
+ __u16 bf_reg_size;
+ __u16 bf_regs_per_page;
+ __u32 cqe_size;
+};
+
+struct mlx4_ib_alloc_pd_resp {
+ __u32 pdn;
+ __u32 reserved;
+};
+
+struct mlx4_ib_create_cq {
+ __u64 buf_addr;
+ __u64 db_addr;
+};
+
+struct mlx4_ib_create_cq_resp {
+ __u32 cqn;
+ __u32 reserved;
+};
+
+struct mlx4_ib_resize_cq {
+ __u64 buf_addr;
+};
+
+struct mlx4_ib_create_srq {
+ __u64 buf_addr;
+ __u64 db_addr;
+};
+
+struct mlx4_ib_create_srq_resp {
+ __u32 srqn;
+ __u32 reserved;
+};
+
+struct mlx4_ib_create_qp {
+ __u64 buf_addr;
+ __u64 db_addr;
+ __u8 log_sq_bb_count;
+ __u8 log_sq_stride;
+ __u8 sq_no_prefetch;
+ __u8 reserved[5];
+};
+
+#endif /* MLX4_ABI_USER_H */
diff --git a/sys/ofed/include/uapi/rdma/mlx5-abi.h b/sys/ofed/include/uapi/rdma/mlx5-abi.h
new file mode 100644
index 0000000000000..f5d0f4e83b59f
--- /dev/null
+++ b/sys/ofed/include/uapi/rdma/mlx5-abi.h
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_ABI_USER_H
+#define MLX5_ABI_USER_H
+
+#include <linux/types.h>
+
+enum {
+ MLX5_QP_FLAG_SIGNATURE = 1 << 0,
+ MLX5_QP_FLAG_SCATTER_CQE = 1 << 1,
+};
+
+enum {
+ MLX5_SRQ_FLAG_SIGNATURE = 1 << 0,
+};
+
+enum {
+ MLX5_WQ_FLAG_SIGNATURE = 1 << 0,
+};
+
+/* Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define MLX5_IB_UVERBS_ABI_VERSION 1
+
+/* Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct mlx5_ib_alloc_ucontext_req {
+ __u32 total_num_uuars;
+ __u32 num_low_latency_uuars;
+};
+
+struct mlx5_ib_alloc_ucontext_req_v2 {
+ __u32 total_num_uuars;
+ __u32 num_low_latency_uuars;
+ __u32 flags;
+ __u32 comp_mask;
+ __u8 max_cqe_version;
+ __u8 reserved0;
+ __u16 reserved1;
+ __u32 reserved2;
+};
+
+enum mlx5_ib_alloc_ucontext_resp_mask {
+ MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0,
+};
+
+enum mlx5_user_cmds_supp_uhw {
+ MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE = 1 << 0,
+};
+
+struct mlx5_ib_alloc_ucontext_resp {
+ __u32 qp_tab_size;
+ __u32 bf_reg_size;
+ __u32 tot_uuars;
+ __u32 cache_line_size;
+ __u16 max_sq_desc_sz;
+ __u16 max_rq_desc_sz;
+ __u32 max_send_wqebb;
+ __u32 max_recv_wr;
+ __u32 max_srq_recv_wr;
+ __u16 num_ports;
+ __u16 reserved1;
+ __u32 comp_mask;
+ __u32 response_length;
+ __u8 cqe_version;
+ __u8 cmds_supp_uhw;
+ __u16 reserved2;
+ __u64 hca_core_clock_offset;
+};
+
+struct mlx5_ib_alloc_pd_resp {
+ __u32 pdn;
+};
+
+struct mlx5_ib_tso_caps {
+ __u32 max_tso; /* Maximum tso payload size in bytes */
+
+ /* Corresponding bit will be set if qp type from
+ * 'enum ib_qp_type' is supported, e.g.
+ * supported_qpts |= 1 << IB_QPT_UD
+ */
+ __u32 supported_qpts;
+};
+
+struct mlx5_ib_rss_caps {
+ __u64 rx_hash_fields_mask; /* enum mlx5_rx_hash_fields */
+ __u8 rx_hash_function; /* enum mlx5_rx_hash_function_flags */
+ __u8 reserved[7];
+};
+
+struct mlx5_ib_query_device_resp {
+ __u32 comp_mask;
+ __u32 response_length;
+ struct mlx5_ib_tso_caps tso_caps;
+ struct mlx5_ib_rss_caps rss_caps;
+};
+
+struct mlx5_ib_create_cq {
+ __u64 buf_addr;
+ __u64 db_addr;
+ __u32 cqe_size;
+ __u32 reserved; /* explicit padding (optional on i386) */
+};
+
+struct mlx5_ib_create_cq_resp {
+ __u32 cqn;
+ __u32 reserved;
+};
+
+struct mlx5_ib_resize_cq {
+ __u64 buf_addr;
+ __u16 cqe_size;
+ __u16 reserved0;
+ __u32 reserved1;
+};
+
+struct mlx5_ib_create_srq {
+ __u64 buf_addr;
+ __u64 db_addr;
+ __u32 flags;
+ __u32 reserved0; /* explicit padding (optional on i386) */
+ __u32 uidx;
+ __u32 reserved1;
+};
+
+struct mlx5_ib_create_srq_resp {
+ __u32 srqn;
+ __u32 reserved;
+};
+
+struct mlx5_ib_create_qp {
+ __u64 buf_addr;
+ __u64 db_addr;
+ __u32 sq_wqe_count;
+ __u32 rq_wqe_count;
+ __u32 rq_wqe_shift;
+ __u32 flags;
+ __u32 uidx;
+ __u32 reserved0;
+ __u64 sq_buf_addr;
+};
+
+/* RX Hash function flags */
+enum mlx5_rx_hash_function_flags {
+ MLX5_RX_HASH_FUNC_TOEPLITZ = 1 << 0,
+};
+
+/*
+ * RX Hash flags, these flags allows to set which incoming packet's field should
+ * participates in RX Hash. Each flag represent certain packet's field,
+ * when the flag is set the field that is represented by the flag will
+ * participate in RX Hash calculation.
+ * Note: *IPV4 and *IPV6 flags can't be enabled together on the same QP
+ * and *TCP and *UDP flags can't be enabled together on the same QP.
+*/
+enum mlx5_rx_hash_fields {
+ MLX5_RX_HASH_SRC_IPV4 = 1 << 0,
+ MLX5_RX_HASH_DST_IPV4 = 1 << 1,
+ MLX5_RX_HASH_SRC_IPV6 = 1 << 2,
+ MLX5_RX_HASH_DST_IPV6 = 1 << 3,
+ MLX5_RX_HASH_SRC_PORT_TCP = 1 << 4,
+ MLX5_RX_HASH_DST_PORT_TCP = 1 << 5,
+ MLX5_RX_HASH_SRC_PORT_UDP = 1 << 6,
+ MLX5_RX_HASH_DST_PORT_UDP = 1 << 7
+};
+
+struct mlx5_ib_create_qp_rss {
+ __u64 rx_hash_fields_mask; /* enum mlx5_rx_hash_fields */
+ __u8 rx_hash_function; /* enum mlx5_rx_hash_function_flags */
+ __u8 rx_key_len; /* valid only for Toeplitz */
+ __u8 reserved[6];
+ __u8 rx_hash_key[128]; /* valid only for Toeplitz */
+ __u32 comp_mask;
+ __u32 reserved1;
+};
+
+struct mlx5_ib_create_qp_resp {
+ __u32 uuar_index;
+};
+
+struct mlx5_ib_alloc_mw {
+ __u32 comp_mask;
+ __u8 num_klms;
+ __u8 reserved1;
+ __u16 reserved2;
+};
+
+struct mlx5_ib_create_wq {
+ __u64 buf_addr;
+ __u64 db_addr;
+ __u32 rq_wqe_count;
+ __u32 rq_wqe_shift;
+ __u32 user_index;
+ __u32 flags;
+ __u32 comp_mask;
+ __u32 reserved;
+};
+
+struct mlx5_ib_create_wq_resp {
+ __u32 response_length;
+ __u32 reserved;
+};
+
+struct mlx5_ib_create_rwq_ind_tbl_resp {
+ __u32 response_length;
+ __u32 reserved;
+};
+
+struct mlx5_ib_modify_wq {
+ __u32 comp_mask;
+ __u32 reserved;
+};
+#endif /* MLX5_ABI_USER_H */